Manejo de datos con R
Índice
1 Lectura de datos
1.1 setwd, getwd, dir
En setwd hay que especificar el directorio que contiene el repositorio.
getwd() old <- setwd("~/github/intro") dir()
dir(pattern='.R')
[1] "ClasesMetodos.R" "datos.R" "estadistica.R" [4] "factorDateCharacter.R" "Funciones.R" "graficos.R" [7] "intro.R" "raster.R" "zoo.R"
dir('data')
[1] "aranjuez.csv" "aranjuez.RData" "CO2_GNI_BM.csv" [4] "El.Arenosillo.txt" "InformeDatos.zip" "NREL-Hawaii.csv" [7] "radiacion_datas.csv" "SIAR.csv" "SISmm2008_CMSAF.zip"
1.2 Lectura de datos con read.table
dats <- read.table('data/aranjuez.csv', sep=',', header=TRUE) head(dats)
X TempAvg TempMax TempMin HumidAvg HumidMax WindAvg WindMax Rain
1 2004-01-01 4.044 10.71 -1.969 88.3 95.9 0.746 3.528 0
2 2004-01-02 5.777 11.52 1.247 83.3 98.5 1.078 6.880 0
3 2004-01-03 5.850 13.32 0.377 75.0 94.4 0.979 6.576 0
4 2004-01-04 4.408 15.59 -2.576 82.0 97.0 0.633 3.704 0
5 2004-01-05 3.081 14.58 -2.974 83.2 97.0 0.389 2.244 0
6 2004-01-06 2.304 11.83 -3.379 84.5 96.5 0.436 2.136 0
Radiation ET
1 5.490 0.5352688
2 6.537 0.7710499
3 8.810 0.8361229
4 9.790 0.6861381
5 10.300 0.5152422
6 9.940 0.4886631
1.3 Lectura de datos con read.csv
aranjuez <- read.csv('data/aranjuez.csv') names(aranjuez)[1] <- 'Date'
class(aranjuez)
[1] "data.frame"
names(aranjuez)
[1] "Date" "TempAvg" "TempMax" "TempMin" "HumidAvg" "HumidMax" [7] "WindAvg" "WindMax" "Rain" "Radiation" "ET"
1.4 Inspeccionamos el resultado
head(aranjuez)
Date TempAvg TempMax TempMin HumidAvg HumidMax WindAvg WindMax Rain
1 2004-01-01 4.044 10.71 -1.969 88.3 95.9 0.746 3.528 0
2 2004-01-02 5.777 11.52 1.247 83.3 98.5 1.078 6.880 0
3 2004-01-03 5.850 13.32 0.377 75.0 94.4 0.979 6.576 0
4 2004-01-04 4.408 15.59 -2.576 82.0 97.0 0.633 3.704 0
5 2004-01-05 3.081 14.58 -2.974 83.2 97.0 0.389 2.244 0
6 2004-01-06 2.304 11.83 -3.379 84.5 96.5 0.436 2.136 0
Radiation ET
1 5.490 0.5352688
2 6.537 0.7710499
3 8.810 0.8361229
4 9.790 0.6861381
5 10.300 0.5152422
6 9.940 0.4886631
tail(aranjuez)
Date TempAvg TempMax TempMin HumidAvg HumidMax WindAvg WindMax Rain
2893 2011-12-26 3.366 13.88 -3.397 81.5 100 0.556 3.263 0.000
2894 2011-12-27 2.222 13.33 -4.005 87.0 100 0.369 1.842 0.000
2895 2011-12-28 1.810 12.33 -4.682 85.0 100 0.540 3.401 0.203
2896 2011-12-29 2.512 11.92 -4.682 77.2 100 0.546 4.420 0.203
2897 2011-12-30 1.006 11.05 -5.822 79.7 100 0.446 2.832 0.000
2898 2011-12-31 2.263 12.67 -3.938 80.3 100 0.270 1.950 0.000
Radiation ET
2893 9.44 0.5358751
2894 9.52 0.4386931
2895 9.59 0.5183545
2896 9.72 0.5428373
2897 9.74 0.4614953
2898 8.11 0.4246270
1.5 Inspeccionamos el resultado
summary(aranjuez)
Date TempAvg TempMax TempMin
2004-01-01: 1 Min. :-5.309 Min. :-2.362 Min. :-12.980
2004-01-02: 1 1st Qu.: 7.692 1st Qu.:14.530 1st Qu.: 1.515
2004-01-03: 1 Median :13.810 Median :21.670 Median : 7.170
2004-01-04: 1 Mean :14.405 Mean :22.531 Mean : 6.888
2004-01-05: 1 3rd Qu.:21.615 3rd Qu.:30.875 3rd Qu.: 12.590
2004-01-06: 1 Max. :30.680 Max. :41.910 Max. : 22.710
(Other) :2892 NA's :4
HumidAvg HumidMax WindAvg WindMax
Min. : 19.89 Min. : 35.88 Min. :0.251 Min. : 0.000
1st Qu.: 47.04 1st Qu.: 81.60 1st Qu.:0.667 1st Qu.: 3.783
Median : 62.58 Median : 90.90 Median :0.920 Median : 5.027
Mean : 62.16 Mean : 87.22 Mean :1.174 Mean : 5.208
3rd Qu.: 77.38 3rd Qu.: 94.90 3rd Qu.:1.431 3rd Qu.: 6.537
Max. :100.00 Max. :100.00 Max. :8.260 Max. :10.000
NA's :13 NA's :8 NA's :128
Rain Radiation ET
Min. : 0.000 Min. : 0.277 Min. :0.000
1st Qu.: 0.000 1st Qu.: 9.370 1st Qu.:1.168
Median : 0.000 Median :16.660 Median :2.758
Mean : 1.094 Mean :16.742 Mean :3.091
3rd Qu.: 0.200 3rd Qu.:24.650 3rd Qu.:4.926
Max. :49.730 Max. :32.740 Max. :8.564
NA's :4 NA's :13 NA's :18
1.6 Valores ausentes
NAestá definido comological
class(NA)
[1] "logical"
- Operar con
NAsiempre produce unNA
1 + NA
[1] NA
- Esto es un «problema» al usar funciones
mean(aranjuez$Radiation)
[1] NA
mean(aranjuez$Radiation, na.rm = TRUE)
[1] 16.74176
1.7 Valores ausentes
Las funciones is.na y anyNA los identifican
anyNA(aranjuez)
[1] TRUE
which(is.na(aranjuez$Radiation))
[1] 1861 1867 1873 1896 1897 1908 1923 2153 2413 2587 2600 2603 2684
sum(is.na(aranjuez$Radiation))
[1] 13
2 Indexado
2.1 Indexado con []
- Filas
aranjuez[1:5,]
Date TempAvg TempMax TempMin HumidAvg HumidMax WindAvg WindMax Rain
1 2004-01-01 4.044 10.71 -1.969 88.3 95.9 0.746 3.528 0
2 2004-01-02 5.777 11.52 1.247 83.3 98.5 1.078 6.880 0
3 2004-01-03 5.850 13.32 0.377 75.0 94.4 0.979 6.576 0
4 2004-01-04 4.408 15.59 -2.576 82.0 97.0 0.633 3.704 0
5 2004-01-05 3.081 14.58 -2.974 83.2 97.0 0.389 2.244 0
Radiation ET
1 5.490 0.5352688
2 6.537 0.7710499
3 8.810 0.8361229
4 9.790 0.6861381
5 10.300 0.5152422
- Filas y Columnas
aranjuez[10:14, 1:5]
Date TempAvg TempMax TempMin HumidAvg
10 2004-01-10 10.85 16.59 5.676 84.9
11 2004-01-11 7.59 9.23 4.806 95.4
12 2004-01-12 7.41 10.24 5.200 93.1
13 2004-01-13 8.35 11.38 4.137 91.3
14 2004-01-14 8.74 13.32 2.857 86.9
2.2 Indexado con []
- Condición basada en los datos
idx <- with(aranjuez, Radiation > 20 & TempAvg < 10)
head(aranjuez[idx, ])
Date TempAvg TempMax TempMin HumidAvg HumidMax WindAvg WindMax Rain
82 2004-03-22 9.78 16.12 4.340 51.65 87.9 1.526 7.660 0
83 2004-03-23 8.50 15.52 -0.290 50.10 83.3 1.533 6.027 0
85 2004-03-25 7.47 14.58 1.584 49.66 76.6 1.138 5.939 0
100 2004-04-09 8.83 15.52 2.056 47.50 70.8 1.547 6.125 0
101 2004-04-10 7.04 13.85 -0.155 54.45 85.8 1.448 6.958 0
102 2004-04-11 7.50 15.19 -1.699 54.98 91.0 1.126 7.590 0
Radiation ET
82 21.92 3.075785
83 20.62 2.881419
85 22.44 2.849603
100 25.45 3.566452
101 21.07 2.943239
102 20.99 2.905479
2.3 subset
subset(aranjuez,
subset = (Radiation > 20 & TempAvg < 10),
select = c(Radiation, TempAvg,
TempMax, TempMin))
Radiation TempAvg TempMax TempMin
82 21.92 9.780 16.12 4.340
83 20.62 8.500 15.52 -0.290
85 22.44 7.470 14.58 1.584
100 25.45 8.830 15.52 2.056
101 21.07 7.040 13.85 -0.155
102 20.99 7.500 15.19 -1.699
104 25.76 9.420 17.47 0.115
461 24.29 7.460 14.66 -0.081
462 25.25 7.930 17.35 -1.686
463 24.56 9.800 19.08 -1.484
1146 20.08 7.170 18.20 -3.746
1157 20.90 4.378 12.03 -6.353
1159 21.87 7.920 18.54 -2.941
1160 20.35 7.830 16.49 -2.807
1521 21.54 8.100 19.29 -4.075
2244 20.49 6.121 15.15 -0.940
2245 21.02 5.989 16.94 -3.208
2246 20.22 9.020 19.74 -2.068
2261 23.00 9.500 14.96 3.662
2262 20.40 9.910 14.70 4.668
2263 24.09 9.440 16.89 0.794
2265 23.64 9.680 16.35 2.938
2295 22.46 8.730 13.84 1.740
3 Cambio de formato
3.1 Forma simple con stack
aranjuezWide <- aranjuez[, c('Radiation', 'TempAvg', 'TempMax', 'WindAvg', 'WindMax')]
- Pasamos de formato
widealong
aranjuezLong <- stack(aranjuezWide)
head(aranjuezLong)
values ind 1 5.490 Radiation 2 6.537 Radiation 3 8.810 Radiation 4 9.790 Radiation 5 10.300 Radiation 6 9.940 Radiation
summary(aranjuezLong)
values ind
Min. :-5.309 Radiation:2898
1st Qu.: 3.158 TempAvg :2898
Median : 8.720 TempMax :2898
Mean :12.074 WindAvg :2898
3rd Qu.:19.970 WindMax :2898
Max. :41.910
NA's :149
3.2 Más flexible con reshape2
reshape2es un paquete que puede facilitar la transformación dedata.framey matrices.
library(reshape2)
3.3 melt para cambiar de wide a long
aranjuezLong2 <- melt(aranjuez, id.vars = 'Date', variable.name = 'Variable', value.name = 'Value') head(aranjuezLong2)
Date Variable Value
1 2004-01-01 TempAvg 4.044
2 2004-01-02 TempAvg 5.777
3 2004-01-03 TempAvg 5.850
4 2004-01-04 TempAvg 4.408
5 2004-01-05 TempAvg 3.081
6 2004-01-06 TempAvg 2.304
3.4 dcast para cambiar de long a wide
aranjuezWide2 <- dcast(aranjuezLong2,
Variable ~ Date)
head(aranjuezWide2[, 1:10])
Using Value as value column: use value.var to override. Variable 2004-01-01 2004-01-02 2004-01-03 2004-01-04 2004-01-05 2004-01-06 1 TempAvg 4.044 5.777 5.850 4.408 3.081 2.304 2 TempMax 10.710 11.520 13.320 15.590 14.580 11.830 3 TempMin -1.969 1.247 0.377 -2.576 -2.974 -3.379 4 HumidAvg 88.300 83.300 75.000 82.000 83.200 84.500 5 HumidMax 95.900 98.500 94.400 97.000 97.000 96.500 6 WindAvg 0.746 1.078 0.979 0.633 0.389 0.436 2004-01-07 2004-01-08 2004-01-09 1 2.080 6.405 12.060 2 11.500 13.380 15.330 3 -3.109 -1.301 9.830 4 87.000 88.600 86.800 5 96.600 97.400 94.500 6 0.449 1.188 2.737
4 Datos agregados
4.1 aggregate
aranjuez$rainy <- aranjuez$Rain > 0
aggregate(Radiation ~ rainy, data = aranjuez,
FUN = mean)
rainy Radiation 1 FALSE 19.63325 2 TRUE 10.26028
4.2 Variable categórica con cut
aranjuez$tempClass <- cut(aranjuez$TempAvg, 5)
aggregate(Radiation ~ tempClass, data = aranjuez,
FUN = mean)
tempClass Radiation
1 (-5.34,1.89] 8.805389
2 (1.89,9.09] 9.014178
3 (9.09,16.3] 14.554177
4 (16.3,23.5] 21.912414
5 (23.5,30.7] 26.192742
aggregate(Radiation ~ tempClass + rainy,
data = aranjuez, FUN = mean)
tempClass rainy Radiation
1 (-5.34,1.89] FALSE 9.869134
2 (1.89,9.09] FALSE 10.718837
3 (9.09,16.3] FALSE 17.238283
4 (16.3,23.5] FALSE 23.238145
5 (23.5,30.7] FALSE 26.392665
6 (-5.34,1.89] TRUE 6.822955
7 (1.89,9.09] TRUE 7.063932
8 (9.09,16.3] TRUE 11.091063
9 (16.3,23.5] TRUE 15.802522
10 (23.5,30.7] TRUE 22.545862
4.3 Agregamos varias variables
aggregate(cbind(Radiation, TempAvg) ~ tempClass,
data = aranjuez, FUN = mean)
tempClass Radiation TempAvg
1 (-5.34,1.89] 8.805389 0.3423095
2 (1.89,9.09] 9.014178 5.6663267
3 (9.09,16.3] 14.554177 12.5219084
4 (16.3,23.5] 21.912414 19.7486310
5 (23.5,30.7] 26.192742 26.0496953
aggregate(cbind(Radiation, TempAvg) ~ tempClass + rainy,
data = aranjuez, FUN = mean)
tempClass rainy Radiation TempAvg
1 (-5.34,1.89] FALSE 9.869134 0.3550122
2 (1.89,9.09] FALSE 10.718837 5.6657481
3 (9.09,16.3] FALSE 17.238283 12.6959488
4 (16.3,23.5] FALSE 23.238145 19.9486604
5 (23.5,30.7] FALSE 26.392665 26.0896408
6 (-5.34,1.89] TRUE 6.822955 0.3186364
7 (1.89,9.09] TRUE 7.063932 5.6669887
8 (9.09,16.3] TRUE 11.091063 12.2973563
9 (16.3,23.5] TRUE 15.802522 18.8267565
10 (23.5,30.7] TRUE 22.545862 25.3210345
4.4 Agregamos a partir de un formato long
head(aranjuezLong2)
Date Variable Value
1 2004-01-01 TempAvg 4.044
2 2004-01-02 TempAvg 5.777
3 2004-01-03 TempAvg 5.850
4 2004-01-04 TempAvg 4.408
5 2004-01-05 TempAvg 3.081
6 2004-01-06 TempAvg 2.304
aggregate(Value ~ Variable, data = aranjuezLong2,
FUN = mean)
Variable Value
1 TempAvg 14.404856
2 TempMax 22.531033
3 TempMin 6.887590
4 HumidAvg 62.160390
5 HumidMax 87.216887
6 WindAvg 1.173983
7 WindMax 5.208021
8 Rain 1.094255
9 Radiation 16.741759
10 ET 3.091191
5 Unión de data.frame
5.1 Con merge
- Primero construimos un
data.framede ejemplo
USStates <- as.data.frame(state.x77) USStates$Name <- rownames(USStates) rownames(USStates) <- NULL
- Lo partimos en estados «fríos» y estados «grandes»
coldStates <- USStates[USStates$Frost>150, c('Name', 'Frost')] largeStates <- USStates[USStates$Area>1e5, c('Name', 'Area')]
5.2 Con merge
- Unimos los dos conjuntos (estados «fríos» y «grandes»)
merge(coldStates, largeStates)
Name Frost Area
1 Alaska 152 566432
2 Colorado 166 103766
3 Montana 155 145587
4 Nevada 188 109889
5.3 merge usa match
- Estados grandes que también son fríos
idxLarge <- match(largeStates$Name,
coldStates$Name,
nomatch=0)
idxLarge
[1] 1 0 0 2 5 6 0 0
coldStates[idxLarge,]
Name Frost
2 Alaska 152
6 Colorado 166
26 Montana 155
28 Nevada 188
5.4 merge usa match
- Estados frios que también son grandes
idxCold <- match(coldStates$Name,
largeStates$Name,
nomatch=0)
idxCold
[1] 1 4 0 0 5 6 0 0 0 0 0
largeStates[idxCold,]
Name Area
2 Alaska 566432
6 Colorado 103766
26 Montana 145587
28 Nevada 109889