Wednesday, March 23, 2011

http://www.ats.ucla.edu/stat/R/notes/managing.htm

> hs1 <- read.table("http://www.ats.ucla.edu/stat/R/notes/hs1.csv", header=T, sep=",")
> # Keeping only the observations where the reading score is 60 or higher.
> hs1.read.well <- hs1[hs1$read >= 60, ]
> # Comparing means of read in the original hs1 data frame and the new smaller hs1.read.well data frame.
> mean(hs1.read.well$read)
[1] 65.48214
> mean(hs1$read)
[1] 52.23
> # Keeping only the variables read and write from the hs1 data frame.
> hs2<-hs1[, c("read", "write")]
> # another way of doing the same thing
> hs3<-hs1[, c(7, 8)]
> names(hs3)
[1] "read"  "write"
> # Dropping the variables read and write from the hs1 data frame by using the column indices corresponding to these two variables with a negative sign.
> hs2.drop<-hs1[, -c(7, 8)]
> names(hs2.drop)
 [1] "female"  "id"      "race"    "ses"     "schtyp"  "prgtype" "math"    "science" "socst"   "prog"   
> # We will subset hs1 to two data sets, one for female and one for male. We then put them back together.
> attach(hs1)
> hsfemale<-hs1[female==1, ]
> hsmale<-hs1[female==0, ]
> dim(hsfemale)
[1] 109  12
> dim(hsmale)
[1] 91 12
> hs.all<-rbind(hsfemale, hsmale)
> dim(hs.all)
[1] 200  12
> dim(hs1)
[1] 200  12
> # We will create two data sets from hs1, one contains demographic variables and the other one contains test scores. We then merge the two data sets by the id variable.
> hs.demo<-hs1[, c("id", "ses", "female", "race")]
> hs.scores<-hs1[, c("id", "read", "write", "math", "science")]
> dim(hs.demo)
[1] 200   4
> dim(hs.scores)
[1] 200   5
> hs.merge <- merge(hs.demo, hs.scores, by="id", all=T)
> head(hs.merge)
  id ses female race read write math science
1  1   1      1    1   34    44   40      39
2  2   2      1    1   39    41   33      42
3  3   1      0    1   63    65   48      63
4  4   1      1    1   44    50   41      39
5  5   1      0    1   47    40   43      NA
6  6   1      1    1   47    41   46      40
> dim(hs.merge)
[1] 200   8
> hs.merge1 <- merge(hs.demo, hs.scores, by.x="id", by.y="id", all=T)
> dim(hs.merge1)
[1] 200   8
> head(hs.merge1)
  id ses female race read write math science
1  1   1      1    1   34    44   40      39
2  2   2      1    1   39    41   33      42
3  3   1      0    1   63    65   48      63
4  4   1      1    1   44    50   41      39
5  5   1      0    1   47    40   43      NA
6  6   1      1    1   47    41   46      40
> # test
> hs.demo1 <-hs1[, c("id", "ses", "female", "race")]
> hs.scores1 <-hs1[, c("read", "write", "math", "science")]
> dim(hs.demo1)
[1] 200   4
> dim(hs.scores1)
[1] 200   4
> hs.merge2 <- merge(hs.demo1, hs.scores1, by.x="id", by.y="read", all=T)
> dim(hs.merge2)
[1] 370   7
> head(hs.merge2)
  id ses female race write math science
1  1   1      1    1    NA   NA      NA
2  2   2      1    1    NA   NA      NA
3  3   1      0    1    NA   NA      NA
4  4   1      1    1    NA   NA      NA
5  5   1      0    1    NA   NA      NA
6  6   1      1    1    NA   NA      NA



http://www.ats.ucla.edu/stat/R/notes/modifying.htm

> # reading in data
> hs0 <- read.table("http://www.ats.ucla.edu/stat/R/notes/hs0.csv", header=T, sep=",")
> # commenting the data set 
> comment(hs0)<-"High school and beyond data"
> # checking
> comment(hs0)
[1] "High school and beyond data"
> # variable labels using comment
> comment(hs0$write)<-"writing score"
> comment(hs0$read) <-"reading score"
> # more checking to make sure that our comments stay with the data frame
> save(hs0,file="hs0.rda") 
> rm(list=ls())
> load(file="hs0.rda")
> comment(hs0)
[1] "High school and beyond data"
> comment(hs0$write)
[1] "writing score"
> search()
[1] ".GlobalEnv"        "package:stats"     "package:graphics"  "package:grDevices" "package:utils"     "package:datasets"  "package:methods"  
[8] "Autoloads"         "package:base"     
> attach(hs0)
> search()
 [1] ".GlobalEnv"        "hs0"               "package:stats"     "package:graphics"  "package:grDevices" "package:utils"     "package:datasets" 
 [8] "package:methods"   "Autoloads"         "package:base"     

> # use the sapply function with the is.factor function to check if any of the variables in the hs0 data frame are factor variables.
> sapply(hs0, is.factor)
 gender      id    race     ses  schtyp prgtype    read   write    math science   socst 
  FALSE   FALSE   FALSE   FALSE   FALSE    TRUE   FALSE   FALSE   FALSE   FALSE   FALSE 
> # Creating a factor (categorical) variable called schtyp.f for schtyp and a factor variable female for gender with value labels.
> schtyp.f <- factor(schtyp, levels=c(1, 2), labels=c("public", "private"))
> female <- factor(gender, levels=c(0, 1), labels=c("male", "female")) 
> table(schtyp.f)
schtyp.f
 public private 
    168      32 
> table(female)
female
  male female 
    91    109 


> # 4.0 Recoding variables and generating new variables
> table(hs0$race)
  1   2   3   4   5 
 24  11  20 143   2 
> hs0$race[hs0$race==5] <-NA
> table(hs0$race)
  1   2   3   4 
 24  11  20 143 
> # displaying the missing data as well
> table(hs0$race, useNA="ifany")
   1    2    3    4 <NA> 
  24   11   20  143    2 
> # Creating a variable called total=read+write+math+science
> total<-read+write+math+science
> # noticing the missing values generated
> summary(total)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  139.0   180.0   210.0   208.8   233.5   274.0     5.0 
> # Creating a variable called grade based on total.
> # initializing a variable
> grade<-0
> grade[total <=140]<-0
> grade[total > 140 & total <= 180] <-1
> grade[total > 180 & total <= 210] <-2
> grade[total > 210 & total <= 234] <-3
> grade[total > 234] <-4
> comment(grade)<-"combined grades of read, write, math, science"
> grade<-factor(grade, levels=c(0, 1, 2, 3, 4), labels=c("F", "D", "C", "B", "A"))
> table(grade)
grade
 F  D  C  B  A 
 1 50 50 47 47 
> # Creating mean scores in two ways - working with missing values differently.
> m1<-(read+write+math+science)/4
> m2<-rowMeans(cbind(read, write, math, science))
> m2<-rowMeans(cbind(read, write, math, science), na.rm=T)
> # At this point, we might want to combine the new variables we have created with the original data set. We can use the cbind function for this.
> hs1<-cbind(hs0, cbind(schtyp.f, female, total, grade))
> table(hs1$race)
  1   2   3   4 
 24  11  20 143 
> is.data.frame(hs1)
[1] TRUE



http://www.ats.ucla.edu/stat/R/notes/exploring.htm

> hs0 <- read.table("http://www.ats.ucla.edu/stat/R/notes/hs0.csv", header=T, sep=",")
> hs0[1:5, ]
   gender  id race ses schtyp  prgtype read write math science socst
1       0  70    4   1      1  general   57    52   41      47    57
2       1 121    4   2      1   vocati   68    59   53      63    61
3       0  86    4   3      1  general   44    33   54      58    31
4       0 141    4   3      1   vocati   63    44   47      53    56
5       0 172    4   2      1 academic   47    52   57      53    61


> names(hs0)
 [1] "gender"  "id"      "race"    "ses"     "schtyp"  "prgtype" "read"    "write"   "math"    "science" "socst"  
> # shorthand way of referring to read, write, math, science 
> read.sci <- hs0[ , 7:10]  
> # checking the type of object 
> class(read.sci)
[1] "data.frame"
> # listing the first 10 observations
> head(read.sci, n=10)
   read write math science
1    57    52   41      47
2    68    59   53      63
3    44    33   54      58
4    63    44   47      53
5    47    52   57      53
6    44    52   51      63
7    50    59   42      53
8    34    46   45      39
9    63    57   54      NA
10   57    55   52      50
> # displaying the dimensions 
> dim(read.sci)
[1] 200   4
> length(read.sci)
[1] 4
> length(read.sci$read)
[1] 200 
> summary(read.sci)
      read           write            math          science     
 Min.   :28.00   Min.   :31.00   Min.   :33.00   Min.   :26.00  
 1st Qu.:44.00   1st Qu.:45.75   1st Qu.:45.00   1st Qu.:44.00  
 Median :50.00   Median :54.00   Median :52.00   Median :53.00  
 Mean   :52.23   Mean   :52.77   Mean   :52.65   Mean   :51.66  
 3rd Qu.:60.00   3rd Qu.:60.00   3rd Qu.:59.00   3rd Qu.:58.00  
 Max.   :76.00   Max.   :67.00   Max.   :75.00   Max.   :74.00  
                                                 NA's   : 5.00  
> range(read.sci$write)
[1] 31 67
> range(read.sci$science)
[1] NA NA
> range(read.sci$science, na.rm=T)
[1] 26 74
> # the minimum and the maximum among all the variables
> range(read.sci, na.rm=T)
[1] 26 76
> mean(read.sci)
   read   write    math science 
 52.230  52.775  52.645      NA 
> mean(read.sci, na.rm=T)
    read    write     math  science 
52.23000 52.77500 52.64500 51.66154 
> sd(read.sci, na.rm=T)
     read     write      math   science 
10.252937  9.478586  9.368448  9.866026 
> table(hs0$prgtype)


academic  general   vocati 
     105       45       50 
> by(hs0, hs0$prgtype, mean)
hs0$prgtype: academic
    gender         id       race        ses     schtyp    prgtype       read      write       math    science      socst 
  0.552381 107.628571   3.495238   2.219048   1.228571         NA  56.161905  56.257143  56.733333         NA  56.695238 
-----------------------------------------------------------------
hs0$prgtype: general
    gender         id       race        ses     schtyp    prgtype       read      write       math    science      socst 
 0.5333333 97.3111111  3.4666667  1.8444444  1.1333333         NA 49.7555556 51.3333333 50.0222222         NA 50.6000000 
----------------------------------------------------------------- 
hs0$prgtype: vocati
 gender      id    race     ses  schtyp prgtype    read   write    math science   socst 
   0.54   88.40    3.30    1.90    1.04      NA   46.20   46.76   46.42   47.22   45.02 
Warning messages:
1: In mean.default(X[[6L]], ...) :
  argument is not numeric or logical: returning NA
2: In mean.default(X[[6L]], ...) :
  argument is not numeric or logical: returning NA
3: In mean.default(X[[6L]], ...) :
  argument is not numeric or logical: returning NA
> by(hs0, hs0$prgtype, sd)
hs0$prgtype: academic
    gender         id       race        ses     schtyp    prgtype       read      write       math    science      socst 
 0.4996336 61.0042964  1.0107481  0.7335498  0.4219265         NA  9.5887793  7.9433433  8.7302157         NA  9.1736703 
---------------------------------------------------------------------------------------------------------------------------------------------- 
hs0$prgtype: general
    gender         id       race        ses     schtyp    prgtype       read      write       math    science      socst 
 0.5045250 52.5793004  1.0135446  0.7371800  0.3437758         NA  9.2347062  9.3977754  7.4421676         NA  9.3088423 
---------------------------------------------------------------------------------------------------------------------------------------------- 
hs0$prgtype: vocati
    gender         id       race        ses     schtyp    prgtype       read      write       math    science      socst 
 0.5034574 54.3713610  1.1649647  0.6144518  0.1979487         NA  8.9076899  9.3187544  7.9541800 10.3337963 10.6569705 
Warning messages:
1: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
2: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
3: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
> # attaching hs0, so its variables will be sesarchable by R 
> attach(hs0)
> getOption("digits")
[1] 7
> options(digits=2)
> by(hs0, prgtype, mean, na.rm=T)
prgtype: academic
 gender      id    race     ses  schtyp prgtype    read   write    math science   socst 
   0.55  107.63    3.50    2.22    1.23      NA   56.16   56.26   56.73   53.62   56.70 
-----------------------------------------------------------------
prgtype: general
 gender      id    race     ses  schtyp prgtype    read   write    math science   socst 
   0.53   97.31    3.47    1.84    1.13      NA   49.76   51.33   50.02   52.19   50.60 
-----------------------------------------------------------------
prgtype: vocati
 gender      id    race     ses  schtyp prgtype    read   write    math science   socst 
   0.54   88.40    3.30    1.90    1.04      NA   46.20   46.76   46.42   47.22   45.02 
Warning messages:
1: In mean.default(X[[6L]], ...) :
  argument is not numeric or logical: returning NA
2: In mean.default(X[[6L]], ...) :
  argument is not numeric or logical: returning NA
3: In mean.default(X[[6L]], ...) :
  argument is not numeric or logical: returning NA
> by(hs0, prgtype, sd, na.rm=T)
prgtype: academic
 gender      id    race     ses  schtyp prgtype    read   write    math science   socst 
   0.50   61.00    1.01    0.73    0.42      NA    9.59    7.94    8.73    9.01    9.17 
-----------------------------------------------------------------
prgtype: general
 gender      id    race     ses  schtyp prgtype    read   write    math science   socst 
   0.50   52.58    1.01    0.74    0.34      NA    9.23    9.40    7.44    9.83    9.31 
-----------------------------------------------------------------
prgtype: vocati
 gender      id    race     ses  schtyp prgtype    read   write    math science   socst 
   0.50   54.37    1.16    0.61    0.20      NA    8.91    9.32    7.95   10.33   10.66 
Warning messages:
1: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
2: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
3: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
> m   <- tapply(write, prgtype, mean)
> v   <- tapply(write, prgtype, var)
> med <- tapply(write, prgtype, median)
> n   <- tapply(write, prgtype, length)
> sd  <- tapply(write, prgtype, sd)
> cbind(mean=m, var=v, std.dev=sd, median=med, n=n)
         mean var std.dev median   n
academic   56  63     7.9     59 105
general    51  88     9.4     54  45
vocati     47  87     9.3     46  50
> # set the number of digits to 7
> options(digits=7)

> hist(write)
> # load trellis graphics
> library(lattice) 
> # trellis graphs 
> histogram(~write, hs0, type="count")






> # histogram of write by gender
> histogram(~write | gender, hs0, type="count")






> # change the number of bins to 15
> hist(write, breaks=15)





> # boxplot function in the graphics package
> boxplot(write)






> #trellis graphs
> bwplot(ses~ write, hs0)






> # boxplot by gender
> bwplot(ses~ write| gender, hs0)






> barplot(table(ses, gender), legend=c("low", "medium", "high"))






> barplot(table(ses, gender), beside=T, legend=c("low", "medium", "high"), ylim=c(0, 50))





> # changing the location of legend and adding a title, etc
> barplot(table(ses, gender), beside=T, legend.text=c("low", "medium", "high"), ylim=c(0, 50), space=c(.1, 1),
+ col=c("lightblue", "blue", "dark blue"), names.arg=c("male", "female"),
+ main="Distribution of SES by gender", args.legend=list(x =9, y=45, cex=.6))






> table(ses)
ses
 1  2  3 
47 95 58 
> tab1<-table(gender, ses)
> # row proportions
> prop.table(tab1,1) 
      ses
gender         1         2         3
     0 0.1648352 0.5164835 0.3186813
     1 0.2935780 0.4403670 0.2660550
> # row frequencies
> rowSums(tab1)  
  0   1 
 91 109 
> # column frequencies
> colSums(tab1) 
 1  2  3 
47 95 58 
> # correlation of a pair of variables
> cor(write, math)
[1] 0.6174493
> cor(write, science)
[1] NA
> cor(write, science, use="complete.obs")
[1] 0.5671298
> # correlation matrix
> cor(read.sci, use="complete.obs")
             read     write      math   science
read    1.0000000 0.5959677 0.6492202 0.6170562
write   0.5959677 1.0000000 0.6203022 0.5671298
math    0.6492202 0.6203022 1.0000000 0.6166288
science 0.6170562 0.5671298 0.6166288 1.0000000
> cor(read.sci, use="pairwise.complete.obs")
             read     write      math   science
read    1.0000000 0.5967765 0.6622801 0.6170562
write   0.5967765 1.0000000 0.6174493 0.5671298
math    0.6622801 0.6174493 1.0000000 0.6166288
science 0.6170562 0.5671298 0.6166288 1.0000000
> plot(math, write)






> # scatter plot matrix
> plot(read.sci)






> # Unless you are going to continue working with the hs0 data frame it is generally a good idea to detach all attached data frames.
> detach()





http://www.ats.ucla.edu/stat/R/notes/entering.htm

> data1 <- read.table("http://www.ats.ucla.edu/stat/R/notes/hs0.csv", header=T, sep=",")
> names(data1)
 [1] "gender"  "id"      "race"    "ses"     "schtyp"  "prgtype" "read"    "write"   "math"    "science" "socst"
> head(data1)
  gender  id race ses schtyp  prgtype read write math science socst
1      0  70    4   1      1  general   57    52   41      47    57
2      1 121    4   2      1   vocati   68    59   53      63    61
3      0  86    4   3      1  general   44    33   54      58    31
4      0 141    4   3      1   vocati   63    44   47      53    56
5      0 172    4   2      1 academic   47    52   57      53    61
6      0 113    4   2      1 academic   44    52   51      63    61
> dim(data1)
[1] 200  11
> # saves as an R object
> save(data1,file="data1.rda")
> # checking to see if data1.rda has been created
> dir()
 [1] "asg1-40"                           "data1.rda"                         "desktop.ini"                       "FMS_data.txt"                      "FMS_data_edited.dat"            
 [6] "FMS_data_edited.sav"               "HGDP_AKT1.txt"                     "hs0.rda"                           "JHS_NPHP1_single_sheet_wy0111.dat" "MolecularWeight_tair7.xls"      
[11] "My Music"                          "My Pictures"                       "My Videos"                         "R"                                 "rtn.txt"                        
[16] "rtn.xlsx"                          "SafeNet Sentinel"                  "SPSSInc"                           "TargetP_analysis_tair7.xls"        "test1230.txt"                    
[21] "Virco_data.csv"                  
# list of rda files
> ls() 
[1] "data1" "hs0"
# clear everything out of memory
> rm(list=ls()) 
> ls()
character(0)
> # load the R data into memory
> load("data1.rda")
> tail(data1)
    gender  id race ses schtyp  prgtype read write math science socst
195      1 179    4   2      2 academic   47    65   60      50    56
196      1  31    2   2      2  general   55    59   52      42    56
197      1 145    4   2      1   vocati   42    46   38      36    46
198      1 187    4   2      2  general   57    41   57      55    52
199      1 118    4   2      1  general   55    62   58      58    61
200      1 137    4   3      1 academic   63    65   65      53    61
>
> rm(list=ls())  # clear everything out of memory
> ls()
character(0)


> temp <- read.table("http://www.ats.ucla.edu/stat/R/notes/hs0_1.csv", header=F, sep=",")
> temp[9,]
  V1 V2 V3 V4 V5      V6 V7 V8 V9 V10 V11
9  0 84  4  2  1 general 63 57 54  NA  51
> names(temp) <- c("gender","id","race","ses","schtyp","prgtype","read","write","math","science","socst")
> # list observations 5 through 10 to check the data
> temp[5:10, ]
   gender  id race ses schtyp  prgtype read write math science socst
5       0 172    4   2      1 academic   47    52   57      53    61
6       0 113    4   2      1 academic   44    52   51      63    61
7       0  50    3   2      1  general   50    59   42      53    61
8       0  11    1   2      1 academic   34    46   45      39    36
9       0  84    4   2      1  general   63    57   54      NA    51
10      0  48    3   2      1 academic   57    55   52      50    51
>