Wednesday, March 23, 2011

http://www.ats.ucla.edu/stat/R/notes/modifying.htm

> # reading in data
> hs0 <- read.table("http://www.ats.ucla.edu/stat/R/notes/hs0.csv", header=T, sep=",")
> # commenting the data set 
> comment(hs0)<-"High school and beyond data"
> # checking
> comment(hs0)
[1] "High school and beyond data"
> # variable labels using comment
> comment(hs0$write)<-"writing score"
> comment(hs0$read) <-"reading score"
> # more checking to make sure that our comments stay with the data frame
> save(hs0,file="hs0.rda") 
> rm(list=ls())
> load(file="hs0.rda")
> comment(hs0)
[1] "High school and beyond data"
> comment(hs0$write)
[1] "writing score"
> search()
[1] ".GlobalEnv"        "package:stats"     "package:graphics"  "package:grDevices" "package:utils"     "package:datasets"  "package:methods"  
[8] "Autoloads"         "package:base"     
> attach(hs0)
> search()
 [1] ".GlobalEnv"        "hs0"               "package:stats"     "package:graphics"  "package:grDevices" "package:utils"     "package:datasets" 
 [8] "package:methods"   "Autoloads"         "package:base"     

> # use the sapply function with the is.factor function to check if any of the variables in the hs0 data frame are factor variables.
> sapply(hs0, is.factor)
 gender      id    race     ses  schtyp prgtype    read   write    math science   socst 
  FALSE   FALSE   FALSE   FALSE   FALSE    TRUE   FALSE   FALSE   FALSE   FALSE   FALSE 
> # Creating a factor (categorical) variable called schtyp.f for schtyp and a factor variable female for gender with value labels.
> schtyp.f <- factor(schtyp, levels=c(1, 2), labels=c("public", "private"))
> female <- factor(gender, levels=c(0, 1), labels=c("male", "female")) 
> table(schtyp.f)
schtyp.f
 public private 
    168      32 
> table(female)
female
  male female 
    91    109 


> # 4.0 Recoding variables and generating new variables
> table(hs0$race)
  1   2   3   4   5 
 24  11  20 143   2 
> hs0$race[hs0$race==5] <-NA
> table(hs0$race)
  1   2   3   4 
 24  11  20 143 
> # displaying the missing data as well
> table(hs0$race, useNA="ifany")
   1    2    3    4 <NA> 
  24   11   20  143    2 
> # Creating a variable called total=read+write+math+science
> total<-read+write+math+science
> # noticing the missing values generated
> summary(total)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  139.0   180.0   210.0   208.8   233.5   274.0     5.0 
> # Creating a variable called grade based on total.
> # initializing a variable
> grade<-0
> grade[total <=140]<-0
> grade[total > 140 & total <= 180] <-1
> grade[total > 180 & total <= 210] <-2
> grade[total > 210 & total <= 234] <-3
> grade[total > 234] <-4
> comment(grade)<-"combined grades of read, write, math, science"
> grade<-factor(grade, levels=c(0, 1, 2, 3, 4), labels=c("F", "D", "C", "B", "A"))
> table(grade)
grade
 F  D  C  B  A 
 1 50 50 47 47 
> # Creating mean scores in two ways - working with missing values differently.
> m1<-(read+write+math+science)/4
> m2<-rowMeans(cbind(read, write, math, science))
> m2<-rowMeans(cbind(read, write, math, science), na.rm=T)
> # At this point, we might want to combine the new variables we have created with the original data set. We can use the cbind function for this.
> hs1<-cbind(hs0, cbind(schtyp.f, female, total, grade))
> table(hs1$race)
  1   2   3   4 
 24  11  20 143 
> is.data.frame(hs1)
[1] TRUE



No comments:

Post a Comment