> hs0 <- read.table("http://www.ats.ucla.edu/stat/R/notes/hs0.csv", header=T, sep=",")
> # commenting the data set
> comment(hs0)<-"High school and beyond data"
> # checking
> comment(hs0)
[1] "High school and beyond data"
> # variable labels using comment
> comment(hs0$write)<-"writing score"
> comment(hs0$read) <-"reading score"
> # more checking to make sure that our comments stay with the data frame
> save(hs0,file="hs0.rda")
> rm(list=ls())
> load(file="hs0.rda")
> comment(hs0)
[1] "High school and beyond data"
> comment(hs0$write)
[1] "writing score"
> search()
[1] ".GlobalEnv" "package:stats" "package:graphics" "package:grDevices" "package:utils" "package:datasets" "package:methods"
[8] "Autoloads" "package:base"
> attach(hs0)
> search()
[1] ".GlobalEnv" "hs0" "package:stats" "package:graphics" "package:grDevices" "package:utils" "package:datasets"
[8] "package:methods" "Autoloads" "package:base"
> # use the sapply function with the is.factor function to check if any of the variables in the hs0 data frame are factor variables.
> sapply(hs0, is.factor)
gender id race ses schtyp prgtype read write math science socst
FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
> # Creating a factor (categorical) variable called schtyp.f for schtyp and a factor variable female for gender with value labels.
> schtyp.f <- factor(schtyp, levels=c(1, 2), labels=c("public", "private"))
> female <- factor(gender, levels=c(0, 1), labels=c("male", "female"))
> table(schtyp.f)
schtyp.f
public private
168 32
> table(female)
female
male female
91 109
> # 4.0 Recoding variables and generating new variables
> table(hs0$race)
1 2 3 4 5
24 11 20 143 2
> hs0$race[hs0$race==5] <-NA
> table(hs0$race)
1 2 3 4
24 11 20 143
> # displaying the missing data as well
> table(hs0$race, useNA="ifany")
1 2 3 4 <NA>
24 11 20 143 2
> # Creating a variable called total=read+write+math+science
> total<-read+write+math+science
> # noticing the missing values generated
> summary(total)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
139.0 180.0 210.0 208.8 233.5 274.0 5.0
> # Creating a variable called grade based on total.
> # initializing a variable
> grade<-0
> grade[total <=140]<-0
> grade[total > 140 & total <= 180] <-1
> grade[total > 180 & total <= 210] <-2
> grade[total > 210 & total <= 234] <-3
> grade[total > 234] <-4
> comment(grade)<-"combined grades of read, write, math, science"
> grade<-factor(grade, levels=c(0, 1, 2, 3, 4), labels=c("F", "D", "C", "B", "A"))
> table(grade)
grade
F D C B A
1 50 50 47 47
> # Creating mean scores in two ways - working with missing values differently.
> m1<-(read+write+math+science)/4
> m2<-rowMeans(cbind(read, write, math, science))
> m2<-rowMeans(cbind(read, write, math, science), na.rm=T)
> # At this point, we might want to combine the new variables we have created with the original data set. We can use the cbind function for this.
> hs1<-cbind(hs0, cbind(schtyp.f, female, total, grade))
> table(hs1$race)
1 2 3 4
24 11 20 143
> is.data.frame(hs1)
[1] TRUE
> sapply(hs0, is.factor)
gender id race ses schtyp prgtype read write math science socst
FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
> # Creating a factor (categorical) variable called schtyp.f for schtyp and a factor variable female for gender with value labels.
> schtyp.f <- factor(schtyp, levels=c(1, 2), labels=c("public", "private"))
> female <- factor(gender, levels=c(0, 1), labels=c("male", "female"))
> table(schtyp.f)
schtyp.f
public private
168 32
> table(female)
female
male female
91 109
> # 4.0 Recoding variables and generating new variables
> table(hs0$race)
1 2 3 4 5
24 11 20 143 2
> hs0$race[hs0$race==5] <-NA
> table(hs0$race)
1 2 3 4
24 11 20 143
> # displaying the missing data as well
> table(hs0$race, useNA="ifany")
1 2 3 4 <NA>
24 11 20 143 2
> # Creating a variable called total=read+write+math+science
> total<-read+write+math+science
> # noticing the missing values generated
> summary(total)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
139.0 180.0 210.0 208.8 233.5 274.0 5.0
> # Creating a variable called grade based on total.
> # initializing a variable
> grade<-0
> grade[total <=140]<-0
> grade[total > 140 & total <= 180] <-1
> grade[total > 180 & total <= 210] <-2
> grade[total > 210 & total <= 234] <-3
> grade[total > 234] <-4
> comment(grade)<-"combined grades of read, write, math, science"
> grade<-factor(grade, levels=c(0, 1, 2, 3, 4), labels=c("F", "D", "C", "B", "A"))
> table(grade)
grade
F D C B A
1 50 50 47 47
> # Creating mean scores in two ways - working with missing values differently.
> m1<-(read+write+math+science)/4
> m2<-rowMeans(cbind(read, write, math, science))
> m2<-rowMeans(cbind(read, write, math, science), na.rm=T)
> # At this point, we might want to combine the new variables we have created with the original data set. We can use the cbind function for this.
> hs1<-cbind(hs0, cbind(schtyp.f, female, total, grade))
> table(hs1$race)
1 2 3 4
24 11 20 143
> is.data.frame(hs1)
[1] TRUE
No comments:
Post a Comment