> hs1 <- read.table("http://www.ats.ucla.edu/stat/R/notes/hs1.csv", header=T, sep=",")
> # Keeping only the observations where the reading score is 60 or higher.
> hs1.read.well <- hs1[hs1$read >= 60, ]
> # Comparing means of read in the original hs1 data frame and the new smaller hs1.read.well data frame.
> mean(hs1.read.well$read)
[1] 65.48214
> mean(hs1$read)
[1] 52.23
> # Keeping only the variables read and write from the hs1 data frame.
> hs2<-hs1[, c("read", "write")]
> # another way of doing the same thing
> hs3<-hs1[, c(7, 8)]
> names(hs3)
[1] "read" "write"
> # Dropping the variables read and write from the hs1 data frame by using the column indices corresponding to these two variables with a negative sign.
> hs2.drop<-hs1[, -c(7, 8)]
> names(hs2.drop)
[1] "female" "id" "race" "ses" "schtyp" "prgtype" "math" "science" "socst" "prog"
> # We will subset hs1 to two data sets, one for female and one for male. We then put them back together.
> attach(hs1)
> hsfemale<-hs1[female==1, ]
> hsmale<-hs1[female==0, ]
> dim(hsfemale)
[1] 109 12
> dim(hsmale)
[1] 91 12
> hs.all<-rbind(hsfemale, hsmale)
> dim(hs.all)
[1] 200 12
> dim(hs1)
[1] 200 12
> # We will create two data sets from hs1, one contains demographic variables and the other one contains test scores. We then merge the two data sets by the id variable.
> hs.demo<-hs1[, c("id", "ses", "female", "race")]
> hs.scores<-hs1[, c("id", "read", "write", "math", "science")]
> dim(hs.demo)
[1] 200 4
> dim(hs.scores)
[1] 200 5
> hs.merge <- merge(hs.demo, hs.scores, by="id", all=T)
> head(hs.merge)
id ses female race read write math science
1 1 1 1 1 34 44 40 39
2 2 2 1 1 39 41 33 42
3 3 1 0 1 63 65 48 63
4 4 1 1 1 44 50 41 39
5 5 1 0 1 47 40 43 NA
6 6 1 1 1 47 41 46 40
> dim(hs.merge)
[1] 200 8
> hs.merge1 <- merge(hs.demo, hs.scores, by.x="id", by.y="id", all=T)
> dim(hs.merge1)
[1] 200 8
> head(hs.merge1)
id ses female race read write math science
1 1 1 1 1 34 44 40 39
2 2 2 1 1 39 41 33 42
3 3 1 0 1 63 65 48 63
4 4 1 1 1 44 50 41 39
5 5 1 0 1 47 40 43 NA
6 6 1 1 1 47 41 46 40
> # test
> hs.demo1 <-hs1[, c("id", "ses", "female", "race")]
> hs.scores1 <-hs1[, c("read", "write", "math", "science")]
> dim(hs.demo1)
[1] 200 4
> dim(hs.scores1)
[1] 200 4
> hs.merge2 <- merge(hs.demo1, hs.scores1, by.x="id", by.y="read", all=T)
> dim(hs.merge2)
[1] 370 7
> head(hs.merge2)
id ses female race write math science
1 1 1 1 1 NA NA NA
2 2 2 1 1 NA NA NA
3 3 1 0 1 NA NA NA
4 4 1 1 1 NA NA NA
5 5 1 0 1 NA NA NA
6 6 1 1 1 NA NA NA
> # reading in data
> hs0 <- read.table("http://www.ats.ucla.edu/stat/R/notes/hs0.csv", header=T, sep=",")
> # commenting the data set
> comment(hs0)<-"High school and beyond data"
> # checking
> comment(hs0)
[1] "High school and beyond data"
> # variable labels using comment
> comment(hs0$write)<-"writing score"
> comment(hs0$read) <-"reading score"
> # more checking to make sure that our comments stay with the data frame
> save(hs0,file="hs0.rda")
> rm(list=ls())
> load(file="hs0.rda")
> comment(hs0)
[1] "High school and beyond data"
> comment(hs0$write)
[1] "writing score"
> search()
[1] ".GlobalEnv" "package:stats" "package:graphics" "package:grDevices" "package:utils" "package:datasets" "package:methods"
[8] "Autoloads" "package:base"
> attach(hs0)
> search()
[1] ".GlobalEnv" "hs0" "package:stats" "package:graphics" "package:grDevices" "package:utils" "package:datasets"
[8] "package:methods" "Autoloads" "package:base"
> # use the sapply function with the is.factor function to check if any of the variables in the hs0 data frame are factor variables.
> sapply(hs0, is.factor)
gender id race ses schtyp prgtype read write math science socst
FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
> # Creating a factor (categorical) variable called schtyp.f for schtyp and a factor variable female for gender with value labels.
> schtyp.f <- factor(schtyp, levels=c(1, 2), labels=c("public", "private"))
> female <- factor(gender, levels=c(0, 1), labels=c("male", "female"))
> table(schtyp.f)
schtyp.f
public private
168 32
> table(female)
female
male female
91 109
> # 4.0 Recoding variables and generating new variables
> table(hs0$race)
1 2 3 4 5
24 11 20 143 2
> hs0$race[hs0$race==5] <-NA
> table(hs0$race)
1 2 3 4
24 11 20 143
> # displaying the missing data as well
> table(hs0$race, useNA="ifany")
1 2 3 4 <NA>
24 11 20 143 2
> # Creating a variable called total=read+write+math+science
> total<-read+write+math+science
> # noticing the missing values generated
> summary(total)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
139.0 180.0 210.0 208.8 233.5 274.0 5.0
> # Creating a variable called grade based on total.
> # initializing a variable
> grade<-0
> grade[total <=140]<-0
> grade[total > 140 & total <= 180] <-1
> grade[total > 180 & total <= 210] <-2
> grade[total > 210 & total <= 234] <-3
> grade[total > 234] <-4
> comment(grade)<-"combined grades of read, write, math, science"
> grade<-factor(grade, levels=c(0, 1, 2, 3, 4), labels=c("F", "D", "C", "B", "A"))
> table(grade)
grade
F D C B A
1 50 50 47 47
> # Creating mean scores in two ways - working with missing values differently.
> m1<-(read+write+math+science)/4
> m2<-rowMeans(cbind(read, write, math, science))
> m2<-rowMeans(cbind(read, write, math, science), na.rm=T)
> # At this point, we might want to combine the new variables we have created with the original data set. We can use the cbind function for this.
> hs1<-cbind(hs0, cbind(schtyp.f, female, total, grade))
> table(hs1$race)
1 2 3 4
24 11 20 143
> is.data.frame(hs1)
[1] TRUE
> hs0 <- read.table("http://www.ats.ucla.edu/stat/R/notes/hs0.csv", header=T, sep=",")
> hs0[1:5, ]
gender id race ses schtyp prgtype read write math science socst
1 0 70 4 1 1 general 57 52 41 47 57
2 1 121 4 2 1 vocati 68 59 53 63 61
3 0 86 4 3 1 general 44 33 54 58 31
4 0 141 4 3 1 vocati 63 44 47 53 56
5 0 172 4 2 1 academic 47 52 57 53 61
> names(hs0)
[1] "gender" "id" "race" "ses" "schtyp" "prgtype" "read" "write" "math" "science" "socst"
> # shorthand way of referring to read, write, math, science
> read.sci <- hs0[ , 7:10]
> # checking the type of object
> class(read.sci)
[1] "data.frame"
> # listing the first 10 observations
> head(read.sci, n=10)
read write math science
1 57 52 41 47
2 68 59 53 63
3 44 33 54 58
4 63 44 47 53
5 47 52 57 53
6 44 52 51 63
7 50 59 42 53
8 34 46 45 39
9 63 57 54 NA
10 57 55 52 50
> # displaying the dimensions
> dim(read.sci)
[1] 200 4
> length(read.sci)
[1] 4
> length(read.sci$read)
[1] 200
> summary(read.sci)
read write math science
Min. :28.00 Min. :31.00 Min. :33.00 Min. :26.00
1st Qu.:44.00 1st Qu.:45.75 1st Qu.:45.00 1st Qu.:44.00
Median :50.00 Median :54.00 Median :52.00 Median :53.00
Mean :52.23 Mean :52.77 Mean :52.65 Mean :51.66
3rd Qu.:60.00 3rd Qu.:60.00 3rd Qu.:59.00 3rd Qu.:58.00
Max. :76.00 Max. :67.00 Max. :75.00 Max. :74.00
NA's : 5.00
> range(read.sci$write)
[1] 31 67
> range(read.sci$science)
[1] NA NA
> range(read.sci$science, na.rm=T)
[1] 26 74
> # the minimum and the maximum among all the variables
> range(read.sci, na.rm=T)
[1] 26 76
> mean(read.sci)
read write math science
52.230 52.775 52.645 NA
> mean(read.sci, na.rm=T)
read write math science
52.23000 52.77500 52.64500 51.66154
> sd(read.sci, na.rm=T)
read write math science
10.252937 9.478586 9.368448 9.866026
> table(hs0$prgtype)
academic general vocati
105 45 50
> by(hs0, hs0$prgtype, mean)
hs0$prgtype: academic
gender id race ses schtyp prgtype read write math science socst
0.552381 107.628571 3.495238 2.219048 1.228571 NA 56.161905 56.257143 56.733333 NA 56.695238
-----------------------------------------------------------------
hs0$prgtype: general
gender id race ses schtyp prgtype read write math science socst
0.5333333 97.3111111 3.4666667 1.8444444 1.1333333 NA 49.7555556 51.3333333 50.0222222 NA 50.6000000
-----------------------------------------------------------------
hs0$prgtype: vocati
gender id race ses schtyp prgtype read write math science socst
0.54 88.40 3.30 1.90 1.04 NA 46.20 46.76 46.42 47.22 45.02
Warning messages:
1: In mean.default(X[[6L]], ...) :
argument is not numeric or logical: returning NA
2: In mean.default(X[[6L]], ...) :
argument is not numeric or logical: returning NA
3: In mean.default(X[[6L]], ...) :
argument is not numeric or logical: returning NA
> by(hs0, hs0$prgtype, sd)
hs0$prgtype: academic
gender id race ses schtyp prgtype read write math science socst
0.4996336 61.0042964 1.0107481 0.7335498 0.4219265 NA 9.5887793 7.9433433 8.7302157 NA 9.1736703
----------------------------------------------------------------------------------------------------------------------------------------------
hs0$prgtype: general
gender id race ses schtyp prgtype read write math science socst
0.5045250 52.5793004 1.0135446 0.7371800 0.3437758 NA 9.2347062 9.3977754 7.4421676 NA 9.3088423
----------------------------------------------------------------------------------------------------------------------------------------------
hs0$prgtype: vocati
gender id race ses schtyp prgtype read write math science socst
0.5034574 54.3713610 1.1649647 0.6144518 0.1979487 NA 8.9076899 9.3187544 7.9541800 10.3337963 10.6569705
Warning messages:
1: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
2: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
3: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
> # attaching hs0, so its variables will be sesarchable by R
> attach(hs0)
> getOption("digits")
[1] 7
> options(digits=2)
> by(hs0, prgtype, mean, na.rm=T)
prgtype: academic
gender id race ses schtyp prgtype read write math science socst
0.55 107.63 3.50 2.22 1.23 NA 56.16 56.26 56.73 53.62 56.70
-----------------------------------------------------------------
prgtype: general
gender id race ses schtyp prgtype read write math science socst
0.53 97.31 3.47 1.84 1.13 NA 49.76 51.33 50.02 52.19 50.60
-----------------------------------------------------------------
prgtype: vocati
gender id race ses schtyp prgtype read write math science socst
0.54 88.40 3.30 1.90 1.04 NA 46.20 46.76 46.42 47.22 45.02
Warning messages:
1: In mean.default(X[[6L]], ...) :
argument is not numeric or logical: returning NA
2: In mean.default(X[[6L]], ...) :
argument is not numeric or logical: returning NA
3: In mean.default(X[[6L]], ...) :
argument is not numeric or logical: returning NA
> by(hs0, prgtype, sd, na.rm=T)
prgtype: academic
gender id race ses schtyp prgtype read write math science socst
0.50 61.00 1.01 0.73 0.42 NA 9.59 7.94 8.73 9.01 9.17
-----------------------------------------------------------------
prgtype: general
gender id race ses schtyp prgtype read write math science socst
0.50 52.58 1.01 0.74 0.34 NA 9.23 9.40 7.44 9.83 9.31
-----------------------------------------------------------------
prgtype: vocati
gender id race ses schtyp prgtype read write math science socst
0.50 54.37 1.16 0.61 0.20 NA 8.91 9.32 7.95 10.33 10.66
Warning messages:
1: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
2: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
3: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
> m <- tapply(write, prgtype, mean)
> v <- tapply(write, prgtype, var)
> med <- tapply(write, prgtype, median)
> n <- tapply(write, prgtype, length)
> sd <- tapply(write, prgtype, sd)
> cbind(mean=m, var=v, std.dev=sd, median=med, n=n)
mean var std.dev median n
academic 56 63 7.9 59 105
general 51 88 9.4 54 45
vocati 47 87 9.3 46 50
> # set the number of digits to 7
> options(digits=7)
> hist(write)
> # load trellis graphics
> library(lattice)
> # trellis graphs
> histogram(~write, hs0, type="count")
> # histogram of write by gender
> histogram(~write | gender, hs0, type="count")
> # change the number of bins to 15
> hist(write, breaks=15)
> # boxplot function in the graphics package
> boxplot(write)
> #trellis graphs
> bwplot(ses~ write, hs0)
> # boxplot by gender
> bwplot(ses~ write| gender, hs0)
> barplot(table(ses, gender), legend=c("low", "medium", "high"))
> barplot(table(ses, gender), beside=T, legend=c("low", "medium", "high"), ylim=c(0, 50))
> # changing the location of legend and adding a title, etc
> barplot(table(ses, gender), beside=T, legend.text=c("low", "medium", "high"), ylim=c(0, 50), space=c(.1, 1),
+ col=c("lightblue", "blue", "dark blue"), names.arg=c("male", "female"),
+ main="Distribution of SES by gender", args.legend=list(x =9, y=45, cex=.6))
> table(ses)
ses
1 2 3
47 95 58
> tab1<-table(gender, ses)
> # row proportions
> prop.table(tab1,1)
ses
gender 1 2 3
0 0.1648352 0.5164835 0.3186813
1 0.2935780 0.4403670 0.2660550
> # row frequencies
> rowSums(tab1)
0 1
91 109
> # column frequencies
> colSums(tab1)
1 2 3
47 95 58
> # correlation of a pair of variables
> cor(write, math)
[1] 0.6174493
> cor(write, science)
[1] NA
> cor(write, science, use="complete.obs")
[1] 0.5671298
> # correlation matrix
> cor(read.sci, use="complete.obs")
read write math science
read 1.0000000 0.5959677 0.6492202 0.6170562
write 0.5959677 1.0000000 0.6203022 0.5671298
math 0.6492202 0.6203022 1.0000000 0.6166288
science 0.6170562 0.5671298 0.6166288 1.0000000
> cor(read.sci, use="pairwise.complete.obs")
read write math science
read 1.0000000 0.5967765 0.6622801 0.6170562
write 0.5967765 1.0000000 0.6174493 0.5671298
math 0.6622801 0.6174493 1.0000000 0.6166288
science 0.6170562 0.5671298 0.6166288 1.0000000
> plot(math, write)
> # scatter plot matrix
> plot(read.sci)
> # Unless you are going to continue working with the hs0 data frame it is generally a good idea to detach all attached data frames.
> detach()
> data1 <- read.table("http://www.ats.ucla.edu/stat/R/notes/hs0.csv", header=T, sep=",")
> names(data1)
[1] "gender" "id" "race" "ses" "schtyp" "prgtype" "read" "write" "math" "science" "socst"
> head(data1)
gender id race ses schtyp prgtype read write math science socst
1 0 70 4 1 1 general 57 52 41 47 57
2 1 121 4 2 1 vocati 68 59 53 63 61
3 0 86 4 3 1 general 44 33 54 58 31
4 0 141 4 3 1 vocati 63 44 47 53 56
5 0 172 4 2 1 academic 47 52 57 53 61
6 0 113 4 2 1 academic 44 52 51 63 61
> dim(data1)
[1] 200 11
> # saves as an R object
> save(data1,file="data1.rda")
> # checking to see if data1.rda has been created
> dir()
[1] "asg1-40" "data1.rda" "desktop.ini" "FMS_data.txt" "FMS_data_edited.dat"
[6] "FMS_data_edited.sav" "HGDP_AKT1.txt" "hs0.rda" "JHS_NPHP1_single_sheet_wy0111.dat" "MolecularWeight_tair7.xls"
[11] "My Music" "My Pictures" "My Videos" "R" "rtn.txt"
[16] "rtn.xlsx" "SafeNet Sentinel" "SPSSInc" "TargetP_analysis_tair7.xls" "test1230.txt"
[21] "Virco_data.csv"
> # list of rda files
> ls()
[1] "data1" "hs0"
> # clear everything out of memory
> rm(list=ls())
> ls()
character(0)
> # load the R data into memory
> load("data1.rda")
> tail(data1)
gender id race ses schtyp prgtype read write math science socst
195 1 179 4 2 2 academic 47 65 60 50 56
196 1 31 2 2 2 general 55 59 52 42 56
197 1 145 4 2 1 vocati 42 46 38 36 46
198 1 187 4 2 2 general 57 41 57 55 52
199 1 118 4 2 1 general 55 62 58 58 61
200 1 137 4 3 1 academic 63 65 65 53 61
>
> rm(list=ls()) # clear everything out of memory
> ls()
character(0)
> temp <- read.table("http://www.ats.ucla.edu/stat/R/notes/hs0_1.csv", header=F, sep=",")
> temp[9,]
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
9 0 84 4 2 1 general 63 57 54 NA 51
> names(temp) <- c("gender","id","race","ses","schtyp","prgtype","read","write","math","science","socst")
> # list observations 5 through 10 to check the data
> temp[5:10, ]
gender id race ses schtyp prgtype read write math science socst
5 0 172 4 2 1 academic 47 52 57 53 61
6 0 113 4 2 1 academic 44 52 51 63 61
7 0 50 3 2 1 general 50 59 42 53 61
8 0 11 1 2 1 academic 34 46 45 39 36
9 0 84 4 2 1 general 63 57 54 NA 51
10 0 48 3 2 1 academic 57 55 52 50 51
>