Wednesday, March 23, 2011

http://www.ats.ucla.edu/stat/R/notes/exploring.htm

> hs0 <- read.table("http://www.ats.ucla.edu/stat/R/notes/hs0.csv", header=T, sep=",")
> hs0[1:5, ]
   gender  id race ses schtyp  prgtype read write math science socst
1       0  70    4   1      1  general   57    52   41      47    57
2       1 121    4   2      1   vocati   68    59   53      63    61
3       0  86    4   3      1  general   44    33   54      58    31
4       0 141    4   3      1   vocati   63    44   47      53    56
5       0 172    4   2      1 academic   47    52   57      53    61


> names(hs0)
 [1] "gender"  "id"      "race"    "ses"     "schtyp"  "prgtype" "read"    "write"   "math"    "science" "socst"  
> # shorthand way of referring to read, write, math, science 
> read.sci <- hs0[ , 7:10]  
> # checking the type of object 
> class(read.sci)
[1] "data.frame"
> # listing the first 10 observations
> head(read.sci, n=10)
   read write math science
1    57    52   41      47
2    68    59   53      63
3    44    33   54      58
4    63    44   47      53
5    47    52   57      53
6    44    52   51      63
7    50    59   42      53
8    34    46   45      39
9    63    57   54      NA
10   57    55   52      50
> # displaying the dimensions 
> dim(read.sci)
[1] 200   4
> length(read.sci)
[1] 4
> length(read.sci$read)
[1] 200 
> summary(read.sci)
      read           write            math          science     
 Min.   :28.00   Min.   :31.00   Min.   :33.00   Min.   :26.00  
 1st Qu.:44.00   1st Qu.:45.75   1st Qu.:45.00   1st Qu.:44.00  
 Median :50.00   Median :54.00   Median :52.00   Median :53.00  
 Mean   :52.23   Mean   :52.77   Mean   :52.65   Mean   :51.66  
 3rd Qu.:60.00   3rd Qu.:60.00   3rd Qu.:59.00   3rd Qu.:58.00  
 Max.   :76.00   Max.   :67.00   Max.   :75.00   Max.   :74.00  
                                                 NA's   : 5.00  
> range(read.sci$write)
[1] 31 67
> range(read.sci$science)
[1] NA NA
> range(read.sci$science, na.rm=T)
[1] 26 74
> # the minimum and the maximum among all the variables
> range(read.sci, na.rm=T)
[1] 26 76
> mean(read.sci)
   read   write    math science 
 52.230  52.775  52.645      NA 
> mean(read.sci, na.rm=T)
    read    write     math  science 
52.23000 52.77500 52.64500 51.66154 
> sd(read.sci, na.rm=T)
     read     write      math   science 
10.252937  9.478586  9.368448  9.866026 
> table(hs0$prgtype)


academic  general   vocati 
     105       45       50 
> by(hs0, hs0$prgtype, mean)
hs0$prgtype: academic
    gender         id       race        ses     schtyp    prgtype       read      write       math    science      socst 
  0.552381 107.628571   3.495238   2.219048   1.228571         NA  56.161905  56.257143  56.733333         NA  56.695238 
-----------------------------------------------------------------
hs0$prgtype: general
    gender         id       race        ses     schtyp    prgtype       read      write       math    science      socst 
 0.5333333 97.3111111  3.4666667  1.8444444  1.1333333         NA 49.7555556 51.3333333 50.0222222         NA 50.6000000 
----------------------------------------------------------------- 
hs0$prgtype: vocati
 gender      id    race     ses  schtyp prgtype    read   write    math science   socst 
   0.54   88.40    3.30    1.90    1.04      NA   46.20   46.76   46.42   47.22   45.02 
Warning messages:
1: In mean.default(X[[6L]], ...) :
  argument is not numeric or logical: returning NA
2: In mean.default(X[[6L]], ...) :
  argument is not numeric or logical: returning NA
3: In mean.default(X[[6L]], ...) :
  argument is not numeric or logical: returning NA
> by(hs0, hs0$prgtype, sd)
hs0$prgtype: academic
    gender         id       race        ses     schtyp    prgtype       read      write       math    science      socst 
 0.4996336 61.0042964  1.0107481  0.7335498  0.4219265         NA  9.5887793  7.9433433  8.7302157         NA  9.1736703 
---------------------------------------------------------------------------------------------------------------------------------------------- 
hs0$prgtype: general
    gender         id       race        ses     schtyp    prgtype       read      write       math    science      socst 
 0.5045250 52.5793004  1.0135446  0.7371800  0.3437758         NA  9.2347062  9.3977754  7.4421676         NA  9.3088423 
---------------------------------------------------------------------------------------------------------------------------------------------- 
hs0$prgtype: vocati
    gender         id       race        ses     schtyp    prgtype       read      write       math    science      socst 
 0.5034574 54.3713610  1.1649647  0.6144518  0.1979487         NA  8.9076899  9.3187544  7.9541800 10.3337963 10.6569705 
Warning messages:
1: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
2: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
3: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
> # attaching hs0, so its variables will be sesarchable by R 
> attach(hs0)
> getOption("digits")
[1] 7
> options(digits=2)
> by(hs0, prgtype, mean, na.rm=T)
prgtype: academic
 gender      id    race     ses  schtyp prgtype    read   write    math science   socst 
   0.55  107.63    3.50    2.22    1.23      NA   56.16   56.26   56.73   53.62   56.70 
-----------------------------------------------------------------
prgtype: general
 gender      id    race     ses  schtyp prgtype    read   write    math science   socst 
   0.53   97.31    3.47    1.84    1.13      NA   49.76   51.33   50.02   52.19   50.60 
-----------------------------------------------------------------
prgtype: vocati
 gender      id    race     ses  schtyp prgtype    read   write    math science   socst 
   0.54   88.40    3.30    1.90    1.04      NA   46.20   46.76   46.42   47.22   45.02 
Warning messages:
1: In mean.default(X[[6L]], ...) :
  argument is not numeric or logical: returning NA
2: In mean.default(X[[6L]], ...) :
  argument is not numeric or logical: returning NA
3: In mean.default(X[[6L]], ...) :
  argument is not numeric or logical: returning NA
> by(hs0, prgtype, sd, na.rm=T)
prgtype: academic
 gender      id    race     ses  schtyp prgtype    read   write    math science   socst 
   0.50   61.00    1.01    0.73    0.42      NA    9.59    7.94    8.73    9.01    9.17 
-----------------------------------------------------------------
prgtype: general
 gender      id    race     ses  schtyp prgtype    read   write    math science   socst 
   0.50   52.58    1.01    0.74    0.34      NA    9.23    9.40    7.44    9.83    9.31 
-----------------------------------------------------------------
prgtype: vocati
 gender      id    race     ses  schtyp prgtype    read   write    math science   socst 
   0.50   54.37    1.16    0.61    0.20      NA    8.91    9.32    7.95   10.33   10.66 
Warning messages:
1: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
2: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
3: In var(as.vector(x), na.rm = na.rm) : NAs introduced by coercion
> m   <- tapply(write, prgtype, mean)
> v   <- tapply(write, prgtype, var)
> med <- tapply(write, prgtype, median)
> n   <- tapply(write, prgtype, length)
> sd  <- tapply(write, prgtype, sd)
> cbind(mean=m, var=v, std.dev=sd, median=med, n=n)
         mean var std.dev median   n
academic   56  63     7.9     59 105
general    51  88     9.4     54  45
vocati     47  87     9.3     46  50
> # set the number of digits to 7
> options(digits=7)

> hist(write)
> # load trellis graphics
> library(lattice) 
> # trellis graphs 
> histogram(~write, hs0, type="count")






> # histogram of write by gender
> histogram(~write | gender, hs0, type="count")






> # change the number of bins to 15
> hist(write, breaks=15)





> # boxplot function in the graphics package
> boxplot(write)






> #trellis graphs
> bwplot(ses~ write, hs0)






> # boxplot by gender
> bwplot(ses~ write| gender, hs0)






> barplot(table(ses, gender), legend=c("low", "medium", "high"))






> barplot(table(ses, gender), beside=T, legend=c("low", "medium", "high"), ylim=c(0, 50))





> # changing the location of legend and adding a title, etc
> barplot(table(ses, gender), beside=T, legend.text=c("low", "medium", "high"), ylim=c(0, 50), space=c(.1, 1),
+ col=c("lightblue", "blue", "dark blue"), names.arg=c("male", "female"),
+ main="Distribution of SES by gender", args.legend=list(x =9, y=45, cex=.6))






> table(ses)
ses
 1  2  3 
47 95 58 
> tab1<-table(gender, ses)
> # row proportions
> prop.table(tab1,1) 
      ses
gender         1         2         3
     0 0.1648352 0.5164835 0.3186813
     1 0.2935780 0.4403670 0.2660550
> # row frequencies
> rowSums(tab1)  
  0   1 
 91 109 
> # column frequencies
> colSums(tab1) 
 1  2  3 
47 95 58 
> # correlation of a pair of variables
> cor(write, math)
[1] 0.6174493
> cor(write, science)
[1] NA
> cor(write, science, use="complete.obs")
[1] 0.5671298
> # correlation matrix
> cor(read.sci, use="complete.obs")
             read     write      math   science
read    1.0000000 0.5959677 0.6492202 0.6170562
write   0.5959677 1.0000000 0.6203022 0.5671298
math    0.6492202 0.6203022 1.0000000 0.6166288
science 0.6170562 0.5671298 0.6166288 1.0000000
> cor(read.sci, use="pairwise.complete.obs")
             read     write      math   science
read    1.0000000 0.5967765 0.6622801 0.6170562
write   0.5967765 1.0000000 0.6174493 0.5671298
math    0.6622801 0.6174493 1.0000000 0.6166288
science 0.6170562 0.5671298 0.6166288 1.0000000
> plot(math, write)






> # scatter plot matrix
> plot(read.sci)






> # Unless you are going to continue working with the hs0 data frame it is generally a good idea to detach all attached data frames.
> detach()





No comments:

Post a Comment