Webinar  Link - http://bit.ly/1EOnj7E

BIG DATA R CODE as seen below --- Copy this in a TEXT File before we start the session. 
Copy all text below this line ________________________________________________________________________________________

# BIG DATA Analytics - Basic R Code 
library(RCurl)
# urlfile <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/gisette/GISETTE/gisette_train.data'
# x <- getURL(urlfile, ssl.verifypeer = FALSE)
# BD_Raw<-read.table(textConnection(x), sep = '', header = FALSE, stringsAsFactors = FALSE)
BD_RAW <- read.csv("C:/STAT/BRIDGE/Module-3/Week-3/PCA_BIG_DATA/PCA_BIG_DATA/BD_RAW.csv")
print(dim(BD_Raw))
# View(BD_Raw)
getwd()
# write.table(BD_Raw,"C: [---- FOLDER NAME HERE ---] /BD_RAW.csv",sep=",")
#
# Checking for Features / Columns with --  . 
#  -- 1) few unique values relative to other values within feature
#  -- 2) Near Zero variance or high frequency of most common value or mode value 
#  -- before we conduct PCA [ Principal Components Analysis ] this can be considered as a First Step for Feature Reduction 
#
library(caret)
#
Ftr_ZeroVAR<-nearZeroVar(BD_Raw, saveMetrics = TRUE)
class(Ftr_ZeroVAR)
#-- Uncomment ---# ?class
## here - nearZeroVar is a function of library(caret)
##-- Uncomment ---# ?nearZeroVar

# nearZeroVar diagnoses predictors that have 
# one unique value (i.e. are zero variance predictors) 
# or predictors that are have both of the following
# characteristics: they have very few unique values
# relative to the number of samples and the ratio of
# the frequency of the most common value to the frequency 
# of the second most common value is large

print(paste('Range:',range(Ftr_ZeroVAR$percentUnique))) 
# printing the "Range" of the nearZeroVar 
max(Ftr_ZeroVAR$percentUnique)
min(Ftr_ZeroVAR$percentUnique)
# We can see that the RANGE of VARIANCE is between - 
# Min == 0.0166 or Near Zero Variance. 
# Max == 8.6 or NOT Near Zero Variance. 
print(head(Ftr_ZeroVAR))
str(Ftr_ZeroVAR)
#
print(paste('Column count prior removal:',ncol(BD_Raw)))
##-- Uncomment ---# Alternative Code to PRINT Command - xx <- paste('Column count prior removal:',ncol(BD_Raw))
##-- Uncomment ---# xx
##-- Uncomment ---# This Code will Not Print ---- paste('Column count prior removal:',ncol(BD_Raw)

# We count Number of Columns / Features in RAW Big Data 
# We remove features with less than 0.2% variance 
# this is done by checking within vector - percentUnique  - which shows values for 
# percentage of unique data points out of the total number of data points
# if the % unique value < 0.2% we drop the Feature / Column. 

# Printing dimensions / number of Features - % unique value > 0.2% , 0.5% 
dim(Ftr_ZeroVAR[Ftr_ZeroVAR$percentUnique > 0.2,])
dim(Ftr_ZeroVAR[Ftr_ZeroVAR$percentUnique > 0.5,])

# SKIP FOR WEBINAR -1 -- Printing dimensions / number of Features - % unique value > 
# dim(Ftr_ZeroVAR[Ftr_ZeroVAR$percentUnique > 1,])
# dim(Ftr_ZeroVAR[Ftr_ZeroVAR$percentUnique > 8,])
# dim(Ftr_ZeroVAR[Ftr_ZeroVAR$percentUnique > 8.6,])

# Initial BIG DATA had - 5000 Features or Columns and 6000 ROWS or Observations 
# if we choose 0.5% as the Cutoff we drop off - (5000 - 4123 ) == 877 Features , we shall have 
#  6000 ROWS or Observations in  4123 Columns or Feature - Simple Feature Reduction before PCA . 

BD_nzv <- BD_Raw[c(rownames(Ftr_ZeroVAR[Ftr_ZeroVAR$percentUnique > 0.5,])) ]
print(paste('Feature count after cutoff of Near Zero Var Features:',ncol(BD_nzv)))

# Now we conduct PCA using the Dimensions / Features Reduced data frame - BD_nzv
# We check the Dimensions of BD_nzv

print(dim(BD_nzv))

library(doParallel)

===========================R CONSOLE DUMP ================================ WEBINAR -1 =============== #No_Jargon 

#===================================

R version 3.1.3 (2015-03-09) -- "Smooth Sidewalk"
Copyright (C) 2015 The R Foundation for Statistical Computing
Platform: x86_64-w64-mingw32/x64 (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> library(vcd)
Loading required package: grid
> ?Arthritis
> data("Arthritis")
> counts <- table(Arthritis$Improved)
> counts

  None   Some Marked 
    42     14     28 
> attach("Arthritis")
Error in attach("Arthritis") : file 'Arthritis' not found
> attach(Arthritis)
> summary(Arthritis)
       ID          Treatment      Sex          Age          Improved 
 Min.   : 1.00   Placebo:43   Female:59   Min.   :23.00   None  :42  
 1st Qu.:21.75   Treated:41   Male  :25   1st Qu.:46.00   Some  :14  
 Median :42.50                            Median :57.00   Marked:28  
 Mean   :42.50                            Mean   :53.36              
 3rd Qu.:63.25                            3rd Qu.:63.00              
 Max.   :84.00                            Max.   :74.00              
> dim(Arthritis)
[1] 84  5
> View(Arthritis)
> barplot(counts, main="Simple Bar Plot",xlab="Improvement", ylab="Frequency")
> install.packages("ggplot2")
Installing package into ‘C:/Users/Rohit/Documents/R/win-library/3.1’
(as ‘lib’ is unspecified)
trying URL 'http://cran.rstudio.com/bin/windows/contrib/3.1/ggplot2_1.0.1.zip'
Content type 'application/zip' length 2676225 bytes (2.6 MB)
opened URL
downloaded 2.6 MB

package ‘ggplot2’ successfully unpacked and MD5 sums checked

The downloaded binary packages are in
C:\Users\Rohit\AppData\Local\Temp\RtmpumccYV\downloaded_packages
> library(ggplot2)
> library(vcd)
> library("arules", lib.loc="~/R/win-library/3.1")
Loading required package: Matrix

Attaching package: ‘arules’

The following objects are masked from ‘package:base’:

    %in%, write

> data("Arthritis")
> attach(Arthritis)
The following objects are masked from Arthritis (pos = 6):

    Age, ID, Improved, Sex, Treatment

> summary(Arthritis)
       ID          Treatment      Sex          Age          Improved 
 Min.   : 1.00   Placebo:43   Female:59   Min.   :23.00   None  :42  
 1st Qu.:21.75   Treated:41   Male  :25   1st Qu.:46.00   Some  :14  
 Median :42.50                            Median :57.00   Marked:28  
 Mean   :42.50                            Mean   :53.36              
 3rd Qu.:63.25                            3rd Qu.:63.00              
 Max.   :84.00                            Max.   :74.00              
> ?Arthritis
> class(Arthritis)
[1] "data.frame"
> dim(Arthritis)
[1] 84  5
> dim(Arthritis)[1]
[1] 84
> dim(Arthritis)[2]
[1] 5
> View(Arthritis)
> View(Arthritis)
> counts <- table(Arthritis$Improved)
> class(counts)
[1] "table"
> barplot(counts, main="Simple Bar Plot",xlab="Improvement", ylab="Frequency")
> barplot(counts, main="Simple Bar Plot for #No_jargon",xlab="Improvement", ylab="Frequency")
> barplot(counts, main="Simple Bar Plot for #No_jargon",xlab="Improvement_NoJargon", ylab="Frequency_NoJargon")
> barplot(counts,
+         main="Horizontal Bar Plot",
+         xlab="Frequency", ylab="Improvement",
+         horiz=TRUE)
> ?mtcars
> attach(mtcars)
The following object is masked from package:ggplot2:

    mpg

> View(mtcars)
> par(mfrow=c(2,1))
> View(mtcars)
> d <- density(mtcars$mpg)
> ?list
> plot(d)
> ?density
> par(mfrow=c(2,1))
> plot(d, main="Kernel Density of Miles Per Gallon")
> polygon(d, col="red", border="blue")
> rug(mtcars$mpg, col="brown")
> par(mfrow=c(2,1))
> d <- density(mtcars$mpg)
> # ?density
> # ?list
> plot(d)
> d <- density(mtcars$mpg)
> plot(d, main="Kernel Density of Miles Per Gallon")
> polygon(d, col="red", border="blue")
> rug(mtcars$mpg, col="brown")
> memory.size(max = FALSE) # amount currently in use by "R" 
[1] 124.35
> memory.size(max = NA) # the Total memory available on your System  
[1] 2990
> memory.size(max = TRUE) #
[1] 429.06
> urlfile <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/gisette/GISETTE/gisette_train.data'
> library(RCurl)
Loading required package: bitops
> library(pryr)

Attaching package: ‘pryr’

The following object is masked from ‘package:arules’:

    inspect

> library("filematrix", lib.loc="~/R/win-library/3.1")
> x <- getURL(urlfile, ssl.verifypeer = FALSE)
> BD_Raw<-read.table(textConnection(x), sep = '', header = FALSE, stringsAsFactors = FALSE)
> getwd()
[1] "C:/Webinar_1/A/Webinar-1"
> write.table(BD_Raw,"C:/C:/Webinar_1/A/Webinar-1/BD_RAW.csv",sep=",")
Error in file(file, ifelse(append, "a", "w")) : 
  cannot open the connection
In addition: Warning message:
In file(file, ifelse(append, "a", "w")) :
  cannot open file 'C:/C:/Webinar_1/A/Webinar-1/BD_RAW.csv': Invalid argument
> write.table(BD_Raw,"C:/Webinar_1/A/Webinar-1/BD_RAW.csv",sep=",")
> print(dim(BD_Raw))
[1] 6000 5000
> library(caret)
Loading required package: lattice
> t<-system.time({
+   Ftr_ZeroVAR<-nearZeroVar(BD_Raw, saveMetrics = TRUE)
+ })
> t
   user  system elapsed 
  28.74    0.37   30.21 
> class(Ftr_ZeroVAR)
[1] "data.frame"
> print(paste('Range:',range(Ftr_ZeroVAR$percentUnique)))
[1] "Range: 0.0166666666666667" "Range: 8.6"               
> max(Ftr_ZeroVAR$percentUnique)
[1] 8.6
> min(Ftr_ZeroVAR$percentUnique)
[1] 0.01666667
> print(head(Ftr_ZeroVAR))
    freqRatio percentUnique zeroVar  nzv
V1   48.25234     5.2166667   FALSE TRUE
V2 1180.80000     1.3666667   FALSE TRUE
V3   41.31579     6.1500000   FALSE TRUE
V4 5991.00000     0.1666667   FALSE TRUE
V5  980.00000     1.5333333   FALSE TRUE
V6  140.00000     3.5166667   FALSE TRUE
> str(Ftr_ZeroVAR)
'data.frame': 5000 obs. of  4 variables:
 $ freqRatio    : num  48.3 1180.8 41.3 5991 980 ...
 $ percentUnique: num  5.217 1.367 6.15 0.167 1.533 ...
 $ zeroVar      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ nzv          : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
> print(paste('Column count prior removal:',ncol(BD_Raw)))
[1] "Column count prior removal: 5000"
> xx <- paste('Column count prior removal:',ncol(BD_Raw))
> xx
[1] "Column count prior removal: 5000"
> paste('Column count prior removal:',ncol(BD_Raw)
+ dim(Ftr_ZeroVAR[Ftr_ZeroVAR$percentUnique > 0.2,])
Error: unexpected symbol in:
"paste('Column count prior removal:',ncol(BD_Raw)
dim"
> dim(Ftr_ZeroVAR[Ftr_ZeroVAR$percentUnique > 0.2,])
[1] 4478    4
> dim(Ftr_ZeroVAR[Ftr_ZeroVAR$percentUnique > 0.5,])
[1] 4123    4
> BD_nzv <- BD_Raw[c(rownames(Ftr_ZeroVAR[Ftr_ZeroVAR$percentUnique > 0.5,])) ]
=====================================================