# Initial post - proof of concept - reach me for further detailed code and R Examples ... 

# SVM _ Simulated Data -1 
# Source -- https://lagunita.stanford.edu/c4x/HumanitiesandScience/StatLearning/asset/ch9.html

set.seed(10111)
x = matrix(rnorm(40), 20, 2)
y = rep(c(-1, 1), c(10, 10))
x[y == 1, ] = x[y == 1, ] + 1
plot(x, col = y + 3, pch = 19)
#
set.seed(10111)
m = matrix(rnorm(40), 20, 2)
n = rep(c(-1, 1), c(10, 10))
m[n == 1, ] = m[n == 1, ] + 1
plot(m, col = n + 3, pch = 19)
#
library(e1071)
library("e1071", lib.loc="~/R/win-library/3.1")

d = data.frame(x, y = as.factor(y))
svmfit = svm(y ~ ., data = d, kernel = "linear", cost = 10, scale = FALSE)
print(svmfit)
plot(svmfit, d)

make.grid = function(x, n = 75) {
  grange = apply(x, 2, range)
  x1 = seq(from = grange[1, 1], to = grange[2, 1], length = n)
  x2 = seq(from = grange[1, 2], to = grange[2, 2], length = n)
  expand.grid(X1 = x1, X2 = x2)
}
xgrid = make.grid(x)
ygrid = predict(svmfit, xgrid)
plot(xgrid, col = c("red", "blue")[as.numeric(ygrid)], pch = 20, cex = 0.2)
points(x, col = y + 3, pch = 19)
points(x[svmfit$index, ], pch = 5, cex = 2)

beta = drop(t(svmfit$coefs) %*% x[svmfit$index, ])
beta0 = svmfit$rho
plot(xgrid, col = c("red", "blue")[as.numeric(ygrid)], pch = 20, cex = 0.2)
points(x, col = y + 3, pch = 19)
points(x[svmfit$index, ], pch = 5, cex = 2)
abline(beta0/beta[2], -beta[1]/beta[2])
abline((beta0 - 1)/beta[2], -beta[1]/beta[2], lty = 2)
abline((beta0 + 1)/beta[2], -beta[1]/beta[2], lty = 2)


load(url("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/ESL.mixture.rda"))
# mm<-data.frame(c(1:8))
# save(mm,file="ESL.mixture.rda")
# ?save
names(ESL.mixture)
rm(x, y)
attach(ESL.mixture)
plot(x, col = y + 1)
# plot(x, col = y)
# plot(x,y)
d1= data.frame(y = factor(y), x)
fit = svm(factor(y) ~ ., data = d1, scale = FALSE, kernel = "radial", cost = 5)
str(fit)
#
xgrid = expand.grid(X1 = px1, X2 = px2)
ygrid = predict(fit, xgrid)
plot(xgrid, col = as.numeric(ygrid), pch = 20, cex = 0.2)
points(x, col = y + 1, pch = 19)
func = predict(fit, xgrid, decision.values = TRUE)
func = attributes(func)$decision
xgrid = expand.grid(X1 = px1, X2 = px2)
ygrid = predict(fit, xgrid)
plot(xgrid, col = as.numeric(ygrid), pch = 20, cex = 0.2)
points(x, col = y + 1, pch = 19)

contour(px1, px2, matrix(func, 69, 99), level = 0, add = TRUE)
contour(px1, px2, matrix(prob, 69, 99), level = 0.5, add = TRUE, col = "blue", 
        lwd = 2)
------------------------------------------

#SVM_TEXT
# https://groups.google.com/forum/#!forum/rtexttools-help
library(RTextTools)
d <- read.csv("~Data.csv")
attach(d)
# Create the document term matrix
dtMatrix <- create_matrix(d["Text"])
dtMatrix
# create_matrix -- this code is showing "Acronym" --- if (attr(weighting, "Acronym") == "tf-idf") 

# Configure training set 
cont <- create_container(dtMatrix, d$IsSunny, trainSize=1:11, virgin=FALSE)
cont
# train a SVM Model
SVM <- train_model(cont, "SVM", kernel="linear", cost=1)
SVM
# new data for Prediction - This is TEST Data . 
Pred_d <- list("sunny sunny sunny rainy rainy", "rainy sunny rainy rainy", "hello", "", "this is another rainy world")
Pred_d

> trace("create_matrix",edit=TRUE)

Tracing function "create_matrix" in

package "RTextTools"

[1] "create_matrix"

#
# create a prediction document term matrix
Pred_Matrix<- create_matrix(Pred_d, originalMatrix=dtMatrix)
#
# create corresponding container
Pred_size <- length(Pred_d);
Pred_cont <- create_container(Pred_Matrix, labels=rep(0,Pred_size), testSize=1:Pred_size, virgin=FALSE)
Pred_cont

# predict
results <- classify_model(Pred_cont,SVM)
results

package ‘tree’ successfully unpacked and MD5 sums checked

package ‘maxent’ successfully unpacked and MD5 sums checked

package ‘glmnet’ successfully unpacked and MD5 sums checked

package ‘tau’ successfully unpacked and MD5 sums checked

package ‘RTextTools’ successfully unpacked and MD5 sums checked

 

The downloaded binary packages are in

        C:\Users\Rohit\AppData\Local\Temp\Rtmp2bXxPY\downloaded_packages

> data(NYTimes)

Warning message:

In data(NYTimes) : data set ‘NYTimes’ not found

> data(IRIS)

Warning message:

In data(IRIS) : data set ‘IRIS’ not found

> ?iris

> d <- read.csv("C:/STAT/BRIDGE/Module-3/Week-100_Plus/______Main_SVM_RCode/SVM_R_CODE_GITHUB_SUNNY_RAINY/sunnyData.csv")

>   View(d)

> attach(d)

> dtMatrix <- create_matrix(d["Text"])

Error: could not find function "create_matrix"

> library(RTextTools)

Loading required package: SparseM

 

Attaching package: ‘SparseM’

 

The following object is masked from ‘package:base’:

 

    backsolve

 

> dtMatrix <- create_matrix(d["Text"])

> dtMatrix

<<DocumentTermMatrix (documents: 11, terms: 2)>>

Non-/sparse entries: 17/5

Sparsity           : 23%

Maximal term length: 5

Weighting          : term frequency (tf)

> cont <- create_container(dtMatrix, d$IsSunny, trainSize=1:11, virgin=FALSE)

> cont

An object of class "matrix_container"

Slot "training_matrix":

An object of class "matrix.csr"

Slot "ra":

 [1] 1 1 2 1 1 1 1 2 3 1 2 1 2 1 2 2 1

 

Slot "ja":

 [1] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2

 

Slot "ia":

 [1]  1  2  3  4  6  8  9 10 12 14 16 18

 

Slot "dimension":

[1] 11  2

 

 

Slot "classification_matrix":

An object of class "matrix.csr"

Slot "ra":

 [1] 1 1 2 1 1 1 1 2 3 1 2 1 2 1 2 2 1

 

Slot "ja":

 [1] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2

 

Slot "ia":

 [1]  1  2  3  4  6  8  9 10 12 14 16 18

 

Slot "dimension":

[1] 11  2

 

 

Slot "training_codes":

 [1] 1  -1 1  -1 -1 -1 1  1  1  1  -1

Levels: -1 1

 

Slot "testing_codes":

 [1] 1  -1 1  -1 -1 -1 1  1  1  1  -1

Levels: -1 1

 

Slot "column_names":

[1] "rainy" "sunny"

 

Slot "virgin":

[1] FALSE

 

> d <- read.csv("C:/STAT/BRIDGE/Module-3/Week-100_Plus/______Main_SVM_RCode/SVM_R_CODE_GITHUB_SUNNY_RAINY/sunnyData.csv")

> SVM <- train_model(cont, "SVM", kernel="linear", cost=1)

> SVM

 

Call:

svm.default(x = container@training_matrix, y = container@training_codes,

    kernel = kernel, cost = cost, cross = cross, probability = TRUE,

    method = method)

 

 

Parameters:

   SVM-Type:  C-classification

 SVM-Kernel:  linear

       cost:  1

      gamma:  0.5

 

Number of Support Vectors:  4

 

> plot(SVM)

Error in ncol(data) : argument "data" is missing, with no default

> Pred_d <- list("sunny sunny sunny rainy rainy", "rainy sunny rainy rainy", "hello", "", "this is another rainy world")

> Pred_Matrix<- create_matrix(Pred_d, originalMatrix=dtMatrix)

Error in if (attr(weighting, "Acronym") == "tf-idf") weight <- 1e-09 :

  argument is of length zero

> Pred_d

[[1]]

[1] "sunny sunny sunny rainy rainy"

 

[[2]]

[1] "rainy sunny rainy rainy"

 

[[3]]

[1] "hello"

 

[[4]]

[1] ""

 

[[5]]

[1] "this is another rainy world"

 

> Pred_Matrix<- create_matrix(Pred_d, originalMatrix=dtMatrix)

Error in if (attr(weighting, "Acronym") == "tf-idf") weight <- 1e-09 :

  argument is of length zero

> tm::weightTfIdf

function (m, normalize = TRUE)

{

    isDTM <- inherits(m, "DocumentTermMatrix")

    if (isDTM)

        m <- t(m)

    if (normalize) {

        cs <- col_sums(m)

        if (any(cs == 0))

            warning("empty document(s): ", paste(Docs(m)[cs ==

                0], collapse = " "))

        names(cs) <- seq_len(nDocs(m))

        m$v <- m$v/cs[m$j]

    }

    rs <- row_sums(m > 0)

    if (any(rs == 0))

        warning("unreferenced term(s): ", paste(Terms(m)[rs ==

            0], collapse = " "))

    lnrs <- log2(nDocs(m)/rs)

    lnrs[!is.finite(lnrs)] <- 0

    m <- m * lnrs

    attr(m, "weighting") <- c(sprintf("%s%s", "term frequency - inverse document frequency",

        if (normalize) " (normalized)" else ""), "tf-idf")

    if (isDTM)

        t(m)

    else m

}

<environment: namespace:tm>

attr(,"class")

[1] "WeightFunction" "function"     

attr(,"name")

[1] "term frequency - inverse document frequency"

attr(,"acronym")

[1] "tf-idf"

> tm::weightTfIdf

function (m, normalize = TRUE)

{

    isDTM <- inherits(m, "DocumentTermMatrix")

    if (isDTM)

        m <- t(m)

    if (normalize) {

        cs <- col_sums(m)

        if (any(cs == 0))

            warning("empty document(s): ", paste(Docs(m)[cs ==

                0], collapse = " "))

        names(cs) <- seq_len(nDocs(m))

        m$v <- m$v/cs[m$j]

    }

    rs <- row_sums(m > 0)

    if (any(rs == 0))

        warning("unreferenced term(s): ", paste(Terms(m)[rs ==

            0], collapse = " "))

    lnrs <- log2(nDocs(m)/rs)

    lnrs[!is.finite(lnrs)] <- 0

    m <- m * lnrs

    attr(m, "weighting") <- c(sprintf("%s%s", "term frequency - inverse document frequency",

        if (normalize) " (normalized)" else ""), "tf-idf")

    if (isDTM)

        t(m)

    else m

}

<environment: namespace:tm>

attr(,"class")

[1] "WeightFunction" "function"     

attr(,"name")

[1] "term frequency - inverse document frequency"

attr(,"acronym")

[1] "tf-idf"

> session.info()

Error: could not find function "session.info"

> ??session.info()

> sessionInfo()

R version 3.1.3 (2015-03-09)

Platform: x86_64-w64-mingw32/x64 (64-bit)

Running under: Windows 7 x64 (build 7601) Service Pack 1

 

locale:

[1] LC_COLLATE=English_India.1252  LC_CTYPE=English_India.1252  

[3] LC_MONETARY=English_India.1252 LC_NUMERIC=C                 

[5] LC_TIME=English_India.1252   

 

attached base packages:

[1] stats     graphics  grDevices utils     datasets  methods   base    

 

other attached packages:

[1] RTextTools_1.4.2 SparseM_1.6    

 

loaded via a namespace (and not attached):

 [1] bitops_1.0-6        caTools_1.17.1      class_7.3-12      

 [4] codetools_0.2-10    digest_0.6.8        e1071_1.6-4       

 [7] evaluate_0.5.5      foreach_1.4.2       formatR_1.0       

[10] glmnet_2.0-2        grid_3.1.3          htmltools_0.2.6   

[13] ipred_0.9-4         iterators_1.0.7     knitr_1.9         

[16] lattice_0.20-30     lava_1.4.0          magrittr_1.5      

[19] MASS_7.3-40         Matrix_1.1-5        maxent_1.3.3.1    

[22] NLP_0.1-8           nnet_7.3-9          parallel_3.1.3    

[25] prodlim_1.5.1       randomForest_4.6-10 Rcpp_0.12.0       

[28] rmarkdown_0.5.1     rpart_4.1-9         slam_0.1-32       

[31] splines_3.1.3       stringi_0.5-5       stringr_1.0.0     

[34] survival_2.38-1     tau_0.0-18          tm_0.6-2          

[37] tools_3.1.3         tree_1.0-36       

> RTextTools/R/create_matrix.R

Error: object 'RTextTools' not found

> data(NYTimes)

> View(NYTimes)

> ?tm::weightTfIdf

> trace("create_matrix",edit=TRUE)

Tracing function "create_matrix" in

package "RTextTools"

[1] "create_matrix"

> Pred_d

[[1]]

[1] "sunny sunny sunny rainy rainy"

 

[[2]]

[1] "rainy sunny rainy rainy"

 

[[3]]

[1] "hello"

 

[[4]]

[1] ""

 

[[5]]

[1] "this is another rainy world"

 

> Pred_Matrix<- create_matrix(Pred_d, originalMatrix=dtMatrix)

> Pred_size <- length(Pred_d);

> Pred_cont <- create_container(Pred_Matrix, labels=rep(0,Pred_size), testSize=1:Pred_size, virgin=FALSE)

> Pred_cont

An object of class "matrix_container"

Slot "training_matrix":

An object of class "matrix.csr"

Slot "ra":

[1] 2 3 3 1 1

 

Slot "ja":

[1] 1 2 1 2 1

 

Slot "ia":

[1] 1 3 5 5 5 6

 

Slot "dimension":

[1] 5 2

 

 

Slot "classification_matrix":

An object of class "matrix.csr"

Slot "ra":

[1] 2 3 3 1 1

 

Slot "ja":

[1] 1 2 1 2 1

 

Slot "ia":

[1] 1 3 5 5 5 6

 

Slot "dimension":

[1] 5 2

 

 

Slot "training_codes":

[1] 0 0 0 0 0

Levels: 0

 

Slot "testing_codes":

[1] 0 0 0 0 0

Levels: 0

 

Slot "column_names":

[1] "rainy" "sunny"

 

Slot "virgin":

[1] FALSE

 

> trace("create_matrix",edit=TRUE)

Tracing function "create_matrix" in package "RTextTools"

[1] "create_matrix"



> dtMatrix

<<DocumentTermMatrix (documents: 11, terms: 2)>>

Non-/sparse entries: 17/5

Sparsity           : 23%

Maximal term length: 5

Weighting          : term frequency (tf)

> create_matrix

Object with tracing code, class "functionWithTrace"

Original definition:

Object with tracing code, class "functionWithTrace"

Original definition:

function (textColumns, language = "english", minDocFreq = 1,

    maxDocFreq = Inf, minWordLength = 3, maxWordLength = Inf,

    ngramLength = 1, originalMatrix = NULL, removeNumbers = FALSE,

    removePunctuation = TRUE, removeSparseTerms = 0, removeStopwords = TRUE,

    stemWords = FALSE, stripWhitespace = TRUE, toLower = TRUE,

    weighting = weightTf)

{

    stem_words <- function(x) {

        split <- strsplit(x, " ")

        return(wordStem(unlist(split), language = language))

    }

    tokenize_ngrams <- function(x, n = ngramLength) return(rownames(as.data.frame(unclass(textcnt(x,

        method = "string", n = n)))))

    control <- list(bounds = list(local = c(minDocFreq, maxDocFreq)),

        language = language, tolower = toLower, removeNumbers = removeNumbers,

        removePunctuation = removePunctuation, stopwords = removeStopwords,

        stripWhitespace = stripWhitespace, wordLengths = c(minWordLength,

            maxWordLength), weighting = weighting)

    if (ngramLength > 1) {

        control <- append(control, list(tokenize = tokenize_ngrams),

            after = 7)

    }

    else {

        control <- append(control, list(tokenize = scan_tokenizer),

            after = 4)

    }

    if (stemWords == TRUE && ngramLength == 1)

        control <- append(control, list(stemming = stem_words),

            after = 7)

    trainingColumn <- apply(as.matrix(textColumns), 1, paste,

        collapse = " ")

    trainingColumn <- sapply(as.vector(trainingColumn, mode = "character"),

        iconv, to = "UTF8", sub = "byte")

    corpus <- Corpus(VectorSource(trainingColumn), readerControl = list(language = language))

    matrix <- DocumentTermMatrix(corpus, control = control)

    if (removeSparseTerms > 0)

        matrix <- removeSparseTerms(matrix, removeSparseTerms)

    if (!is.null(originalMatrix)) {

        terms <- colnames(originalMatrix[, which(!colnames(originalMatrix) %in%

            colnames(matrix))])

        weight <- 0

        if (attr(weighting, "Acronym") == "tf-idf")

            weight <- 1e-09

        amat <- matrix(weight, nrow = nrow(matrix), ncol = length(terms))

        colnames(amat) <- terms

        rownames(amat) <- rownames(matrix)

        fixed <- as.DocumentTermMatrix(cbind(matrix[, which(colnames(matrix) %in%

            colnames(originalMatrix))], amat), weighting = weighting)

        matrix <- fixed

    }

    matrix <- matrix[, sort(colnames(matrix))]

    gc()

    return(matrix)

}

<environment: namespace:RTextTools>

 

## (to see the tracing code, look at body(object))

 

## (to see the tracing code, look at body(object))

> Pred_Matrix<- create_matrix(Pred_d, originalMatrix=dtMatrix)

> Pred_size <- length(Pred_d);

> Pred_cont <- create_container(Pred_Matrix, labels=rep(0,Pred_size), testSize=1:Pred_size, virgin=FALSE)

> Pred_cont

An object of class "matrix_container"

Slot "training_matrix":

An object of class "matrix.csr"

Slot "ra":

[1] 2 3 3 1 1

 

Slot "ja":

[1] 1 2 1 2 1

 

Slot "ia":

[1] 1 3 5 5 5 6

 

Slot "dimension":

[1] 5 2

 

 

Slot "classification_matrix":

An object of class "matrix.csr"

Slot "ra":

[1] 2 3 3 1 1

 

Slot "ja":

[1] 1 2 1 2 1

 

Slot "ia":

[1] 1 3 5 5 5 6

 

Slot "dimension":

[1] 5 2

 

 

Slot "training_codes":

[1] 0 0 0 0 0

Levels: 0

 

Slot "testing_codes":

[1] 0 0 0 0 0

Levels: 0

 

Slot "column_names":

[1] "rainy" "sunny"

 

Slot "virgin":

[1] FALSE

 

> results <- classify_model(Pred_cont,SVM)

> results

  SVM_LABEL  SVM_PROB

1         1 0.7410242

2        -1 0.8655672

3         1 0.5201115

4         1 0.5201115

5        -1 0.7089546

> View(results)

> library("e1071", lib.loc="~/R/win-library/3.1")

>

 


-------------------------------------TBD ---------------------------

# Graphical Classification of Text 
library("e1071", lib.loc="~/R/win-library/3.1")

d = data.frame(x, y = as.factor(y))
# svmfit = svm(y ~ ., data = d, kernel = "linear", cost = 10, scale = FALSE)
# print(svmfit)
# plot(svmfit, d)
# make.grid = function(x, n = 75) {
#   grange = apply(x, 2, range)
#   x1 = seq(from = grange[1, 1], to = grange[2, 1], length = n)
#   x2 = seq(from = grange[1, 2], to = grange[2, 2], length = n)
#   expand.grid(X1 = x1, X2 = x2)
# }
# xgrid = make.grid(x)
# ygrid = predict(svmfit, xgrid)
# plot(xgrid, col = c("red", "blue")[as.numeric(ygrid)], pch = 20, cex = 0.2)
# points(x, col = y + 3, pch = 19)
# points(x[svmfit$index, ], pch = 5, cex = 2)
# # 
# beta = drop(t(svmfit$coefs) %*% x[svmfit$index, ])
# beta0 = svmfit$rho
# plot(xgrid, col = c("pink", "cyan")[as.numeric(ygrid)], pch = 20, cex = 0.2,main="Support Vectors")
# points(x, col = y + 3, pch = 19)
# points(x[svmfit$index, ], pch = 5, cex = 2)
# abline(beta0/beta[2], -beta[1]/beta[2])
# abline((beta0 - 1)/beta[2], -beta[1]/beta[2], lty = 2)
# abline((beta0 + 1)/beta[2], -beta[1]/beta[2], lty = 2)
# # 
# load(url("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/ESL.mixture.rda"))
# # mm<-data.frame(c(1:8))
# # save(mm,file="ESL.mixture.rda")
# # ?save
# names(ESL.mixture)
# rm(x, y)
# attach(ESL.mixture)
# plot(x, col = y + 1)
# # plot(x, col = y)
# # plot(x,y)
# d1= data.frame(y = factor(y), x)
# fit = svm(factor(y) ~ ., data = d1, scale = FALSE, kernel = "radial", cost = 5)
# str(fit)
# #
# xgrid = expand.grid(X1 = px1, X2 = px2)
# ygrid = predict(fit, xgrid)
# plot(xgrid, col = as.numeric(ygrid), pch = 20, cex = 0.2)
# points(x, col = y + 1, pch = 19)
# # 
# func = predict(fit, xgrid, decision.values = TRUE)
# func = attributes(func)$decision
# xgrid = expand.grid(X1 = px1, X2 = px2)
# ygrid = predict(fit, xgrid)
# plot(xgrid, col = as.numeric(ygrid), pch = 20, cex = 0.2)
# points(x, col = y + 1, pch = 19)
# contour(px1, px2, matrix(func, 69, 99), level = 0, add = TRUE)
# contour(px1, px2, matrix(prob, 69, 99), level = 0.5, add = TRUE, col = "blue", 
#         lwd = 2)
# # 



# -- 
# Error Code Dump -- Fixed with --- trace("create_matrix",edit=TRUE)
## -  Quitting from lines 3-25 (SVM_TEXT_.spin.Rmd) 
# Error in if (attr(weighting, "Acronym") == "tf-idf") weight <- 1e-09 : 
#   argument is of length zero
# Calls: <Anonymous> ... withCallingHandlers -> withVisible -> eval -> eval -> create_matrix
# Execution halted
# sessionInfo()
# ?tm::weightTfIdf
# tm::weightTfIdf
# tm::weightTfIdf
# C:\Users\Rohit\Documents\R\win-library\3.1\RTextTools\R
# trace("create_matrix",edit=TRUE)