Social Network Analysis using R  - Twitter Data Analysis and Sentiment Analysis for a Network of Students 

Kindly note this is a preliminary post a sort of proof of concept with limited data and code . 

Given Twitter network data for 200 + nodes or participants . We extract sentiments from their Tweets. As seen from the Histograms of the Sentiment analysis of the Twitter Data , sentiment is positive amongst the students of 4th Year as compared to students of the 2nd Year.

Also as seen from the Histograms of the Sentiment analysis of the Twitter data  Sentiment is much more positive in the Technology Stream than in the Business stream of study.  



## -------------------------------------------


library(igraph)

library(statnet) #statnet::update_statnet()

## Loading required package: network
#1/ Load the dataset -- Loading the Network Class Object
load("studentNetwork.RData")

#2/ Make a plot of the Network --
plot(studentNetwork,main = "Student Network")

## Network attributes:

##   vertices = 205
##   directed = FALSE
##   hyper = FALSE
##   loops = FALSE
##   multiple = FALSE
##   bipartite = FALSE
##  total edges = 203
##    missing edges = 0
##    non-missing edges = 203
##  density = 0.009708274


##   vertices = 205
##   directed = FALSE
##   hyper = FALSE
##   loops = FALSE
##   multiple = FALSE
##   bipartite = FALSE
##  total edges = 203
##    missing edges = 0
##    non-missing edges = 203
##  density = 0.009708274
##
## 4.2/ Vertex attributes: accessed using %v% operator
##
##  4.2.1/ Course_of_Study:
##    character valued attribute
##    attribute summary:
##          Business         Fine_Arts      Liberal_Arts Physical_Sciences
##               109                 4                68                 6
##        Technology
##                18
##
##  4.2.2/ Sex:
##    character valued attribute
##    attribute summary:
##   F   M
##  99 106

##  4.2.3/ StudentID:
##    integer valued attribute
##    205 values
##
##  4.2.4/ Tweets:
##    character valued attribute
##    attribute summary:
##    the 10 most common values are:___________truncated output _____
                                                                                                         ## 4.2.5/  Year:
##    numeric valued attribute
##    attribute summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
##   1.000   1.000   3.000   2.732   4.000   6.000
##
## 4.3/No edge attributes : When present Edge attributes are - accessed using %e% operator

library(parallel)
library(foreach)
#Extracting students' Tweets
std.twt=studentNetwork%v%"Tweets"

#Assigning Sentiment Scores to the Tweets
tweet.score <- foreach(i=1:205,.combine='rbind') %dopar% { # the ".combine" is used as we want "foreach" to return a Numeric Vector
  # and not a list
  # %dopar% is used for Parallel processing
  words<-unlist(strsplit(std.twt[i],split="\\|")) # This "|" splits the pipe delimited string into vectors of Single Words
  Pos.match = match(words, unlist(Pos))
  Neg.match = match(words, unlist(Neg))
  Pos.match = !is.na(Pos.match) # Removing any NA from Pos.matches
  # NA{base} - The generic function is.na indicates which elements are missing.
  Neg.match = !is.na(Neg.match)

## As seen in the Histograms above the Sentiment is much more positive amongst the students of == 4th Year as compared to students of the 2nd Year.

 

#Histograms of Tweet Scores for Business and Technology Students respectively
hist(tweet.score[student.CoS=="Business"], main="Sentiment - Business",
     xlab="Tweet Score",col=symbol)
hist(tweet.score[student.CoS=="Technology"], main="Sentiment - Technology",
     xlab="Tweet Score",col=symbol)


## -------------------------------------------


# Source Code Courtesy -- Jeff Leek (jtleek@gmail.com)

# source(“http://biostat.jhsph.edu/~jleek/code/twitterMap.R”)

# Main - https://github.com/JulianHill/R-Tutorials

# Also See - Seems Doesnt Work on WINDOWS - https://github.com/vdimarco/twitterMap/blob/master/twitter.R



install.packages("devtools")

devtools::install_github("geoffjentry/twitteR")

# # To be used -- install.packages("base64enc")

# devtools::install_version("httr", version="0.6.0", repos="http://cran.us.r-project.org")

library("RColorBrewer", lib.loc="~/R/win-library/3.1")

library("geosphere", lib.loc="~/R/win-library/3.1")

library("maps", lib.loc="~/R/win-library/3.1")


# Rohit Comment - Seen below packages from other Twitter Example 

library("XML", lib.loc="~/R/win-library/3.1")

library("twitteR", lib.loc="~/R/win-library/3.1")

library("Rcpp", lib.loc="~/R/win-library/3.1")

library("rjson", lib.loc="~/R/win-library/3.1")

library("xml2", lib.loc="~/R/win-library/3.1")

library("bit64", lib.loc="~/R/win-library/3.1")

library("httr", lib.loc="~/R/win-library/3.1")

library("RCurl", lib.loc="~/R/win-library/3.1")

library("Rcpp", lib.loc="~/R/win-library/3.1")

library("ROAuth", lib.loc="~/R/win-library/3.1")

library("stringr", lib.loc="~/R/win-library/3.1")

library("plyr", lib.loc="~/R/win-library/3.1")

library("digest", lib.loc="~/R/win-library/3.1")


download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")


api_key <- "XcRuw#################umUG"

api_secret <- "IScXQXnz#############oGabr"

access_token <- "73090441-###############B3eI7kjBmq4Y"

access_token_secret <- "za5MLQMPAA333333333333333333333vT"


setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)

## Rohit Comment - this oauth needs to be run for every session also after some break ...time not sure 

RDT<-searchTwitter("DhankarRohit")

RDT

# searchTwitter("#rstats")

# # d.df=twListToDf(MLB.list) ### Rohit Comment ----  MLB.list is Doubtful 

# # ??twListToDf

# zz <- searchTwitter("#rstats")

# twListToDF(zz)


# -- Jeff Leek code - twitterMap("DhankarRohit") - 


twitterMap("DhankarRohit",userLocation="Gurgaon",plotType="both")


twitterMap <- function(DhankarRohit,userLocation=NULL,fileName="twitterMap.pdf",nMax = 10,plotType=c("followers","both","following")){

  

  # Get location data

  cat("Getting data from Twitter, this may take a moment.\n")

  tmp = getUser("DhankarRohit") 

  if(is.null(userLocation)){

    userLocation = location(tmp)

    userLocation = trim(userLocation)

    if(nchar(userLocation) < 2){stop("We can not find your location from Twitter")}

  }

  

# Not sure if USER NAMe is correct in this format == "DhankarRohit"

# Or it should be == DhankarRohit , without any PUNCTUATION . 


  followers=tmp$getFollowers(n=nMax)

  followersLocation = sapply(followers,function(x){location(x)})

 # nMax number of Followers 

  following = tmp$getFriends(n=nMax)

  followingLocation = sapply(following,function(x){location(x)})

 # nMax number of Following  

 

 # Load the geographic data - these all load as Class - Promise but the moment 

 # we see - str for these it turns into a DataFrame . As seen below -- 

 

 data(world.cities)

 # str(world.cities)

 # How to find - population of Delhi ...in Data Frame pop_del<- 

 # Have commented the Cities - data for US and Canada below - this 

 # Was retained as such by J Leek. Jeff Leek (jtleek@gmail.com)

 

#  data(us.cities)

#  data(canada.cities)

 

 # Find the latitude and longitude of the user - USER being SELF 

 cat("Getting geographic (latitude/longitude) of Twitter users.\n")

 userLL <- findLatLon(userLocation)$latlon

 if(any(is.na(userLL))){stop("We can't find the latitude and longitude of your location from Twitter")}

 

 # Find the latitude and longitude of each of the followers/following

 # and calcualte the distance to the user

 

 followersLL = matrix(NA,nrow=length(followers),ncol=4)

 followingLL = matrix(NA,nrow=length(following),ncol=4)

 

 ## Rohit Comment - for loop to run from - 1 to the Length of the Followers

 # Note while checking brackets for the FUNCTIONS like a FOR LOOP 

 # the corresponding - Curly Bracket shall highlight properly when cursor placed 

 # on the right side of the Curly like so === {| 

 

 for(i in 1:length(followers)){

   if(length(followersLocation[[i]]) > 0){

     tmpLL = findLatLon(trim(followersLocation[[i]]))

     if(any(!is.na(tmpLL$latlon))){

       followersLL[i,] = c(unlist(tmpLL$latlon),distCosine(userLL,tmpLL$latlon),unlist(tmpLL$cont))

     }

   }

 }

 

 for(i in 1:length(following)){

   if(length(followingLocation[[i]]) > 0){

     tmpLL = findLatLon(trim(followingLocation[[i]]))

     if(any(!is.na(tmpLL$latlon))){

       followingLL[i,] =  c(unlist(tmpLL$latlon),distCosine(userLL,tmpLL$latlon),unlist(tmpLL$cont))

     }

   }

 }

 

 ## Rohit Comment - reordering of the followers - Not sure why ? 

 

 followingLL = followingLL[order(-followingLL[,3]),]

 followersLL = followersLL[order(-followersLL[,3]),]

 

 followingLL = followingLL[!is.na(followingLL[,1]),]

 followersLL = followersLL[!is.na(followersLL[,1]),]

 

 

 cat("Plotting results.\n")

 # Set up the colors

 cols = brewer.pal(7,"Set2")

 

 # Both followers and following - here == fileName is the same given in the first 

# line/command of the code 


 if(plotType=="both"){

   pdf(fileName,height=12,width=10)

   data(worldMapEnv)

   par(mfrow=c(2,1),mar=rep(0,4))

   map('world',col="#191919",bg="black",fill=T,mar=rep(0,4),border=0)

   

   mtext(paste("@","DhankarRohit"," Follower Map",sep=""),col="lightgrey")

   nFollowers = dim(followersLL)[1]

   for(i in 1:nFollowers){

     greatC = getGreatCircle(userLL,followersLL[i,1:2])

     lines(greatC,col=cols[followersLL[i,4]],lwd=0.8)

   }

   

   legend(-180,0,legend = c(paste("Asia",sum(followingLL[,4]==1)),paste("Africa",sum(followingLL[,4]==2)),paste("N. America",sum(followingLL[,4]==3)),paste("S. America",sum(followingLL[,4]==4)),paste("Australia/N.Z.",sum(followingLL[,4]==5)),paste("Europe",sum(followingLL[,4]==6))),text.col=cols[1:6],bg="black",cex=0.75)

   

   mtext("Created by @simplystats twitterMap",side=1,adj=1,cex=0.8,col="grey")

   dev.off()

 }


## Just followers

if(plotType=="followers"){

  pdf(fileName,height=6,width=10)

  data(worldMapEnv)

  map('world',col="#191919",bg="black",fill=T,mar=rep(0,4),border=0)

  

  mtext(paste("@",userName," Follower Map",sep=""),col="lightgrey")

  nFollowers = dim(followersLL)[1]

  for(i in 1:nFollowers){

    greatC = getGreatCircle(userLL,followersLL[i,1:2])

    lines(greatC,col=cols[followersLL[i,4]],lwd=0.8)

  }

  

  legend(-180,0,legend = c(paste("Asia",sum(followersLL[,4]==1)),paste("Africa",sum(followersLL[,4]==2)),paste("N. America",sum(followersLL[,4]==3)),paste("S. America",sum(followersLL[,4]==4)),paste("Australia/N.Z.",sum(followersLL[,4]==5)),paste("Europe",sum(followersLL[,4]==6))),text.col=cols[1:6],bg="black",cex=0.75)

  mtext("Created by @simplystats twitterMap",side=1,adj=1,cex=0.8,col="grey")

  dev.off()

  

}


# Experiment --- to see how to get a Population of a City from 

# str(world.cities)

# How to find - population of Delhi ...in Data Frame pop_del<-

#  http://stackoverflow.com/questions/15303283/how-to-do-vlookup-and-fill-down-like-in-excel-in-r 

# Complete StackOverflow Page in the same Folder 

# The read.table below makes a Data.Frame out of a Text== entry , simple text converted into Data.Frame 


houses <- read.table(text="Semi            1

Single          2

Row             3

Single          2

Apartment       4

Apartment       4

Row             3",col.names=c("HouseType","HouseTypeNo"))


External Links / Recommended Reads :- 

1/ http://arxiv.org/abs/1312.6635  --- Topic and Sentiment Analysis on Social Networks 
2/ https://www.youtube.com/watch?v=mK7qmZ2-wxc -- Analyzing Social Networks on Twitter