Filtering Sub Group Variables with R [ package - dplyr ]

UA-60924200-1

# - We create Clubbed variables by using the filter function from library - 

library(dplyr)

>filter(WCD,Channel==1,Region==1)

> VAR_C1_R1<-filter(WCD,Channel==1,Region==1)

> VAR_C1_R1

> View(VAR_C1_R1)

> summary(VAR_C1_R1)

    Channel      Region      Fresh            Milk          Grocery          Frozen     

 Min.   :1   Min.   :1   Min.   :  514   Min.   :  258   Min.   :  489   Min.   :   91  

 1st Qu.:1   1st Qu.:1   1st Qu.: 4438   1st Qu.: 1071   1st Qu.: 1620   1st Qu.:  966  

 Median :1   Median :1   Median : 8656   Median : 2280   Median : 2576   Median : 1859  

 Mean   :1   Mean   :1   Mean   :12902   Mean   : 3870   Mean   : 4026   Mean   : 3127  

 3rd Qu.:1   3rd Qu.:1   3rd Qu.:18135   3rd Qu.: 4996   3rd Qu.: 5172   3rd Qu.: 4479  

 Max.   :1   Max.   :1   Max.   :56083   Max.   :23527   Max.   :16966   Max.   :18711  

 Detergents_Paper   Delicassen  

 Min.   :   5.0   Min.   :   7  

 1st Qu.: 237.0   1st Qu.: 374  

 Median : 412.0   Median : 749  

 Mean   : 950.5   Mean   :1197  

 3rd Qu.: 874.0   3rd Qu.:1622  

 Max.   :5828.0   Max.   :6854

# - have created a Data Frame VAR_C1_R1 which represents - Channel -1 , Region -1 , also created a Integer Vector - FRESH_VAR_C1_R1 to be able to plot Boxplots and Histograms forVAR_C1_R1$Fresh . We note that FRESH_VAR_C1_R1 has outliers which need to be capped. 

FRESH_VAR_C1_R1<-VAR_C1_R1$Fresh

> head(FRESH_VAR_C1_R1)

[1] 30624 11686  9670 25203   583  1956

 > BoxPlot(FRESH_VAR_C1_R1, col.stroke="red", horiz=TRUE, col.fill="plum",xlab="_FRESH_VAR_C1_R1_",main="_Variable_FRESH_Channel-1_Region-1_with OUTLIERS_")

 --- _FRESH_VAR_C1_R1_, _Variable_FRESH_Channel-1_Region-1_with OUTLIERS_ ---

 Present: 59

Missing: 0

Total  : 59

 Minimum      : 514

Lower Whisker: 514

Lower Hinge  : 4438

Median       : 8656

Upper Hinge  : 18135

Upper Whisker: 31614

Maximum      : 56083

 

1st Quartile : 4438

3rd Quartile : 18135

IQR          : 13698

 

Number of outliers: 3

Small: none

Large: 47493  53205  56083 

 

> MILK_VAR_C1_R1<-Milk

> BoxPlot(MILK_VAR_C1_R1, col.stroke="black", horiz=TRUE, col.fill="green",xlab="_MILK_VAR_C1_R1_",main="_Variable_MILK_Channel-1_Region-1_with OUTLIERS_")

 

--- _MILK_VAR_C1_R1_, _Variable_MILK_Channel-1_Region-1_with OUTLIERS_ ---

 Present: 59

Missing: 0

Total  : 59

Minimum      : 258

Lower Whisker: 258

Lower Hinge  : 1071

Median       : 2280

Upper Hinge  : 4996

Upper Whisker: 10765

Maximum      : 23527

 

1st Quartile : 1071

3rd Quartile : 4996

IQR          : 3924

 

Number of outliers: 3

Small: none

Large: 11487  17972  23527 


# - Now we share SUMMARY BOXPLOT STATS for the rest of the Variables Groupings - which are 

Frozen_VAR_C1_R1_

Detergents_VAR_C1_R1_

Delicassen_VAR_C1_R1_

GROCERY_VAR_C1_R1_

VAR_C1_R2_ 

Fresh_VAR_C1_R2_

Milk_VAR_C1_R2_

Frozen_VAR_C1_R2_

Detergents_Paper_VAR_C1_R2_

Delicassen_VAR_C1_R2_

VAR_C1_R3_

Fresh_VAR_C1_R3_

Milk_VAR_C1_R3_

Frozen_VAR_C1_R3_

Detergents_Paper_VAR_C1_R3_

Delicassen_VAR_C1_R3_

VAR_C2_R1_

Fresh_VAR_C2_R1_

Milk_VAR_C2_R1_

Frozen_VAR_C2_R1_

Detergents_Paper_VAR_C2_R1_

Delicassen_VAR_C2_R1_

VAR_C2_R2_

Fresh_VAR_C2_R2_

Milk_VAR_C2_R2_

Frozen_VAR_C2_R2_

Detergents_Paper_VAR_C2_R2_

Delicassen_VAR_C2_R2_

VAR_C2_R3_

Fresh_VAR_C2_R3_

Milk_VAR_C2_R3_

Frozen_VAR_C2_R3_

Detergents_Paper_VAR_C2_R3_

Delicassen_VAR_C2_R3_

## - for VAR_C1_R2_ 

> VAR_C1_R2_ <-filter(WCD,Channel==1,Region==2)

> summary(VAR_C1_R2_)

    Channel      Region      Fresh            Milk      

 Min.   :1   Min.   :2   Min.   :    3   Min.   :  333  

 1st Qu.:1   1st Qu.:2   1st Qu.: 4938   1st Qu.: 1146  

 Median :1   Median :2   Median : 9787   Median : 1560  

 Mean   :1   Mean   :2   Mean   :11651   Mean   : 2304  

 3rd Qu.:1   3rd Qu.:2   3rd Qu.:17032   3rd Qu.: 2345  

 Max.   :1   Max.   :2   Max.   :32717   Max.   :16784  

    Grocery          Frozen        Detergents_Paper   Delicassen    

 Min.   : 1330   Min.   :  264.0   Min.   :  15.0   Min.   :  51.0  

 1st Qu.: 2374   1st Qu.:  962.2   1st Qu.: 182.8   1st Qu.: 567.2  

 Median : 3352   Median : 2696.5   Median : 325.0   Median : 883.0  

 Mean   : 4396   Mean   : 5745.0   Mean   : 482.7   Mean   :1105.9  

 3rd Qu.: 5528   3rd Qu.: 4617.0   3rd Qu.: 707.0   3rd Qu.:1146.0  

 Max.   :13626   Max.   :60869.0   Max.   :1679.0   Max.   :5609.0  

> MILK_VAR_C1_R2_<-VAR_C1_R2_$Milk

> summary(MILK_VAR_C1_R2_)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 

    333    1146    1560    2304    2345   16780 

> BoxPlot(MILK_VAR_C1_R2_, col.stroke="black", horiz=TRUE, col.fill="green",xlab="_MILK_VAR_C1_R2_",main="_Variable_MILK_Channel-1_Region-2_with OUTLIERS_")

--- _MILK_VAR_C1_R2_, _Variable_MILK_Channel-1_Region-2_with OUTLIERS_ --- 


Present: 28 

Missing: 0 

Total  : 28 


Minimum      : 333 

Lower Whisker: 333 

Lower Hinge  : 1104 

Median       : 1560 

Upper Hinge  : 2354 

Upper Whisker: 3696 

Maximum      : 16784 


1st Quartile : 1146 

3rd Quartile : 2345 

IQR          : 1199 


Number of outliers: 1 

Small: none

Large: 16784  


Frozen_VAR_C1_R2_<- VAR_C1_R2_$Frozen

> BoxPlot(Frozen_VAR_C1_R2_, col.stroke="black", horiz=TRUE, col.fill="green",xlab="_FROZEN_VAR_C1_R2_",main="_Variable_FROZEN_Channel-1_Region-2_with OUTLIERS_")

--- _FROZEN_VAR_C1_R2_, _Variable_FROZEN_Channel-1_Region-2_with OUTLIERS_ --- 


Present: 28 

Missing: 0 

Total  : 28 

Minimum      : 264 

Lower Whisker: 264 

Lower Hinge  : 946 

Median       : 2696 

Upper Hinge  : 4787 

Upper Whisker: 9584 

Maximum      : 60869 

1st Quartile : 962 

3rd Quartile : 4617 

IQR          : 3655 


Number of outliers: 3 

Small: none

Large: 12569  15601  60869  


Detergents_Paper_VAR_C1_R2_<- VAR_C1_R2_$Detergents_Paper

> BoxPlot(Detergents_Paper_VAR_C1_R2_, col.stroke="black", horiz=TRUE, col.fill="green",xlab="_Detergents_Paper_VAR_C1_R2_",main="_Variable_Detergents_Paper_Channel-1_Region-2_with OUTLIERS_")

--- _Detergents_Paper_VAR_C1_R2_, _Variable_Detergents_Paper_Channel-1_Region-2_with OUTLIERS_ --- 

Present: 28 

Missing: 0 

Total  : 28 

Minimum      : 15 

Lower Whisker: 15 

Lower Hinge  : 182 

Median       : 325 

Upper Hinge  : 708 

Upper Whisker: 1470 

Maximum      : 1679 


1st Quartile : 183 

3rd Quartile : 707 

IQR          : 524 


Number of outliers: 1 

Small: none

Large: 1679  


#- like this we create summary plots for all other variables - Mutate the Outliers and then again inspect summary stats while creating Correlation plots ... 


Delicassen_VAR_C1_R2_<- VAR_C1_R2_$Delicassen

BoxPlot(Delicassen_VAR_C1_R2_, col.stroke="black", horiz=TRUE, col.fill="green",xlab="_Delicassen_Paper_VAR_C1_R2_",main="_Variable_Delicassen_Channel-1_Region-2_with OUTLIERS_")

--- _Delicassen_VAR_C1_R2_, _Variable_Delicassen_Channel-1_Region-2_with OUTLIERS_ --- 


Present: 28 

Missing: 0 

Total  : 28 


Minimum      : 51 

Lower Whisker: 51 

Lower Hinge  : 562 

Median       : 883 

Upper Hinge  : 1175 

Upper Whisker: 1942 

Maximum      : 5609 


1st Quartile : 567 

3rd Quartile : 1146 

IQR          : 579 


Number of outliers: 2 

Small: none

Large: 2602  5609  


For the Correlation Plots created using Corrplot() - LINK