################################### # Code for: R for Marketing Research and Analytics, 2nd ed: Chapter 11 # # Authors: Chris Chapman Elea McDonnell Feit # cnchapman+rbook@gmail.com efeit@drexel.edu # # Copyright 2019, Springer # # Last update: May 14, 2019 # Version: 2.01 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ################################################################# # BRIEF HOW TO USE # This file contains scripts used in Chapter 11 of Chapman & Feit (2019), # "R for Marketing Research and Analytics, 2nd edition", Springer. ################################################################# # R code for Chapter 11 -- segmentation # UPDATE May 2019: # set random number generator to use pre R 3.6 method, to match book # this affects data simulation and Bayesian methods, which use randomization # ==> Run this line: if (getRversion() >= "3.6.0") suppressWarnings(RNGversion("3.5.0")) # you could always change back to current with: # RNGversion(getRversion()) # load chapter 5 data # ==> locally, if saved earlier load("~/segdf-Rintro-Ch5.RData") seg.raw <- seg.df seg.df <- seg.raw[ , -7] # OR from web site: seg.raw <- read.csv("http://goo.gl/qw303p") seg.df <- seg.raw[ , -7] # a copy without the known segment assignments summary(seg.df) # a simple function to report means by group seg.summ <- function(data, groups) { aggregate(data, list(groups), function(x) mean(as.numeric(x))) } seg.summ(seg.df, seg.raw$Segment) #### HCLUST # distance example -- by hand c(1,2,3) - c(2,3,2) sum((c(1,2,3) - c(2,3,2))^2) sqrt(sum((c(1,2,3) - c(2,3,2))^2)) # using dist() dist(rbind(c(1,2,3), c(2,3,2))) # distances using numeric columns of seg.df (first 5 observations) d <- dist(seg.df[, c("age", "income", "kids")]) as.matrix(d)[1:5, 1:5] rm(d) # clean up, not in book # now the real hclust() work library(cluster) # daisy works with mixed data types seg.dist <- daisy(seg.df) # inspect some of the results as.matrix(seg.dist)[1:5, 1:5] seg.hc <- hclust(seg.dist, method="complete") plot(seg.hc) # zoom in on just part of it plot(cut(as.dendrogram(seg.hc), h=0.5)$lower[[1]]) # check some of the proposed similarities seg.df[c(101, 107), ] # similar seg.df[c(278, 294), ] # similar seg.df[c(173, 141), ] # less similar # examine cophenetic correlation cor(cophenetic(seg.hc), seg.dist) # see hclust's proposal for 4 groups plot(seg.hc) rect.hclust(seg.hc, k=4, border="red") # actually get 4 groups seg.hc.segment <- cutree(seg.hc, k=4) # membership vector for 4 groups table(seg.hc.segment) # what did hclust come up with? seg.summ(seg.df, seg.hc.segment) # plot this plot(jitter(as.numeric(seg.df$gender)) ~ jitter(as.numeric(seg.df$subscribe)), col=seg.hc.segment, yaxt="n", xaxt="n", ylab="", xlab="") axis(1, at=c(1, 2), labels=c("Subscribe: No", "Subscribe: Yes")) axis(2, at=c(1, 2), labels=levels(seg.df$gender)) #### K-MEANS # convert factor variables to numeric (kmeans requires). OK b/c all are binary. seg.df.num <- seg.df seg.df.num$gender <- ifelse(seg.df$gender=="Male", 0, 1) seg.df.num$ownHome <- ifelse(seg.df$ownHome=="ownNo", 0, 1) seg.df.num$subscribe <- ifelse(seg.df$subscribe=="subNo", 0, 1) summary(seg.df.num) set.seed(96743) seg.k <- kmeans(seg.df.num, centers=4) # inspect it seg.summ(seg.df, seg.k$cluster) # plot one of the variables boxplot(seg.df.num$income ~ seg.k$cluster, ylab="Income", xlab="Cluster") # plot the result library(cluster) clusplot(seg.df, seg.k$cluster, color=TRUE, shade=TRUE, labels=4, lines=0, main="K-means cluster plot") #### MCLUST # do mclust for segments library(mclust) ### # convert factor variables to numeric (mclust requires). OK b/c all are binary. # these lines are the same as above for k-means [not repeated in book] seg.df.num <- seg.df seg.df.num$gender <- ifelse(seg.df$gender=="Male", 0, 1) seg.df.num$ownHome <- ifelse(seg.df$ownHome=="ownNo", 0, 1) seg.df.num$subscribe <- ifelse(seg.df$subscribe=="subNo", 0, 1) summary(seg.df.num) ### # fit the model seg.mc <- Mclust(seg.df.num) summary(seg.mc) # what if we estimate 4 clusters? seg.mc4 <- Mclust(seg.df.num, G=4) summary(seg.mc4) # compare the two models BIC(seg.mc, seg.mc4) # examine the 3-cluster model seg.summ(seg.df, seg.mc$class) # check all solutions up to 9 clusters mclustBIC(seg.df.num) # plot the 3-cluster model library(cluster) clusplot(seg.df, seg.mc$class, color=TRUE, shade=TRUE, labels=4, lines=0, main="Model-based cluster plot") #### poLCA seg.df.cut <- seg.df seg.df.cut$age <- factor(ifelse(seg.df$age < median(seg.df$age), 1, 2)) seg.df.cut$income <- factor(ifelse(seg.df$income < median(seg.df$income), 1, 2)) seg.df.cut$kids <- factor(ifelse(seg.df$kids < median(seg.df$kids), 1, 2)) summary(seg.df.cut) # create a model formula seg.f <- with(seg.df.cut, cbind(age, gender, income, kids, ownHome, subscribe)~1) # fit the model library(poLCA) set.seed(02807) seg.LCA3 <- poLCA(seg.f, data=seg.df.cut, nclass=3) seg.LCA4 <- poLCA(seg.f, data=seg.df.cut, nclass=4) seg.LCA4$bic seg.LCA3$bic # examine the solutions # 3 clusters seg.summ(seg.df, seg.LCA3$predclass) table(seg.LCA3$predclass) clusplot(seg.df, seg.LCA3$predclass, color=TRUE, shade=TRUE, labels=4, lines=0, main="LCA plot (K=3)") # 4 clusters seg.summ(seg.df, seg.LCA4$predclass) table(seg.LCA4$predclass) clusplot(seg.df, seg.LCA4$predclass, color=TRUE, shade=TRUE, labels=4, lines=0, main="LCA plot (K=4)") # compare 3-cluster and 4-cluster solutions table(seg.LCA3$predclass, seg.LCA4$predclass) library(mclust) mapClass(seg.LCA3$predclass, seg.LCA4$predclass) adjustedRandIndex(seg.LCA3$predclass, seg.LCA4$predclass) # compare random assignment to LCA4 set.seed(11021) random.data <- sample(4, length(seg.LCA4$predclass), replace=TRUE) adjustedRandIndex(random.data, seg.LCA4$predclass) # compare to known segments table(seg.raw$Segment, seg.LCA4$predclass) adjustedRandIndex(seg.raw$Segment, seg.LCA4$predclass) #### CLASSIFICATION #### NAIVE BAYES # install.packages("e1071") # if needed set.seed(04625) train.prop <- 0.65 train.cases <- sample(nrow(seg.raw), nrow(seg.raw)*train.prop) seg.df.train <- seg.raw[train.cases, ] seg.df.test <- seg.raw[-train.cases, ] library(e1071) (seg.nb <- naiveBayes(Segment ~ ., data=seg.df.train)) (seg.nb.class <- predict(seg.nb, seg.df.test)) # frequencies in predicted data prop.table(table(seg.nb.class)) # plot it clusplot(seg.df.test[, -7], seg.nb.class, color=TRUE, shade=TRUE, labels=4, lines=0, main="Naive Bayes classification, holdout data") # compare to known segments (which we can do with this test data) mean(seg.df.test$Segment==seg.nb.class) # adjusted for chance library(mclust) adjustedRandIndex(seg.nb.class, seg.df.test$Segment) table(seg.nb.class, seg.df.test$Segment) # summary data for proposed segments in the test data seg.summ(seg.df.test, seg.nb.class) # summary data for the known segments in the test data seg.summ(seg.df.test, seg.df.test$Segment) # predict raw probabilities predict(seg.nb, seg.df.test, type="raw") #### random forest library(randomForest) set.seed(98040) (seg.rf <- randomForest(Segment ~ ., data=seg.df.train, ntree=3000)) ######## START digression: visualize trees # The following function is from: # P. Caldon, "to.dendrogram", posted Dec 2012, at http://stats.stackexchange.com/a/45161 # retrieved Apr 9, 2014 # # This function is NOT reproduced in the book, only here for convenience to.dendrogram <- function(dfrep,rownum=1,height.increment=0.1){ if(dfrep[rownum,'status'] == -1){ rval <- list() attr(rval,"members") <- 1 attr(rval,"height") <- 0.0 attr(rval,"label") <- dfrep[rownum,'prediction'] attr(rval,"leaf") <- TRUE }else{##note the change "to.dendrogram" and not "to.dendogram" left <- to.dendrogram(dfrep,dfrep[rownum,'left daughter'],height.increment) right <- to.dendrogram(dfrep,dfrep[rownum,'right daughter'],height.increment) rval <- list(left,right) attr(rval,"members") <- attr(left,"members") + attr(right,"members") attr(rval,"height") <- max(attr(left,"height"),attr(right,"height")) + height.increment attr(rval,"leaf") <- FALSE attr(rval,"edgetext") <- dfrep[rownum,'split var'] } class(rval) <- "dendrogram" return(rval) } ##### End of function from P. Caldon. ##### Following code generates chart in the book; code is not in the book tree1 <- getTree(seg.rf, 987, labelVar=TRUE) d1 <- to.dendrogram(tree1) tree2 <- getTree(seg.rf, 1245, labelVar=TRUE) d2 <- to.dendrogram(tree2) plot(d1, center=TRUE, edgePar=list(t.cex=0.8, p.col=NA, p.lty=0), main="Example: Tree 987", yaxt="n") plot(d2, center=TRUE, edgePar=list(t.cex=0.8, p.col=NA, p.lty=0), main="Example: Tree 1245", yaxt="n") ######## ######## END digression: visualize trees # predict the test data for random forest seg.rf.class <- predict(seg.rf, seg.df.test) # predicted classes # plot the solution library(cluster) clusplot(seg.df.test[, -7], seg.rf.class, color=TRUE, shade=TRUE, labels=4, lines=0, main="Random Forest classification, holdout data") # get the individual prediction distribution seg.rf.class.all <- predict(seg.rf, seg.df.test, predict.all=TRUE) # look at the distribution for the first 5 test data cases apply(seg.rf.class.all$individual[1:5, ], 1, function(x) prop.table(table(x))) # summaries for the proposed and actual segments seg.summ(seg.df.test, seg.rf.class) # proposed segments seg.summ(seg.df.test, seg.df.test$Segment) # actual segments # confusion matrix in test data mean(seg.df.test$Segment==seg.rf.class) table(seg.df.test$Segment, seg.rf.class) library(mclust) adjustedRandIndex(seg.df.test$Segment, seg.rf.class) ### random forest variable importance set.seed(98040) (seg.rf <- randomForest(Segment ~ ., data=seg.df.train, ntree=3000, importance=TRUE)) importance(seg.rf) varImpPlot(seg.rf, main="Variable importance by segment") library(gplots) library(RColorBrewer) heatmap.2(t(importance(seg.rf)[ , 1:4]), col=brewer.pal(9, "Blues"), dend="none", trace="none", key=FALSE, margins=c(10, 10), main="Variable importance by segment" ) #### predict subscription status #### using random forest set.seed(92118) train.prop <- 0.65 train.cases <- sample(nrow(seg.df), nrow(seg.df)*train.prop) sub.df.train <- seg.raw[train.cases, ] sub.df.test <- seg.raw[-train.cases, ] # see how differentiated the subscribers are, in the training data clusplot(sub.df.train[, -6], sub.df.train$subscribe, color=TRUE, shade=TRUE, labels=4, lines=0, main="Subscriber clusters, training data") library(randomForest) set.seed(11954) (sub.rf <- randomForest(subscribe ~ ., data=sub.df.train, ntree=3000)) # try again with more trees, and balanced classes using sampsize set.seed(11954) (sub.rf <- randomForest(subscribe ~ ., data=sub.df.train, ntree=3000, sampsize=c(25, 25)) ) # predict the holdout data sub.rf.sub <- predict(sub.rf, sub.df.test) # confusion matrix table(sub.rf.sub, sub.df.test$subscribe) library(mclust) adjustedRandIndex(sub.rf.sub, sub.df.test$subscribe) library(psych) cohen.kappa(cbind(sub.rf.sub, sub.df.test$subscribe))