################################### # EXERCISE Code: R for Marketing Research and Analytics, 2nd ed: Chapter 11 # # Authors: Chris Chapman Elea McDonnell Feit # cnchapman+rbook@gmail.com efeit@drexel.edu # # Copyright 2019, Springer # # Last update: March 31, 2019 # Version: 1.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ################################################################# # BRIEF HOW TO USE # This file contains answers to exercises in Chapter 11 of Chapman & Feit (2019), # "R for Marketing Research and Analytics, 2nd edition", Springer. # # RECOMMENDATION # 1. Read the comments -- or the text in the book -- for the questions. # 2. Try to answer each question on your own in a separate R file. # 3. Compare your solution to the ones listed here. There are often many # different ways to solve a problem in R! # 4. Step through the code carefully and make sure you understand each line ################################################################# # chapter 11 exercises # chapman & feit 2019 # SETUP # load data # omit the known segments # read the data from the web site or locally # ONLINE seg.ex.raw <- read.csv("https://goo.gl/s1KEiF") # original with segments # OR get fron LOCAL -- set folder path wherever you have downloaded it seg.ex.raw <- read.csv("music-sub.csv") # original with segments seg.ex <- seg.ex.raw # copy without segments seg.ex$Segment <- NULL summary(seg.ex.raw) summary(seg.ex) ## CLUSTERING # 1. In the chapter, we suggested using a quick function to quickly # examine group differences. Develop a quick summary function for # the seg.ex data. Demonstrate its basic usage. # this is only ONE of an infinite number of possibilities ... ex.summ <- function(data, seg=data$Segment) { # set a default for "seg" aggregate(. ~ seg, data=data, mean) # seg can be replaced by any var } # demonstration ex.summ(seg.ex.raw) # default to grouping by segment ex.summ(seg.ex, seg.ex$sex) # use different data & grouping variable # HCLUST # 2. Using hierarchical clustering, cluster the seg.ex data. Cut it into some # number of segments, and visualize those. How many segments did you choose and why? # library(cluster) ex.dist <- daisy(seg.ex) ex.hc <- hclust(ex.dist, method="complete") # see hclust's proposal for 5 groups (note: many possible answers) plot(ex.hc) rect.hclust(ex.hc, k=5, border="red") # actually get 4 groups seg.ex.segment <- cutree(ex.hc, k=5) # membership vector for 4 groups table(seg.ex.segment) # what did hclust come up with? ex.summ(seg.ex, seg.ex.segment) # 3. Are the hclust() results interesting? Show a plot that demonstrates why or why not. # [I'd say the result is not very interesting ... why not?] plot(jitter(as.numeric(seg.ex$commuteCar)) ~ jitter(as.numeric(seg.ex$subscribeToMusic)), col=seg.ex.segment, yaxt="n", xaxt="n", ylab="", xlab="") axis(1, at=c(1, 2), labels=c("Subscribe: No", "Subscribe: Yes")) axis(2, at=c(0, 1), labels=c("Commute: No", "Commute: Yes")) # KMEANS # 4. Using kmeans, find a four group solution for the data. (Note: you'll # need to do some dat conversion first.) Is the solution interesting? # Plot two of the continuous variables by segment. # convert factor variables to numeric (kmeans requires). OK b/c all are binary. seg.ex.num <- seg.ex seg.ex.num$sex <- ifelse(seg.ex.num$sex=="Male", 0, 1) seg.ex.num$subscribeToMusic <- ifelse(seg.ex.num$subscribeToMusic=="subNo", 0, 1) summary(seg.ex.num) set.seed(96743) seg.k <- kmeans(seg.ex.num, centers=4) # inspect it ex.summ(seg.ex, seg.k$cluster) # plot of income by segment found boxplot(seg.ex.num$householdIncome ~ seg.k$cluster, ylab="Income", xlab="Cluster") boxplot(seg.ex.num$age ~ seg.k$cluster, ylab="Income", xlab="Cluster") # 5. Plot the clusters by principal components. How do you interpret this plot? library(cluster) clusplot(seg.ex.num, seg.k$cluster, color=TRUE, shade=TRUE, labels=4, lines=0, main="K-means cluster plot") # MCLUST # 6. Use the mclust package to fit a model-based cluster solution for the # music subscription data. How many clusters does it suggest? Are they # well-differentiated? How do they compare to the K-means solution from # exercise 5? library(mclust) ex.mc <- Mclust(seg.ex.num) summary(ex.mc) # examine the 3-cluster model ex.summ(seg.ex, ex.mc$class) # plot the 3-cluster model library(cluster) clusplot(seg.ex, ex.mc$class, color=TRUE, shade=TRUE, labels=4, lines=0, main="Model-based cluster plot") # 7. Mclust can also fit a specified number of clusters (parameter ``G''). # Fit solutions for G=2 and G=4 clusters. When fit to the data, are they # much worse than Mclust's suggested G=3 solution? # ex.mc2 <- Mclust(seg.ex.num, G=2) ex.mc4 <- Mclust(seg.ex.num, G=4) # compare the models BIC(ex.mc, ex.mc2, ex.mc4) # POLYTOMOUS CLUSTERING # 8. Prepare the data for poLCA, recoding variables to binary factors, # splitting as follows: age: less than 30, or 30+. Household income: less # than 55000, or greater. Kids at home: 0, or more than 0. Music enthusiasm: # a score of 5 or higher, or less than that. We will not use miles driven # or driving enthusiasm for this exercise. # Prepare the data for poLCA seg.ex.po <- seg.ex seg.ex.po$age <- factor(ifelse(seg.ex.po$age < 30, "LessThan30", "Over30")) seg.ex.po$householdIncome <- factor(ifelse(seg.ex.po$householdIncome < 55000, "LessThan55k", "Over55k")) seg.ex.po$kidsAtHome <- factor(ifelse(seg.ex.po$kidsAtHome > 0, "Kids", "NoKids")) seg.ex.po$musicEnthuse <- factor(ifelse(seg.ex.po$musicEnthuse > 4, "MusicEnthuse", "NoEnthuse")) seg.ex.po$commuteCar <- factor(seg.ex.po$commuteCar, labels = c("noCarCommute", "yesCarCommute")) seg.ex.po$milesDrive <- NULL seg.ex.po$drivingEnthuse <- NULL summary(seg.ex.po) # 9. Fit polytomous latent class models with 3-class and 4-class solutions to # the data. Visualize them. How different are the two solutions in # respondents' assignments? Which one is more useful? (Note: solutions # depend in part on the random number sequence.) # create a model formula seg.f <- with(seg.ex.po, cbind(age, sex, householdIncome, kidsAtHome, commuteCar, musicEnthuse, subscribeToMusic)~1) # fit the model library(poLCA) set.seed(55605) seg.LCA3 <- poLCA(seg.f, data=seg.ex.po, nclass=3) seg.LCA4 <- poLCA(seg.f, data=seg.ex.po, nclass=4) seg.LCA4$bic seg.LCA3$bic # examine the solutions # 3 clusters ex.summ(seg.ex, seg.LCA3$predclass) table(seg.LCA3$predclass) clusplot(seg.ex, seg.LCA3$predclass, color=TRUE, shade=TRUE, labels=4, lines=0, main="LCA plot (K=3)") # 4 clusters ex.summ(seg.ex, seg.LCA4$predclass) table(seg.LCA4$predclass) clusplot(seg.ex, seg.LCA4$predclass, color=TRUE, shade=TRUE, labels=4, lines=0, main="LCA plot (K=4)") # compare 3-cluster and 4-cluster solutions table(seg.LCA3$predclass, seg.LCA4$predclass) ## CLASSIFICATION # split data into 65 / 35 training and test sample # 10. Split the music subscription data set --- with the segment assignments --- # into 65%/35% sets for classification model training and assessment. # Compare the two sets. Are they suitably similar? (Note: be sure to # set a random number seed for replicability.) set.seed(05458) samp.train <- sample(nrow(seg.ex.raw), nrow(seg.ex.raw) * 0.65) ex.train <- seg.ex.raw[ samp.train, ] ex.test <- seg.ex.raw[-samp.train, ] # compare summaries for them ex.summ(ex.train) ex.summ(ex.test) # income looks the most different. test it t.test(ex.train$householdIncome, ex.test$householdIncome) # ==> "kids and talk" segment differs, but overall non-significant # 11. Fit a naive Bayes model to predict segment membership from the other # variables in the training data. Check its performance on the test data. # Does it perform better than chance? library(e1071) (seg.nb <- naiveBayes(Segment ~ ., data=ex.train)) (seg.nb.class <- predict(seg.nb, ex.test)) # frequencies in predicted data prop.table(table(seg.nb.class)) # plot it clusplot(ex.test[, -10], seg.nb.class, color=TRUE, shade=TRUE, labels=4, lines=0, main="Naive Bayes classification, holdout data") # compare to known segments mean(ex.test$Segment==seg.nb.class) table(seg.nb.class, ex.test$Segment) # adjusted for chance library(mclust) adjustedRandIndex(seg.nb.class, ex.test$Segment) # 12. Fit a random forest model to predict segment membership. What is its # out of bag error rate? Did you do anything to control class imbalance? library(randomForest) set.seed(05458) (seg.rf <- randomForest(Segment ~ ., data=ex.train, ntree=3000, sampsize=rep(36, 6))) # 13. With the random forest model, predict segments in the test data. # Compare those to the actual segments. Is is better than chance? seg.pred <- predict(seg.rf, ex.test) # confusion matrix table(seg.pred, ex.test$Segment) library(mclust) adjustedRandIndex(seg.pred, ex.test$Segment) library(psych) cohen.kappa(cbind(seg.pred, ex.test$Segment)) # 14. In the random forest model, which variables are most important for the # prediction? set.seed(05458) (seg.rf2 <- randomForest(Segment ~ ., data=ex.train, ntree=3000, sampsize=rep(36, 6), importance=TRUE)) varImpPlot(seg.rf2, main="Variable importance by segment") library(gplots) library(RColorBrewer) heatmap.2(t(importance(seg.rf2)[ , 1:6]), col=brewer.pal(9, "Blues"), dend="none", trace="none", key=FALSE, margins=c(10, 10), main="Variable importance by segment" ) # 15. Predict subscription status in the data, using a random forest model. # How well does it predict the test data? Which variables are most # important? set.seed(05458) (seg.rf3 <- randomForest(subscribeToMusic ~ ., data=ex.train, ntree=5000, sampsize=c(61, 61), importance=TRUE)) # holdout prediction seg.pred <- predict(seg.rf3, ex.test) # confusion matrix table(seg.pred, ex.test$subscribeToMusic) library(mclust) adjustedRandIndex(seg.pred, ex.test$subscribeToMusic) library(psych) cohen.kappa(cbind(seg.pred, ex.test$subscribeToMusic)) # variable importance varImpPlot(seg.rf3, main="Variable importance by segment") # 16. (stretch exercise) Use the hotel satisfaction data from the # exercises in Chapter 7. Build a predictive model to predict elite # status (as the ``segment'' of interest). How well does it perform? # ... code omitted because there are many possible paths!