###################################
# Code for: R for Marketing Research and Analytics, 2nd ed: Chapter 5
#
# Authors:  Chris Chapman               Elea McDonnell Feit
#           cnchapman+rbook@gmail.com   efeit@drexel.edu
#
# Copyright 2019, Springer 
#
# Last update: May 14, 2019
# Version: 2.01
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
#
# You may obtain a copy of the License at
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#################################################################
# BRIEF HOW TO USE
# This file contains scripts used in Chapter 5 of Chapman & Feit (2019),
#   "R for Marketing Research and Analytics, 2nd edition", Springer. 
#################################################################


# UPDATE May 2019:
# set random number generator to use pre R 3.6 method, to match book
# this affects data simulation and Bayesian methods, which use randomization
# ==> Run this line:
if (getRversion() >= "3.6.0") suppressWarnings(RNGversion("3.5.0"))
# you could always change back to current with: 
#   RNGversion(getRversion())


# R code for Chapter 5 -- assessing differences between groups

#### quick alternative to load the data
seg.df <- read.csv("http://goo.gl/qw303p")
summary(seg.df)


#### Create the data

# Names of the variables we will define for each segment
segVars <- c("age", "gender", "income", "kids", "ownHome", "subscribe")

# the data type for each segment
segVarType <- c("norm", "binom", "norm", "pois", "binom", "binom")

# names of the segments
segNames <- c("Suburb mix", "Urban hip", "Travelers", "Moving up")

# the size of each segment (N)
segSize <- c(100, 50, 80, 70)

# the means for each variable for each segment
segMeans <- matrix( c(
  40, .5, 55000, 2, .5, .1,
  24, .7, 21000, 1, .2, .2,
  58, .5, 64000, 0, .7, .05,
  36, .3, 52000, 2, .3, .2  ), ncol=length(segVars), byrow=TRUE)

# the standard deviations for each segment (NA = not applicable for the variable)
segSDs <- matrix( c(
  5, NA, 12000, NA, NA, NA,
  2, NA,  5000, NA, NA, NA,
  8, NA, 21000, NA, NA, NA,
  4, NA, 10000, NA, NA, NA  ), ncol=length(segVars), byrow=TRUE)


#### digression to learn about for() loops
for (i in 1:10) { print(i) }

(i.seq <- rep(sqrt(seq(from=2.1, to=6.2, by=1.7)), 3))
for (i in i.seq ) { print(i) }

for (i in c("Hello ","world, ","welcome to R!")) { cat(i) }

for (i in 1:length(i.seq)) { cat("Entry", i, "=", i.seq[i], "\n") }

for (i in seq_along(i.seq)) { cat("Entry", i, "=", i.seq[i], "\n") }

i.seq <- NULL
# maybe we have a bunch of other code, and then ...
for (i in 1:length(i.seq)) { print (i) }     # bad

i.seq <- NULL
# maybe we have a bunch of other code, and then ...
for (i in seq_along(i.seq)) { print (i) }     # better


#### if statements
x <- 1:5
if (x > 1) {      # bad code, gives warning!
  print ("hi") 
} else { 
  print ("bye") 
}

ifelse(x > 1, "hi", "bye")

fn.hi  <- function() { "hi" }
fn.bye <- function() { "bye" }
ifelse(x > 1, fn.hi(), fn.bye() )


#### return to data generation

# make sure we're starting our dataset from a known state
seg.df <- NULL
set.seed(02554)

# iterate over all the segments and create data for each
for (i in seq_along(segNames)) {    
  cat(i, segNames[i], "\n")

  # create an empty matrix to hold this particular segment's data
  this.seg <- data.frame(matrix(NA, nrow=segSize[i], ncol=length(segVars)))

  # within a segment, iterate over the variables and draw appropriate random data
  for (j in seq_along(segVars)) {    # and iterate over each variable
    if (segVarType[j] == "norm") {   # draw random normals
      this.seg[, j] <- rnorm(segSize[i], mean=segMeans[i, j], sd=segSDs[i, j])
    } else if (segVarType[j] == "pois") {    # draw counts
      this.seg[, j] <- rpois(segSize[i], lambda=segMeans[i, j])
    } else if (segVarType[j] == "binom") {   # draw binomials
      this.seg[, j] <- rbinom(segSize[i], size=1, prob=segMeans[i, j])
    } else {
      stop("Bad segment data type: ", segVarType[j])
    }
  }
  # add this segment to the total dataset
  seg.df <- rbind(seg.df, this.seg)     
}

# make the data frame names match what we defined
names(seg.df) <- segVars

# add segment membership for each row
seg.df$Segment   <- factor(rep(segNames, times=segSize))

# convert the binomial variables to nicely labeled factors
seg.df$ownHome   <- factor(seg.df$ownHome, labels=c("ownNo", "ownYes"))
seg.df$gender    <- factor(seg.df$gender, labels=c("Female", "Male"))
seg.df$subscribe <- factor(seg.df$subscribe, labels=c("subNo", "subYes"))

# check the data and confirm it
summary(seg.df)

### save the data
### adjust for your system and preference if you wish
save(seg.df, file="~/segdf-Rintro-Ch5.RData")


######

# descriptives

mean(seg.df$income[seg.df$Segment == "Moving up"])
mean(seg.df$income[seg.df$Segment == "Moving up" & 
                   seg.df$subscribe=="subNo"])

by(seg.df$income, seg.df$Segment, mean)
by(seg.df$income, list(seg.df$Segment, seg.df$subscribe), mean)

aggregate(seg.df$income, list(seg.df$Segment), mean)

seg.income.mean <- aggregate(seg.df$income, list(seg.df$Segment), mean)
seg.df$segIncome <- seg.income.mean[seg.df$Segment, 2]
library(car)
some(seg.df)

seg.df$Segment
seg.income.mean[seg.df$Segment, ]
seg.income.mean[seg.df$Segment, 2]

# clear that variable
seg.df$segIncome <- NULL


#### formula version

aggregate(income ~ Segment, data=seg.df, mean)

##########
# two-way data aggregation
aggregate(income ~ Segment + ownHome, data=seg.df, mean)
aggregate(income ~ Segment + ownHome + subscribe, data=seg.df, mean)

agg.data <- aggregate(income ~ Segment + ownHome, data=seg.df, mean)
agg.data[2, ]
agg.data[2, 3]

# Count of factor level occurence by factor
with(seg.df, table(Segment, ownHome))
with(seg.df, table(kids, Segment))

# total of variables by factor
xtabs(kids ~ Segment, data=seg.df)

aggregate(kids ~ Segment, data=seg.df, sum)

seg.tab <- with(seg.df, table(kids, Segment))
apply(seg.tab*0:7, 2, sum)
colSums(seg.tab*0:7)


#### visualize counts by group
library(lattice)

# histogram by 1 factor
histogram(~subscribe | Segment, data=seg.df)

# counts instead of proportions, and some visual options
histogram(~subscribe | Segment, data=seg.df, type="count", 
          layout=c(4,1), col=c("burlywood", "darkolivegreen"))


# histogram by 2 factors
histogram(~subscribe | Segment + ownHome, data=seg.df)

# use prop.table to get just positive proportion
prop.table(table(seg.df$subscribe, seg.df$Segment), margin=2)

barchart(prop.table(table(seg.df$subscribe, seg.df$Segment), margin=2)[2, ], 
          xlab="Subscriber proportion by Segment", col="darkolivegreen")


#### visualize continuous data by group

## bar chart for continuous variable, the "spreadsheet" way to graph it
# aggregate our data
seg.mean <- aggregate(income ~ Segment, data=seg.df, mean)
# plot it
library(lattice)

barchart(income~Segment, data=seg.mean, col="grey")

seg.income.agg <- aggregate(income ~ Segment + ownHome, data=seg.df, mean)
# then plot it
barchart(income ~ Segment, data=seg.income.agg, 
         groups=ownHome, auto.key=TRUE,
         par.settings = simpleTheme(col=c("gray95", "gray50"))   # try rainbow, topo.colors, heat.colors, cm.colors
)


## better = boxplot for continuous variable

# base graphics way to do this

boxplot(income ~ Segment, data=seg.df, yaxt="n", ylab="Income ($k)")
ax.seq <- seq(from=0, to=120000, by=20000)
axis(side=2, at=ax.seq, labels=paste(ax.seq/1000, "k", sep=""), las=1)


# lattice gives more options, especially for multiway breakouts ("conditioning")
library(lattice)

bwplot(Segment ~ income, data=seg.df, horizontal=TRUE, xlab = "Income")

# add second conditioning variable
bwplot(Segment ~ income | ownHome, data=seg.df, horizontal=TRUE, 
       xlab="Income")