library(data.table) # library to use data.table objects
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(ggplot2)
dt.em <- fread("20210412-email-campaign.csv")
Looking at the Structure of the Data
str(dt.em)
## Classes 'data.table' and 'data.frame': 45572 obs. of 11 variables:
## $ last_purchase : int 9 7 7 3 1 8 1 2 1 5 ...
## $ hist_spend : num 115.6 67.9 70.7 1221.9 512.4 ...
## $ books : int 0 0 0 1 1 1 0 1 0 1 ...
## $ electronics : int 1 1 1 1 1 0 1 0 1 1 ...
## $ pop_density : chr "Rural" "Urban" "Suburban" "Suburban" ...
## $ new_customer : int 1 1 1 1 1 0 1 0 0 0 ...
## $ device : chr "Mobile" "Mobile" "Mobile" "Laptop" ...
## $ treatment : chr "Books Email" "Electronics Email" "No Email" "Books Email" ...
## $ visit_after : int 0 0 0 0 0 1 0 0 0 0 ...
## $ purchased_after: int 0 0 0 0 0 0 0 0 0 0 ...
## $ spend_after : num 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
cat("\n Numerical Variables Summary Table\n")
##
## Numerical Variables Summary Table
dt.em.predictors = dt.em[, list(last_purchase, hist_spend, books, electronics, pop_density, new_customer, device)]
stargazer(dt.em.predictors, type="text")
##
## ========================================================================
## Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
## ------------------------------------------------------------------------
## last_purchase 45,572 5.764 3.505 1 2 9 12
## hist_spend 45,572 241.555 254.597 29.990 64.840 325.202 3,345.930
## books 45,572 0.550 0.498 0 0 1 1
## electronics 45,572 0.552 0.497 0 0 1 1
## new_customer 45,572 0.500 0.500 0 0 1 1
## ------------------------------------------------------------------------
cat("\n Categorical Variables Summary Table\n\n")
##
## Categorical Variables Summary Table
summary(dt.em.predictors[, list(pop_density, device)])
## pop_density device
## Length:45572 Length:45572
## Class :character Class :character
## Mode :character Mode :character
# check device mode
qplot(factor(device), fill=factor(device), data=dt.em, geom="bar") +
labs(title = "Device Mode is Laptop and Mobile",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "Device", y = "", fill="Device")
qplot(factor(pop_density), fill=factor(pop_density), data=dt.em, geom="bar") +
labs(title = "Population Density Mode is Sub-Urban",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "Population Density", y = "", fill="Population Density")
last_purchase | predictor | numeric | Months since last purchase. hist_spend | predictor | numeric | Actual dollar value spent in the past year. books | predictor | binomial | 1/0 indicator, 1 = customer purchased from the books section in the past year. electronics | predictor | binomial | 1/0 indicator, 1 = customer purchased from the electronics section in the past year. pop_density | predictor | multinomial | Use location classified as Urban, Suburban, or Rural. new_customer | predictor | binomial | 1/0 indicator, 1 = New customer in the past twelve months. device | predictor | multinomial | Describes the devices the customer purchased from in the past year.
ggplot(data=dt.em, aes(factor(last_purchase), fill=factor(new_customer))) +
geom_bar(position="dodge") + labs(title = "Months since last purchase per Customer",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "(Months since) Last Purchase", y = "", fill="New Customer")
ggplot(data=dt.em, aes(hist_spend, fill = factor(new_customer))) + geom_histogram(alpha = 1, position = "identity") +
labs(title = "Historical Spending Histogram per New Customer",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "(Last year's) Historical Spending", y = "", fill="New Customer")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=dt.em, aes(hist_spend)) + geom_histogram(alpha = 1, position = "identity") + facet_wrap(~ new_customer) +
labs(title = "Historical Spending Histogram per New Customer",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "(Last year's) Historical Spending", y = "", fill="New Customer")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
This is what we call a “Power Law” distribution, so we can apply a log to see something closer to a normal distribution.
ggplot(data=dt.em, aes(log(hist_spend), fill = factor(new_customer))) + geom_histogram(alpha = 1, position = "identity") +
labs(title = "Log of Historical Spending Histogram per New Customer",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "Log of (Last year's) Historical Spending", y = "", fill="New Customer")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=dt.em, aes(log(hist_spend)))+ geom_histogram(alpha = 1, position = "identity") + facet_wrap(~ new_customer) +
labs(title = "Log of Historical Spending Histogram per New Customer",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "Log of (Last year's) Historical Spending", y = "", fill="New Customer")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(factor(new_customer), hist_spend, data=dt.em, fill=factor(new_customer), geom="boxplot") + facet_grid(~ books) +
labs(title = "Books",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "New Customer", y = "Historical Spending", fill="New Customer")
qplot(factor(new_customer), hist_spend, data=dt.em, fill=factor(new_customer), geom="boxplot") + facet_grid(~ electronics) +
labs(title = "Electronics",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "New Customer", y = "Historical Spending", fill="New Customer")
ggplot(dt.em, aes(x=factor(new_customer), y=hist_spend, fill=factor(new_customer))) + geom_boxplot() + facet_grid(~ books + electronics) +
labs(title = "Books & Electronics",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "New Customer", y = "Historical Spending", fill="New Customer")
There seems to be a difference between means of new_customers per books and per electronics.
New customer seems to have a significant amount of outliers in spending, with the largest one being someone that bought both books and electronics last year. This tells us that for customers that buy either only books or only electronics last year, their distribution will have longer tails than that of a regular customer.
There also seems to be a difference in means between new_customer and both books and electronics.
t.test(dt.em$new_customer ~ dt.em$books)
##
## Welch Two Sample t-test
##
## data: dt.em$new_customer by dt.em$books
## t = -5.8381, df = 43806, p-value = 5.318e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.03669940 -0.01825089
## sample estimates:
## mean in group 0 mean in group 1
## 0.4853730 0.5128481
t.test(dt.em$new_customer ~ dt.em$electronics)
##
## Welch Two Sample t-test
##
## data: dt.em$new_customer by dt.em$electronics
## t = -3.3417, df = 43656, p-value = 0.0008334
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.024968347 -0.006506955
## sample estimates:
## mean in group 0 mean in group 1
## 0.4917944 0.5075321
By performing the t-test we have that there seems to be a statistical significant differece between means.
lm.books.eletronics = lm(new_customer ~ books + electronics, data=dt.em)
summary(lm.books.eletronics)
##
## Call:
## lm(formula = new_customer ~ books + electronics, data = dt.em)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6053 -0.4918 0.3947 0.5082 0.5146
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.371876 0.008822 42.16 <2e-16 ***
## books 0.119918 0.008102 14.80 <2e-16 ***
## electronics 0.113497 0.008105 14.00 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4988 on 45569 degrees of freedom
## Multiple R-squared: 0.005029, Adjusted R-squared: 0.004985
## F-statistic: 115.2 on 2 and 45569 DF, p-value: < 2.2e-16
By performing the F-test we can also see that books and electronics are jointly statistically significant
mfit = lm(new_customer ~ last_purchase + hist_spend + books + electronics + pop_density + device,
data=dt.em)
summary(mfit)
##
## Call:
## lm(formula = new_customer ~ last_purchase + hist_spend + books +
## electronics + pop_density + device, data = dt.em)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9597 -0.4638 -0.3334 0.5331 0.6135
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.607e-01 1.379e-02 26.163 < 2e-16 ***
## last_purchase 3.849e-04 6.716e-04 0.573 0.56664
## hist_spend 4.723e-04 1.063e-05 44.440 < 2e-16 ***
## books -2.008e-02 8.495e-03 -2.364 0.01810 *
## electronics -2.630e-02 8.497e-03 -3.095 0.00197 **
## pop_densitySuburban 1.535e-02 6.827e-03 2.249 0.02452 *
## pop_densityUrban 1.030e-02 6.926e-03 1.487 0.13705
## deviceLaptop 3.760e-02 8.021e-03 4.687 2.77e-06 ***
## deviceMobile 4.895e-02 8.029e-03 6.097 1.09e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4872 on 45563 degrees of freedom
## Multiple R-squared: 0.05089, Adjusted R-squared: 0.05072
## F-statistic: 305.4 on 8 and 45563 DF, p-value: < 2.2e-16
By doing a multilinear regression to explain new_costumers we find that, ceteris paribus, hist_spend (last year expenditure) and device (laptop and mobile) are statistically significant at a 99.9% Confidence Interval. Interstingly, the time since last purchase is not statistically significant when it comes to explain new customers.
ggpairs - Inspect relations between all combinations of variable pairs
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(dt.em[, list(last_purchase, hist_spend, books, electronics, pop_density, device)])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The correlation between historical spending (last year’s spending) and last purchase is statistically significant at -0.246. Therefore, there’s a weak negative correlation implying that we expect to have higher spending with customers that made their last purchase at the end of the year, which seems intuitive given that the end of the year has Christmas season a well know and planned expense for many customers. The weak correlation comes from the spike in March and April in the middle of the year, in what would otherwise be a downward trend throughout the year. It is important to reinforce the need to investigate what is causing such a significant increase in last purchases this early in the year, as the ideal would be to have a strong negative correlation between historical spending and time since last purchase as it would imply larger revenue for the company.
Books and electronics slightly negative correlations with last_purchase suggest that there may be little influence in the timming of the last purchase and this being books or electronics, actually with this data it is necessarily one or the other, or both.
Books and eletronics slightly positive correlation with historical spending (last year’s spending) which likely comes from having 2 out of 3 possible states of purchase. These are, (Books, No Electronics) , (No Books, Electronics) or (Books, Electronics). And as we can see in the Box plot for Books and Electronincs, there is a statistically significant difference in means, between old and new customers with the highest historical spending mean of all combinations coming from the purchase of both books and electronics.
Books and electronics have a strong negative correlation, which is to be expected given that, as we can see below, only a small share of customers bough both products during the year.
qplot(factor(new_customer), fill=factor(new_customer), data=dt.em, geom="bar")+ facet_grid(~ books + electronics) +
labs(title = "The majority of customers buy either Books or Electronics",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "New Customer", y = "", fill="New Customer")
###2 RANDOMIZATION CHECKS ####2.1 Creating a summary statistics table
stargazer(dt.em , type = "text")
##
## ==========================================================================
## Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
## --------------------------------------------------------------------------
## last_purchase 45,572 5.764 3.505 1 2 9 12
## hist_spend 45,572 241.555 254.597 29.990 64.840 325.202 3,345.930
## books 45,572 0.550 0.498 0 0 1 1
## electronics 45,572 0.552 0.497 0 0 1 1
## new_customer 45,572 0.500 0.500 0 0 1 1
## visit_after 45,572 0.148 0.355 0 0 0 1
## purchased_after 45,572 0.009 0.094 0 0 0 1
## spend_after 45,572 1.022 14.607 0 0 0 499
## --------------------------------------------------------------------------
###2.2 Checking if both groups are comparable On this first chunk we compare if the values of the variables from the group that receive the email campaign featuring electronics are different from the one that did not receive any email (control group) If the randomization was properly done the p_value of each test should be greater than 0.1 so then we can reject the null hypothesis. For the last 3 variables (visit_after, purchased_after , spend_after) the p-value is expected to be close to 0, since these are the output variables and it is expected that the variables behave differently across the two groups, otherwise the campaign wouldn’t have any result (e.g. a group of customers that receive an email campaign featuring electronics will probably spend more dollars than one that did not receive any email campaign). All the tests yield a p-value greater than 0.1 (except, as expected, the last 3), therefore we can say that this treatment group (the ones who received an email campaign featuring electronics) was properly randomly assigned and both, treatment and control groups are comparable.
dt.em[treatment != "Books Email", t.test(last_purchase ~ treatment)]
##
## Welch Two Sample t-test
##
## data: last_purchase by treatment
## t = 0.60336, df = 30279, p-value = 0.5463
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.05456909 0.10310618
## sample estimates:
## mean in group Electronics Email mean in group No Email
## 5.764982 5.740714
dt.em[treatment != "Books Email", t.test(hist_spend ~ treatment)]
##
## Welch Two Sample t-test
##
## data: hist_spend by treatment
## t = 0.6871, df = 30245, p-value = 0.492
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.730161 7.757002
## sample estimates:
## mean in group Electronics Email mean in group No Email
## 241.9132 239.8998
dt.em[treatment != "Books Email", t.test(books ~ treatment)]
##
## Welch Two Sample t-test
##
## data: books by treatment
## t = -0.4546, df = 30282, p-value = 0.6494
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.013806495 0.008607807
## sample estimates:
## mean in group Electronics Email mean in group No Email
## 0.5485729 0.5511723
dt.em[treatment != "Books Email", t.test(electronics ~ treatment)]
##
## Welch Two Sample t-test
##
## data: electronics by treatment
## t = 0.69207, df = 30282, p-value = 0.4889
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.007245521 0.015154927
## sample estimates:
## mean in group Electronics Email mean in group No Email
## 0.5547977 0.5508430
dt.em[treatment != "Books Email", t.test(new_customer ~ treatment)]
##
## Welch Two Sample t-test
##
## data: new_customer by treatment
## t = 0.22446, df = 30282, p-value = 0.8224
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.009973481 0.012553196
## sample estimates:
## mean in group Electronics Email mean in group No Email
## 0.5010264 0.4997366
dt.em[treatment != "Books Email", t.test(visit_after ~ treatment)]
##
## Welch Two Sample t-test
##
## data: visit_after by treatment
## t = 20.164, df = 28637, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.07333789 0.08913065
## sample estimates:
## mean in group Electronics Email mean in group No Email
## 0.1862791 0.1050448
dt.em[treatment != "Books Email", t.test(purchased_after ~ treatment)]
##
## Welch Two Sample t-test
##
## data: purchased_after by treatment
## t = 6.9367, df = 25794, p-value = 4.109e-12
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.005437086 0.009719892
## sample estimates:
## mean in group Electronics Email mean in group No Email
## 0.012913052 0.005334563
dt.em[treatment != "Books Email", t.test(spend_after ~ treatment)]
##
## Welch Two Sample t-test
##
## data: spend_after by treatment
## t = 4.6341, df = 26073, p-value = 3.602e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.4463453 1.1006734
## sample estimates:
## mean in group Electronics Email mean in group No Email
## 1.3957367 0.6222273
On this second chunk we compare if the values of the variables from the group that receive the email campaign featuring books are different from the one that did not receive any email (control group) If the randomization was properly done the p_value of each test should be greater than 0.1 so then we can reject the null hypothesis. For the last 3 variables (visit_after, purchased_after , spend_after) the p-value is expected to be close to 0, since these are the output variables and it is expected that the variables behave differently across the two groups, otherwise the campaign wouldn’t have any result (e.g. a group of customers that receive an email campaign featuring electronics will probably spend more dollars than one that did not receive any email campaign). Again, all the tests yield a p-value greater than 0.1 (except, as expected, the last 3), therefore we can say that this treatment group (the ones who received an email campaign featuring books) was properly randomly assigned and both, treatment and control groups are comparable.
dt.em[treatment != "Electronics Email", t.test(last_purchase ~ treatment)]
##
## Welch Two Sample t-test
##
## data: last_purchase by treatment
## t = 1.1553, df = 30469, p-value = 0.248
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.03229262 0.12501277
## sample estimates:
## mean in group Books Email mean in group No Email
## 5.787074 5.740714
dt.em[treatment != "Electronics Email", t.test(hist_spend ~ treatment)]
##
## Welch Two Sample t-test
##
## data: hist_spend by treatment
## t = 1.0177, df = 30468, p-value = 0.3088
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.726253 8.614625
## sample estimates:
## mean in group Books Email mean in group No Email
## 242.8440 239.8998
dt.em[treatment != "Electronics Email", t.test(books ~ treatment)]
##
## Welch Two Sample t-test
##
## data: books by treatment
## t = -0.19248, df = 30468, p-value = 0.8474
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.01226828 0.01007417
## sample estimates:
## mean in group Books Email mean in group No Email
## 0.5500752 0.5511723
dt.em[treatment != "Electronics Email", t.test(electronics ~ treatment)]
##
## Welch Two Sample t-test
##
## data: electronics by treatment
## t = -0.04289, df = 30468, p-value = 0.9658
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.01141546 0.01092657
## sample estimates:
## mean in group Books Email mean in group No Email
## 0.5505985 0.5508430
dt.em[treatment != "Electronics Email", t.test(new_customer ~ treatment)]
##
## Welch Two Sample t-test
##
## data: new_customer by treatment
## t = 0.16588, df = 30468, p-value = 0.8683
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.01027864 0.01217923
## sample estimates:
## mean in group Books Email mean in group No Email
## 0.5006869 0.4997366
dt.em[treatment != "Electronics Email", t.test(visit_after ~ treatment)]
##
## Welch Two Sample t-test
##
## data: visit_after by treatment
## t = 12.664, df = 29763, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.04103574 0.05606381
## sample estimates:
## mean in group Books Email mean in group No Email
## 0.1535946 0.1050448
dt.em[treatment != "Electronics Email", t.test(purchased_after ~ treatment)]
##
## Welch Two Sample t-test
##
## data: purchased_after by treatment
## t = 3.521, df = 28917, p-value = 0.0004305
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.001492092 0.005239188
## sample estimates:
## mean in group Books Email mean in group No Email
## 0.008700203 0.005334563
dt.em[treatment != "Electronics Email", t.test(spend_after ~ treatment)]
##
## Welch Two Sample t-test
##
## data: spend_after by treatment
## t = 2.8376, df = 28559, p-value = 0.004548
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1321767 0.7226032
## sample estimates:
## mean in group Books Email mean in group No Email
## 1.0496173 0.6222273
We also need to make sure that both treatment groups are comparable between them, so the results of the experimentation will be valid. For the last 3 variables (visit_after, purchased_after , spend_after) the p-value is expected to be close to 0, since these are the output variables and it is expected that the variables behave differently across the two groups, otherwise the campaign wouldn’t have any result (e.g. a group of customers that receive an email campaign featuring electronics will probably spend more dollars than one that did not receive any email campaign). Again, all the tests yield a p-value greater than 0.1 (except, as expected, the last 3), therefore we can say that both treatments groups are comparable.
dt.em[treatment != "No Email", t.test(last_purchase ~ treatment)]
##
## Welch Two Sample t-test
##
## data: last_purchase by treatment
## t = 0.54798, df = 30382, p-value = 0.5837
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.05692716 0.10111023
## sample estimates:
## mean in group Books Email mean in group Electronics Email
## 5.787074 5.764982
dt.em[treatment != "No Email", t.test(hist_spend ~ treatment)]
##
## Welch Two Sample t-test
##
## data: hist_spend by treatment
## t = 0.31645, df = 30356, p-value = 0.7517
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.834216 6.695747
## sample estimates:
## mean in group Books Email mean in group Electronics Email
## 242.8440 241.9132
dt.em[treatment != "No Email", t.test(books ~ treatment)]
##
## Welch Two Sample t-test
##
## data: books by treatment
## t = 0.26315, df = 30381, p-value = 0.7924
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.009687279 0.012691849
## sample estimates:
## mean in group Books Email mean in group Electronics Email
## 0.5500752 0.5485729
dt.em[treatment != "No Email", t.test(electronics ~ treatment)]
##
## Welch Two Sample t-test
##
## data: electronics by treatment
## t = -0.73607, df = 30382, p-value = 0.4617
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.015380802 0.006982506
## sample estimates:
## mean in group Books Email mean in group Electronics Email
## 0.5505985 0.5547977
dt.em[treatment != "No Email", t.test(new_customer ~ treatment)]
##
## Welch Two Sample t-test
##
## data: new_customer by treatment
## t = -0.05919, df = 30381, p-value = 0.9528
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.01158396 0.01090483
## sample estimates:
## mean in group Books Email mean in group Electronics Email
## 0.5006869 0.5010264
dt.em[treatment != "No Email", t.test(visit_after ~ treatment)]
##
## Welch Two Sample t-test
##
## data: visit_after by treatment
## t = -7.5902, df = 30148, p-value = 3.287e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.04112477 -0.02424422
## sample estimates:
## mean in group Books Email mean in group Electronics Email
## 0.1535946 0.1862791
dt.em[treatment != "No Email", t.test(purchased_after ~ treatment)]
##
## Welch Two Sample t-test
##
## data: purchased_after by treatment
## t = -3.55, df = 29162, p-value = 0.0003859
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.006538900 -0.001886798
## sample estimates:
## mean in group Books Email mean in group Electronics Email
## 0.008700203 0.012913052
dt.em[treatment != "No Email", t.test(spend_after ~ treatment)]
##
## Welch Two Sample t-test
##
## data: spend_after by treatment
## t = -1.8825, df = 29635, p-value = 0.05977
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.70648799 0.01424922
## sample estimates:
## mean in group Books Email mean in group Electronics Email
## 1.049617 1.395737
by.treatment = dt.em[, list(Total_Visit_after=sum(visit_after),
Total_Purchased_after=sum(purchased_after),
Total_Spend_after=sum(spend_after)), by="treatment"]
ggplot(data=by.treatment, aes(x=factor(treatment), y=Total_Visit_after, fill=factor(treatment))) +
geom_bar(stat="identity", position=position_dodge())+
geom_text(aes(label=Total_Visit_after), vjust=1.6, color="white",
position = position_dodge(0.9), size=3.5) + ggtitle("Total Number of Visitors 4 weeks after Treatment") +
labs(title = "Total Visitors 4 weeks after Treatment",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "Treatment", y = "Total Visits After", fill="Treatment")
ggplot(data=by.treatment, aes(x=factor(treatment), y=Total_Purchased_after, fill=factor(treatment))) +
geom_bar(stat="identity", position=position_dodge())+
geom_text(aes(label=Total_Purchased_after), vjust=1.6, color="white",
position = position_dodge(0.9), size=3.5) + ggtitle("Total Number of Purchases 4 weeks after Treatment") +
labs(title = "Total Purchases 4 weeks after Treatment",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "Treatment", y = "Total Purchased After", fill="Treatment")
ggplot(data=by.treatment, aes(x=factor(treatment), y=Total_Spend_after, fill=factor(treatment))) +
geom_bar(stat="identity", position=position_dodge())+
geom_text(aes(label=Total_Spend_after), vjust=1.6, color="white",
position = position_dodge(0.9), size=3.5) + ggtitle("Total Spending 4 weeks after Treatment") +
labs(title = "Total Spending 4 weeks after Treatment",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "Treatment", y = "Total Spend After", fill="Treatment")
# No email
n_customer_no = length(dt.em[treatment=="No Email", new_customer])
total_no_spend = dt.em[treatment=="No Email", sum(spend_after)]
no_spend_per_customer = total_no_spend/n_customer_no
percent_no = sum(dt.em[treatment == "No Email" , sum(spend_after)])/sum(dt.em$spend_after)*100
n_customer_b = length(dt.em[treatment=="Books Email", new_customer])
total_books_spend = dt.em[treatment=="Books Email", sum(spend_after)]
books_spend_per_customer = total_books_spend/n_customer_b
percent_books = sum(dt.em[treatment == "Books Email" , sum(spend_after)])/sum(dt.em$spend_after)*100
cat("\nThe books version of the campaign yielded an incremental ", books_spend_per_customer-no_spend_per_customer, "€ per customer. With ", n_customer_b, " customers spending ", total_books_spend, "€. Which equates to ", percent_books,"% of the total sales 4 weeks after the experiment\n")
##
## The books version of the campaign yielded an incremental 0.42739 € per customer. With 15287 customers spending 16045.5 €. Which equates to 34.45427 % of the total sales 4 weeks after the experiment
n_customer_el = length(dt.em[treatment=="Electronics Email", new_customer])
total_electronics_spend = dt.em[treatment=="Electronics Email", sum(spend_after)]
electronics_spend_per_customer = total_electronics_spend/n_customer_el
percent_electronics = sum(dt.em[treatment == "Electronics Email" , sum(spend_after)])/sum(dt.em$spend_after)*100
cat("\nThe Electronics version of the campaign yielded an incremental ", electronics_spend_per_customer-no_spend_per_customer, "€ per customer. With ", n_customer_el, " customers spending ", total_electronics_spend, "€. Which equates to ", percent_electronics,"% of the total sales 4 weeks after the experiment")
##
## The Electronics version of the campaign yielded an incremental 0.7735094 € per customer. With 15101 customers spending 21077.02 €. Which equates to 45.25839 % of the total sales 4 weeks after the experiment
treatment = c("Books Email","Electronics Email", "No Email")
values = c(books_spend_per_customer-no_spend_per_customer, electronics_spend_per_customer-no_spend_per_customer, no_spend_per_customer-no_spend_per_customer)
data = data.frame(treatment, values)
ggplot(data=data, aes(x=factor(treatment), y=values, fill=factor(treatment))) +
geom_bar(stat="identity", position=position_dodge())+
geom_text(aes(label=values), vjust=1.6, color="white",
position = position_dodge(0.9), size=3.5) + ggtitle("Total Spending 4 weeks after Treatment") +
labs(title = "Incremental Spending per Treatment per customer, No Email as baseline",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "Treatment", y = "Spend After per Customer", fill="Treatment")
new.data = dt.em[, list(Total_Spend_after=sum(spend_after)), by=c("treatment","books","electronics","device","pop_density","new_customer")]
labels_x = c("Books\nEmail"," Elect.\nEmail","No\nEmail")
ggplot(data=new.data, aes(x=factor(device), y=Total_Spend_after, fill=factor(treatment))) + #scale_x_discrete(labels=labels_x) +
geom_bar(stat="identity", position=position_dodge()) + facet_grid(~ books + electronics) +
labs(title = "Total Spending 4 weeks after Treatment",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "Treatment", y = "Total Spend After", fill="Treatment")
ggplot(data=new.data, aes(x=factor(pop_density), y=Total_Spend_after, fill=factor(treatment))) + #scale_x_discrete(labels=labels_x) +
geom_bar(stat="identity", position=position_dodge()) + facet_grid(~ books + electronics) +
labs(title = "Total Spending 4 weeks after Treatment",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "Treatment", y = "Total Spend After", fill="Treatment")
ggplot(data=new.data, aes(x=factor(new_customer), y=Total_Spend_after, fill=factor(treatment))) + #scale_x_discrete(labels=labels_x) +
geom_bar(stat="identity", position=position_dodge()) + facet_grid(~ books + electronics) +
labs(title = "Total Spending 4 weeks after Treatment",
#subtitle = "Plot of length by dose",
#caption = "Data source: ToothGrowth",
x = "Treatment", y = "Total Spend After", fill="Treatment")