import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
df_emailinfo=pd.read_csv("email_table.csv")
#df_emailinfo = df_emailinfo.reset_index(drop=True)
df_emailinfo.head(5)
df_emailinfo.info()
df_emailinfo.describe()
categorical_feature_mask = df_emailinfo.dtypes==object
categorical_feature_mask
# filter categorical columns using mask and turn it into a list
categorical_cols = df_emailinfo.columns[categorical_feature_mask].tolist()
categorical_cols.remove('weekday')
categorical_cols.remove('user_country')
from sklearn.preprocessing import LabelEncoder
# # # instantiate labelencoder object
le = LabelEncoder()
df_emailinfo[categorical_cols] = df_emailinfo[categorical_cols].apply(lambda col: le.fit_transform(col))
df_emailinfo[categorical_cols].head(10)
df_emailinfo.head(5)
import seaborn as sns
sns.boxplot(x='user_country',y='hour',data=df_emailinfo,palette='rainbow')
sns.countplot(x='user_country',data=df_emailinfo,palette='rainbow')
sns.countplot(x='converted' ,data=df_emailinfo)
sns.countplot(x='converted',hue='user_country' ,data=df_emailinfo)
sns.countplot(x='converted', hue='email_version', data=df_emailinfo)
# from graph we can observe that compared to generic emails, personalized emails is being converted
sns.countplot(x='converted', hue='email_text', data=df_emailinfo)
# no major difference
df_emailinfo['user_past_purchases'].hist(color='green',bins=50,figsize=(8,4))
sns.countplot(x='hour', hue='user_country', data=df_emailinfo)
# most of mails were sent during day-time from 5:00 am to 3:00 pm across various countries
sns.countplot(x='user_past_purchases', hue='user_country', data=df_emailinfo)
# we can observe a constant decline of purchases
df_emailopen=pd.read_csv("email_opened_table.csv")
df_emailopen = df_emailopen.reset_index(drop=True)
df_emailopen.shape
df_emaillink=pd.read_csv("link_clicked_table.csv")
df_emaillink.info()
df_emaillink.shape
df_emailopen['email_opened']=1
df_emailopen.head(5)
df_emailinfo = pd.merge(left=df_emailinfo, right=df_emailopen, on='email_id',how='outer')
df_emailinfo.info()
df_emailinfo['email_opened']=df_emailinfo['email_opened'].fillna(0)
df_emailinfo['email_opened']=df_emailinfo['email_opened'].astype(int)
df_emaillink['link_opened']=1
df_emaillink.head(5)
df_emailinfo = pd.merge(left=df_emailinfo, right=df_emaillink, on='email_id',how='outer')
df_emailinfo.info()
df_emailinfo['link_opened']=df_emailinfo['link_opened'].fillna(0)
df_emailinfo['link_opened']=df_emailinfo['link_opened'].astype(int)
df_emailinfo['user_past_purchases']=df_emailinfo['user_past_purchases'].astype(int)
sns.countplot(x='email_opened',data=df_emailinfo)
sns.countplot(x='email_opened', hue='email_text', data=df_emailinfo)
# short emails are preferrable
sns.countplot(x='email_opened', hue='email_version', data=df_emailinfo)
sns.countplot(x='email_opened', hue='weekday', data=df_emailinfo)
# mostly opened during weekdays
sns.countplot(x='email_opened', hue='user_country', data=df_emailinfo)
sns.countplot(x='email_opened', hue='user_country', data=df_emailinfo)
sns.countplot(x='link_opened', hue='email_text', data=df_emailinfo)
#does not matter whether long or short emails are sent- equal chance
sns.countplot(x='email_opened', hue='email_version', data=df_emailinfo)
# to prefer more personalised emails to convert into more buyers
sns.countplot(x='link_opened', hue='weekday', data=df_emailinfo)
# maximum mails should be triggered on weekdays
sns.countplot(x='link_opened', hue='user_country', data=df_emailinfo)
# should focus on increasing the count for US and UK
# should come up with new strategies to attract France and Spain customers
Changing categorical to numerical
df_text_ohe=pd.get_dummies(df_emailinfo, columns=['weekday','user_country'])
df_text_ohe.head(5)
df_emailinfo.drop(['email_text','weekday','email_version','user_country'],axis=1,inplace=True)
df_emailinfo.head(5)
df_emailinfo=df_emailinfo.merge(df_text_ohe)
df_emailinfo.head()
df_emailinfo.shape
#join_df=df_emailinfo.join(df_emailopen,how='outer',rsuffix='_open',on='email_id')
#join_df.info()
#join_df.sample(50)
#df_emailinfo[df_emailinfo['email_id'] == 950040]
#merge_df=pd.concat([df_emailinfo,df_emailopen],ignore_index=True)
#merge_df.info()
#merged_inner = pd.merge(left=df_emailinfo, right=df_emailopen, on='email_id',how='outer',suffixes=('','_open'))
#merged_inner.info()
df_emailinfo.describe()
from sklearn import preprocessing
x1_array = pd.Series(df_emailinfo['user_past_purchases'])
normalized_X = preprocessing.normalize([x1_array])
df_emailinfo['user_past_purchases'].shape
user_purchase=normalized_X.reshape(100000,)
user_purchase
df_emailinfo['user_past_purchases']=user_purchase
df_emailinfo['user_past_purchases'].head(10)
df_emailinfo.describe()
#df_emailinfo.user_past_purchases.hist(bins=50)
#df_emailinfo.hour.hist(bins=10)
# x2_array = pd.Series(df_emailinfo['hour'])
# normalized_X2 = preprocessing.normalize([x2_array])
# df_emailinfo['hour'].shape
# hour=normalized_X2.reshape(100000,)
# hour
# df_emailinfo['hour']=hour
# df_emailinfo['hour'].head(10)
df_emailinfo.describe()
df_emailinfo.hour.hist(bins=10)
scaler = preprocessing.StandardScaler()
df_emailinfo['hour']=scaler.fit_transform(df_emailinfo[['hour']])
df_emailinfo['hour'].head()
df_emailinfo.describe()
df_emailinfo.hour.hist(bins=10)
import seaborn as sns
sns.kdeplot(df_emailinfo.hour)
# correlation=df_emailinfo.corr()
# fig=plt.figure
# plt.show()
sns.heatmap(df_emailinfo.corr(),cmap='coolwarm',)
plt.title('email_info.corr()')
#df_emailinfo.drop('email_text_short_email',axis=1,inplace=True)
#df_emailinfo.drop('email_version_generic',axis=1,inplace=True)
df_emailinfo.drop('link_opened',axis=1,inplace=True)
df_emailinfo.drop('user_country_ES',axis=1,inplace=True)
df_emailinfo.drop('email_opened',axis=1,inplace=True)
#df_emailinfo.drop('user_past_purchases',axis=1,inplace=True)
sns.heatmap(df_emailinfo.corr(),cmap='coolwarm',)
plt.title('email_info.corr()')
df_emailinfo.shape
df_emailinfo.info()
#input_columns=[columns for columns in df_emailinfo.columns if columns!='converted']
#output_column=df_emailinfo['converted']
#not able to execute
#X = df.loc[:,input_columns].values
#y = df.loc[:,output_column]
#print (X.shape, y.shape)
X = df_emailinfo[['email_id','hour','user_past_purchases','email_text','weekday_Friday','weekday_Monday','weekday_Saturday','weekday_Sunday','weekday_Thursday','weekday_Tuesday','weekday_Wednesday','email_version','user_country_FR','user_country_UK','user_country_US']]
y = df_emailinfo['converted']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)
print (X_train.shape)
print (y_train.shape)
print (X_test.shape)
print (y_test.shape)
non_converted=df_emailinfo[df_emailinfo['converted']==0]
converted=df_emailinfo[df_emailinfo['converted']==1]
print(non_converted.shape, converted.shape)
from sklearn.utils import resample
converted_upsampled = resample(converted,
replace=True, # sample with replacement
n_samples=len(non_converted), # match number in majority class
random_state=27)
converted_upsampled.shape
non_converted.shape
upsampled=pd.concat([converted_upsampled,non_converted])
upsampled.head(5)
upsampled.converted.value_counts()
y_train = upsampled.converted
X_train = upsampled.drop('converted', axis=1)
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
logmodel.score(X_train, y_train)
logmodel.score(X_test, y_test)
predictions = logmodel.predict(X_test)
predictions
from sklearn.metrics import classification_report,accuracy_score
from sklearn.metrics import confusion_matrix
print(accuracy_score(y_test,predictions))
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))
from sklearn.metrics import roc_auc_score,roc_curve,auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc
plt.plot(false_positive_rate,true_positive_rate)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
predicted_probs=logmodel.predict_proba(X_test)
predicted_probs[100:110]
y_pred_lower_threshold = logmodel.predict_proba(X_test)[:,1] <0.5
print(classification_report(y_test, y_pred_lower_threshold))
from sklearn.model_selection import GridSearchCV
log_model_1=LogisticRegression()
param_grid = {'C': [0.001, 0.04, 0.07,0.08, 1,10],'max_iter':[50,100,200,500]}
grid_search = GridSearchCV(log_model_1, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test)))
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
print("Best estimator:\n{}".format(grid_search.best_estimator_))
logreg = LogisticRegression(class_weight='balanced',C=0.002,max_iter=50).fit(X_train,y_train)
logreg.score(X_test, y_test)
predictions=logreg.predict(X_test)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,target_names=["Non Converted", "Converted"]))