In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
df_emailinfo=pd.read_csv("email_table.csv")
#df_emailinfo = df_emailinfo.reset_index(drop=True)
In [3]:
df_emailinfo.head(5)
Out[3]:
email_id email_text email_version hour weekday user_country user_past_purchases converted
0 85120 short_email personalized 2 Sunday US 5 0
1 966622 long_email personalized 12 Sunday UK 2 1
2 777221 long_email personalized 11 Wednesday US 2 0
3 493711 short_email generic 6 Monday UK 1 0
4 106887 long_email generic 14 Monday US 6 0
In [4]:
df_emailinfo.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
email_id               100000 non-null int64
email_text             100000 non-null object
email_version          100000 non-null object
hour                   100000 non-null int64
weekday                100000 non-null object
user_country           100000 non-null object
user_past_purchases    100000 non-null int64
converted              100000 non-null int64
dtypes: int64(4), object(4)
memory usage: 6.1+ MB
In [5]:
df_emailinfo.describe()
Out[5]:
email_id hour user_past_purchases converted
count 100000.000000 100000.000000 100000.00000 100000.000000
mean 498690.196160 9.059300 3.87845 0.021190
std 289230.727534 4.439637 3.19611 0.144018
min 8.000000 1.000000 0.00000 0.000000
25% 246708.250000 6.000000 1.00000 0.000000
50% 498447.000000 9.000000 3.00000 0.000000
75% 749942.750000 12.000000 6.00000 0.000000
max 999998.000000 24.000000 22.00000 1.000000
In [6]:
categorical_feature_mask = df_emailinfo.dtypes==object
categorical_feature_mask

# filter categorical columns using mask and turn it into a list
categorical_cols = df_emailinfo.columns[categorical_feature_mask].tolist()

categorical_cols.remove('weekday')
categorical_cols.remove('user_country')
from sklearn.preprocessing import LabelEncoder
# # # instantiate labelencoder object
le = LabelEncoder()
df_emailinfo[categorical_cols] = df_emailinfo[categorical_cols].apply(lambda col: le.fit_transform(col))
df_emailinfo[categorical_cols].head(10)
Out[6]:
email_text email_version
0 1 1
1 0 1
2 0 1
3 1 0
4 0 0
5 0 1
6 1 1
7 0 0
8 0 1
9 1 1
In [7]:
df_emailinfo.head(5)
Out[7]:
email_id email_text email_version hour weekday user_country user_past_purchases converted
0 85120 1 1 2 Sunday US 5 0
1 966622 0 1 12 Sunday UK 2 1
2 777221 0 1 11 Wednesday US 2 0
3 493711 1 0 6 Monday UK 1 0
4 106887 0 0 14 Monday US 6 0
In [ ]:
 

Data Visualisation

In [8]:
import seaborn as sns
In [9]:
sns.boxplot(x='user_country',y='hour',data=df_emailinfo,palette='rainbow')
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1e30cf90>
In [10]:
sns.countplot(x='user_country',data=df_emailinfo,palette='rainbow')
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x11b65aed0>
In [11]:
sns.countplot(x='converted' ,data=df_emailinfo)
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x11b6d06d0>
In [12]:
sns.countplot(x='converted',hue='user_country' ,data=df_emailinfo)
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x11b65ad90>

***from above we can observe that e-mails sent across France and Spain was not converted

**maximum e-mails were sent across to US and UK citizens

In [13]:
sns.countplot(x='converted', hue='email_version', data=df_emailinfo)

# from graph we can observe that compared to generic emails, personalized emails is being converted
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x11b741250>
In [14]:
sns.countplot(x='converted', hue='email_text', data=df_emailinfo)

# no major difference
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1e848090>
In [15]:
df_emailinfo['user_past_purchases'].hist(color='green',bins=50,figsize=(8,4))
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x11b6ac8d0>
In [16]:
sns.countplot(x='hour', hue='user_country', data=df_emailinfo)
# most of mails were sent during day-time from 5:00 am to 3:00 pm across various countries
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1ece61d0>
In [17]:
sns.countplot(x='user_past_purchases', hue='user_country', data=df_emailinfo)
# we can observe a constant decline of purchases 
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1ee8ab10>

Merging other two tables wil email_info table

In [18]:
df_emailopen=pd.read_csv("email_opened_table.csv")
df_emailopen = df_emailopen.reset_index(drop=True)
df_emailopen.shape
Out[18]:
(10345, 1)
In [19]:
df_emaillink=pd.read_csv("link_clicked_table.csv")
df_emaillink.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2119 entries, 0 to 2118
Data columns (total 1 columns):
email_id    2119 non-null int64
dtypes: int64(1)
memory usage: 16.7 KB
In [20]:
df_emaillink.shape
Out[20]:
(2119, 1)
In [21]:
df_emailopen['email_opened']=1
df_emailopen.head(5)
Out[21]:
email_id email_opened
0 284534 1
1 609056 1
2 220820 1
3 905936 1
4 164034 1
In [22]:
df_emailinfo = pd.merge(left=df_emailinfo, right=df_emailopen, on='email_id',how='outer')
df_emailinfo.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 9 columns):
email_id               100000 non-null int64
email_text             100000 non-null int64
email_version          100000 non-null int64
hour                   100000 non-null int64
weekday                100000 non-null object
user_country           100000 non-null object
user_past_purchases    100000 non-null int64
converted              100000 non-null int64
email_opened           10345 non-null float64
dtypes: float64(1), int64(6), object(2)
memory usage: 7.6+ MB
In [23]:
df_emailinfo['email_opened']=df_emailinfo['email_opened'].fillna(0)
df_emailinfo['email_opened']=df_emailinfo['email_opened'].astype(int)
In [24]:
df_emaillink['link_opened']=1
df_emaillink.head(5)
Out[24]:
email_id link_opened
0 609056 1
1 870980 1
2 935124 1
3 158501 1
4 177561 1
In [25]:
df_emailinfo = pd.merge(left=df_emailinfo, right=df_emaillink, on='email_id',how='outer')
df_emailinfo.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 10 columns):
email_id               100000 non-null int64
email_text             100000 non-null int64
email_version          100000 non-null int64
hour                   100000 non-null int64
weekday                100000 non-null object
user_country           100000 non-null object
user_past_purchases    100000 non-null int64
converted              100000 non-null int64
email_opened           100000 non-null int64
link_opened            2119 non-null float64
dtypes: float64(1), int64(7), object(2)
memory usage: 8.4+ MB
In [26]:
df_emailinfo['link_opened']=df_emailinfo['link_opened'].fillna(0)
df_emailinfo['link_opened']=df_emailinfo['link_opened'].astype(int)
df_emailinfo['user_past_purchases']=df_emailinfo['user_past_purchases'].astype(int)

Data Visualization after merging

In [27]:
sns.countplot(x='email_opened',data=df_emailinfo)
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x1196ef690>
In [28]:
sns.countplot(x='email_opened', hue='email_text', data=df_emailinfo)
# short emails are preferrable
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x11b8fd190>
In [29]:
sns.countplot(x='email_opened', hue='email_version', data=df_emailinfo)
Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1e645d50>
In [30]:
sns.countplot(x='email_opened', hue='weekday', data=df_emailinfo)

# mostly opened during weekdays
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x119759810>
In [31]:
sns.countplot(x='email_opened', hue='user_country', data=df_emailinfo)
Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1f1d05d0>
In [32]:
sns.countplot(x='email_opened', hue='user_country', data=df_emailinfo)
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a20a99150>
In [33]:
sns.countplot(x='link_opened', hue='email_text', data=df_emailinfo)
#does not matter whether long or short emails are sent- equal chance
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1e700150>
In [34]:
sns.countplot(x='email_opened', hue='email_version', data=df_emailinfo)
# to prefer more personalised emails to convert into more buyers 
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a20c92cd0>
In [35]:
sns.countplot(x='link_opened', hue='weekday', data=df_emailinfo)
# maximum mails should be triggered on weekdays 
Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a20e427d0>
In [36]:
sns.countplot(x='link_opened', hue='user_country', data=df_emailinfo)
# should focus on increasing the count for US and UK
# should come up with new strategies to attract France and Spain customers
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a20ed1a50>

Feature engineering

Changing categorical to numerical

In [37]:
df_text_ohe=pd.get_dummies(df_emailinfo, columns=['weekday','user_country'])
In [38]:
df_text_ohe.head(5)
Out[38]:
email_id email_text email_version hour user_past_purchases converted email_opened link_opened weekday_Friday weekday_Monday weekday_Saturday weekday_Sunday weekday_Thursday weekday_Tuesday weekday_Wednesday user_country_ES user_country_FR user_country_UK user_country_US
0 85120 1 1 2 5 0 0 0 0 0 0 1 0 0 0 0 0 0 1
1 966622 0 1 12 2 1 1 1 0 0 0 1 0 0 0 0 0 1 0
2 777221 0 1 11 2 0 0 0 0 0 0 0 0 0 1 0 0 0 1
3 493711 1 0 6 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0
4 106887 0 0 14 6 0 0 0 0 1 0 0 0 0 0 0 0 0 1
In [39]:
df_emailinfo.drop(['email_text','weekday','email_version','user_country'],axis=1,inplace=True)
In [40]:
df_emailinfo.head(5)
Out[40]:
email_id hour user_past_purchases converted email_opened link_opened
0 85120 2 5 0 0 0
1 966622 12 2 1 1 1
2 777221 11 2 0 0 0
3 493711 6 1 0 0 0
4 106887 14 6 0 0 0
In [41]:
df_emailinfo=df_emailinfo.merge(df_text_ohe)
df_emailinfo.head()
Out[41]:
email_id hour user_past_purchases converted email_opened link_opened email_text email_version weekday_Friday weekday_Monday weekday_Saturday weekday_Sunday weekday_Thursday weekday_Tuesday weekday_Wednesday user_country_ES user_country_FR user_country_UK user_country_US
0 85120 2 5 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1
1 966622 12 2 1 1 1 0 1 0 0 0 1 0 0 0 0 0 1 0
2 777221 11 2 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1
3 493711 6 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0
4 106887 14 6 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1
In [42]:
df_emailinfo.shape
Out[42]:
(100000, 19)
In [43]:
#join_df=df_emailinfo.join(df_emailopen,how='outer',rsuffix='_open',on='email_id')
#join_df.info()
In [44]:
#join_df.sample(50)
In [45]:
#df_emailinfo[df_emailinfo['email_id'] == 950040]
In [46]:
#merge_df=pd.concat([df_emailinfo,df_emailopen],ignore_index=True)
#merge_df.info()
#merged_inner = pd.merge(left=df_emailinfo, right=df_emailopen, on='email_id',how='outer',suffixes=('','_open'))
#merged_inner.info()

Transformation - Rescaling and Standardization

In [47]:
df_emailinfo.describe()
Out[47]:
email_id hour user_past_purchases converted email_opened link_opened email_text email_version weekday_Friday weekday_Monday weekday_Saturday weekday_Sunday weekday_Thursday weekday_Tuesday weekday_Wednesday user_country_ES user_country_FR user_country_UK user_country_US
count 100000.000000 100000.000000 100000.00000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.00000 100000.00000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000
mean 498690.196160 9.059300 3.87845 0.021190 0.103450 0.021190 0.497240 0.497910 0.141770 0.143630 0.145690 0.14387 0.14277 0.141430 0.140840 0.099670 0.099950 0.199390 0.600990
std 289230.727534 4.439637 3.19611 0.144018 0.304547 0.144018 0.499995 0.499998 0.348816 0.350716 0.352797 0.35096 0.34984 0.348466 0.347858 0.299561 0.299935 0.399544 0.489697
min 8.000000 1.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 246708.250000 6.000000 1.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 498447.000000 9.000000 3.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
75% 749942.750000 12.000000 6.00000 0.000000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
max 999998.000000 24.000000 22.00000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.00000 1.00000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [48]:
from sklearn import preprocessing
x1_array = pd.Series(df_emailinfo['user_past_purchases'])
normalized_X = preprocessing.normalize([x1_array])
df_emailinfo['user_past_purchases'].shape
Out[48]:
(100000,)
In [49]:
user_purchase=normalized_X.reshape(100000,)
user_purchase
Out[49]:
array([0.00314612, 0.00125845, 0.00125845, ..., 0.00377535, 0.        ,
       0.00062922])
In [50]:
df_emailinfo['user_past_purchases']=user_purchase
In [51]:
df_emailinfo['user_past_purchases'].head(10)
Out[51]:
0    0.003146
1    0.001258
2    0.001258
3    0.000629
4    0.003775
5    0.001888
6    0.005034
7    0.001258
8    0.002517
9    0.001888
Name: user_past_purchases, dtype: float64
In [52]:
df_emailinfo.describe()
Out[52]:
email_id hour user_past_purchases converted email_opened link_opened email_text email_version weekday_Friday weekday_Monday weekday_Saturday weekday_Sunday weekday_Thursday weekday_Tuesday weekday_Wednesday user_country_ES user_country_FR user_country_UK user_country_US
count 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.00000 100000.00000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000
mean 498690.196160 9.059300 0.002440 0.021190 0.103450 0.021190 0.497240 0.497910 0.141770 0.143630 0.145690 0.14387 0.14277 0.141430 0.140840 0.099670 0.099950 0.199390 0.600990
std 289230.727534 4.439637 0.002011 0.144018 0.304547 0.144018 0.499995 0.499998 0.348816 0.350716 0.352797 0.35096 0.34984 0.348466 0.347858 0.299561 0.299935 0.399544 0.489697
min 8.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 246708.250000 6.000000 0.000629 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 498447.000000 9.000000 0.001888 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
75% 749942.750000 12.000000 0.003775 0.000000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
max 999998.000000 24.000000 0.013843 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.00000 1.00000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [53]:
#df_emailinfo.user_past_purchases.hist(bins=50)
In [54]:
#df_emailinfo.hour.hist(bins=10)
In [55]:
# x2_array = pd.Series(df_emailinfo['hour'])
# normalized_X2 = preprocessing.normalize([x2_array])
# df_emailinfo['hour'].shape
# hour=normalized_X2.reshape(100000,)
# hour
# df_emailinfo['hour']=hour
# df_emailinfo['hour'].head(10)
In [56]:
df_emailinfo.describe()
Out[56]:
email_id hour user_past_purchases converted email_opened link_opened email_text email_version weekday_Friday weekday_Monday weekday_Saturday weekday_Sunday weekday_Thursday weekday_Tuesday weekday_Wednesday user_country_ES user_country_FR user_country_UK user_country_US
count 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.00000 100000.00000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000
mean 498690.196160 9.059300 0.002440 0.021190 0.103450 0.021190 0.497240 0.497910 0.141770 0.143630 0.145690 0.14387 0.14277 0.141430 0.140840 0.099670 0.099950 0.199390 0.600990
std 289230.727534 4.439637 0.002011 0.144018 0.304547 0.144018 0.499995 0.499998 0.348816 0.350716 0.352797 0.35096 0.34984 0.348466 0.347858 0.299561 0.299935 0.399544 0.489697
min 8.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 246708.250000 6.000000 0.000629 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 498447.000000 9.000000 0.001888 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
75% 749942.750000 12.000000 0.003775 0.000000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
max 999998.000000 24.000000 0.013843 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.00000 1.00000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [57]:
df_emailinfo.hour.hist(bins=10)
Out[57]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a21019b10>
In [58]:
scaler = preprocessing.StandardScaler()
df_emailinfo['hour']=scaler.fit_transform(df_emailinfo[['hour']])
df_emailinfo['hour'].head()
Out[58]:
0   -1.590070
1    0.662377
2    0.437132
3   -0.689091
4    1.112867
Name: hour, dtype: float64
In [59]:
df_emailinfo.describe()
Out[59]:
email_id hour user_past_purchases converted email_opened link_opened email_text email_version weekday_Friday weekday_Monday weekday_Saturday weekday_Sunday weekday_Thursday weekday_Tuesday weekday_Wednesday user_country_ES user_country_FR user_country_UK user_country_US
count 100000.000000 1.000000e+05 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.00000 100000.00000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000
mean 498690.196160 -6.131573e-16 0.002440 0.021190 0.103450 0.021190 0.497240 0.497910 0.141770 0.143630 0.145690 0.14387 0.14277 0.141430 0.140840 0.099670 0.099950 0.199390 0.600990
std 289230.727534 1.000005e+00 0.002011 0.144018 0.304547 0.144018 0.499995 0.499998 0.348816 0.350716 0.352797 0.35096 0.34984 0.348466 0.347858 0.299561 0.299935 0.399544 0.489697
min 8.000000 -1.815315e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 246708.250000 -6.890912e-01 0.000629 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 498447.000000 -1.335701e-02 0.001888 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
75% 749942.750000 6.623772e-01 0.003775 0.000000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
max 999998.000000 3.365314e+00 0.013843 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.00000 1.00000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [60]:
df_emailinfo.hour.hist(bins=10)
Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a6ee790>
In [61]:
import seaborn as sns
sns.kdeplot(df_emailinfo.hour)
Out[61]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1e4b4890>

Data Visualisation

In [62]:
# correlation=df_emailinfo.corr()
# fig=plt.figure
# plt.show()
In [63]:
sns.heatmap(df_emailinfo.corr(),cmap='coolwarm',)
plt.title('email_info.corr()')
Out[63]:
Text(0.5, 1, 'email_info.corr()')
In [64]:
#df_emailinfo.drop('email_text_short_email',axis=1,inplace=True)
#df_emailinfo.drop('email_version_generic',axis=1,inplace=True)
df_emailinfo.drop('link_opened',axis=1,inplace=True)
df_emailinfo.drop('user_country_ES',axis=1,inplace=True)
df_emailinfo.drop('email_opened',axis=1,inplace=True)
#df_emailinfo.drop('user_past_purchases',axis=1,inplace=True)
In [65]:
sns.heatmap(df_emailinfo.corr(),cmap='coolwarm',)
plt.title('email_info.corr()')
Out[65]:
Text(0.5, 1, 'email_info.corr()')
In [66]:
df_emailinfo.shape
Out[66]:
(100000, 16)
In [67]:
df_emailinfo.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 16 columns):
email_id               100000 non-null int64
hour                   100000 non-null float64
user_past_purchases    100000 non-null float64
converted              100000 non-null int64
email_text             100000 non-null int64
email_version          100000 non-null int64
weekday_Friday         100000 non-null uint8
weekday_Monday         100000 non-null uint8
weekday_Saturday       100000 non-null uint8
weekday_Sunday         100000 non-null uint8
weekday_Thursday       100000 non-null uint8
weekday_Tuesday        100000 non-null uint8
weekday_Wednesday      100000 non-null uint8
user_country_FR        100000 non-null uint8
user_country_UK        100000 non-null uint8
user_country_US        100000 non-null uint8
dtypes: float64(2), int64(4), uint8(10)
memory usage: 6.3 MB
In [68]:
#input_columns=[columns for columns in df_emailinfo.columns if columns!='converted']
In [69]:
#output_column=df_emailinfo['converted']

#not able to execute
#X = df.loc[:,input_columns].values
#y = df.loc[:,output_column]
#print (X.shape, y.shape)
In [70]:
X = df_emailinfo[['email_id','hour','user_past_purchases','email_text','weekday_Friday','weekday_Monday','weekday_Saturday','weekday_Sunday','weekday_Thursday','weekday_Tuesday','weekday_Wednesday','email_version','user_country_FR','user_country_UK','user_country_US']]
y = df_emailinfo['converted']
In [71]:
from sklearn.model_selection import train_test_split
In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)
In [73]:
print (X_train.shape)
print (y_train.shape)

print (X_test.shape)
print (y_test.shape)
(70000, 15)
(70000,)
(30000, 15)
(30000,)
In [74]:
non_converted=df_emailinfo[df_emailinfo['converted']==0]
converted=df_emailinfo[df_emailinfo['converted']==1]
In [75]:
print(non_converted.shape, converted.shape)
(97881, 16) (2119, 16)
In [76]:
from sklearn.utils import resample
converted_upsampled = resample(converted,
                          replace=True, # sample with replacement
                          n_samples=len(non_converted), # match number in majority class
                          random_state=27)
In [77]:
converted_upsampled.shape
Out[77]:
(97881, 16)
In [78]:
non_converted.shape
Out[78]:
(97881, 16)
In [79]:
upsampled=pd.concat([converted_upsampled,non_converted])
upsampled.head(5)
Out[79]:
email_id hour user_past_purchases converted email_text email_version weekday_Friday weekday_Monday weekday_Saturday weekday_Sunday weekday_Thursday weekday_Tuesday weekday_Wednesday user_country_FR user_country_UK user_country_US
49114 939228 1.338111 0.005663 1 0 0 1 0 0 0 0 0 0 0 1 0
35950 815518 -0.238602 0.006921 1 0 1 0 0 0 0 0 0 1 0 0 1
61976 217459 0.437132 0.001258 1 1 1 0 0 0 1 0 0 0 0 0 1
6780 195134 -0.463847 0.001888 1 1 1 0 0 0 0 1 0 0 0 0 1
48520 17253 -0.463847 0.005663 1 1 1 0 0 0 0 0 1 0 0 1 0
In [80]:
upsampled.converted.value_counts()
Out[80]:
1    97881
0    97881
Name: converted, dtype: int64
In [81]:
y_train = upsampled.converted
X_train = upsampled.drop('converted', axis=1)
In [82]:
from sklearn.linear_model import LogisticRegression
In [83]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Out[83]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
In [84]:
logmodel.score(X_train, y_train)
Out[84]:
0.6050867890601853
In [85]:
logmodel.score(X_test, y_test)
Out[85]:
0.6947
In [86]:
predictions = logmodel.predict(X_test)
predictions
Out[86]:
array([0, 0, 1, ..., 1, 0, 0])
In [87]:
from sklearn.metrics import classification_report,accuracy_score
from sklearn.metrics import confusion_matrix
In [88]:
print(accuracy_score(y_test,predictions))
0.6947
In [89]:
print(classification_report(y_test,predictions))
              precision    recall  f1-score   support

           0       0.98      0.70      0.82     29364
           1       0.03      0.38      0.05       636

    accuracy                           0.69     30000
   macro avg       0.50      0.54      0.43     30000
weighted avg       0.96      0.69      0.80     30000

In [90]:
print(confusion_matrix(y_test,predictions))
[[20601  8763]
 [  396   240]]
In [91]:
from sklearn.metrics import roc_auc_score,roc_curve,auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc
Out[91]:
0.5394659228473834
In [92]:
plt.plot(false_positive_rate,true_positive_rate)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
In [93]:
predicted_probs=logmodel.predict_proba(X_test)
In [94]:
predicted_probs[100:110]
Out[94]:
array([[0.6284114 , 0.3715886 ],
       [0.50485609, 0.49514391],
       [0.48033055, 0.51966945],
       [0.55562665, 0.44437335],
       [0.65107419, 0.34892581],
       [0.37728975, 0.62271025],
       [0.74302312, 0.25697688],
       [0.64252236, 0.35747764],
       [0.59497748, 0.40502252],
       [0.46132606, 0.53867394]])
In [95]:
y_pred_lower_threshold = logmodel.predict_proba(X_test)[:,1] <0.5
In [96]:
print(classification_report(y_test, y_pred_lower_threshold))
              precision    recall  f1-score   support

           0       0.97      0.30      0.46     29364
           1       0.02      0.62      0.04       636

    accuracy                           0.31     30000
   macro avg       0.50      0.46      0.25     30000
weighted avg       0.95      0.31      0.45     30000

In [97]:
from sklearn.model_selection import GridSearchCV
In [98]:
log_model_1=LogisticRegression()
In [114]:
param_grid = {'C': [0.001, 0.04, 0.07,0.08, 1,10],'max_iter':[50,100,200,500]}
In [115]:
grid_search = GridSearchCV(log_model_1, param_grid, cv=5)
In [116]:
grid_search.fit(X_train, y_train)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Out[116]:
GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.001, 0.04, 0.07, 0.08, 1, 10],
                         'max_iter': [50, 100, 200, 500]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
In [117]:
print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test)))
Test set score: 0.70
In [118]:
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
Best parameters: {'C': 0.04, 'max_iter': 50}
Best cross-validation score: 0.60
In [ ]:
 
In [119]:
print("Best estimator:\n{}".format(grid_search.best_estimator_))
Best estimator:
LogisticRegression(C=0.04, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=50,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
In [120]:
logreg = LogisticRegression(class_weight='balanced',C=0.002,max_iter=50).fit(X_train,y_train)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
In [121]:
logreg.score(X_test, y_test)
Out[121]:
0.7040666666666666
In [122]:
predictions=logreg.predict(X_test)
In [123]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions,target_names=["Non Converted", "Converted"]))
               precision    recall  f1-score   support

Non Converted       0.98      0.71      0.82     29364
    Converted       0.03      0.36      0.05       636

     accuracy                           0.70     30000
    macro avg       0.50      0.54      0.44     30000
 weighted avg       0.96      0.70      0.81     30000