Sales Analysis

In [13]:
import pandas as pd
import os
In [ ]:
# Merging the data
In [17]:
files = [file for file in os.listdir(r"C:\Users\ashwi\Desktop\Insidesherpa\Pandas-Data-Science-Tasks-master\SalesAnalysis\Sales_Data")]

all_months_data = pd.DataFrame()

for file in files:
    df = pd.read_csv(r"C:\Users\ashwi\Desktop\Insidesherpa\Pandas-Data-Science-Tasks-master\SalesAnalysis\Sales_Data/" + file)
    all_months_data = pd.concat([all_months_data, df])
    
all_months_data.to_csv("all_data.csv", index=False)
In [18]:
all_data = pd.read_csv("all_data.csv")
all_data.head()
Out[18]:
Order ID Product Quantity Ordered Price Each Order Date Purchase Address
0 176558 USB-C Charging Cable 2 11.95 04/19/19 08:46 917 1st St, Dallas, TX 75001
1 NaN NaN NaN NaN NaN NaN
2 176559 Bose SoundSport Headphones 1 99.99 04/07/19 22:30 682 Chestnut St, Boston, MA 02215
3 176560 Google Phone 1 600 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001
4 176560 Wired Headphones 1 11.99 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001
In [22]:
all_data = all_data[all_data['Order Date'].str[0:2] != 'Or']
all_data = all_data.dropna(how='all')
In [27]:
all_data['Quantity Ordered'] = pd.to_numeric(all_data['Quantity Ordered'])
all_data['Price Each'] = pd.to_numeric(all_data['Price Each'])
In [25]:
#Adding "Months" Column

all_data['Month'] = all_data['Order Date'].str[0:2]
all_data['Month'] = all_data['Month'].astype('int32')
all_data.head()
Out[25]:
Order ID Product Quantity Ordered Price Each Order Date Purchase Address Month
0 176558 USB-C Charging Cable 2 11.95 04/19/19 08:46 917 1st St, Dallas, TX 75001 4
2 176559 Bose SoundSport Headphones 1 99.99 04/07/19 22:30 682 Chestnut St, Boston, MA 02215 4
3 176560 Google Phone 1 600 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4
4 176560 Wired Headphones 1 11.99 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4
5 176561 Wired Headphones 1 11.99 04/30/19 09:27 333 8th St, Los Angeles, CA 90001 4
In [29]:
# Adding a "sales" column
all_data['Sales'] = all_data['Quantity Ordered'] * all_data['Price Each']
all_data.head()
Out[29]:
Order ID Product Quantity Ordered Price Each Order Date Purchase Address Month Sales
0 176558 USB-C Charging Cable 2 11.95 04/19/19 08:46 917 1st St, Dallas, TX 75001 4 23.90
2 176559 Bose SoundSport Headphones 1 99.99 04/07/19 22:30 682 Chestnut St, Boston, MA 02215 4 99.99
3 176560 Google Phone 1 600.00 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4 600.00
4 176560 Wired Headphones 1 11.99 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4 11.99
5 176561 Wired Headphones 1 11.99 04/30/19 09:27 333 8th St, Los Angeles, CA 90001 4 11.99

Q.1. What is the best month for sales? What was the revenue?

In [36]:
results = all_data.groupby('Month').sum()
import matplotlib.pyplot as plt
months = range(1,13)
plt.bar(months, results['Sales'])
plt.xticks(months)
plt.ylabel('Sales in $')
plt.xlabel('Month')
plt.show()
In [38]:
#Answer1 - December 2019 noticed the highest sales in 2019
In [50]:
#Adding a 'City' column

def get_city(address):
    return address.split(',')[1]

def get_state(address):
    return address.split(',')[2].split(' ')[1]

all_data['City'] = all_data['Purchase Address'].apply(lambda x: f"{get_city(x)}  ({get_state(x)})")


all_data.head()
Out[50]:
Order ID Product Quantity Ordered Price Each Order Date Purchase Address Month Sales City
0 176558 USB-C Charging Cable 2 11.95 04/19/19 08:46 917 1st St, Dallas, TX 75001 4 23.90 Dallas (TX)
2 176559 Bose SoundSport Headphones 1 99.99 04/07/19 22:30 682 Chestnut St, Boston, MA 02215 4 99.99 Boston (MA)
3 176560 Google Phone 1 600.00 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4 600.00 Los Angeles (CA)
4 176560 Wired Headphones 1 11.99 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4 11.99 Los Angeles (CA)
5 176561 Wired Headphones 1 11.99 04/30/19 09:27 333 8th St, Los Angeles, CA 90001 4 11.99 Los Angeles (CA)

Q.2. Which US City has the higehst number of sales

In [54]:
results = all_data.groupby('City').sum()
results

import matplotlib.pyplot as plt
Cities = [city for city, df in all_data.groupby('City')] 

plt.bar(Cities, results['Sales'])
plt.xticks(months, rotation='vertical', size=10)
plt.ylabel('Sales in $')
plt.xlabel('City Name')
plt.show()
In [55]:
#Answer2 - San Francisco had the highest number of Sales

Q.3. What time to advertise the product to maximise the sales

In [59]:
all_data['Order Date'] = pd.to_datetime(all_data['Order Date'])
all_data['Hour'] = all_data['Order Date'].dt.hour
all_data['Minute']  = all_data['Order Date'].dt.minute
all_data.head()
Out[59]:
Order ID Product Quantity Ordered Price Each Order Date Purchase Address Month Sales City Hour Minute
0 176558 USB-C Charging Cable 2 11.95 2019-04-19 08:46:00 917 1st St, Dallas, TX 75001 4 23.90 Dallas (TX) 8 46
2 176559 Bose SoundSport Headphones 1 99.99 2019-04-07 22:30:00 682 Chestnut St, Boston, MA 02215 4 99.99 Boston (MA) 22 30
3 176560 Google Phone 1 600.00 2019-04-12 14:38:00 669 Spruce St, Los Angeles, CA 90001 4 600.00 Los Angeles (CA) 14 38
4 176560 Wired Headphones 1 11.99 2019-04-12 14:38:00 669 Spruce St, Los Angeles, CA 90001 4 11.99 Los Angeles (CA) 14 38
5 176561 Wired Headphones 1 11.99 2019-04-30 09:27:00 333 8th St, Los Angeles, CA 90001 4 11.99 Los Angeles (CA) 9 27
In [64]:
Hours = [hour for hour, df in all_data.groupby('Hour')] 

plt.plot(Hours, all_data.groupby(['Hour']).count())
plt.xticks(Hours)
plt.xlabel('Hour')
plt.ylabel('No. of Orders')
plt.grid()
plt.show()
In [ ]:
#Answer3 - In the morning around 11AM-12Pm and 7Pm-8Pm in the evening are the apt times to advertise the product

Q.4. What products are most often sold together

In [68]:
df = all_data[all_data['Order ID'].duplicated(keep=False)]
df['Grouped'] = df.groupby('Order ID')['Product'].transform(lambda x: ','.join(x))

df = df[['Order ID', 'Grouped']].drop_duplicates()
df.head()
C:\Users\ashwi\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
Out[68]:
Order ID Grouped
3 176560 Google Phone,Wired Headphones
18 176574 Google Phone,USB-C Charging Cable
30 176585 Bose SoundSport Headphones,Bose SoundSport Hea...
32 176586 AAA Batteries (4-pack),Google Phone
119 176672 Lightning Charging Cable,USB-C Charging Cable
In [73]:
from itertools import combinations
from collections import Counter

count = Counter()

for row in df['Grouped']:
    row_list= row.split(',')
    count.update(Counter(combinations(row_list, 3)))
for key, value in count.most_common(10):
    print(key, value)
('Google Phone', 'USB-C Charging Cable', 'Wired Headphones') 87
('iPhone', 'Lightning Charging Cable', 'Wired Headphones') 62
('iPhone', 'Lightning Charging Cable', 'Apple Airpods Headphones') 47
('Google Phone', 'USB-C Charging Cable', 'Bose SoundSport Headphones') 35
('Vareebadd Phone', 'USB-C Charging Cable', 'Wired Headphones') 33
('iPhone', 'Apple Airpods Headphones', 'Wired Headphones') 27
('Google Phone', 'Bose SoundSport Headphones', 'Wired Headphones') 24
('Vareebadd Phone', 'USB-C Charging Cable', 'Bose SoundSport Headphones') 16
('USB-C Charging Cable', 'Bose SoundSport Headphones', 'Wired Headphones') 5
('Vareebadd Phone', 'Bose SoundSport Headphones', 'Wired Headphones') 5

Q.5. Which product was sold the most and why?

In [75]:
product_group = all_data.groupby('Product')
quantity_ordered = product_group.sum()['Quantity Ordered']

products = [product for product, df in product_group]

plt.bar(products, quantity_ordered)
plt.xticks(products, rotation='vertical', size=10)
plt.ylabel('Quantity Ordered')
plt.xlabel('Product')
plt.show()
In [79]:
prices = all_data.groupby('Product').mean()['Price Each']

fig, ax1=plt.subplots()

ax2=ax1.twinx()
ax1.bar(products, quantity_ordered, color='b')
ax2.plot(products, prices, 'r-')

ax1.set_xlabel('Product Name')
ax1.set_ylabel('Quantity Ordered', color='b')
ax2.set_ylabel('Price ($)', color = 'r')
ax1.set_xticklabels(products, rotation='vertical', size=10)

plt.show()
In [ ]: