import pandas as pd
import os
# Merging the data
files = [file for file in os.listdir(r"C:\Users\ashwi\Desktop\Insidesherpa\Pandas-Data-Science-Tasks-master\SalesAnalysis\Sales_Data")]
all_months_data = pd.DataFrame()
for file in files:
df = pd.read_csv(r"C:\Users\ashwi\Desktop\Insidesherpa\Pandas-Data-Science-Tasks-master\SalesAnalysis\Sales_Data/" + file)
all_months_data = pd.concat([all_months_data, df])
all_months_data.to_csv("all_data.csv", index=False)
all_data = pd.read_csv("all_data.csv")
all_data.head()
all_data = all_data[all_data['Order Date'].str[0:2] != 'Or']
all_data = all_data.dropna(how='all')
all_data['Quantity Ordered'] = pd.to_numeric(all_data['Quantity Ordered'])
all_data['Price Each'] = pd.to_numeric(all_data['Price Each'])
#Adding "Months" Column
all_data['Month'] = all_data['Order Date'].str[0:2]
all_data['Month'] = all_data['Month'].astype('int32')
all_data.head()
# Adding a "sales" column
all_data['Sales'] = all_data['Quantity Ordered'] * all_data['Price Each']
all_data.head()
results = all_data.groupby('Month').sum()
import matplotlib.pyplot as plt
months = range(1,13)
plt.bar(months, results['Sales'])
plt.xticks(months)
plt.ylabel('Sales in $')
plt.xlabel('Month')
plt.show()
#Answer1 - December 2019 noticed the highest sales in 2019
#Adding a 'City' column
def get_city(address):
return address.split(',')[1]
def get_state(address):
return address.split(',')[2].split(' ')[1]
all_data['City'] = all_data['Purchase Address'].apply(lambda x: f"{get_city(x)} ({get_state(x)})")
all_data.head()
results = all_data.groupby('City').sum()
results
import matplotlib.pyplot as plt
Cities = [city for city, df in all_data.groupby('City')]
plt.bar(Cities, results['Sales'])
plt.xticks(months, rotation='vertical', size=10)
plt.ylabel('Sales in $')
plt.xlabel('City Name')
plt.show()
#Answer2 - San Francisco had the highest number of Sales
all_data['Order Date'] = pd.to_datetime(all_data['Order Date'])
all_data['Hour'] = all_data['Order Date'].dt.hour
all_data['Minute'] = all_data['Order Date'].dt.minute
all_data.head()
Hours = [hour for hour, df in all_data.groupby('Hour')]
plt.plot(Hours, all_data.groupby(['Hour']).count())
plt.xticks(Hours)
plt.xlabel('Hour')
plt.ylabel('No. of Orders')
plt.grid()
plt.show()
#Answer3 - In the morning around 11AM-12Pm and 7Pm-8Pm in the evening are the apt times to advertise the product
df = all_data[all_data['Order ID'].duplicated(keep=False)]
df['Grouped'] = df.groupby('Order ID')['Product'].transform(lambda x: ','.join(x))
df = df[['Order ID', 'Grouped']].drop_duplicates()
df.head()
from itertools import combinations
from collections import Counter
count = Counter()
for row in df['Grouped']:
row_list= row.split(',')
count.update(Counter(combinations(row_list, 3)))
for key, value in count.most_common(10):
print(key, value)
product_group = all_data.groupby('Product')
quantity_ordered = product_group.sum()['Quantity Ordered']
products = [product for product, df in product_group]
plt.bar(products, quantity_ordered)
plt.xticks(products, rotation='vertical', size=10)
plt.ylabel('Quantity Ordered')
plt.xlabel('Product')
plt.show()
prices = all_data.groupby('Product').mean()['Price Each']
fig, ax1=plt.subplots()
ax2=ax1.twinx()
ax1.bar(products, quantity_ordered, color='b')
ax2.plot(products, prices, 'r-')
ax1.set_xlabel('Product Name')
ax1.set_ylabel('Quantity Ordered', color='b')
ax2.set_ylabel('Price ($)', color = 'r')
ax1.set_xticklabels(products, rotation='vertical', size=10)
plt.show()