import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.animation
df = pd.read_csv("classicmodel.csv", encoding = "ISO-8859-1")
df.head(3)
CUSTOMERNUMBER | CUSTOMERNAME | CONTACTLASTNAME | CONTACTFIRSTNAME | PHONE | ADDRESSLINE1 | ADDRESSLINE2 | CITY | STATE | POSTALCODE | ... | PRICEEACH | ORDERLINENUMBER | PRODUCTNAME | PRODUCTLINE | PRODUCTSCALE | PRODUCTVENDOR | PRODUCTDESCRIPTION | QUANTITYINSTOCK | BUYPRICE | MSRP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 242 | Alpha Cognac | Roulet | Annette | 61.77.6555 | 1 rue Alsace-Lorraine | NaN | Toulouse | NaN | 31000 | ... | 104.81 | 9 | 1941 Chevrolet Special Deluxe Cabriolet | Vintage Cars | 1:18 | Exoto Designs | Features opening hood, opening doors, opening ... | 2378 | 64.58 | 105.87 |
1 | 242 | Alpha Cognac | Roulet | Annette | 61.77.6555 | 1 rue Alsace-Lorraine | NaN | Toulouse | NaN | 31000 | ... | 67.82 | 5 | 1900s Vintage Bi-Plane | Planes | 1:24 | Autoart Studio Design | Hand crafted diecast-like metal bi-plane is re... | 5942 | 34.25 | 68.51 |
2 | 242 | Alpha Cognac | Roulet | Annette | 61.77.6555 | 1 rue Alsace-Lorraine | NaN | Toulouse | NaN | 31000 | ... | 65.75 | 6 | 1937 Horch 930V Limousine | Vintage Cars | 1:24 | Autoart Studio Design | Features opening hood, opening doors, opening ... | 2902 | 26.30 | 65.75 |
3 rows × 38 columns
df.axes #2996 observations and 38 variables
df.shape
(2996, 38)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2996 entries, 0 to 2995 Data columns (total 38 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CUSTOMERNUMBER 2996 non-null int64 1 CUSTOMERNAME 2996 non-null object 2 CONTACTLASTNAME 2996 non-null object 3 CONTACTFIRSTNAME 2996 non-null object 4 PHONE 2996 non-null object 5 ADDRESSLINE1 2996 non-null object 6 ADDRESSLINE2 689 non-null object 7 CITY 2996 non-null object 8 STATE 1404 non-null object 9 POSTALCODE 2823 non-null object 10 COUNTRY 2996 non-null object 11 SALESREPEMPLOYEENUMBER 2996 non-null int64 12 CREDITLIMIT 2996 non-null int64 13 LASTNAME 2996 non-null object 14 FIRSTNAME 2996 non-null object 15 EXTENSION 2996 non-null object 16 EMAIL 2996 non-null object 17 OFFICECODE 2996 non-null int64 18 REPORTSTO 2996 non-null int64 19 JOBTITLE 2996 non-null object 20 ORDERNUMBER 2996 non-null int64 21 ORDERDATE 2996 non-null object 22 REQUIREDDATE 2996 non-null object 23 SHIPPEDDATE 2855 non-null object 24 STATUS 2996 non-null object 25 COMMENTS 2996 non-null object 26 PRODUCTCODE 2996 non-null object 27 QUANTITYORDERED 2996 non-null int64 28 PRICEEACH 2996 non-null float64 29 ORDERLINENUMBER 2996 non-null int64 30 PRODUCTNAME 2996 non-null object 31 PRODUCTLINE 2996 non-null object 32 PRODUCTSCALE 2996 non-null object 33 PRODUCTVENDOR 2996 non-null object 34 PRODUCTDESCRIPTION 2996 non-null object 35 QUANTITYINSTOCK 2996 non-null int64 36 BUYPRICE 2996 non-null float64 37 MSRP 2996 non-null float64 dtypes: float64(3), int64(9), object(26) memory usage: 889.6+ KB
df.dtypes.value_counts()
object 26 int64 9 float64 3 dtype: int64
df.isna().sum() #Checking for number of null values in each column
CUSTOMERNUMBER 0 CUSTOMERNAME 0 CONTACTLASTNAME 0 CONTACTFIRSTNAME 0 PHONE 0 ADDRESSLINE1 0 ADDRESSLINE2 2307 CITY 0 STATE 1592 POSTALCODE 173 COUNTRY 0 SALESREPEMPLOYEENUMBER 0 CREDITLIMIT 0 LASTNAME 0 FIRSTNAME 0 EXTENSION 0 EMAIL 0 OFFICECODE 0 REPORTSTO 0 JOBTITLE 0 ORDERNUMBER 0 ORDERDATE 0 REQUIREDDATE 0 SHIPPEDDATE 141 STATUS 0 COMMENTS 0 PRODUCTCODE 0 QUANTITYORDERED 0 PRICEEACH 0 ORDERLINENUMBER 0 PRODUCTNAME 0 PRODUCTLINE 0 PRODUCTSCALE 0 PRODUCTVENDOR 0 PRODUCTDESCRIPTION 0 QUANTITYINSTOCK 0 BUYPRICE 0 MSRP 0 dtype: int64
df[df.duplicated(subset = None, keep="first")] #Checking if there are any duplicated rows to drop
CUSTOMERNUMBER | CUSTOMERNAME | CONTACTLASTNAME | CONTACTFIRSTNAME | PHONE | ADDRESSLINE1 | ADDRESSLINE2 | CITY | STATE | POSTALCODE | ... | PRICEEACH | ORDERLINENUMBER | PRODUCTNAME | PRODUCTLINE | PRODUCTSCALE | PRODUCTVENDOR | PRODUCTDESCRIPTION | QUANTITYINSTOCK | BUYPRICE | MSRP |
---|
0 rows × 38 columns
df["QUANTITYORDERED"] = df["QUANTITYORDERED"].astype("float")
df["PRICEEACH"]=df["PRICEEACH"].astype("float")
df["Total_Sales"] = (df["QUANTITYORDERED"] * df["PRICEEACH"])
Productline = df["PRODUCTLINE"].value_counts()
plt.style.use("bmh")
Productline.plot(kind = "bar")
plt.xlabel("Productline")
plt.ylabel("Counts")
plt.title("Count of Number Of Productline")
Text(0.5, 1.0, 'Count of Number Of Productline')
df["QUANTITYORDERED"] = df["QUANTITYORDERED"].astype("float")
df["PRICEEACH"]=df["PRICEEACH"].astype("float")
df["Total_Sales"] = (df["QUANTITYORDERED"] * df["PRICEEACH"])
def ranking(price):
if price >= mean_price:
return "Above Average"
else:
return "Below Average"
mean_price = df["Total_Sales"].mean()
price_evaluation = df["Total_Sales"].apply(ranking).value_counts()
price_evaluation.plot(kind = "pie", colors = ["#ff9999","#66b3ff"], autopct='%1.1f%%', fontsize = 12, legend = True)
plt.title("Percentage Of Total Sales")
plt.ylabel('')
Text(0, 0.5, '')
mask = df.groupby(["PRODUCTLINE", "PRODUCTNAME"])["Total_Sales"].agg(["mean", "count"]).reset_index()
mask = mask.sort_values("mean", ascending=False).reset_index(drop=True)
plt.figure(figsize=(7, 7))
ax = sns.barplot(data=mask[:20], x="mean", y="PRODUCTNAME", hue="PRODUCTLINE", palette="muted", dodge= False)
plt.xlabel("Mean")
plt.ylabel("Product Name")
plt.title("Top 20 Product Mean Prices")
plt.show() #Showing the TOP 20 neighbourhoods with highest mean prices
import seaborn as sns
sns.distplot(df["Total_Sales"], hist=True, kde=True)
#The data for price is right skewed, therefore we can use a log10 transformation to get the data
#into normal distrubition.
C:\Users\Owner\Anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='Total_Sales', ylabel='Density'>
sns.distplot(np.log10(df["Total_Sales"]), hist=True, kde=True)
C:\Users\Owner\Anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='Total_Sales', ylabel='Density'>
df["ORDERDATE"] = pd.to_datetime(df["ORDERDATE"])
ts=df.groupby(df["ORDERDATE"])["Total_Sales"].sum()
plt.figure(figsize=(16,8))
plt.title('Total Sales of the company')
plt.xlabel('Time')
plt.ylabel('Sales')
plt.plot(ts);
Months_Years_Only = df.set_index("ORDERDATE").groupby(pd.Grouper(freq='M')).sum()
ts=ts=Months_Years_Only["Total_Sales"]
plt.figure(figsize=(16,8))
plt.title('Total Sales of the company')
plt.xlabel('Month/Year')
plt.ylabel('Total Sales')
plt.plot(ts);
Months_Years_Only.reset_index(inplace=True)
import matplotlib.pyplot as plt
import matplotlib.animation
import numpy as np
x = Months_Years_Only["ORDERDATE"]
y = Months_Years_Only["Total_Sales"]
fig, ax = plt.subplots()
l, = ax.plot(x,y)
ax.set(xlabel = 'Month/Year', ylabel = 'Total Sale' ,title = 'Total Sales of the company')
ax.grid()
def animate(i):
l.set_data(x[:i], y[:i])
ani = matplotlib.animation.FuncAnimation(fig, animate, frames=len(x),interval=300)
from IPython.display import HTML
HTML(ani.to_jshtml())
jupyter nbconvert --execute --to html notebClassic_Model_Sales_Analysis.ipynb
File "<ipython-input-37-868f45d9bbd9>", line 1 jupyter nbconvert --to html Classic_Model_Sales_Analysis.ipynb ^ SyntaxError: invalid syntax