import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.animation

df = pd.read_csv("classicmodel.csv", encoding = "ISO-8859-1")
df.head(3)


df.axes #2996 observations and 38 variables
df.shape

(2996, 38)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2996 entries, 0 to 2995
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   CUSTOMERNUMBER          2996 non-null   int64  
 1   CUSTOMERNAME            2996 non-null   object 
 2   CONTACTLASTNAME         2996 non-null   object 
 3   CONTACTFIRSTNAME        2996 non-null   object 
 4   PHONE                   2996 non-null   object 
 5   ADDRESSLINE1            2996 non-null   object 
 6   ADDRESSLINE2            689 non-null    object 
 7   CITY                    2996 non-null   object 
 8   STATE                   1404 non-null   object 
 9   POSTALCODE              2823 non-null   object 
 10  COUNTRY                 2996 non-null   object 
 11  SALESREPEMPLOYEENUMBER  2996 non-null   int64  
 12  CREDITLIMIT             2996 non-null   int64  
 13  LASTNAME                2996 non-null   object 
 14  FIRSTNAME               2996 non-null   object 
 15  EXTENSION               2996 non-null   object 
 16  EMAIL                   2996 non-null   object 
 17  OFFICECODE              2996 non-null   int64  
 18  REPORTSTO               2996 non-null   int64  
 19  JOBTITLE                2996 non-null   object 
 20  ORDERNUMBER             2996 non-null   int64  
 21  ORDERDATE               2996 non-null   object 
 22  REQUIREDDATE            2996 non-null   object 
 23  SHIPPEDDATE             2855 non-null   object 
 24  STATUS                  2996 non-null   object 
 25  COMMENTS                2996 non-null   object 
 26  PRODUCTCODE             2996 non-null   object 
 27  QUANTITYORDERED         2996 non-null   int64  
 28  PRICEEACH               2996 non-null   float64
 29  ORDERLINENUMBER         2996 non-null   int64  
 30  PRODUCTNAME             2996 non-null   object 
 31  PRODUCTLINE             2996 non-null   object 
 32  PRODUCTSCALE            2996 non-null   object 
 33  PRODUCTVENDOR           2996 non-null   object 
 34  PRODUCTDESCRIPTION      2996 non-null   object 
 35  QUANTITYINSTOCK         2996 non-null   int64  
 36  BUYPRICE                2996 non-null   float64
 37  MSRP                    2996 non-null   float64
dtypes: float64(3), int64(9), object(26)
memory usage: 889.6+ KB


df.dtypes.value_counts()

object     26
int64       9
float64     3
dtype: int64


df.isna().sum() #Checking for number of null values in each column

CUSTOMERNUMBER               0
CUSTOMERNAME                 0
CONTACTLASTNAME              0
CONTACTFIRSTNAME             0
PHONE                        0
ADDRESSLINE1                 0
ADDRESSLINE2              2307
CITY                         0
STATE                     1592
POSTALCODE                 173
COUNTRY                      0
SALESREPEMPLOYEENUMBER       0
CREDITLIMIT                  0
LASTNAME                     0
FIRSTNAME                    0
EXTENSION                    0
EMAIL                        0
OFFICECODE                   0
REPORTSTO                    0
JOBTITLE                     0
ORDERNUMBER                  0
ORDERDATE                    0
REQUIREDDATE                 0
SHIPPEDDATE                141
STATUS                       0
COMMENTS                     0
PRODUCTCODE                  0
QUANTITYORDERED              0
PRICEEACH                    0
ORDERLINENUMBER              0
PRODUCTNAME                  0
PRODUCTLINE                  0
PRODUCTSCALE                 0
PRODUCTVENDOR                0
PRODUCTDESCRIPTION           0
QUANTITYINSTOCK              0
BUYPRICE                     0
MSRP                         0
dtype: int64


df[df.duplicated(subset = None, keep="first")] #Checking if there are any duplicated rows to drop


df["QUANTITYORDERED"] = df["QUANTITYORDERED"].astype("float")
df["PRICEEACH"]=df["PRICEEACH"].astype("float")
df["Total_Sales"] = (df["QUANTITYORDERED"] * df["PRICEEACH"])


Productline = df["PRODUCTLINE"].value_counts()
plt.style.use("bmh")
Productline.plot(kind = "bar")
plt.xlabel("Productline")
plt.ylabel("Counts")
plt.title("Count of Number Of Productline")

Text(0.5, 1.0, 'Count of Number Of Productline')


df["QUANTITYORDERED"] = df["QUANTITYORDERED"].astype("float")
df["PRICEEACH"]=df["PRICEEACH"].astype("float")
df["Total_Sales"] = (df["QUANTITYORDERED"] * df["PRICEEACH"])

def ranking(price):
    if price >= mean_price:
        return "Above Average"
    else:
        return "Below Average"

mean_price = df["Total_Sales"].mean()

price_evaluation = df["Total_Sales"].apply(ranking).value_counts()
price_evaluation.plot(kind = "pie", colors = ["#ff9999","#66b3ff"], autopct='%1.1f%%', fontsize = 12, legend = True)
plt.title("Percentage Of Total Sales")
plt.ylabel('')

Text(0, 0.5, '')


mask = df.groupby(["PRODUCTLINE", "PRODUCTNAME"])["Total_Sales"].agg(["mean", "count"]).reset_index()
mask = mask.sort_values("mean", ascending=False).reset_index(drop=True)
plt.figure(figsize=(7, 7))
ax = sns.barplot(data=mask[:20], x="mean", y="PRODUCTNAME", hue="PRODUCTLINE", palette="muted", dodge= False)
plt.xlabel("Mean")
plt.ylabel("Product Name")
plt.title("Top 20 Product Mean Prices")
plt.show() #Showing the TOP 20 neighbourhoods with highest mean prices


import seaborn as sns
sns.distplot(df["Total_Sales"], hist=True, kde=True) 
#The data for price is right skewed, therefore we can use a log10 transformation to get the data
#into normal distrubition.

C:\Users\Owner\Anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

<AxesSubplot:xlabel='Total_Sales', ylabel='Density'>


sns.distplot(np.log10(df["Total_Sales"]), hist=True, kde=True)

C:\Users\Owner\Anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

<AxesSubplot:xlabel='Total_Sales', ylabel='Density'>


df["ORDERDATE"] = pd.to_datetime(df["ORDERDATE"])
ts=df.groupby(df["ORDERDATE"])["Total_Sales"].sum()
plt.figure(figsize=(16,8))
plt.title('Total Sales of the company')
plt.xlabel('Time')
plt.ylabel('Sales')
plt.plot(ts);


Months_Years_Only = df.set_index("ORDERDATE").groupby(pd.Grouper(freq='M')).sum()
ts=ts=Months_Years_Only["Total_Sales"]
plt.figure(figsize=(16,8))
plt.title('Total Sales of the company')
plt.xlabel('Month/Year')
plt.ylabel('Total Sales')
plt.plot(ts);


Months_Years_Only.reset_index(inplace=True)


import matplotlib.pyplot as plt
import matplotlib.animation
import numpy as np

x = Months_Years_Only["ORDERDATE"]
y = Months_Years_Only["Total_Sales"]


fig, ax = plt.subplots()
l, = ax.plot(x,y)
ax.set(xlabel = 'Month/Year', ylabel = 'Total Sale' ,title = 'Total Sales of the company')
ax.grid()

def animate(i):
    l.set_data(x[:i], y[:i])

ani = matplotlib.animation.FuncAnimation(fig, animate, frames=len(x),interval=300)

from IPython.display import HTML
HTML(ani.to_jshtml())


jupyter nbconvert --execute --to html notebClassic_Model_Sales_Analysis.ipynb

  File "<ipython-input-37-868f45d9bbd9>", line 1
    jupyter nbconvert --to html Classic_Model_Sales_Analysis.ipynb
                    ^
SyntaxError: invalid syntax

	CUSTOMERNUMBER	CUSTOMERNAME	CONTACTLASTNAME	CONTACTFIRSTNAME	PHONE	ADDRESSLINE1	ADDRESSLINE2	CITY	STATE	POSTALCODE	...	PRICEEACH	ORDERLINENUMBER	PRODUCTNAME	PRODUCTLINE	PRODUCTSCALE	PRODUCTVENDOR	PRODUCTDESCRIPTION	QUANTITYINSTOCK	BUYPRICE	MSRP
0	242	Alpha Cognac	Roulet	Annette	61.77.6555	1 rue Alsace-Lorraine	NaN	Toulouse	NaN	31000	...	104.81	9	1941 Chevrolet Special Deluxe Cabriolet	Vintage Cars	1:18	Exoto Designs	Features opening hood, opening doors, opening ...	2378	64.58	105.87
1	242	Alpha Cognac	Roulet	Annette	61.77.6555	1 rue Alsace-Lorraine	NaN	Toulouse	NaN	31000	...	67.82	5	1900s Vintage Bi-Plane	Planes	1:24	Autoart Studio Design	Hand crafted diecast-like metal bi-plane is re...	5942	34.25	68.51
2	242	Alpha Cognac	Roulet	Annette	61.77.6555	1 rue Alsace-Lorraine	NaN	Toulouse	NaN	31000	...	65.75	6	1937 Horch 930V Limousine	Vintage Cars	1:24	Autoart Studio Design	Features opening hood, opening doors, opening ...	2902	26.30	65.75

Classic Model Sales Analysis By: Fullwin Liang¶

In this project, I will be analyzing and creating visualizations on the classmodel.csv dataset. We can create visualziations to seek trends on sales and profits for specific car models and productlines.¶

Source: This dataset is a combined view of multiple tables from a relational transaction processing database of car sales exported with SQL.¶

Data Profiling, Cleaning and Deduping¶

Creating Calculated Field For Total Sales¶

Data Visualization (EDA)¶

Animation Time Series¶