import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("countries of the world.csv", encoding = "ISO-8859-1")


df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227 entries, 0 to 226
Data columns (total 20 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Country                             227 non-null    object 
 1   Region                              227 non-null    object 
 2   Population                          227 non-null    int64  
 3   Area (sq. mi.)                      227 non-null    int64  
 4   Pop. Density (per sq. mi.)          227 non-null    object 
 5   Coastline (coast/area ratio)        227 non-null    object 
 6   Net migration                       224 non-null    object 
 7   Infant mortality (per 1000 births)  224 non-null    object 
 8   GDP ($ per capita)                  226 non-null    float64
 9   Literacy (%)                        209 non-null    object 
 10  Phones (per 1000)                   223 non-null    object 
 11  Arable (%)                          225 non-null    object 
 12  Crops (%)                           225 non-null    object 
 13  Other (%)                           225 non-null    object 
 14  Climate                             205 non-null    object 
 15  Birthrate                           224 non-null    object 
 16  Deathrate                           223 non-null    object 
 17  Agriculture                         212 non-null    object 
 18  Industry                            211 non-null    object 
 19  Service                             212 non-null    object 
dtypes: float64(1), int64(2), object(17)
memory usage: 35.6+ KB


df[df.duplicated(subset = None, keep="first")] #Checking if there are any duplicated rows to drop


df.isnull().sum() #Checking for null values

Country                                0
Region                                 0
Population                             0
Area (sq. mi.)                         0
Pop. Density (per sq. mi.)             0
Coastline (coast/area ratio)           0
Net migration                          3
Infant mortality (per 1000 births)     3
GDP ($ per capita)                     1
Literacy (%)                          18
Phones (per 1000)                      4
Arable (%)                             2
Crops (%)                              2
Other (%)                              2
Climate                               22
Birthrate                              3
Deathrate                              4
Agriculture                           15
Industry                              16
Service                               15
dtype: int64


df.columns = (["country","region","population","area","density","coastline","migration","infant_mortality","gdp","literacy","phones","arable","crops","other","climate","birthrate","deathrate","agriculture","industry","service"])
#Renaming columns to fix the errors


#Adjusting the datatypes to category / float 
df.country = df.country.astype('category')
df.region = df.region.astype('category')
df.density = df.density.str.replace(",",".").astype(float)
df.coastline = df.coastline.str.replace(",",".").astype(float)
df.migration = df.migration.str.replace(",",".").astype(float)
df.infant_mortality =df.infant_mortality.str.replace(",",".").astype(float)
df.literacy = df.literacy.str.replace(",",".").astype(float)
df.phones = df.phones.str.replace(",",".").astype(float)
df.arable = df.arable.str.replace(",",".").astype(float)
df.crops = df.crops.str.replace(",",".").astype(float)
df.other = df.other.str.replace(",",".").astype(float)
df.climate =df.climate.str.replace(",",".").astype(float)
df.birthrate = df.birthrate.str.replace(",",".").astype(float)
df.deathrate = df.deathrate.str.replace(",",".").astype(float)
df.agriculture = df.agriculture.str.replace(",",".").astype(float)
df.industry =df.industry.str.replace(",",".").astype(float)
df.service = df.service.str.replace(",",".").astype(float)


Regions = df["region"].value_counts()
plt.figure(figsize=(10,5))
Regions.plot(kind = "bar")
plt.xlabel("Regions")
plt.ylabel("Counts")
plt.title("Number of Countries")
Regions

SUB-SAHARAN AFRICA                     51
LATIN AMER. & CARIB                    45
WESTERN EUROPE                         28
ASIA (EX. NEAR EAST)                   28
OCEANIA                                21
NEAR EAST                              16
EASTERN EUROPE                         12
C.W. OF IND. STATES                    12
NORTHERN AFRICA                         6
NORTHERN AMERICA                        5
BALTICS                                 3
Name: region, dtype: int64


f,ax = plt.subplots(figsize=(20, 16))
sns.heatmap(df.corr(), annot = True, cmap="Blues")

<AxesSubplot:>


sns.lmplot(x="gdp",y="phones",data=df,height=10)
print("Correlation between GDP and Phone = ", df["gdp"].corr(df["phones"]))

Correlation between GDP and Phone =  0.834499275414044


df1 = df.dropna(axis = 0, how ='any')


X = df1[['population' , 'phones' , 'literacy' , 'migration' , 'service']]
y = df1['gdp']


#Splitting to train and testing data with 40% test size
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=60)


#Splitting to train and testing data
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train,y_train)

LinearRegression()


print(lm.intercept_)

35.50168302296879


coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
coeff_df


predictions = lm.predict(X_test)
plt.scatter(y_test,predictions)

<matplotlib.collections.PathCollection at 0x15d594c8>


predictions[0:10]

array([15797.57192665,  2016.05151292,  5262.1925031 , 16043.80474604,
        2530.79609579, 15092.27649283, 15967.46055783,  -242.6673032 ,
        2030.01129828, 27675.90558634])


y_test[0:10]

92     13900.0
1       4500.0
61      4800.0
18      6100.0
162     4600.0
53     15700.0
157     9000.0
89      1600.0
199     1000.0
54     31100.0
Name: gdp, dtype: float64


from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

MAE: 3227.47189444646
MSE: 21325963.66334463
RMSE: 4618.004294426828


from sklearn.metrics import r2_score
print("Coefficent of Determination:", r2_score(y_test, predictions))

Coefficent of Determination: 0.7848811120644366


phone = pd.read_csv("train.csv", encoding = "ISO-8859-1", parse_dates = ["Date"])


phone.head()


list = ['Screen on (unlocked)','Screen off (locked)','Screen on (locked)', 'Screen off','Permission controller','System UI','Package installer',
'Device shutdown','Call Management']

phone = phone[phone["App"].isin(list) == False]


phone['Time_Seconds'] = phone['Duration'].str.split(':').apply(lambda x: int(x[0]) * 3600 + int(x[1])*60 + int(x[2]))


phone.groupby('App').sum().nlargest(20,'Time_Seconds').reset_index()


phone["Time_Minutes"] = phone["Time_Seconds"] / 60


plt.figure(figsize=(15,6))
plt.xticks(rotation=40)
total = phone.groupby('App').sum().nlargest(20,'Time_Seconds').reset_index()
sns.barplot (x="App", y= "Time_Seconds", data = total, palette = "Blues_r")

<AxesSubplot:xlabel='App', ylabel='Time_Seconds'>


phone["Month"] = phone["Date"].dt.month


phone


ts=phone.groupby(phone["Date"])["Time_Seconds"].sum()
plt.figure(figsize=(16,8))
plt.title('Total Time On Phone')
plt.xlabel('Date')
plt.ylabel('Total Time')
plt.plot(ts);


animatedplot = ts.to_frame().reset_index()
animatedplot["Date"]

0     2019-05-17
1     2019-05-18
2     2019-05-19
3     2019-05-20
4     2019-05-21
         ...    
163   2019-10-27
164   2019-10-28
165   2019-10-29
166   2019-10-30
167   2019-10-31
Name: Date, Length: 168, dtype: datetime64[ns]


import matplotlib.pyplot as plt
import matplotlib.animation
import numpy as np

x = animatedplot["Date"]
y = animatedplot["Time_Seconds"]

fig, ax = plt.subplots()
l, = ax.plot(x,y)
ax.set(xlabel = 'Month/Year', ylabel = 'Total Seconds' ,title = 'Total Time On Phone')
ax.grid()

def animate(i):
    l.set_data(x[:i], y[:i])

ani = matplotlib.animation.FuncAnimation(fig, animate, frames=len(x),interval=300)

from IPython.display import HTML
HTML(ani.to_jshtml())


phone.to_csv('phoneapp.csv', index =False) #Exporting the new csv to load into Tableau


!jupyter nbconvert --to html Project2

[NbConvertApp] Converting notebook Project2.ipynb to html
[NbConvertApp] Writing 6077022 bytes to Project2.html

	Country	Region	Population	Area (sq. mi.)	Pop. Density (per sq. mi.)	Coastline (coast/area ratio)	Net migration	Infant mortality (per 1000 births)	GDP ($ per capita)	Literacy (%)	Phones (per 1000)	Arable (%)	Crops (%)	Other (%)	Climate	Birthrate	Deathrate	Agriculture	Industry	Service
0	Afghanistan	ASIA (EX. NEAR EAST)	31056997	647500	48,0	0,00	23,06	163,07	700.0	36,0	3,2	12,13	0,22	87,65	1	46,6	20,34	0,38	0,24	0,38
1	Albania	EASTERN EUROPE	3581655	28748	124,6	1,26	-4,93	21,52	4500.0	86,5	71,2	21,09	4,42	74,49	3	15,11	5,22	0,232	0,188	0,579
2	Algeria	NORTHERN AFRICA	32930091	2381740	13,8	0,04	-0,39	31	6000.0	70,0	78,1	3,22	0,25	96,53	1	17,14	4,61	0,101	0,6	0,298
3	American Samoa	OCEANIA	57794	199	290,4	58,29	-20,71	9,27	8000.0	97,0	259,5	10	15	75	2	22,46	3,27	NaN	NaN	NaN
4	Andorra	WESTERN EUROPE	71201	468	152,1	0,00	6,6	4,05	19000.0	100,0	497,2	2,22	0	97,78	3	8,71	6,25	NaN	NaN	NaN

	App	Time_Seconds
0	Instagram	1030845
1	WhatsApp	460123
2	Phone	348651
3	YouTube	283133
4	Call of Duty	252630
5	Opera Mini	152205
6	Amazon Kindle	89942
7	Chrome	76634
8	MX Player	57245
9	Hotstar	48908
10	Opera	41901
11	Amazon Shopping	39015
12	Device boot	38175
13	Uber Eats	29566
14	Swiggy	29187
15	Goodreads	28204
16	Gallery	17618
17	Google Pay	16158
18	redBus	13925
19	Duolingo	12459

	App	Date	Time	Duration	Time_Seconds	Time_Minutes	Month
4	Settings	2019-05-17	18:28:30	0:00:03	3	0.050000	5
8	Settings	2019-05-17	18:30:17	0:00:03	3	0.050000	5
15	MTP application	2019-05-17	18:30:24	0:00:00	0	0.000000	5
17	MTP application	2019-05-17	18:30:28	0:00:02	2	0.033333	5
20	Contacts	2019-05-17	18:42:24	0:00:04	4	0.066667	5
...	...	...	...	...	...	...	...
79330	Opera	2019-10-31	23:44:11	0:03:29	209	3.483333	10
79331	WhatsApp	2019-10-31	23:47:40	0:00:47	47	0.783333	10
79332	Opera	2019-10-31	23:48:27	0:04:18	258	4.300000	10
79336	Clock	2019-10-31	23:52:59	0:00:13	13	0.216667	10
79337	Clock	2019-10-31	23:53:15	0:00:06	6	0.100000	10

Data viz analysis Project 2 (Analysis on Global GDP)¶

By: Fullwin Liang ¶

Source: All my datasets are retrieved from kaggle and I will be utilizing two datasets for this project _¶

Data Scrubbing and transformation¶

Exploration Data Analysis¶

Linear Regression Model¶

Phone Dataset¶

Data cleaning/transformation¶

Data Visualization and Exporting CSV to Import to Tableau¶

	Coefficient
population	-0.000002
phones	40.046536
literacy	27.557063
migration	504.079980
service	-1319.118652

	App	Date	Time	Duration
0	Screen off (locked)	2019-05-17	18:25:07	0:00:02
1	Screen on (unlocked)	2019-05-17	18:25:10	0:00:01
2	Screen off (locked)	2019-05-17	18:25:10	0:03:19
3	Screen on (unlocked)	2019-05-17	18:28:29	0:00:01
4	Settings	2019-05-17	18:28:30	0:00:03