import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("countries of the world.csv", encoding = "ISO-8859-1")
df.head()
Country | Region | Population | Area (sq. mi.) | Pop. Density (per sq. mi.) | Coastline (coast/area ratio) | Net migration | Infant mortality (per 1000 births) | GDP ($ per capita) | Literacy (%) | Phones (per 1000) | Arable (%) | Crops (%) | Other (%) | Climate | Birthrate | Deathrate | Agriculture | Industry | Service | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Afghanistan | ASIA (EX. NEAR EAST) | 31056997 | 647500 | 48,0 | 0,00 | 23,06 | 163,07 | 700.0 | 36,0 | 3,2 | 12,13 | 0,22 | 87,65 | 1 | 46,6 | 20,34 | 0,38 | 0,24 | 0,38 |
1 | Albania | EASTERN EUROPE | 3581655 | 28748 | 124,6 | 1,26 | -4,93 | 21,52 | 4500.0 | 86,5 | 71,2 | 21,09 | 4,42 | 74,49 | 3 | 15,11 | 5,22 | 0,232 | 0,188 | 0,579 |
2 | Algeria | NORTHERN AFRICA | 32930091 | 2381740 | 13,8 | 0,04 | -0,39 | 31 | 6000.0 | 70,0 | 78,1 | 3,22 | 0,25 | 96,53 | 1 | 17,14 | 4,61 | 0,101 | 0,6 | 0,298 |
3 | American Samoa | OCEANIA | 57794 | 199 | 290,4 | 58,29 | -20,71 | 9,27 | 8000.0 | 97,0 | 259,5 | 10 | 15 | 75 | 2 | 22,46 | 3,27 | NaN | NaN | NaN |
4 | Andorra | WESTERN EUROPE | 71201 | 468 | 152,1 | 0,00 | 6,6 | 4,05 | 19000.0 | 100,0 | 497,2 | 2,22 | 0 | 97,78 | 3 | 8,71 | 6,25 | NaN | NaN | NaN |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 227 entries, 0 to 226 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country 227 non-null object 1 Region 227 non-null object 2 Population 227 non-null int64 3 Area (sq. mi.) 227 non-null int64 4 Pop. Density (per sq. mi.) 227 non-null object 5 Coastline (coast/area ratio) 227 non-null object 6 Net migration 224 non-null object 7 Infant mortality (per 1000 births) 224 non-null object 8 GDP ($ per capita) 226 non-null float64 9 Literacy (%) 209 non-null object 10 Phones (per 1000) 223 non-null object 11 Arable (%) 225 non-null object 12 Crops (%) 225 non-null object 13 Other (%) 225 non-null object 14 Climate 205 non-null object 15 Birthrate 224 non-null object 16 Deathrate 223 non-null object 17 Agriculture 212 non-null object 18 Industry 211 non-null object 19 Service 212 non-null object dtypes: float64(1), int64(2), object(17) memory usage: 35.6+ KB
df[df.duplicated(subset = None, keep="first")] #Checking if there are any duplicated rows to drop
Country | Region | Population | Area (sq. mi.) | Pop. Density (per sq. mi.) | Coastline (coast/area ratio) | Net migration | Infant mortality (per 1000 births) | GDP ($ per capita) | Literacy (%) | Phones (per 1000) | Arable (%) | Crops (%) | Other (%) | Climate | Birthrate | Deathrate | Agriculture | Industry | Service |
---|
df.isnull().sum() #Checking for null values
Country 0 Region 0 Population 0 Area (sq. mi.) 0 Pop. Density (per sq. mi.) 0 Coastline (coast/area ratio) 0 Net migration 3 Infant mortality (per 1000 births) 3 GDP ($ per capita) 1 Literacy (%) 18 Phones (per 1000) 4 Arable (%) 2 Crops (%) 2 Other (%) 2 Climate 22 Birthrate 3 Deathrate 4 Agriculture 15 Industry 16 Service 15 dtype: int64
df.columns = (["country","region","population","area","density","coastline","migration","infant_mortality","gdp","literacy","phones","arable","crops","other","climate","birthrate","deathrate","agriculture","industry","service"])
#Renaming columns to fix the errors
#Adjusting the datatypes to category / float
df.country = df.country.astype('category')
df.region = df.region.astype('category')
df.density = df.density.str.replace(",",".").astype(float)
df.coastline = df.coastline.str.replace(",",".").astype(float)
df.migration = df.migration.str.replace(",",".").astype(float)
df.infant_mortality =df.infant_mortality.str.replace(",",".").astype(float)
df.literacy = df.literacy.str.replace(",",".").astype(float)
df.phones = df.phones.str.replace(",",".").astype(float)
df.arable = df.arable.str.replace(",",".").astype(float)
df.crops = df.crops.str.replace(",",".").astype(float)
df.other = df.other.str.replace(",",".").astype(float)
df.climate =df.climate.str.replace(",",".").astype(float)
df.birthrate = df.birthrate.str.replace(",",".").astype(float)
df.deathrate = df.deathrate.str.replace(",",".").astype(float)
df.agriculture = df.agriculture.str.replace(",",".").astype(float)
df.industry =df.industry.str.replace(",",".").astype(float)
df.service = df.service.str.replace(",",".").astype(float)
Regions = df["region"].value_counts()
plt.figure(figsize=(10,5))
Regions.plot(kind = "bar")
plt.xlabel("Regions")
plt.ylabel("Counts")
plt.title("Number of Countries")
Regions
SUB-SAHARAN AFRICA 51 LATIN AMER. & CARIB 45 WESTERN EUROPE 28 ASIA (EX. NEAR EAST) 28 OCEANIA 21 NEAR EAST 16 EASTERN EUROPE 12 C.W. OF IND. STATES 12 NORTHERN AFRICA 6 NORTHERN AMERICA 5 BALTICS 3 Name: region, dtype: int64
f,ax = plt.subplots(figsize=(20, 16))
sns.heatmap(df.corr(), annot = True, cmap="Blues")
<AxesSubplot:>
sns.lmplot(x="gdp",y="phones",data=df,height=10)
print("Correlation between GDP and Phone = ", df["gdp"].corr(df["phones"]))
Correlation between GDP and Phone = 0.834499275414044
df1 = df.dropna(axis = 0, how ='any')
X = df1[['population' , 'phones' , 'literacy' , 'migration' , 'service']]
y = df1['gdp']
#Splitting to train and testing data with 40% test size
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=60)
#Splitting to train and testing data
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train,y_train)
LinearRegression()
print(lm.intercept_)
35.50168302296879
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
coeff_df
Coefficient | |
---|---|
population | -0.000002 |
phones | 40.046536 |
literacy | 27.557063 |
migration | 504.079980 |
service | -1319.118652 |
predictions = lm.predict(X_test)
plt.scatter(y_test,predictions)
<matplotlib.collections.PathCollection at 0x15d594c8>
predictions[0:10]
array([15797.57192665, 2016.05151292, 5262.1925031 , 16043.80474604, 2530.79609579, 15092.27649283, 15967.46055783, -242.6673032 , 2030.01129828, 27675.90558634])
y_test[0:10]
92 13900.0 1 4500.0 61 4800.0 18 6100.0 162 4600.0 53 15700.0 157 9000.0 89 1600.0 199 1000.0 54 31100.0 Name: gdp, dtype: float64
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
MAE: 3227.47189444646 MSE: 21325963.66334463 RMSE: 4618.004294426828
from sklearn.metrics import r2_score
print("Coefficent of Determination:", r2_score(y_test, predictions))
Coefficent of Determination: 0.7848811120644366
phone = pd.read_csv("train.csv", encoding = "ISO-8859-1", parse_dates = ["Date"])
phone.head()
App | Date | Time | Duration | |
---|---|---|---|---|
0 | Screen off (locked) | 2019-05-17 | 18:25:07 | 0:00:02 |
1 | Screen on (unlocked) | 2019-05-17 | 18:25:10 | 0:00:01 |
2 | Screen off (locked) | 2019-05-17 | 18:25:10 | 0:03:19 |
3 | Screen on (unlocked) | 2019-05-17 | 18:28:29 | 0:00:01 |
4 | Settings | 2019-05-17 | 18:28:30 | 0:00:03 |
list = ['Screen on (unlocked)','Screen off (locked)','Screen on (locked)', 'Screen off','Permission controller','System UI','Package installer',
'Device shutdown','Call Management']
phone = phone[phone["App"].isin(list) == False]
phone['Time_Seconds'] = phone['Duration'].str.split(':').apply(lambda x: int(x[0]) * 3600 + int(x[1])*60 + int(x[2]))
phone.groupby('App').sum().nlargest(20,'Time_Seconds').reset_index()
App | Time_Seconds | |
---|---|---|
0 | 1030845 | |
1 | 460123 | |
2 | Phone | 348651 |
3 | YouTube | 283133 |
4 | Call of Duty | 252630 |
5 | Opera Mini | 152205 |
6 | Amazon Kindle | 89942 |
7 | Chrome | 76634 |
8 | MX Player | 57245 |
9 | Hotstar | 48908 |
10 | Opera | 41901 |
11 | Amazon Shopping | 39015 |
12 | Device boot | 38175 |
13 | Uber Eats | 29566 |
14 | Swiggy | 29187 |
15 | Goodreads | 28204 |
16 | Gallery | 17618 |
17 | Google Pay | 16158 |
18 | redBus | 13925 |
19 | Duolingo | 12459 |
phone["Time_Minutes"] = phone["Time_Seconds"] / 60
plt.figure(figsize=(15,6))
plt.xticks(rotation=40)
total = phone.groupby('App').sum().nlargest(20,'Time_Seconds').reset_index()
sns.barplot (x="App", y= "Time_Seconds", data = total, palette = "Blues_r")
<AxesSubplot:xlabel='App', ylabel='Time_Seconds'>
phone["Month"] = phone["Date"].dt.month
phone
App | Date | Time | Duration | Time_Seconds | Time_Minutes | Month | |
---|---|---|---|---|---|---|---|
4 | Settings | 2019-05-17 | 18:28:30 | 0:00:03 | 3 | 0.050000 | 5 |
8 | Settings | 2019-05-17 | 18:30:17 | 0:00:03 | 3 | 0.050000 | 5 |
15 | MTP application | 2019-05-17 | 18:30:24 | 0:00:00 | 0 | 0.000000 | 5 |
17 | MTP application | 2019-05-17 | 18:30:28 | 0:00:02 | 2 | 0.033333 | 5 |
20 | Contacts | 2019-05-17 | 18:42:24 | 0:00:04 | 4 | 0.066667 | 5 |
... | ... | ... | ... | ... | ... | ... | ... |
79330 | Opera | 2019-10-31 | 23:44:11 | 0:03:29 | 209 | 3.483333 | 10 |
79331 | 2019-10-31 | 23:47:40 | 0:00:47 | 47 | 0.783333 | 10 | |
79332 | Opera | 2019-10-31 | 23:48:27 | 0:04:18 | 258 | 4.300000 | 10 |
79336 | Clock | 2019-10-31 | 23:52:59 | 0:00:13 | 13 | 0.216667 | 10 |
79337 | Clock | 2019-10-31 | 23:53:15 | 0:00:06 | 6 | 0.100000 | 10 |
29463 rows × 7 columns
ts=phone.groupby(phone["Date"])["Time_Seconds"].sum()
plt.figure(figsize=(16,8))
plt.title('Total Time On Phone')
plt.xlabel('Date')
plt.ylabel('Total Time')
plt.plot(ts);
animatedplot = ts.to_frame().reset_index()
animatedplot["Date"]
0 2019-05-17 1 2019-05-18 2 2019-05-19 3 2019-05-20 4 2019-05-21 ... 163 2019-10-27 164 2019-10-28 165 2019-10-29 166 2019-10-30 167 2019-10-31 Name: Date, Length: 168, dtype: datetime64[ns]
import matplotlib.pyplot as plt
import matplotlib.animation
import numpy as np
x = animatedplot["Date"]
y = animatedplot["Time_Seconds"]
fig, ax = plt.subplots()
l, = ax.plot(x,y)
ax.set(xlabel = 'Month/Year', ylabel = 'Total Seconds' ,title = 'Total Time On Phone')
ax.grid()
def animate(i):
l.set_data(x[:i], y[:i])
ani = matplotlib.animation.FuncAnimation(fig, animate, frames=len(x),interval=300)
from IPython.display import HTML
HTML(ani.to_jshtml())
phone.to_csv('phoneapp.csv', index =False) #Exporting the new csv to load into Tableau
!jupyter nbconvert --to html Project2
[NbConvertApp] Converting notebook Project2.ipynb to html [NbConvertApp] Writing 6077022 bytes to Project2.html