2021 World Happiness Report Analysis

report = pd.read_csv('world-happiness-report-2021.csv')print(report.columns.isnull().sum())
# there are no null values in the dataset so we can continue

print(report.dtypes)
# all columns are in order in terms of variable types
report.groupby('Country name')['Ladder score'].sum().sort_values(ascending=False)[0:20].plot(kind='barh')
Western Europe                  13
North America and ANZ 4
Latin America and Caribbean 1
Central and Eastern Europe 1
Middle East and North Africa 1
regional_avg_gdp = report.groupby('Regional indicator')[['Ladder score','Logged GDP per capita']].mean().reset_index()regional_score_avg = regional_avg_gdp.copy()
regional_score_avg.columns = ['Regional indicator','avg_ladder_score','avg_gdp']
report2 = report.merge(regional_score_avg, left_on='Regional indicator', right_on='Regional indicator')
report2['avg_diff'] = report2['Ladder score']-report2['avg_ladder_score']
report2['avg_diff_abs'] = abs(report2['Ladder score']-report2['avg_ladder_score'])
# top 10 differences
report2.sort_values(['avg_diff_abs'], ascending=False)[['Country name','Ladder score','avg_ladder_score','avg_diff']][0:10]
# top 10 best performers
report2.sort_values(['avg_diff'], ascending=False)[['Country name','Ladder score','avg_ladder_score','avg_diff']][0:10]
# top 10 worst performers
report2.sort_values(['avg_diff'], ascending=True)[['Country name','Ladder score','avg_ladder_score','avg_diff']][0:10]
# biggest differences df
diff_df = pd.concat([report2.sort_values(['avg_diff'], ascending=False)[['Country name','Ladder score','avg_ladder_score','avg_diff']][0:10],
report2.sort_values(['avg_diff'], ascending=True)[['Country name', 'Ladder score', 'avg_ladder_score', 'avg_diff']][0:10]],
join='inner').sort_values('avg_diff')
diff_df.groupby('Country name')['avg_diff'].sum().sort_values().plot(kind='barh')
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
data = report.iloc[:,[1,2,6,7,8,9,10,11]]
data = pd.get_dummies(data)

X, y = data.iloc[:,1:16], data.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
lr = LinearRegression().fit(X_train, y_train)print("lr.coef_: {}".format(lr.coef_))
print("lr.intercept_: {}".format(lr.intercept_))
print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store