import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")


# Import all dataset
cert_df = pd.read_csv("salaries_certification_sex.csv")
ethnics_df = pd.read_csv("salaries_ethnicity_sex.csv")
industry_df = pd.read_csv("salaries_industry_sex.csv")


cert_df.head()


cert_df.describe()


cert_df.isnull().sum()

variable       0
variable_en    0
variable_bm    0
sex            0
year           0
mean           0
median         0
recipients     0
dtype: int64


cert_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216 entries, 0 to 215
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   variable     216 non-null    object 
 1   variable_en  216 non-null    object 
 2   variable_bm  216 non-null    object 
 3   sex          216 non-null    object 
 4   year         216 non-null    int64  
 5   mean         216 non-null    float64
 6   median       216 non-null    float64
 7   recipients   216 non-null    int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 13.6+ KB


print('Education', cert_df.variable_en.unique())

Education ['Overall' 'No certificate/Not applicable' 'SPM and below'
 'STPM/Certificate ' 'Diploma' 'Degree']


print('Year', cert_df.year.unique())

Year [2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021]


# create a separate df for Overall value
overalldf = cert_df[cert_df['variable_en'].str.contains("Overall") == True]

# Plot the relationship between year and mean wage
g1 = sns.lineplot(data=overalldf, x="year", y="mean")
g1.set(xlabel="", ylabel="")

plt.title("Time-Series of Mean Wage in Malaysia")
plt.show()


cert2_df = cert_df[cert_df['variable_en'] != 'Overall']
cert2_df[['variable_en', 'mean']].groupby(['variable_en'], as_index=False).mean().sort_values(by='mean', ascending=False)


# Plot the distribution of mean wage by education
g2 = sns.boxplot(data=cert2_df, x="mean", y="variable_en", width=0.5, palette='husl')
g2.set(xlabel="", ylabel="")

plt.title("Mean Wage Distribution by Education Level")
plt.show()


# Remove 'Overall' in the 'Sex' column
cert2_df = cert2_df[cert2_df['sex'] != 'overall']

# Plot the distribution of mean wage by education and gender
g3 = sns.swarmplot(data=cert2_df, x="mean", y="variable_en", hue="sex")
g3.set(xlabel="", ylabel="")

plt.title("Mean Wage Distribution by Education and Gender")
plt.show()


#Next, we explore ethnicity dataset
ethnics_df.head()


print('Race',ethnics_df.variable_en.unique())

Race ['Overall' 'Citizen' 'Bumiputera' 'Chinese' 'Indian ' 'Other (Citizen)'
 'Non-Citizen']


# Remove Overall and Sort by Mean
ethnics2_df = ethnics_df[ethnics_df['variable_en'] != 'Overall']
ethnics2_df[['variable_en', 'mean']].groupby(['variable_en'], as_index=False).mean().sort_values(by='mean', ascending=False)


# Plot the distribution of mean wage by ethnicity
g4 = sns.boxplot(data=ethnics2_df, x="mean", y="variable_en", 
                 order=['Chinese', 'Citizen', 'Bumiputera', 'Indian ', 'Other (Citizen)', 'Non-Citizen'], 
                 width=0.7, palette='husl')
g4.set(xlabel="", ylabel="")

plt.title("Mean Wage Distribution by Ethnicity")
plt.show()


#Finally, we look into industry dataset
industry_df.head()


print('Industry',industry_df.variable_en.unique())

Industry ['Overall' 'Agriculture' 'Mining and quarrying ' 'Manufacturing'
 'Electricity, gas, steam and air conditioning supply '
 'Water supply; sewerage, waste management and remediation activities '
 'Construction '
 'Wholesale and retail trade; repair of motor vehicles and motorcycles '
 'Transportation and storage'
 'Accommodation and food and beverage service activities'
 'Information and communication'
 'Financial and insurance/takaful activities ' 'Real estate activities '
 'Professional, scientific and technical activities'
 'Administrative and support service activities '
 'Public administration and defence; compulsory social security'
 'Education ' 'Human health and social work activities '
 'Arts, entertainment and recreation' 'Other service activities']


# Remove Overall and Sort by Mean
industry2_df = industry_df[industry_df['variable_en'] != 'Overall']
industry2_df[['variable_en', 'mean']].groupby(['variable_en'], as_index=False).mean().sort_values(by='mean', ascending=False)


# Plot the distribution of mean wage by industry
g5 = sns.boxplot(data=industry2_df, x="mean", y="variable_en", 
                 order=['Mining and quarrying ', 'Education ', 'Real estate activities ', 'Information and communication', 'Financial and insurance/takaful activities ', 
                        'Professional, scientific and technical activities', 'Electricity, gas, steam and air conditioning supply ', 'Human health and social work activities ', 
                        'Public administration and defence; compulsory social security', 'Transportation and storage', 'Construction ', 'Manufacturing', 
                        'Water supply; sewerage, waste management and remediation activities ', 'Arts, entertainment and recreation', 'Other service activities', 
                        'Wholesale and retail trade; repair of motor vehicles and motorcycles ', 'Administrative and support service activities ', 
                        'Accommodation and food and beverage service activities', 'Agriculture'],
                 width=0.5, palette='husl')
g5.set(xlabel="", ylabel="")

plt.title("Mean Wage Distribution by Industry")
plt.show()

	year	mean	median	recipients
count	216.000000	216.000000	216.000000	2.160000e+02
mean	2015.500000	2691.349074	2286.402778	1.877225e+06
std	3.460071	1303.158846	1171.029914	2.268984e+06
min	2010.000000	684.000000	600.000000	4.030000e+04
25%	2012.750000	1783.750000	1500.000000	3.534500e+05
50%	2015.500000	2408.600000	2000.000000	7.110000e+05
75%	2018.250000	3226.750000	2806.500000	3.245450e+06
max	2021.000000	6838.000000	6250.000000	9.711900e+06

	variable	variable_en	variable_bm	sex	year	mean	median	recipients
0	overall	Overall	Keseluruhan	overall	2010	1936.0	1500.0	7149200
1	overall	Overall	Keseluruhan	overall	2011	1959.0	1500.0	7634000
2	overall	Overall	Keseluruhan	overall	2012	2052.0	1566.0	7782900
3	overall	Overall	Keseluruhan	overall	2013	2186.0	1700.0	7903100
4	overall	Overall	Keseluruhan	overall	2014	2377.0	1800.0	8392400

	variable_en	mean
0	Degree	5015.658333
1	Diploma	3146.016667
4	STPM/Certificate	2380.583333
3	SPM and below	1822.477778
2	No certificate/Not applicable	1227.616667

	variable_en	mean
1	Chinese	2888.189749
2	Citizen	2555.729951
0	Bumiputera	2460.856442
3	Indian	2387.707240
5	Other (Citizen)	1807.029689
4	Non-Citizen	1330.942241

	variable_en	mean
11	Mining and quarrying	4628.683333
5	Education	4176.644444
15	Real estate activities	3930.411111
9	Information and communication	3854.263889
7	Financial and insurance/takaful activities	3822.325000
13	Professional, scientific and technical activities	3555.769444
6	Electricity, gas, steam and air conditioning s...	3425.075000
8	Human health and social work activities	3378.111111
14	Public administration and defence; compulsory ...	3329.558333
16	Transportation and storage	2365.522222
4	Construction	2282.605556
10	Manufacturing	2169.775000
17	Water supply; sewerage, waste management and r...	2161.508333
3	Arts, entertainment and recreation	2124.527778
12	Other service activities	1821.388889
18	Wholesale and retail trade; repair of motor ve...	1817.541667
1	Administrative and support service activities	1648.647222
0	Accommodation and food and beverage service ac...	1553.355556
2	Agriculture	1480.583333

Income Distribution in Malaysia Different Demographics