import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import ttest_ind, f_oneway, chi2_contingency
import statsmodels.api as sm

from google.colab import files
uploaded = files.upload()

ladder_df = pd.read_csv('Glassdoor Gender Pay Gap.csv')
ladder_df.head()

print(f'Dataset contains {ladder_df.shape[0]} rows and {ladder_df.shape[1]} columns \n')
ladder_df.info()
ladder_df.describe(include='all').T

Dataset contains 1000 rows and 9 columns 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   JobTitle   1000 non-null   object
 1   Gender     1000 non-null   object
 2   Age        1000 non-null   int64 
 3   PerfEval   1000 non-null   int64 
 4   Education  1000 non-null   object
 5   Dept       1000 non-null   object
 6   Seniority  1000 non-null   int64 
 7   BasePay    1000 non-null   int64 
 8   Bonus      1000 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 70.4+ KB

#Check NULL values
print(f'Null values: {ladder_df.isnull().sum()}')

Null values: JobTitle     0
Gender       0
Age          0
PerfEval     0
Education    0
Dept         0
Seniority    0
BasePay      0
Bonus        0
dtype: int64

#Check Duplicate Rows
print(f'Duplicate rows: {ladder_df.duplicated().sum()}')

Duplicate rows: 0

#Check Categorical Variables
for col in ladder_df.select_dtypes('object'):
    print(col, ladder_df[col].unique())

JobTitle ['Graphic Designer' 'Software Engineer' 'Warehouse Associate' 'IT'
 'Sales Associate' 'Driver' 'Financial Analyst' 'Marketing Associate'
 'Data Scientist' 'Manager']
Gender ['Female' 'Male']
Education ['College' 'PhD' 'Masters' 'High School']
Dept ['Operations' 'Management' 'Administration' 'Sales' 'Engineering']

# print("\nDescriptive Statistics:\n")
print(ladder_df[['BasePay', 'Bonus', 'Seniority', 'PerfEval']].describe())

             BasePay         Bonus    Seniority     PerfEval
count    1000.000000   1000.000000  1000.000000  1000.000000
mean    94472.653000   6467.161000     2.971000     3.037000
std     25337.493272   2004.377365     1.395029     1.423959
min     34208.000000   1703.000000     1.000000     1.000000
25%     76850.250000   4849.500000     2.000000     2.000000
50%     93327.500000   6507.000000     3.000000     3.000000
75%    111558.000000   8026.000000     4.000000     4.000000
max    179726.000000  11293.000000     5.000000     5.000000

# Count
gender_counts = ladder_df.groupby('Seniority')['Gender'].value_counts().unstack().fillna(0)

# Percentage
gender_percentages = gender_counts.div(gender_counts.sum(axis=1), axis=0) * 100

print("Gender Counts by Seniority:")
print(gender_counts)

Gender Counts by Seniority:
Gender     Female  Male
Seniority              
1              83   112
2             102   107
3             106   113
4              80   104
5              97    96

print("\nDistribution of Continuous Variables:\n")
ladder_df.hist(figsize=(15,10), edgecolor='white')
plt.show()

Distribution of Continuous Variables:

# KDEplot of Base Pay by Job Title
## can be useful to add in STD
## will need cosmetics adjusment if we're including this in the infographics

# generate basepay mean for each job title
job_mean_serie = ladder_df.groupby('JobTitle')['BasePay'].mean()
ladder_df['job_meanpay'] = ladder_df['JobTitle'].map(job_mean_serie)

# sort the Job by the income mean
ladder_df = ladder_df.sort_values(by="job_meanpay", ascending=False)

# generate a color palette
pal = sns.color_palette(palette='coolwarm', n_colors=len('JobTitle'))

# generate a FacetGrid
grid = sns.FacetGrid(ladder_df, row='JobTitle', hue='JobTitle', aspect=8, height=1, palette=pal, xlim=(30000, 180000), ylim=(0, 0.000025))

# add kdeplots for each Job Title
grid.map(sns.kdeplot, 'BasePay', bw_adjust=0.8, clip_on=True, fill=True, alpha=1, linewidth=1, cut=0)

# add blackline as contour for each kdeplot
grid.map(sns.kdeplot, 'BasePay', bw_adjust=0.8, clip_on=True, color='black', lw=1, cut=0)

# add horizontal lines for each kdeplot
grid.refline(y=0, linewidth=1, linestyle='-', color=None, clip_on=False)

# add vertical reference lines
for x_value in [50000, 100000, 150000]:
    grid.refline(x=x_value, linewidth=1, linestyle='--', color='gray', clip_on=False)

def label(job, color, label):
    ax = plt.gca()
    ax.text(-0.2, 0.2, label, fontweight=8, color='black',
            ha="left", va="center", transform=ax.transAxes)

grid.map(label, 'JobTitle')

# get the subplots to overlap
grid.fig.subplots_adjust(hspace=-0.4)

# remove axes titles, yticks and spines
grid.set_titles('')
grid.set(yticks=[], ylabel="")
grid.despine(bottom=True, left=True)

plt.xlabel('Base Pay', fontweight=12, fontsize=10)
grid.fig.suptitle('Base Pay Distribution by Job',
               ha='center',
               fontsize=12,
               fontweight=12)

plt.tight_layout()
plt.show()

# KDEplot of Base Pay by Dept
## can be useful to add in STD

# generate basepay mean for each Dept
dept_mean_serie = ladder_df.groupby('Dept')['BasePay'].mean()
ladder_df['dept_meanpay'] = ladder_df['Dept'].map(dept_mean_serie)

# sort the Job by the income mean
ladder_df = ladder_df.sort_values(by="dept_meanpay", ascending=False)

# generate a color palette
pal2 = sns.color_palette(palette='coolwarm', n_colors=len('Dept'))

# generate a FacetGrid
grid2 = sns.FacetGrid(ladder_df, row='Dept', hue='Dept', aspect=8, height=1, palette=pal, xlim=(30000, 180000), ylim=(0, 0.000025))

# add kdeplots for each Dept
grid2.map(sns.kdeplot, 'BasePay', bw_adjust=0.8, clip_on=True, fill=True, alpha=1, linewidth=1, cut=0)

# add blackline as contour for each kdeplot
grid2.map(sns.kdeplot, 'BasePay', bw_adjust=0.8, clip_on=True, color='black', lw=1, cut=0)

# add horizontal lines for each kdeplot
grid2.refline(y=0, linewidth=1, linestyle='-', color=None, clip_on=False)

# add vertical reference lines
for x_value in [50000, 100000, 150000]:
    grid2.refline(x=x_value, linewidth=1, linestyle='--', color='gray', clip_on=False)

def label2(job, color, label):
    ax = plt.gca()
    ax.text(-0.2, 0.2, label, fontweight=8, color='black',
            ha="left", va="center", transform=ax.transAxes)

grid2.map(label2, 'Dept')

# get the subplots to overlap
grid2.fig.subplots_adjust(hspace=-0.4)

# remove axes titles, yticks and spines
grid2.set_titles('')
grid2.set(yticks=[], ylabel="")
grid2.despine(bottom=True, left=True)

plt.xlabel('Base Pay', fontweight=12, fontsize=10)
grid2.fig.suptitle('Base Pay Distribution by Dept',
               ha='center',
               fontsize=12,
               fontweight=12)

plt.tight_layout()
plt.show()

# Average Base Pay by Gender

male_pay = ladder_df[ladder_df['Gender'] == 'Male']['BasePay']
female_pay = ladder_df[ladder_df['Gender'] == 'Female']['BasePay']
t_stat, p_val = ttest_ind(male_pay, female_pay)
print(f"\nT-Test Gender Pay Gap: T={t_stat:.2f}, p={p_val:.3e}\n")

sns.boxplot(x='Gender', y='BasePay', data=ladder_df)
plt.title(f'Base Pay by Gender (p={p_val:.3f})')
plt.show()

T-Test Gender Pay Gap: T=5.38, p=9.479e-08

# Gender Pay Gap Across Departments

sns.barplot(data=ladder_df, x='Dept', y='BasePay', hue='Gender')
plt.title('Gender Pay Gap Across Departments')
plt.xticks(rotation=45)

([0, 1, 2, 3, 4],
 [Text(0, 0, 'Sales'),
  Text(1, 0, 'Management'),
  Text(2, 0, 'Engineering'),
  Text(3, 0, 'Administration'),
  Text(4, 0, 'Operations')])

#sort
ladder_df[['Education', 'BasePay']].groupby(['Education'], as_index=False).mean().sort_values(by='BasePay', ascending=True)

# BasePay vs Education Level by Gender (Transparent)

# Define the desired order for Education
edu_order = ['High School', 'College', 'Masters', 'PhD']

# Set the 'Education' column as a categorical variable with the desired order
ladder_df['Education'] = pd.Categorical(ladder_df['Education'], categories=edu_order, ordered=True)

# Plot
sns.lineplot(data=ladder_df, x='Education', y='BasePay', hue='Gender')
plt.title('BasePay Growth Over Education by Gender')

# Remove title, axis labels, and ticks
plt.title('')
plt.xlabel('')
plt.ylabel('')
plt.xticks([])
plt.yticks([])

# Remove legend
plt.legend().remove()

# Save with transparent background
plt.savefig("BasePay_Growth_Over_Education_by_Gender.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()

# Base Pay by Education
groups = [group['BasePay'].values for name, group in ladder_df.groupby('Education')]
f_stat, p_val = f_oneway(*groups)
print(f"\nANOVA for Education Levels: F={f_stat:.2f}, p={p_val:.3e}\n")

# Violin plot
plt.figure(figsize=(10, 6))
sns.violinplot(
    data=ladder_df,
    x='BasePay',
    y='Education',
    order=['High School', 'College', 'Masters', 'PhD'],
    palette='Accent',
    inner='box'  # You can use 'quartile' or 'point' if preferred
)

plt.title(f'Base Pay by Education (p={p_val:.3f})')
plt.xticks(rotation=45)
plt.ylabel('Education')
plt.xlabel('Base Pay')
plt.tight_layout()
plt.show()

ANOVA for Education Levels: F=10.42, p=9.391e-07

<ipython-input-18-5c770c1633b4>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(

# Education Level by Gender
## can include some summary statistics (e.g. how likely it is for male to be more educated than female)

sns.countplot(data=ladder_df, x='Education', hue='Gender', order=['High School','College','Masters','PhD'])
plt.title('Distribution of Education Levels by Gender')
plt.xticks(rotation=45)

([0, 1, 2, 3],
 [Text(0, 0, 'High School'),
  Text(1, 0, 'College'),
  Text(2, 0, 'Masters'),
  Text(3, 0, 'PhD')])

# Seniority VS Base Pay

sns.lmplot(data=ladder_df, x='Seniority', y='BasePay', hue='Gender')
plt.title('Seniority vs. Base Pay by Gender')

Text(0.5, 1.0, 'Seniority vs. Base Pay by Gender')

# Pay Growth by Education & Gender
##too messy to be informative  --can remove or simplify--

sns.lineplot(data=ladder_df, x='Seniority', y='BasePay', hue='Education', style='Gender')
plt.title('Pay Growth Over Seniority by Education & Gender')


lst = ladder_df['Education'].unique()
for ls in lst:
    df1 = ladder_df[ladder_df['Education'] == ls]
    sns.lineplot(data=df1, x='Seniority', y='BasePay', hue='Gender', style='Gender')
    plt.title(f'Pay Growth Over Seniority for Education: {ls} by Gender')
    plt.show()

# Bonus Growth vs Performance Evaluation by Gender (Transparent)

sns.lineplot(data=ladder_df, x='PerfEval', y='Bonus', hue='Gender')
plt.title('Bonus Growth Over PerfEval by Gender')

# Remove title, axis labels, and ticks
plt.title('')
plt.xlabel('')
plt.ylabel('')
plt.xticks([])
plt.yticks([])

# Remove legend
plt.legend().remove()

plt.savefig("Bonus_Growth_Over_PerfEval_by_Gender.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()

# Bonus Growth vs Performance Evaluation by Gender (Transparent)

sns.lineplot(data=ladder_df, x='PerfEval', y='Bonus', hue='Gender')
plt.title('Bonus Growth Over PerfEval by Gender')

plt.savefig("Bonus_Growth_Over_PerfEval_by_Gender.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()

# Performance Evaluation vs Bonus
## can decide on hue (Education or Gender)
### can specify/group by job title to be more accurate i.e. comparing apples to apples

sns.scatterplot(data=ladder_df, x='PerfEval', y='Bonus', hue='Gender')
plt.title('Performance Evaluation vs Bonus')

Text(0.5, 1.0, 'Performance Evaluation vs Bonus')

# Performance + Pay Heatmap
## education level is not yet sorted

pivot = ladder_df.pivot_table(index='PerfEval', columns='Education', values='Bonus', aggfunc='mean')
sns.heatmap(pivot, annot=True, cmap='coolwarm')
plt.title('Avg Bonus by Performance Eval & Education Level')

Text(0.5, 1.0, 'Avg Bonus by Performance Eval & Education Level')

# Pairplot of Key Variables
## might take a HUGE space in infographics
### better to comprehend and select a specific informative pairplot to include in infographic

sns.pairplot(ladder_df, vars=['Age', 'Seniority', 'PerfEval', 'BasePay', 'Bonus'], hue='Gender')

<seaborn.axisgrid.PairGrid at 0x7d0a5fd01610>

# Correlation Matrix
## informative but might take sometime to explain
### consider to do ordinal encoding for Education and include in the Correlation Matrix

corr = ladder_df[['Age', 'Seniority', 'PerfEval', 'BasePay', 'Bonus']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Regression Model
# Optional: encode categorical variables

X = ladder_df[['Seniority', 'PerfEval', 'Age']]
X = sm.add_constant(X)
y = ladder_df['BasePay']

model = sm.OLS(y, X).fit()
print("\nRegression Summary:\n")
print(model.summary())

Regression Summary:

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                BasePay   R-squared:                       0.591
Model:                            OLS   Adj. R-squared:                  0.589
Method:                 Least Squares   F-statistic:                     478.8
Date:                Sat, 10 May 2025   Prob (F-statistic):          1.52e-192
Time:                        07:08:50   Log-Likelihood:                -11112.
No. Observations:                1000   AIC:                         2.223e+04
Df Residuals:                     996   BIC:                         2.225e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       2.472e+04   2281.205     10.835      0.000    2.02e+04    2.92e+04
Seniority   9502.0509    368.449     25.789      0.000    8779.025    1.02e+04
PerfEval    -177.2637    361.466     -0.490      0.624    -886.586     532.059
Age         1016.1999     36.007     28.222      0.000     945.542    1086.858
==============================================================================
Omnibus:                       32.645   Durbin-Watson:                   1.417
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               35.136
Skew:                           0.436   Prob(JB):                     2.35e-08
Kurtosis:                       3.291   Cond. No.                         197.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# Get average BasePay by gender
avg_pay_by_gender = ladder_df.groupby('Gender')['BasePay'].mean()

# Extract male and female average pay
male_avg = avg_pay_by_gender.get('Male')
female_avg = avg_pay_by_gender.get('Female')

# Calculate percentage difference (how much lower female earns than male)
if male_avg and female_avg:
    percent_diff = ((male_avg - female_avg) / male_avg) * 100
    print(f"On average, females earn {percent_diff:.2f}% less than males.")
else:
    print("One of the gender values is missing from the data.")

On average, females earn 8.65% less than males.

# Step 1: Group and compute mean BasePay by JobTitle, Education, Seniority, and Gender
grouped = ladder_df.groupby(['JobTitle', 'Education', 'Seniority', 'Gender'])['BasePay'].mean().unstack()

# Step 2: Drop rows where either Male or Female data is missing
grouped = grouped.dropna(subset=['Male', 'Female'])

# Step 3: Calculate % difference: how much lower Female earns than Male
grouped['% Difference (Female vs Male)'] = ((grouped['Male'] - grouped['Female']) / grouped['Male']) * 100

# Optional: Display only the relevant column
result = grouped['% Difference (Female vs Male)'].reset_index()

# Print result
print(result)

                JobTitle    Education  Seniority  \
0         Data Scientist      College          1   
1         Data Scientist      College          3   
2         Data Scientist      College          5   
3         Data Scientist  High School          1   
4         Data Scientist  High School          2   
..                   ...          ...        ...   
132  Warehouse Associate      Masters          4   
133  Warehouse Associate          PhD          2   
134  Warehouse Associate          PhD          3   
135  Warehouse Associate          PhD          4   
136  Warehouse Associate          PhD          5   

     % Difference (Female vs Male)  
0                        -0.085623  
1                         0.433427  
2                       -13.587748  
3                         9.843575  
4                        20.352445  
..                             ...  
132                       7.550824  
133                     -37.333732  
134                      -1.328469  
135                      11.923900  
136                       7.867828  

[137 rows x 4 columns]

# Step 1–2: Group and compute mean BasePay
grouped = ladder_df.groupby(['JobTitle', 'Education', 'Seniority', 'Gender'])['BasePay'].mean().unstack()

# Step 3: Drop rows with missing Male or Female salary
grouped = grouped.dropna(subset=['Male', 'Female'])

# Step 4: Calculate percentage difference
grouped['Pct_Female_Lower'] = ((grouped['Male'] - grouped['Female']) / grouped['Male']) * 100

# Step 5: Compute average percentage difference across all valid groups
average_pct_diff = grouped['Pct_Female_Lower'].mean()

print(f"\nOn average, females earn {average_pct_diff:.2f}% less than males across matched roles.\n")

On average, females earn -3.62% less than males across matched roles.

# To collect percentage differences where p < 0.05
significant_diffs = []

# Group by job title, education, seniority
grouped = ladder_df.groupby('Education')

for name, group in grouped:
    males = group[group['Gender'] == 'Male']['BasePay']
    females = group[group['Gender'] == 'Female']['BasePay']

    # Only proceed if both genders have at least n observations
    if len(males) >= 5 and len(females) >= 10:
        t_stat, p_val = ttest_ind(males, females, equal_var=False)

        if p_val < 0.05:
            male_mean = males.mean()
            female_mean = females.mean()
            pct_diff = ((male_mean - female_mean) / male_mean) * 100
            significant_diffs.append(pct_diff)

# Final result
if significant_diffs:
    avg_significant_diff = np.mean(significant_diffs)
    print(f"\nAverage % difference where females earn significantly less: {avg_significant_diff:.2f}%")
else:
    print("\nNo statistically significant gender pay differences found in the specified groups.")

Average % difference where females earn significantly less: 10.36%

	JobTitle	Gender	Age	PerfEval	Education	Dept	Seniority	BasePay	Bonus
0	Graphic Designer	Female	18	5	College	Operations	2	42363	9938
1	Software Engineer	Male	21	5	College	Management	5	108476	11128
2	Warehouse Associate	Female	19	4	PhD	Administration	5	90208	9268
3	Software Engineer	Male	20	5	Masters	Sales	4	108080	10154
4	Graphic Designer	Male	26	5	Masters	Engineering	5	99464	9319

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
JobTitle	1000	10	Marketing Associate	118	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Gender	1000	2	Male	532	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Age	1000.0	NaN	NaN	NaN	41.393	14.294856	18.0	29.0	41.0	54.25	65.0
PerfEval	1000.0	NaN	NaN	NaN	3.037	1.423959	1.0	2.0	3.0	4.0	5.0
Education	1000	4	High School	265	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Dept	1000	5	Operations	210	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Seniority	1000.0	NaN	NaN	NaN	2.971	1.395029	1.0	2.0	3.0	4.0	5.0
BasePay	1000.0	NaN	NaN	NaN	94472.653	25337.493272	34208.0	76850.25	93327.5	111558.0	179726.0
Bonus	1000.0	NaN	NaN	NaN	6467.161	2004.377365	1703.0	4849.5	6507.0	8026.0	11293.0

	Education	BasePay
3	PhD	99880.777311
2	Masters	97595.542969
0	College	92126.597510
1	High School	88732.298113