Climbing the Ladder: How Gender, Education, and Seniority Shape Pay in the Workplace
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, f_oneway, chi2_contingency
import statsmodels.api as sm
from google.colab import files
uploaded = files.upload()
ladder_df = pd.read_csv('Glassdoor Gender Pay Gap.csv')
ladder_df.head()
| JobTitle | Gender | Age | PerfEval | Education | Dept | Seniority | BasePay | Bonus | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Graphic Designer | Female | 18 | 5 | College | Operations | 2 | 42363 | 9938 |
| 1 | Software Engineer | Male | 21 | 5 | College | Management | 5 | 108476 | 11128 |
| 2 | Warehouse Associate | Female | 19 | 4 | PhD | Administration | 5 | 90208 | 9268 |
| 3 | Software Engineer | Male | 20 | 5 | Masters | Sales | 4 | 108080 | 10154 |
| 4 | Graphic Designer | Male | 26 | 5 | Masters | Engineering | 5 | 99464 | 9319 |
print(f'Dataset contains {ladder_df.shape[0]} rows and {ladder_df.shape[1]} columns \n')
ladder_df.info()
ladder_df.describe(include='all').T
Dataset contains 1000 rows and 9 columns <class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 JobTitle 1000 non-null object 1 Gender 1000 non-null object 2 Age 1000 non-null int64 3 PerfEval 1000 non-null int64 4 Education 1000 non-null object 5 Dept 1000 non-null object 6 Seniority 1000 non-null int64 7 BasePay 1000 non-null int64 8 Bonus 1000 non-null int64 dtypes: int64(5), object(4) memory usage: 70.4+ KB
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| JobTitle | 1000 | 10 | Marketing Associate | 118 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Gender | 1000 | 2 | Male | 532 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Age | 1000.0 | NaN | NaN | NaN | 41.393 | 14.294856 | 18.0 | 29.0 | 41.0 | 54.25 | 65.0 |
| PerfEval | 1000.0 | NaN | NaN | NaN | 3.037 | 1.423959 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 |
| Education | 1000 | 4 | High School | 265 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Dept | 1000 | 5 | Operations | 210 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Seniority | 1000.0 | NaN | NaN | NaN | 2.971 | 1.395029 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 |
| BasePay | 1000.0 | NaN | NaN | NaN | 94472.653 | 25337.493272 | 34208.0 | 76850.25 | 93327.5 | 111558.0 | 179726.0 |
| Bonus | 1000.0 | NaN | NaN | NaN | 6467.161 | 2004.377365 | 1703.0 | 4849.5 | 6507.0 | 8026.0 | 11293.0 |
#Check NULL values
print(f'Null values: {ladder_df.isnull().sum()}')
Null values: JobTitle 0 Gender 0 Age 0 PerfEval 0 Education 0 Dept 0 Seniority 0 BasePay 0 Bonus 0 dtype: int64
#Check Duplicate Rows
print(f'Duplicate rows: {ladder_df.duplicated().sum()}')
Duplicate rows: 0
Categorical Variables
#Check Categorical Variables
for col in ladder_df.select_dtypes('object'):
print(col, ladder_df[col].unique())
JobTitle ['Graphic Designer' 'Software Engineer' 'Warehouse Associate' 'IT' 'Sales Associate' 'Driver' 'Financial Analyst' 'Marketing Associate' 'Data Scientist' 'Manager'] Gender ['Female' 'Male'] Education ['College' 'PhD' 'Masters' 'High School'] Dept ['Operations' 'Management' 'Administration' 'Sales' 'Engineering']
--- Education is Ordinal, can do ordinal encoding later for better analysis ---
Continuous Variables
Descriptive Statistics
# print("\nDescriptive Statistics:\n")
print(ladder_df[['BasePay', 'Bonus', 'Seniority', 'PerfEval']].describe())
BasePay Bonus Seniority PerfEval count 1000.000000 1000.000000 1000.000000 1000.000000 mean 94472.653000 6467.161000 2.971000 3.037000 std 25337.493272 2004.377365 1.395029 1.423959 min 34208.000000 1703.000000 1.000000 1.000000 25% 76850.250000 4849.500000 2.000000 2.000000 50% 93327.500000 6507.000000 3.000000 3.000000 75% 111558.000000 8026.000000 4.000000 4.000000 max 179726.000000 11293.000000 5.000000 5.000000
# Count
gender_counts = ladder_df.groupby('Seniority')['Gender'].value_counts().unstack().fillna(0)
# Percentage
gender_percentages = gender_counts.div(gender_counts.sum(axis=1), axis=0) * 100
print("Gender Counts by Seniority:")
print(gender_counts)
Gender Counts by Seniority: Gender Female Male Seniority 1 83 112 2 102 107 3 106 113 4 80 104 5 97 96
print("\nDistribution of Continuous Variables:\n")
ladder_df.hist(figsize=(15,10), edgecolor='white')
plt.show()
Distribution of Continuous Variables:
Distribution of Base Pay
# KDEplot of Base Pay by Job Title
## can be useful to add in STD
## will need cosmetics adjusment if we're including this in the infographics
# generate basepay mean for each job title
job_mean_serie = ladder_df.groupby('JobTitle')['BasePay'].mean()
ladder_df['job_meanpay'] = ladder_df['JobTitle'].map(job_mean_serie)
# sort the Job by the income mean
ladder_df = ladder_df.sort_values(by="job_meanpay", ascending=False)
# generate a color palette
pal = sns.color_palette(palette='coolwarm', n_colors=len('JobTitle'))
# generate a FacetGrid
grid = sns.FacetGrid(ladder_df, row='JobTitle', hue='JobTitle', aspect=8, height=1, palette=pal, xlim=(30000, 180000), ylim=(0, 0.000025))
# add kdeplots for each Job Title
grid.map(sns.kdeplot, 'BasePay', bw_adjust=0.8, clip_on=True, fill=True, alpha=1, linewidth=1, cut=0)
# add blackline as contour for each kdeplot
grid.map(sns.kdeplot, 'BasePay', bw_adjust=0.8, clip_on=True, color='black', lw=1, cut=0)
# add horizontal lines for each kdeplot
grid.refline(y=0, linewidth=1, linestyle='-', color=None, clip_on=False)
# add vertical reference lines
for x_value in [50000, 100000, 150000]:
grid.refline(x=x_value, linewidth=1, linestyle='--', color='gray', clip_on=False)
def label(job, color, label):
ax = plt.gca()
ax.text(-0.2, 0.2, label, fontweight=8, color='black',
ha="left", va="center", transform=ax.transAxes)
grid.map(label, 'JobTitle')
# get the subplots to overlap
grid.fig.subplots_adjust(hspace=-0.4)
# remove axes titles, yticks and spines
grid.set_titles('')
grid.set(yticks=[], ylabel="")
grid.despine(bottom=True, left=True)
plt.xlabel('Base Pay', fontweight=12, fontsize=10)
grid.fig.suptitle('Base Pay Distribution by Job',
ha='center',
fontsize=12,
fontweight=12)
plt.tight_layout()
plt.show()
# KDEplot of Base Pay by Dept
## can be useful to add in STD
# generate basepay mean for each Dept
dept_mean_serie = ladder_df.groupby('Dept')['BasePay'].mean()
ladder_df['dept_meanpay'] = ladder_df['Dept'].map(dept_mean_serie)
# sort the Job by the income mean
ladder_df = ladder_df.sort_values(by="dept_meanpay", ascending=False)
# generate a color palette
pal2 = sns.color_palette(palette='coolwarm', n_colors=len('Dept'))
# generate a FacetGrid
grid2 = sns.FacetGrid(ladder_df, row='Dept', hue='Dept', aspect=8, height=1, palette=pal, xlim=(30000, 180000), ylim=(0, 0.000025))
# add kdeplots for each Dept
grid2.map(sns.kdeplot, 'BasePay', bw_adjust=0.8, clip_on=True, fill=True, alpha=1, linewidth=1, cut=0)
# add blackline as contour for each kdeplot
grid2.map(sns.kdeplot, 'BasePay', bw_adjust=0.8, clip_on=True, color='black', lw=1, cut=0)
# add horizontal lines for each kdeplot
grid2.refline(y=0, linewidth=1, linestyle='-', color=None, clip_on=False)
# add vertical reference lines
for x_value in [50000, 100000, 150000]:
grid2.refline(x=x_value, linewidth=1, linestyle='--', color='gray', clip_on=False)
def label2(job, color, label):
ax = plt.gca()
ax.text(-0.2, 0.2, label, fontweight=8, color='black',
ha="left", va="center", transform=ax.transAxes)
grid2.map(label2, 'Dept')
# get the subplots to overlap
grid2.fig.subplots_adjust(hspace=-0.4)
# remove axes titles, yticks and spines
grid2.set_titles('')
grid2.set(yticks=[], ylabel="")
grid2.despine(bottom=True, left=True)
plt.xlabel('Base Pay', fontweight=12, fontsize=10)
grid2.fig.suptitle('Base Pay Distribution by Dept',
ha='center',
fontsize=12,
fontweight=12)
plt.tight_layout()
plt.show()
Gender Pay Gap
# Average Base Pay by Gender
male_pay = ladder_df[ladder_df['Gender'] == 'Male']['BasePay']
female_pay = ladder_df[ladder_df['Gender'] == 'Female']['BasePay']
t_stat, p_val = ttest_ind(male_pay, female_pay)
print(f"\nT-Test Gender Pay Gap: T={t_stat:.2f}, p={p_val:.3e}\n")
sns.boxplot(x='Gender', y='BasePay', data=ladder_df)
plt.title(f'Base Pay by Gender (p={p_val:.3f})')
plt.show()
T-Test Gender Pay Gap: T=5.38, p=9.479e-08
# Gender Pay Gap Across Departments
sns.barplot(data=ladder_df, x='Dept', y='BasePay', hue='Gender')
plt.title('Gender Pay Gap Across Departments')
plt.xticks(rotation=45)
([0, 1, 2, 3, 4], [Text(0, 0, 'Sales'), Text(1, 0, 'Management'), Text(2, 0, 'Engineering'), Text(3, 0, 'Administration'), Text(4, 0, 'Operations')])
Education Level Impact
#sort
ladder_df[['Education', 'BasePay']].groupby(['Education'], as_index=False).mean().sort_values(by='BasePay', ascending=True)
| Education | BasePay | |
|---|---|---|
| 3 | PhD | 99880.777311 |
| 2 | Masters | 97595.542969 |
| 0 | College | 92126.597510 |
| 1 | High School | 88732.298113 |
# BasePay vs Education Level by Gender (Transparent)
# Define the desired order for Education
edu_order = ['High School', 'College', 'Masters', 'PhD']
# Set the 'Education' column as a categorical variable with the desired order
ladder_df['Education'] = pd.Categorical(ladder_df['Education'], categories=edu_order, ordered=True)
# Plot
sns.lineplot(data=ladder_df, x='Education', y='BasePay', hue='Gender')
plt.title('BasePay Growth Over Education by Gender')
# Remove title, axis labels, and ticks
plt.title('')
plt.xlabel('')
plt.ylabel('')
plt.xticks([])
plt.yticks([])
# Remove legend
plt.legend().remove()
# Save with transparent background
plt.savefig("BasePay_Growth_Over_Education_by_Gender.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()
# Base Pay by Education
groups = [group['BasePay'].values for name, group in ladder_df.groupby('Education')]
f_stat, p_val = f_oneway(*groups)
print(f"\nANOVA for Education Levels: F={f_stat:.2f}, p={p_val:.3e}\n")
# Violin plot
plt.figure(figsize=(10, 6))
sns.violinplot(
data=ladder_df,
x='BasePay',
y='Education',
order=['High School', 'College', 'Masters', 'PhD'],
palette='Accent',
inner='box' # You can use 'quartile' or 'point' if preferred
)
plt.title(f'Base Pay by Education (p={p_val:.3f})')
plt.xticks(rotation=45)
plt.ylabel('Education')
plt.xlabel('Base Pay')
plt.tight_layout()
plt.show()
ANOVA for Education Levels: F=10.42, p=9.391e-07
<ipython-input-18-5c770c1633b4>:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.violinplot(
# Education Level by Gender
## can include some summary statistics (e.g. how likely it is for male to be more educated than female)
sns.countplot(data=ladder_df, x='Education', hue='Gender', order=['High School','College','Masters','PhD'])
plt.title('Distribution of Education Levels by Gender')
plt.xticks(rotation=45)
([0, 1, 2, 3], [Text(0, 0, 'High School'), Text(1, 0, 'College'), Text(2, 0, 'Masters'), Text(3, 0, 'PhD')])
Seniority & Pay Growth
# Seniority VS Base Pay
sns.lmplot(data=ladder_df, x='Seniority', y='BasePay', hue='Gender')
plt.title('Seniority vs. Base Pay by Gender')
Text(0.5, 1.0, 'Seniority vs. Base Pay by Gender')
# Pay Growth by Education & Gender
##too messy to be informative --can remove or simplify--
sns.lineplot(data=ladder_df, x='Seniority', y='BasePay', hue='Education', style='Gender')
plt.title('Pay Growth Over Seniority by Education & Gender')
lst = ladder_df['Education'].unique()
for ls in lst:
df1 = ladder_df[ladder_df['Education'] == ls]
sns.lineplot(data=df1, x='Seniority', y='BasePay', hue='Gender', style='Gender')
plt.title(f'Pay Growth Over Seniority for Education: {ls} by Gender')
plt.show()
# Bonus Growth vs Performance Evaluation by Gender (Transparent)
sns.lineplot(data=ladder_df, x='PerfEval', y='Bonus', hue='Gender')
plt.title('Bonus Growth Over PerfEval by Gender')
# Remove title, axis labels, and ticks
plt.title('')
plt.xlabel('')
plt.ylabel('')
plt.xticks([])
plt.yticks([])
# Remove legend
plt.legend().remove()
plt.savefig("Bonus_Growth_Over_PerfEval_by_Gender.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()
# Bonus Growth vs Performance Evaluation by Gender (Transparent)
sns.lineplot(data=ladder_df, x='PerfEval', y='Bonus', hue='Gender')
plt.title('Bonus Growth Over PerfEval by Gender')
plt.savefig("Bonus_Growth_Over_PerfEval_by_Gender.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()
Performance Evaluation vs Pay
# Performance Evaluation vs Bonus
## can decide on hue (Education or Gender)
### can specify/group by job title to be more accurate i.e. comparing apples to apples
sns.scatterplot(data=ladder_df, x='PerfEval', y='Bonus', hue='Gender')
plt.title('Performance Evaluation vs Bonus')
Text(0.5, 1.0, 'Performance Evaluation vs Bonus')
# Performance + Pay Heatmap
## education level is not yet sorted
pivot = ladder_df.pivot_table(index='PerfEval', columns='Education', values='Bonus', aggfunc='mean')
sns.heatmap(pivot, annot=True, cmap='coolwarm')
plt.title('Avg Bonus by Performance Eval & Education Level')
Text(0.5, 1.0, 'Avg Bonus by Performance Eval & Education Level')
Multivariate Salary Influences
# Pairplot of Key Variables
## might take a HUGE space in infographics
### better to comprehend and select a specific informative pairplot to include in infographic
sns.pairplot(ladder_df, vars=['Age', 'Seniority', 'PerfEval', 'BasePay', 'Bonus'], hue='Gender')
<seaborn.axisgrid.PairGrid at 0x7d0a5fd01610>
# Correlation Matrix
## informative but might take sometime to explain
### consider to do ordinal encoding for Education and include in the Correlation Matrix
corr = ladder_df[['Age', 'Seniority', 'PerfEval', 'BasePay', 'Bonus']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()
# Regression Model
# Optional: encode categorical variables
X = ladder_df[['Seniority', 'PerfEval', 'Age']]
X = sm.add_constant(X)
y = ladder_df['BasePay']
model = sm.OLS(y, X).fit()
print("\nRegression Summary:\n")
print(model.summary())
Regression Summary:
OLS Regression Results
==============================================================================
Dep. Variable: BasePay R-squared: 0.591
Model: OLS Adj. R-squared: 0.589
Method: Least Squares F-statistic: 478.8
Date: Sat, 10 May 2025 Prob (F-statistic): 1.52e-192
Time: 07:08:50 Log-Likelihood: -11112.
No. Observations: 1000 AIC: 2.223e+04
Df Residuals: 996 BIC: 2.225e+04
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 2.472e+04 2281.205 10.835 0.000 2.02e+04 2.92e+04
Seniority 9502.0509 368.449 25.789 0.000 8779.025 1.02e+04
PerfEval -177.2637 361.466 -0.490 0.624 -886.586 532.059
Age 1016.1999 36.007 28.222 0.000 945.542 1086.858
==============================================================================
Omnibus: 32.645 Durbin-Watson: 1.417
Prob(Omnibus): 0.000 Jarque-Bera (JB): 35.136
Skew: 0.436 Prob(JB): 2.35e-08
Kurtosis: 3.291 Cond. No. 197.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
--- need more statistical tests ---
# Get average BasePay by gender
avg_pay_by_gender = ladder_df.groupby('Gender')['BasePay'].mean()
# Extract male and female average pay
male_avg = avg_pay_by_gender.get('Male')
female_avg = avg_pay_by_gender.get('Female')
# Calculate percentage difference (how much lower female earns than male)
if male_avg and female_avg:
percent_diff = ((male_avg - female_avg) / male_avg) * 100
print(f"On average, females earn {percent_diff:.2f}% less than males.")
else:
print("One of the gender values is missing from the data.")
On average, females earn 8.65% less than males.
# Step 1: Group and compute mean BasePay by JobTitle, Education, Seniority, and Gender
grouped = ladder_df.groupby(['JobTitle', 'Education', 'Seniority', 'Gender'])['BasePay'].mean().unstack()
# Step 2: Drop rows where either Male or Female data is missing
grouped = grouped.dropna(subset=['Male', 'Female'])
# Step 3: Calculate % difference: how much lower Female earns than Male
grouped['% Difference (Female vs Male)'] = ((grouped['Male'] - grouped['Female']) / grouped['Male']) * 100
# Optional: Display only the relevant column
result = grouped['% Difference (Female vs Male)'].reset_index()
# Print result
print(result)
JobTitle Education Seniority \
0 Data Scientist College 1
1 Data Scientist College 3
2 Data Scientist College 5
3 Data Scientist High School 1
4 Data Scientist High School 2
.. ... ... ...
132 Warehouse Associate Masters 4
133 Warehouse Associate PhD 2
134 Warehouse Associate PhD 3
135 Warehouse Associate PhD 4
136 Warehouse Associate PhD 5
% Difference (Female vs Male)
0 -0.085623
1 0.433427
2 -13.587748
3 9.843575
4 20.352445
.. ...
132 7.550824
133 -37.333732
134 -1.328469
135 11.923900
136 7.867828
[137 rows x 4 columns]
# Step 1–2: Group and compute mean BasePay
grouped = ladder_df.groupby(['JobTitle', 'Education', 'Seniority', 'Gender'])['BasePay'].mean().unstack()
# Step 3: Drop rows with missing Male or Female salary
grouped = grouped.dropna(subset=['Male', 'Female'])
# Step 4: Calculate percentage difference
grouped['Pct_Female_Lower'] = ((grouped['Male'] - grouped['Female']) / grouped['Male']) * 100
# Step 5: Compute average percentage difference across all valid groups
average_pct_diff = grouped['Pct_Female_Lower'].mean()
print(f"\nOn average, females earn {average_pct_diff:.2f}% less than males across matched roles.\n")
On average, females earn -3.62% less than males across matched roles.
# To collect percentage differences where p < 0.05
significant_diffs = []
# Group by job title, education, seniority
grouped = ladder_df.groupby('Education')
for name, group in grouped:
males = group[group['Gender'] == 'Male']['BasePay']
females = group[group['Gender'] == 'Female']['BasePay']
# Only proceed if both genders have at least n observations
if len(males) >= 5 and len(females) >= 10:
t_stat, p_val = ttest_ind(males, females, equal_var=False)
if p_val < 0.05:
male_mean = males.mean()
female_mean = females.mean()
pct_diff = ((male_mean - female_mean) / male_mean) * 100
significant_diffs.append(pct_diff)
# Final result
if significant_diffs:
avg_significant_diff = np.mean(significant_diffs)
print(f"\nAverage % difference where females earn significantly less: {avg_significant_diff:.2f}%")
else:
print("\nNo statistically significant gender pay differences found in the specified groups.")
Average % difference where females earn significantly less: 10.36%