| | """Training script for salary prediction model.""" |
| |
|
| | import pickle |
| | from pathlib import Path |
| |
|
| | import pandas as pd |
| | import numpy as np |
| | import yaml |
| | from xgboost import XGBRegressor |
| | from sklearn.model_selection import KFold, train_test_split |
| |
|
| | from src.preprocessing import prepare_features, reduce_cardinality |
| |
|
| | CATEGORICAL_FEATURES = [ |
| | "Country", |
| | "EdLevel", |
| | "DevType", |
| | "Industry", |
| | "Age", |
| | "ICorPM", |
| | "OrgSize", |
| | "Employment", |
| | ] |
| |
|
| |
|
| | def filter_salaries(df: pd.DataFrame, config: dict) -> pd.DataFrame: |
| | """Filter rows by minimum salary and per-country percentile outlier removal. |
| | |
| | Args: |
| | df: DataFrame with ConvertedCompYearly and Country columns. |
| | config: Config dict with data.min_salary, data.lower_percentile, |
| | data.upper_percentile. |
| | |
| | Returns: |
| | Filtered DataFrame with outliers removed. |
| | """ |
| | main_label = "ConvertedCompYearly" |
| | min_salary = config["data"]["min_salary"] |
| | df = df[df[main_label] > min_salary] |
| |
|
| | lower_pct = config["data"]["lower_percentile"] / 100 |
| | upper_pct = config["data"]["upper_percentile"] / 100 |
| | lower_bound = df.groupby("Country")[main_label].transform("quantile", lower_pct) |
| | upper_bound = df.groupby("Country")[main_label].transform("quantile", upper_pct) |
| | df = df[(df[main_label] > lower_bound) & (df[main_label] < upper_bound)] |
| |
|
| | df = df.dropna(subset=[main_label]) |
| | return df |
| |
|
| |
|
| | def apply_cardinality_reduction(df: pd.DataFrame) -> pd.DataFrame: |
| | """Apply cardinality reduction and unicode normalization to categorical columns. |
| | |
| | Args: |
| | df: DataFrame with categorical feature columns. |
| | |
| | Returns: |
| | DataFrame with reduced cardinality categories. |
| | """ |
| | df = df.copy() |
| | for col in CATEGORICAL_FEATURES: |
| | df[col] = df[col].str.replace("\u2019", "'", regex=False) |
| | df[col] = reduce_cardinality(df[col]) |
| | return df |
| |
|
| |
|
| | def drop_other_rows(df: pd.DataFrame, config: dict) -> pd.DataFrame: |
| | """Drop rows where specified features have the 'Other' catch-all category. |
| | |
| | Args: |
| | df: DataFrame with categorical feature columns (after cardinality reduction). |
| | config: Config dict with features.cardinality.other_category and |
| | features.cardinality.drop_other_from. |
| | |
| | Returns: |
| | DataFrame with 'Other' rows removed from specified features. |
| | """ |
| | other_name = config["features"]["cardinality"].get("other_category", "Other") |
| | drop_other_from = config["features"]["cardinality"].get("drop_other_from", []) |
| | for col in drop_other_from: |
| | df = df[df[col] != other_name] |
| | return df |
| |
|
| |
|
| | def extract_valid_categories(df: pd.DataFrame) -> dict: |
| | """Extract sorted unique values per categorical feature for inference validation. |
| | |
| | Args: |
| | df: DataFrame with categorical feature columns (after cardinality reduction). |
| | |
| | Returns: |
| | Dict mapping feature name to sorted list of valid category values. |
| | """ |
| | return { |
| | col: sorted(df[col].dropna().unique().tolist()) for col in CATEGORICAL_FEATURES |
| | } |
| |
|
| |
|
| | def compute_currency_rates(df: pd.DataFrame, valid_countries: list[str]) -> dict: |
| | """Compute median currency conversion rates per country. |
| | |
| | Args: |
| | df: DataFrame with Country, Currency, CompTotal, ConvertedCompYearly columns. |
| | valid_countries: List of valid country names to compute rates for. |
| | |
| | Returns: |
| | Dict mapping country to {code, name, rate}. |
| | """ |
| | main_label = "ConvertedCompYearly" |
| | currency_df = df[["Country", "Currency", "CompTotal", main_label]].dropna() |
| | currency_df = currency_df.copy() |
| | currency_df["CurrencyCode"] = currency_df["Currency"].str.split(r"\s+", n=1).str[0] |
| | currency_df["CurrencyName"] = currency_df["Currency"].str.split(r"\s+", n=1).str[1] |
| | currency_df["rate"] = currency_df["CompTotal"] / currency_df[main_label] |
| | currency_df = currency_df[ |
| | (currency_df["rate"] > 0.001) & (currency_df["rate"] < 100000) |
| | ] |
| |
|
| | currency_rates = {} |
| | for country in valid_countries: |
| | country_data = currency_df[currency_df["Country"] == country] |
| | if country_data.empty: |
| | continue |
| | most_common = country_data["CurrencyCode"].mode() |
| | if most_common.empty: |
| | continue |
| | code = most_common.iloc[0] |
| | name_row = country_data[country_data["CurrencyCode"] == code].iloc[0] |
| | full_name = name_row["CurrencyName"] |
| | rates = country_data[country_data["CurrencyCode"] == code]["rate"] |
| | median_rate = round(float(rates.median()), 2) |
| | currency_rates[country] = { |
| | "code": code, |
| | "name": full_name, |
| | "rate": median_rate, |
| | } |
| |
|
| | return currency_rates |
| |
|
| |
|
| | def main(): |
| | """Train and save the salary prediction model.""" |
| | |
| | print("Loading configuration...") |
| | config_path = Path("config/model_parameters.yaml") |
| | with open(config_path, "r") as f: |
| | config = yaml.safe_load(f) |
| |
|
| | print("Loading data...") |
| | data_path = Path("data/survey_results_public.csv") |
| |
|
| | if not data_path.exists(): |
| | print(f"Error: Data file not found at {data_path}") |
| | print( |
| | "Please download the Stack Overflow Developer Survey CSV and place it in the data/ directory." |
| | ) |
| | print("Download from: https://insights.stackoverflow.com/survey") |
| | return |
| |
|
| | |
| | df = pd.read_csv( |
| | data_path, |
| | usecols=[ |
| | "Country", |
| | "YearsCode", |
| | "WorkExp", |
| | "EdLevel", |
| | "DevType", |
| | "Industry", |
| | "Age", |
| | "ICorPM", |
| | "OrgSize", |
| | "Employment", |
| | "Currency", |
| | "CompTotal", |
| | "ConvertedCompYearly", |
| | ], |
| | ) |
| |
|
| | print(f"Loaded {len(df):,} rows") |
| |
|
| | print("Removing null, extremely small and large reported salaries") |
| | df = filter_salaries(df, config) |
| | print(f"After filtering: {len(df):,} rows") |
| |
|
| | |
| | |
| | df_copy = apply_cardinality_reduction(df) |
| | df = apply_cardinality_reduction(df) |
| |
|
| | |
| | before_drop = len(df) |
| | df = drop_other_rows(df, config) |
| | df_copy = drop_other_rows(df_copy, config) |
| | drop_other_from = config["features"]["cardinality"].get("drop_other_from", []) |
| | if drop_other_from: |
| | print( |
| | f"Dropped {before_drop - len(df):,} rows with 'Other' in {drop_other_from}" |
| | ) |
| | print(f"After dropping 'Other': {len(df):,} rows") |
| |
|
| | |
| | main_label = "ConvertedCompYearly" |
| | X = prepare_features(df) |
| | y = df[main_label] |
| |
|
| | |
| | valid_categories = extract_valid_categories(df_copy) |
| |
|
| | valid_categories_path = Path("config/valid_categories.yaml") |
| | with open(valid_categories_path, "w") as f: |
| | yaml.dump(valid_categories, f, default_flow_style=False, sort_keys=False) |
| |
|
| | print( |
| | f"\nSaved {len(valid_categories['Country'])} valid countries, {len(valid_categories['EdLevel'])} valid education levels, {len(valid_categories['DevType'])} valid developer types, {len(valid_categories['Industry'])} valid industries, {len(valid_categories['Age'])} valid age ranges, and {len(valid_categories['ICorPM'])} valid IC/PM values to {valid_categories_path}" |
| | ) |
| |
|
| | |
| | print("\nComputing currency conversion rates per country...") |
| | currency_rates = compute_currency_rates(df, valid_categories["Country"]) |
| |
|
| | currency_rates_path = Path("config/currency_rates.yaml") |
| | with open(currency_rates_path, "w") as f: |
| | yaml.dump( |
| | currency_rates, |
| | f, |
| | default_flow_style=False, |
| | sort_keys=True, |
| | allow_unicode=True, |
| | ) |
| |
|
| | print( |
| | f"Saved currency rates for {len(currency_rates)} countries to {currency_rates_path}" |
| | ) |
| | for country, info in sorted(currency_rates.items()): |
| | print( |
| | f" {country:45s} -> {info['code']} ({info['name']}, rate: {info['rate']})" |
| | ) |
| |
|
| | print(f"\nFeature matrix shape: {X.shape}") |
| | print(f"Total features: {X.shape[1]}") |
| |
|
| | |
| | print("\n" + "=" * 60) |
| | print("FEATURE ANALYSIS (for comparing with inference)") |
| | print("=" * 60) |
| |
|
| | |
| | print("\n📍 Top 10 Countries:") |
| | top_countries = df["Country"].value_counts().head(10) |
| | for country, count in top_countries.items(): |
| | print(f" - {country}: {count:,} ({count / len(df) * 100:.1f}%)") |
| |
|
| | |
| | print("\n🎓 Top Education Levels:") |
| | top_edu = df["EdLevel"].value_counts().head(10) |
| | for edu, count in top_edu.items(): |
| | print(f" - {edu}: {count:,} ({count / len(df) * 100:.1f}%)") |
| |
|
| | |
| | print("\n👨💻 Top Developer Types:") |
| | top_devtype = df["DevType"].value_counts().head(10) |
| | for devtype, count in top_devtype.items(): |
| | print(f" - {devtype}: {count:,} ({count / len(df) * 100:.1f}%)") |
| |
|
| | |
| | print("\n🏢 Top Industries:") |
| | top_industry = df["Industry"].value_counts().head(10) |
| | for industry, count in top_industry.items(): |
| | print(f" - {industry}: {count:,} ({count / len(df) * 100:.1f}%)") |
| |
|
| | |
| | print("\n🎂 Age Distribution:") |
| | top_age = df["Age"].value_counts().head(10) |
| | for age, count in top_age.items(): |
| | print(f" - {age}: {count:,} ({count / len(df) * 100:.1f}%)") |
| |
|
| | |
| | print("\n👥 IC or PM Distribution:") |
| | top_icorpm = df["ICorPM"].value_counts().head(10) |
| | for icorpm, count in top_icorpm.items(): |
| | print(f" - {icorpm}: {count:,} ({count / len(df) * 100:.1f}%)") |
| |
|
| | |
| | print("\n💼 Employment Distribution:") |
| | top_employment = df["Employment"].value_counts() |
| | for emp, count in top_employment.items(): |
| | print(f" - {emp}: {count:,} ({count / len(df) * 100:.1f}%)") |
| |
|
| | |
| | print("\n💼 Years of Coding Experience:") |
| | print(f" - Min: {df['YearsCode'].min():.1f}") |
| | print(f" - Max: {df['YearsCode'].max():.1f}") |
| | print(f" - Mean: {df['YearsCode'].mean():.1f}") |
| | print(f" - Median: {df['YearsCode'].median():.1f}") |
| | print(f" - 25th percentile: {df['YearsCode'].quantile(0.25):.1f}") |
| | print(f" - 75th percentile: {df['YearsCode'].quantile(0.75):.1f}") |
| |
|
| | |
| | print("\n💼 Years of Professional Work Experience:") |
| | print(f" - Min: {df['WorkExp'].min():.1f}") |
| | print(f" - Max: {df['WorkExp'].max():.1f}") |
| | print(f" - Mean: {df['WorkExp'].mean():.1f}") |
| | print(f" - Median: {df['WorkExp'].median():.1f}") |
| | print(f" - 25th percentile: {df['WorkExp'].quantile(0.25):.1f}") |
| | print(f" - 75th percentile: {df['WorkExp'].quantile(0.75):.1f}") |
| |
|
| | |
| | |
| |
|
| | |
| | feature_counts = X.sum().sort_values(ascending=False) |
| |
|
| | |
| | categorical_features = feature_counts[ |
| | ~feature_counts.index.str.startswith("YearsCode") |
| | ] |
| |
|
| | |
| | print("\n🌍 Top 15 Country Features (most common):") |
| | country_features = categorical_features[ |
| | categorical_features.index.str.startswith("Country_") |
| | ] |
| | for i, (feature, count) in enumerate(country_features.head(15).items(), 1): |
| | percentage = (count / len(X)) * 100 |
| | country_name = feature.replace("Country_", "") |
| | print( |
| | f" {i:2d}. {country_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)" |
| | ) |
| |
|
| | |
| | print("\n🎓 Top 10 Education Level Features (most common):") |
| | edlevel_features = categorical_features[ |
| | categorical_features.index.str.startswith("EdLevel_") |
| | ] |
| | for i, (feature, count) in enumerate(edlevel_features.head(10).items(), 1): |
| | percentage = (count / len(X)) * 100 |
| | edu_name = feature.replace("EdLevel_", "") |
| | print( |
| | f" {i:2d}. {edu_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)" |
| | ) |
| |
|
| | |
| | print("\n👨💻 Top 10 Developer Type Features (most common):") |
| | devtype_features = categorical_features[ |
| | categorical_features.index.str.startswith("DevType_") |
| | ] |
| | for i, (feature, count) in enumerate(devtype_features.head(10).items(), 1): |
| | percentage = (count / len(X)) * 100 |
| | devtype_name = feature.replace("DevType_", "") |
| | print( |
| | f" {i:2d}. {devtype_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)" |
| | ) |
| |
|
| | |
| | print("\n🏢 Top 10 Industry Features (most common):") |
| | industry_features = categorical_features[ |
| | categorical_features.index.str.startswith("Industry_") |
| | ] |
| | for i, (feature, count) in enumerate(industry_features.head(10).items(), 1): |
| | percentage = (count / len(X)) * 100 |
| | industry_name = feature.replace("Industry_", "") |
| | print( |
| | f" {i:2d}. {industry_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)" |
| | ) |
| |
|
| | |
| | print("\n🎂 Top 10 Age Features (most common):") |
| | age_features = categorical_features[ |
| | categorical_features.index.str.startswith("Age_") |
| | ] |
| | for i, (feature, count) in enumerate(age_features.head(10).items(), 1): |
| | percentage = (count / len(X)) * 100 |
| | age_name = feature.replace("Age_", "") |
| | print( |
| | f" {i:2d}. {age_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)" |
| | ) |
| |
|
| | |
| | print("\n👥 Top 10 IC/PM Features (most common):") |
| | icorpm_features = categorical_features[ |
| | categorical_features.index.str.startswith("ICorPM_") |
| | ] |
| | for i, (feature, count) in enumerate(icorpm_features.head(10).items(), 1): |
| | percentage = (count / len(X)) * 100 |
| | icorpm_name = feature.replace("ICorPM_", "") |
| | print( |
| | f" {i:2d}. {icorpm_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)" |
| | ) |
| |
|
| | print(f"\n📊 Total one-hot encoded features: {len(X.columns)}") |
| | print(" - Numeric: 2 (YearsCode, WorkExp)") |
| | print(f" - Country: {len(country_features)}") |
| | print(f" - Education: {len(edlevel_features)}") |
| | print(f" - DevType: {len(devtype_features)}") |
| | print(f" - Industry: {len(industry_features)}") |
| | print(f" - Age: {len(age_features)}") |
| | print(f" - ICorPM: {len(icorpm_features)}") |
| |
|
| | print("=" * 60 + "\n") |
| |
|
| | |
| | n_splits = config["data"].get("cv_splits", 5) |
| | random_state = config["data"]["random_state"] |
| | model_config = config["model"] |
| |
|
| | print(f"Running {n_splits}-fold cross-validation...") |
| | kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state) |
| |
|
| | train_scores = [] |
| | test_scores = [] |
| | best_iterations = [] |
| |
|
| | for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1): |
| | X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] |
| | y_train, y_test = y.iloc[train_idx], y.iloc[test_idx] |
| |
|
| | model = XGBRegressor( |
| | n_estimators=model_config["n_estimators"], |
| | learning_rate=model_config["learning_rate"], |
| | max_depth=model_config["max_depth"], |
| | min_child_weight=model_config["min_child_weight"], |
| | random_state=model_config["random_state"], |
| | n_jobs=model_config["n_jobs"], |
| | early_stopping_rounds=model_config["early_stopping_rounds"], |
| | ) |
| | model.fit( |
| | X_train, |
| | y_train, |
| | eval_set=[(X_test, y_test)], |
| | verbose=False, |
| | ) |
| |
|
| | train_mape = np.mean(np.abs((y_train - model.predict(X_train)) / y_train)) * 100 |
| | test_mape = np.mean(np.abs((y_test - model.predict(X_test)) / y_test)) * 100 |
| | train_scores.append(train_mape) |
| | test_scores.append(test_mape) |
| | best_iterations.append(model.best_iteration + 1) |
| | print( |
| | f" Fold {fold}: Train MAPE = {train_mape:.2f}%, Test MAPE = {test_mape:.2f}% (best iter: {model.best_iteration + 1})" |
| | ) |
| |
|
| | avg_train = np.mean(train_scores) |
| | avg_test = np.mean(test_scores) |
| | std_test = np.std(test_scores) |
| | avg_best_iter = int(np.mean(best_iterations)) |
| | print(f"\nCV Average Train MAPE: {avg_train:.2f}%") |
| | print(f"CV Average Test MAPE: {avg_test:.2f}% (+/- {std_test:.2f}%)") |
| | print(f"CV Average best iteration: {avg_best_iter}") |
| |
|
| | |
| | |
| | print("\nTraining final model on full dataset...") |
| | X_train_final, X_es, y_train_final, y_es = train_test_split( |
| | X, y, test_size=0.1, random_state=random_state |
| | ) |
| |
|
| | final_model = XGBRegressor( |
| | n_estimators=model_config["n_estimators"], |
| | learning_rate=model_config["learning_rate"], |
| | max_depth=model_config["max_depth"], |
| | min_child_weight=model_config["min_child_weight"], |
| | random_state=model_config["random_state"], |
| | n_jobs=model_config["n_jobs"], |
| | early_stopping_rounds=model_config["early_stopping_rounds"], |
| | ) |
| | final_model.fit( |
| | X_train_final, |
| | y_train_final, |
| | eval_set=[(X_es, y_es)], |
| | verbose=config["training"]["verbose"], |
| | ) |
| | print(f"Final model best iteration: {final_model.best_iteration + 1}") |
| |
|
| | |
| | model_path = Path(config["training"]["model_path"]) |
| | model_path.parent.mkdir(parents=True, exist_ok=True) |
| |
|
| | artifacts = { |
| | "model": final_model, |
| | "feature_columns": list(X.columns), |
| | } |
| |
|
| | with open(model_path, "wb") as f: |
| | pickle.dump(artifacts, f) |
| |
|
| | print(f"Model saved to {model_path}") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|