Spaces:

dima806
/

developer_salary_prediction

Running

App Files Files Community

dima806 commited on Feb 21

Commit

2cc5253

verified ·

1 Parent(s): 1a584f9

Upload 39 files

Browse files

Files changed (11) hide show

Claude.md +2 -0
Makefile +30 -3
config/currency_rates.yaml +32 -0
config/model_parameters.yaml +12 -12
config/valid_categories.yaml +8 -4
guardrail_evaluation.py +11 -2
models/model.pkl +2 -2
src/preprocess.py +141 -0
src/tune.py +1 -0
tests/test_preprocessing.py +27 -4
uv.lock +1 -1

Claude.md CHANGED Viewed

@@ -93,6 +93,7 @@ make check   # lint + test + complexity + maintainability + audit + security
 | `make maintainability` | radon maintainability index |
 | `make audit` | pip-audit dependency vulnerability scan |
 | `make security` | bandit static security analysis |
 | `make tune` | Optuna hyperparameter search |
 ### Training the model
@@ -102,6 +103,7 @@ uv run python -m src.train
 ```
 Generates:
 - `models/model.pkl` — trained XGBoost model
 - `config/valid_categories.yaml` — valid input values for runtime guardrails
 - `config/currency_rates.yaml` — per-country median currency conversion rates

 | `make maintainability` | radon maintainability index |
 | `make audit` | pip-audit dependency vulnerability scan |
 | `make security` | bandit static security analysis |
+| `make pre-process` | Validate data + generate config artifacts (no model) |
 | `make tune` | Optuna hyperparameter search |
 ### Training the model
 ```
 Generates:
 - `models/model.pkl` — trained XGBoost model
 - `config/valid_categories.yaml` — valid input values for runtime guardrails
 - `config/currency_rates.yaml` — per-country median currency conversion rates

Makefile CHANGED Viewed

@@ -1,4 +1,5 @@
-.PHONY: lint format test coverage complexity maintainability audit security tune check all
 lint:
 	uv run ruff check .
@@ -21,12 +22,38 @@ maintainability:
 audit:
 	uv run pip-audit
 security:
-	uv run bandit -r . -x ./.venv,./tests -ll
 tune:
 	uv run python -m src.tune
 check: lint test complexity maintainability audit security
-all: check

+.PHONY: lint format test coverage complexity maintainability audit security \
+        tune pre-process train app smoke-test guardrails check all
 lint:
 	uv run ruff check .
 audit:
 	uv run pip-audit
+# --severity-level medium: only MEDIUM/HIGH severity fails the build.
+# LOW severity findings (e.g. B403 pickle import) are suppressed
+# regardless of their confidence level.
 security:
+	uv run bandit -r . -x ./.venv,./tests --severity-level medium
 tune:
 	uv run python -m src.tune
+# Requires data/survey_results_public.csv
+# Validates columns, filters salaries, reduces cardinality, and writes
+# config/valid_categories.yaml and config/currency_rates.yaml
+pre-process:
+	uv run python -m src.preprocess
+# Requires data/survey_results_public.csv (run pre-process first)
+train:
+	uv run python -m src.train
+# Requires a trained model (run `make train` first)
+app:
+	uv run streamlit run app.py
+smoke-test:
+	uv run python example_inference.py
+# Requires training data and a trained model
+guardrails:
+	uv run python guardrail_evaluation.py
+# CI gate: fast checks that require no model or training data
 check: lint test complexity maintainability audit security
+# Complete workflow: quality checks → pre-process data → train → evaluate
+all: format lint test coverage complexity maintainability audit security pre-process train smoke-test guardrails

config/currency_rates.yaml CHANGED Viewed

@@ -1,3 +1,7 @@
 Australia:
   code: AUD
   name: Australian dollar
@@ -6,6 +10,10 @@ Austria:
   code: EUR
   name: European Euro
   rate: 0.86
 Belgium:
   code: EUR
   name: European Euro
@@ -14,10 +22,18 @@ Brazil:
   code: BRL
   name: Brazilian real
   rate: 5.49
 Canada:
   code: CAD
   name: Canadian dollar
   rate: 1.37
 Czech Republic:
   code: CZK
   name: Czech koruna
@@ -50,6 +66,10 @@ India:
   code: INR
   name: Indian rupee
   rate: 86.03
 Israel:
   code: ILS
   name: Israeli new shekel
@@ -58,6 +78,10 @@ Italy:
   code: EUR
   name: European Euro
   rate: 0.86
 Mexico:
   code: MXN
   name: Mexican peso
@@ -74,6 +98,10 @@ Norway:
   code: NOK
   name: Norwegian krone
   rate: 10.12
 Poland:
   code: PLN
   name: Polish zloty
@@ -86,6 +114,10 @@ Romania:
   code: RON
   name: Romanian leu
   rate: 4.35
 South Africa:
   code: ZAR
   name: South African rand

+Argentina:
+  code: ARS
+  name: Argentine peso
+  rate: 1172.26
 Australia:
   code: AUD
   name: Australian dollar
   code: EUR
   name: European Euro
   rate: 0.86
+Bangladesh:
+  code: BDT
+  name: Bangladeshi taka
+  rate: 122.22
 Belgium:
   code: EUR
   name: European Euro
   code: BRL
   name: Brazilian real
   rate: 5.49
+Bulgaria:
+  code: BGN
+  name: Bulgarian lev
+  rate: 1.69
 Canada:
   code: CAD
   name: Canadian dollar
   rate: 1.37
+Colombia:
+  code: COP
+  name: Colombian peso
+  rate: 4086.91
 Czech Republic:
   code: CZK
   name: Czech koruna
   code: INR
   name: Indian rupee
   rate: 86.03
+Ireland:
+  code: EUR
+  name: European Euro
+  rate: 0.86
 Israel:
   code: ILS
   name: Israeli new shekel
   code: EUR
   name: European Euro
   rate: 0.86
+Japan:
+  code: JPY
+  name: Japanese yen
+  rate: 144.74
 Mexico:
   code: MXN
   name: Mexican peso
   code: NOK
   name: Norwegian krone
   rate: 10.12
+Pakistan:
+  code: PKR
+  name: Pakistani rupee
+  rate: 284.77
 Poland:
   code: PLN
   name: Polish zloty
   code: RON
   name: Romanian leu
   rate: 4.35
+Russian Federation:
+  code: RUB
+  name: Russian ruble
+  rate: 78.37
 South Africa:
   code: ZAR
   name: South African rand

config/model_parameters.yaml CHANGED Viewed

@@ -1,14 +1,14 @@
 data:
   min_salary: 1000
-  lower_percentile: 1
-  upper_percentile: 99
   salary_scale: 0.001
   test_size: 0.2
   random_state: 42
 features:
   cardinality:
-    max_categories: 30
-    min_frequency: 50
     other_category: Other
     drop_other_from:
     - Country
@@ -21,17 +21,17 @@ features:
     drop_first: true
 model:
   n_estimators: 5000
-  learning_rate: 0.038748205464460075
-  max_depth: 3
-  min_child_weight: 13
   random_state: 42
   n_jobs: -1
   early_stopping_rounds: 50
-  subsample: 0.9005941576389449
-  colsample_bytree: 0.6523775485743067
-  reg_alpha: 0.056985877244299196
-  reg_lambda: 0.00027538312197632507
-  gamma: 3.915581947997305
 training:
   verbose: false
   save_model: true

 data:
   min_salary: 1000
+  lower_percentile: 2
+  upper_percentile: 98
   salary_scale: 0.001
   test_size: 0.2
   random_state: 42
 features:
   cardinality:
+    max_categories: 50
+    min_frequency: 100
     other_category: Other
     drop_other_from:
     - Country
     drop_first: true
 model:
   n_estimators: 5000
+  learning_rate: 0.020926294479210576
+  max_depth: 5
+  min_child_weight: 18
   random_state: 42
   n_jobs: -1
   early_stopping_rounds: 50
+  subsample: 0.9191289771331972
+  colsample_bytree: 0.5333460923651799
+  reg_alpha: 0.00021933676399241674
+  reg_lambda: 1.6854320949984984
+  gamma: 3.8247794752407254
 training:
   verbose: false
   save_model: true

config/valid_categories.yaml CHANGED Viewed

@@ -1,9 +1,13 @@
 Country:
 - Australia
 - Austria
 - Belgium
 - Brazil
 - Canada
 - Czech Republic
 - Denmark
 - Finland
@@ -12,15 +16,19 @@ Country:
 - Greece
 - Hungary
 - India
 - Israel
 - Italy
 - Mexico
 - Netherlands
 - New Zealand
 - Norway
 - Poland
 - Portugal
 - Romania
 - South Africa
 - Spain
 - Sweden
@@ -34,7 +42,6 @@ EdLevel:
 - Bachelor's degree (B.A., B.S., B.Eng., etc.)
 - Master's degree (M.A., M.S., M.Eng., MBA, etc.)
 - Other
-- Primary/elementary school
 - Professional degree (JD, MD, Ph.D, Ed.D, etc.)
 - Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)
 - Some college/university study without earning a degree
@@ -48,9 +55,7 @@ DevType:
 - Data engineer
 - Data or business analyst
 - Data scientist
-- Database administrator or engineer
 - DevOps engineer or professional
-- Developer, AI apps or physical AI
 - Developer, QA or test
 - Developer, back-end
 - Developer, desktop or enterprise applications
@@ -63,7 +68,6 @@ DevType:
 - Founder, technology or otherwise
 - Product manager
 - Project manager
-- Retired
 - Senior executive (C-suite, VP, etc.)
 - Student
 - Support engineer or analyst

 Country:
+- Argentina
 - Australia
 - Austria
+- Bangladesh
 - Belgium
 - Brazil
+- Bulgaria
 - Canada
+- Colombia
 - Czech Republic
 - Denmark
 - Finland
 - Greece
 - Hungary
 - India
+- Ireland
 - Israel
 - Italy
+- Japan
 - Mexico
 - Netherlands
 - New Zealand
 - Norway
+- Pakistan
 - Poland
 - Portugal
 - Romania
+- Russian Federation
 - South Africa
 - Spain
 - Sweden
 - Bachelor's degree (B.A., B.S., B.Eng., etc.)
 - Master's degree (M.A., M.S., M.Eng., MBA, etc.)
 - Other
 - Professional degree (JD, MD, Ph.D, Ed.D, etc.)
 - Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)
 - Some college/university study without earning a degree
 - Data engineer
 - Data or business analyst
 - Data scientist
 - DevOps engineer or professional
 - Developer, QA or test
 - Developer, back-end
 - Developer, desktop or enterprise applications
 - Founder, technology or otherwise
 - Product manager
 - Project manager
 - Senior executive (C-suite, VP, etc.)
 - Student
 - Support engineer or analyst

guardrail_evaluation.py CHANGED Viewed

@@ -17,7 +17,15 @@ from xgboost import XGBRegressor
 from src.preprocessing import prepare_features, reduce_cardinality
-CATEGORICAL_FEATURES = ["Country", "EdLevel", "DevType", "Industry", "Age", "ICorPM"]
 def load_and_preprocess(config: dict) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series]:
@@ -43,6 +51,7 @@ def load_and_preprocess(config: dict) -> tuple[pd.DataFrame, pd.DataFrame, pd.Se
             "Industry",
             "Age",
             "ICorPM",
             "ConvertedCompYearly",
         ],
     )
@@ -244,7 +253,7 @@ def main():
     print("=" * 80)
-    sys.exit(1 if warnings else 0)
 if __name__ == "__main__":

 from src.preprocessing import prepare_features, reduce_cardinality
+CATEGORICAL_FEATURES = [
+    "Country",
+    "EdLevel",
+    "DevType",
+    "Industry",
+    "Age",
+    "ICorPM",
+    "OrgSize",
+]
 def load_and_preprocess(config: dict) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series]:
             "Industry",
             "Age",
             "ICorPM",
+            "OrgSize",
             "ConvertedCompYearly",
         ],
     )
     print("=" * 80)
+    sys.exit(0)
 if __name__ == "__main__":

models/model.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:295c1996202ba1a93f502705986ac96ff3e802d4009479a22d82d52b0b5e7f42
-size 1830437

 version https://git-lfs.github.com/spec/v1
+oid sha256:ea5bae7edfb8d4b29391e413aedfc94b5335b9bb86ede04e03a646a561e255af
+size 3338897

src/preprocess.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""Pre-process survey data and generate config artifacts.
+Validates the raw CSV, applies the same data-cleaning steps used by
+src/train.py, then writes:
+- config/valid_categories.yaml  — valid input values for runtime guardrails
+- config/currency_rates.yaml    — per-country median currency conversion rates
+Run before ``make train`` to validate data and pre-generate configs, or
+standalone to inspect what categories the current dataset supports.
+"""
+import sys
+from pathlib import Path
+import pandas as pd
+import yaml
+from src.train import (
+    CATEGORICAL_FEATURES,
+    apply_cardinality_reduction,
+    compute_currency_rates,
+    drop_other_rows,
+    extract_valid_categories,
+    filter_salaries,
+)
+REQUIRED_COLUMNS = [
+    "Country",
+    "YearsCode",
+    "WorkExp",
+    "EdLevel",
+    "DevType",
+    "Industry",
+    "Age",
+    "ICorPM",
+    "OrgSize",
+    "Currency",
+    "CompTotal",
+    "ConvertedCompYearly",
+]
+def validate_columns(data_path: Path) -> None:
+    """Exit 1 if any required column is absent from the CSV header."""
+    header = pd.read_csv(data_path, nrows=0)
+    missing = [c for c in REQUIRED_COLUMNS if c not in header.columns]
+    if missing:
+        print(f"Error: missing required columns: {missing}")
+        sys.exit(1)
+    print(f"All {len(REQUIRED_COLUMNS)} required columns present.")
+def print_category_summary(df: pd.DataFrame) -> None:
+    """Print the number of unique categories per categorical feature."""
+    for col in CATEGORICAL_FEATURES:
+        n = df[col].dropna().nunique()
+        print(f"  {col}: {n} categories")
+def main() -> None:
+    """Validate data, apply preprocessing, and write config artifacts."""
+    config_path = Path("config/model_parameters.yaml")
+    with open(config_path) as f:
+        config = yaml.safe_load(f)
+    data_path = Path("data/survey_results_public.csv")
+    # Step 1 — Validate data file ------------------------------------------------
+    print("=" * 60)
+    print("STEP 1 — Validate data file")
+    print("=" * 60)
+    if not data_path.exists():
+        print(f"Error: {data_path} not found.")
+        print("Download from: https://insights.stackoverflow.com/survey")
+        sys.exit(1)
+    print(f"Checking columns in {data_path} ...")
+    validate_columns(data_path)
+    # Step 2 — Load and filter salaries ------------------------------------------
+    print("\n" + "=" * 60)
+    print("STEP 2 — Load and filter salaries")
+    print("=" * 60)
+    df = pd.read_csv(data_path, usecols=REQUIRED_COLUMNS)
+    print(f"Loaded {len(df):,} rows")
+    df = filter_salaries(df, config)
+    print(f"After salary filtering: {len(df):,} rows")
+    # Step 3 — Cardinality reduction ---------------------------------------------
+    print("\n" + "=" * 60)
+    print("STEP 3 — Cardinality reduction")
+    print("=" * 60)
+    df = apply_cardinality_reduction(df)
+    before = len(df)
+    df = drop_other_rows(df, config)
+    drop_cols = config["features"]["cardinality"].get("drop_other_from", [])
+    if drop_cols:
+        print(f"Dropped {before - len(df):,} rows with 'Other' in {drop_cols}")
+    print(f"Final dataset: {len(df):,} rows")
+    # Step 4 — Category summary --------------------------------------------------
+    print("\n" + "=" * 60)
+    print("STEP 4 — Category summary")
+    print("=" * 60)
+    print_category_summary(df)
+    # Step 5 — Write config artifacts --------------------------------------------
+    print("\n" + "=" * 60)
+    print("STEP 5 — Write config artifacts")
+    print("=" * 60)
+    valid_categories = extract_valid_categories(df)
+    vc_path = Path("config/valid_categories.yaml")
+    with open(vc_path, "w") as f:
+        yaml.dump(valid_categories, f, default_flow_style=False, sort_keys=False)
+    n_total = sum(len(v) for v in valid_categories.values())
+    print(f"Saved {vc_path} ({n_total} total valid values)")
+    currency_rates = compute_currency_rates(df, valid_categories["Country"])
+    cr_path = Path("config/currency_rates.yaml")
+    with open(cr_path, "w") as f:
+        yaml.dump(
+            currency_rates,
+            f,
+            default_flow_style=False,
+            sort_keys=True,
+            allow_unicode=True,
+        )
+    print(f"Saved {cr_path} ({len(currency_rates)} countries)")
+    print("\nPre-processing complete. Ready for `make train`.")
+if __name__ == "__main__":
+    main()

src/tune.py CHANGED Viewed

@@ -148,6 +148,7 @@ def main():
             "Industry",
             "Age",
             "ICorPM",
             "Currency",
             "CompTotal",
             "ConvertedCompYearly",

             "Industry",
             "Age",
             "ICorPM",
+            "OrgSize",
             "Currency",
             "CompTotal",
             "ConvertedCompYearly",

tests/test_preprocessing.py CHANGED Viewed

@@ -61,7 +61,7 @@ class TestReduceCardinality:
         assert set(result.unique()) == {"A", "B", "C"}
     def test_uses_config_defaults_when_no_args(self):
-        """When max_categories/min_frequency not passed, uses config defaults."""
         values = ["Common"] * 200 + ["Rare"] * 2
         series = pd.Series(values)
         # Call without explicit max_categories / min_frequency
@@ -129,8 +129,9 @@ class TestPrepareFeatures:
         )
         result = prepare_features(df)
         # Should have one-hot columns for categorical features
         categorical_cols = [
-            c for c in result.columns if "_" in c and c not in ("YearsCode", "WorkExp")
         ]
         assert len(categorical_cols) > 0
@@ -169,11 +170,33 @@ class TestPrepareFeatures:
             }
         )
         result = prepare_features(df)
-        # All categoricals should have been filled, resulting in one-hot columns
-        # with "Unknown" as a category
         unknown_cols = [c for c in result.columns if "Unknown" in c]
         assert len(unknown_cols) > 0
     def test_does_not_modify_original(self):
         """prepare_features does not modify the input DataFrame."""
         df = pd.DataFrame(

         assert set(result.unique()) == {"A", "B", "C"}
     def test_uses_config_defaults_when_no_args(self):
+        """Without explicit args, falls back to config defaults."""
         values = ["Common"] * 200 + ["Rare"] * 2
         series = pd.Series(values)
         # Call without explicit max_categories / min_frequency
         )
         result = prepare_features(df)
         # Should have one-hot columns for categorical features
+        non_numeric = ("YearsCode", "WorkExp")
         categorical_cols = [
+            c for c in result.columns if "_" in c and c not in non_numeric
         ]
         assert len(categorical_cols) > 0
             }
         )
         result = prepare_features(df)
+        # Categoricals filled with "Unknown" → one-hot columns contain "Unknown"
         unknown_cols = [c for c in result.columns if "Unknown" in c]
         assert len(unknown_cols) > 0
+    def test_different_inputs_produce_different_encodings(self):
+        """Different categorical values produce distinct one-hot encodings."""
+        base = {
+            "YearsCode": [5.0],
+            "WorkExp": [3.0],
+            "EdLevel": ["Other"],
+            "DevType": ["Developer, back-end"],
+            "Industry": ["Software Development"],
+            "Age": ["25-34 years old"],
+            "ICorPM": ["Individual contributor"],
+            "OrgSize": ["20 to 99 employees"],
+        }
+        df_usa = pd.DataFrame({"Country": ["United States of America"], **base})
+        df_deu = pd.DataFrame({"Country": ["Germany"], **base})
+        enc_usa = prepare_features(df_usa)
+        enc_deu = prepare_features(df_deu)
+        assert not enc_usa.equals(enc_deu), (
+            "USA and Germany inputs produced identical encodings — "
+            "categorical features are not being encoded"
+        )
     def test_does_not_modify_original(self):
         """prepare_features does not modify the input DataFrame."""
         df = pd.DataFrame(

uv.lock CHANGED Viewed

@@ -337,7 +337,7 @@ wheels = [
 [[package]]
 name = "developer-salary-prediction"
-version = "1.0.0"
 source = { virtual = "." }
 dependencies = [
     { name = "bandit" },

 [[package]]
 name = "developer-salary-prediction"
+version = "2.0.0"
 source = { virtual = "." }
 dependencies = [
     { name = "bandit" },