Upload 25 files
Browse files- .dockerignore +53 -0
- .gitignore +220 -0
- .python-version +1 -0
- Claude.md +226 -0
- Dockerfile +1 -1
- LICENSE +201 -0
- README.md +305 -20
- app.py +41 -11
- config/currency_rates.yaml +84 -0
- config/valid_categories.yaml +17 -0
- debug_prepare_features.py +81 -0
- diagnose_encoding.py +65 -0
- example_inference.py +111 -0
- models/model.pkl +2 -2
- pyproject.toml +15 -0
- src/infer.py +33 -1
- src/preprocessing.py +7 -6
- src/schema.py +7 -3
- src/train.py +134 -31
- test_feature_impact.py +373 -0
- test_fix.py +43 -0
- uv.lock +0 -0
.dockerignore
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
*.egg-info/
|
| 8 |
+
dist/
|
| 9 |
+
build/
|
| 10 |
+
*.egg
|
| 11 |
+
|
| 12 |
+
# Virtual environments
|
| 13 |
+
.venv/
|
| 14 |
+
venv/
|
| 15 |
+
ENV/
|
| 16 |
+
env/
|
| 17 |
+
|
| 18 |
+
# IDE
|
| 19 |
+
.vscode/
|
| 20 |
+
.idea/
|
| 21 |
+
*.swp
|
| 22 |
+
*.swo
|
| 23 |
+
*~
|
| 24 |
+
|
| 25 |
+
# OS
|
| 26 |
+
.DS_Store
|
| 27 |
+
Thumbs.db
|
| 28 |
+
|
| 29 |
+
# Git
|
| 30 |
+
.git/
|
| 31 |
+
.gitignore
|
| 32 |
+
.gitattributes
|
| 33 |
+
|
| 34 |
+
# Data (don't include raw survey data in Docker image)
|
| 35 |
+
data/
|
| 36 |
+
|
| 37 |
+
# Testing and development
|
| 38 |
+
test*.py
|
| 39 |
+
debug*.py
|
| 40 |
+
diagnose*.py
|
| 41 |
+
example_inference.py
|
| 42 |
+
|
| 43 |
+
# Documentation
|
| 44 |
+
.llm/
|
| 45 |
+
*.md
|
| 46 |
+
!README.md
|
| 47 |
+
|
| 48 |
+
# CI/CD
|
| 49 |
+
.github/
|
| 50 |
+
|
| 51 |
+
# Project specific
|
| 52 |
+
pyproject.toml
|
| 53 |
+
uv.lock
|
.gitignore
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[codz]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py.cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# UV
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
#uv.lock
|
| 102 |
+
|
| 103 |
+
# poetry
|
| 104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
+
# commonly ignored for libraries.
|
| 107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
+
#poetry.lock
|
| 109 |
+
#poetry.toml
|
| 110 |
+
|
| 111 |
+
# pdm
|
| 112 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 113 |
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
| 114 |
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
| 115 |
+
#pdm.lock
|
| 116 |
+
#pdm.toml
|
| 117 |
+
.pdm-python
|
| 118 |
+
.pdm-build/
|
| 119 |
+
|
| 120 |
+
# pixi
|
| 121 |
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
| 122 |
+
#pixi.lock
|
| 123 |
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
| 124 |
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
| 125 |
+
.pixi
|
| 126 |
+
|
| 127 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 128 |
+
__pypackages__/
|
| 129 |
+
|
| 130 |
+
# Celery stuff
|
| 131 |
+
celerybeat-schedule
|
| 132 |
+
celerybeat.pid
|
| 133 |
+
|
| 134 |
+
# SageMath parsed files
|
| 135 |
+
*.sage.py
|
| 136 |
+
|
| 137 |
+
# Environments
|
| 138 |
+
.env
|
| 139 |
+
.envrc
|
| 140 |
+
.venv
|
| 141 |
+
env/
|
| 142 |
+
venv/
|
| 143 |
+
ENV/
|
| 144 |
+
env.bak/
|
| 145 |
+
venv.bak/
|
| 146 |
+
|
| 147 |
+
# Spyder project settings
|
| 148 |
+
.spyderproject
|
| 149 |
+
.spyproject
|
| 150 |
+
|
| 151 |
+
# Rope project settings
|
| 152 |
+
.ropeproject
|
| 153 |
+
|
| 154 |
+
# mkdocs documentation
|
| 155 |
+
/site
|
| 156 |
+
|
| 157 |
+
# mypy
|
| 158 |
+
.mypy_cache/
|
| 159 |
+
.dmypy.json
|
| 160 |
+
dmypy.json
|
| 161 |
+
|
| 162 |
+
# Pyre type checker
|
| 163 |
+
.pyre/
|
| 164 |
+
|
| 165 |
+
# pytype static type analyzer
|
| 166 |
+
.pytype/
|
| 167 |
+
|
| 168 |
+
# Cython debug symbols
|
| 169 |
+
cython_debug/
|
| 170 |
+
|
| 171 |
+
# PyCharm
|
| 172 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 173 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 174 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 175 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 176 |
+
#.idea/
|
| 177 |
+
|
| 178 |
+
# Abstra
|
| 179 |
+
# Abstra is an AI-powered process automation framework.
|
| 180 |
+
# Ignore directories containing user credentials, local state, and settings.
|
| 181 |
+
# Learn more at https://abstra.io/docs
|
| 182 |
+
.abstra/
|
| 183 |
+
|
| 184 |
+
# Visual Studio Code
|
| 185 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
| 186 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
| 187 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
| 188 |
+
# you could uncomment the following to ignore the entire vscode folder
|
| 189 |
+
# .vscode/
|
| 190 |
+
|
| 191 |
+
# Ruff stuff:
|
| 192 |
+
.ruff_cache/
|
| 193 |
+
|
| 194 |
+
# PyPI configuration file
|
| 195 |
+
.pypirc
|
| 196 |
+
|
| 197 |
+
# Cursor
|
| 198 |
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
| 199 |
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
| 200 |
+
# refer to https://docs.cursor.com/context/ignore-files
|
| 201 |
+
.cursorignore
|
| 202 |
+
.cursorindexingignore
|
| 203 |
+
|
| 204 |
+
# Marimo
|
| 205 |
+
marimo/_static/
|
| 206 |
+
marimo/_lsp/
|
| 207 |
+
__marimo__/
|
| 208 |
+
|
| 209 |
+
# Project-specific
|
| 210 |
+
# Data files (too large for git)
|
| 211 |
+
data/*.csv
|
| 212 |
+
data/*.zip
|
| 213 |
+
|
| 214 |
+
# Trained model artifacts
|
| 215 |
+
# Note: Model files are included in the repo for deployment
|
| 216 |
+
# models/*.pkl
|
| 217 |
+
# models/*.joblib
|
| 218 |
+
|
| 219 |
+
# LLM
|
| 220 |
+
.llm/
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.12
|
Claude.md
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Claude Development Guide
|
| 2 |
+
|
| 3 |
+
## Project Overview
|
| 4 |
+
This is a minimal, local-first ML application built in Python that predicts developer salaries using Stack Overflow Developer Survey data. The project emphasizes clarity and simplicity over production completeness.
|
| 5 |
+
|
| 6 |
+
## Tech Stack
|
| 7 |
+
- **Python 3.11+**
|
| 8 |
+
- **uv** - Package & virtual environment management
|
| 9 |
+
- **pandas** - Data manipulation
|
| 10 |
+
- **scikit-learn** - ML modeling
|
| 11 |
+
- **pydantic** - Input validation
|
| 12 |
+
- **streamlit** - Web UI
|
| 13 |
+
- **xgboost** - Advanced gradient boosting (optional)
|
| 14 |
+
|
| 15 |
+
## Project Structure
|
| 16 |
+
```
|
| 17 |
+
.
|
| 18 |
+
├── data/
|
| 19 |
+
│ └── survey_results_public.csv # Stack Overflow survey data
|
| 20 |
+
├── models/
|
| 21 |
+
│ └── model.pkl # Serialized trained model
|
| 22 |
+
├── src/
|
| 23 |
+
│ ├── schema.py # Pydantic validation models
|
| 24 |
+
│ ├── train.py # Model training script
|
| 25 |
+
│ └── infer.py # Inference utilities
|
| 26 |
+
├── app.py # Streamlit web application
|
| 27 |
+
├── example_inference.py # Example inference script
|
| 28 |
+
├── pyproject.toml # Project dependencies (uv)
|
| 29 |
+
├── uv.lock # Locked dependencies
|
| 30 |
+
└── README.md # Project documentation
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
## Setup & Installation
|
| 34 |
+
|
| 35 |
+
### Initial Setup
|
| 36 |
+
```bash
|
| 37 |
+
# The virtual environment is already created at .venv/
|
| 38 |
+
# Activate it:
|
| 39 |
+
source .venv/bin/activate # On Linux/Mac
|
| 40 |
+
# or
|
| 41 |
+
.venv\Scripts\activate # On Windows
|
| 42 |
+
|
| 43 |
+
# Install/sync dependencies with uv:
|
| 44 |
+
uv sync
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
### Adding New Dependencies
|
| 48 |
+
```bash
|
| 49 |
+
uv add <package-name>
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
## Key Workflows
|
| 53 |
+
|
| 54 |
+
### Training the Model
|
| 55 |
+
```bash
|
| 56 |
+
python src/train.py
|
| 57 |
+
```
|
| 58 |
+
This will:
|
| 59 |
+
- Load data from `data/survey_results_public.csv`
|
| 60 |
+
- Clean and preprocess features
|
| 61 |
+
- Train the regression model
|
| 62 |
+
- Save model to `models/model.pkl`
|
| 63 |
+
|
| 64 |
+
### Running the Streamlit App
|
| 65 |
+
```bash
|
| 66 |
+
streamlit run app.py
|
| 67 |
+
```
|
| 68 |
+
Opens a browser interface for salary predictions.
|
| 69 |
+
|
| 70 |
+
### Running Inference Programmatically
|
| 71 |
+
```python
|
| 72 |
+
from src.schema import SalaryInput
|
| 73 |
+
from src.infer import predict_salary
|
| 74 |
+
|
| 75 |
+
input_data = SalaryInput(
|
| 76 |
+
country="United States",
|
| 77 |
+
years_code=5.0,
|
| 78 |
+
education_level="Bachelor's degree",
|
| 79 |
+
dev_type="Developer, back-end",
|
| 80 |
+
industry="Software Development"
|
| 81 |
+
)
|
| 82 |
+
salary = predict_salary(input_data)
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
## Key Files
|
| 86 |
+
|
| 87 |
+
### [src/schema.py](src/schema.py)
|
| 88 |
+
Contains Pydantic models for:
|
| 89 |
+
- Input validation (`SalaryInput`)
|
| 90 |
+
- Type safety across the application
|
| 91 |
+
|
| 92 |
+
### [src/train.py](src/train.py)
|
| 93 |
+
Training pipeline:
|
| 94 |
+
- Data loading and cleaning
|
| 95 |
+
- Feature engineering
|
| 96 |
+
- Model training
|
| 97 |
+
- Model persistence
|
| 98 |
+
|
| 99 |
+
### [src/infer.py](src/infer.py)
|
| 100 |
+
Inference utilities:
|
| 101 |
+
- Model loading
|
| 102 |
+
- Prediction logic
|
| 103 |
+
- Validated input processing
|
| 104 |
+
|
| 105 |
+
### [app.py](app.py)
|
| 106 |
+
Streamlit UI:
|
| 107 |
+
- User input forms
|
| 108 |
+
- Real-time predictions
|
| 109 |
+
- Results visualization
|
| 110 |
+
|
| 111 |
+
## Development Guidelines
|
| 112 |
+
|
| 113 |
+
### Code Style
|
| 114 |
+
- Keep code simple and readable
|
| 115 |
+
- Total codebase should remain under ~200 lines
|
| 116 |
+
- Focus on clarity over cleverness
|
| 117 |
+
- Use type hints where helpful
|
| 118 |
+
|
| 119 |
+
### Data Requirements
|
| 120 |
+
The dataset must include these columns:
|
| 121 |
+
- `Country` - Developer location
|
| 122 |
+
- `YearsCode` - Total years of coding (including education)
|
| 123 |
+
- `EdLevel` - Education level
|
| 124 |
+
- `DevType` - Developer type
|
| 125 |
+
- `Industry` - Industry the developer works in
|
| 126 |
+
- `ConvertedCompYearly` - Annual salary (target variable)
|
| 127 |
+
|
| 128 |
+
### Model Expectations
|
| 129 |
+
- Basic regression model (LinearRegression or similar)
|
| 130 |
+
- Simple feature encoding (one-hot for categoricals)
|
| 131 |
+
- No hyperparameter tuning required
|
| 132 |
+
- Focus on working end-to-end pipeline
|
| 133 |
+
|
| 134 |
+
## Common Tasks
|
| 135 |
+
|
| 136 |
+
### Debugging Training Issues
|
| 137 |
+
1. Check if data file exists: `ls -la data/`
|
| 138 |
+
2. Verify CSV columns: `head -1 data/survey_results_public.csv`
|
| 139 |
+
3. Check for missing values in target column
|
| 140 |
+
4. Review data types and encoding
|
| 141 |
+
|
| 142 |
+
### Updating Features
|
| 143 |
+
1. Modify `SalaryInput` schema in [src/schema.py](src/schema.py)
|
| 144 |
+
2. Update feature extraction in [src/train.py](src/train.py)
|
| 145 |
+
3. Update inference logic in [src/infer.py](src/infer.py)
|
| 146 |
+
4. Update UI inputs in [app.py](app.py)
|
| 147 |
+
5. Retrain the model
|
| 148 |
+
|
| 149 |
+
### Testing Predictions
|
| 150 |
+
```python
|
| 151 |
+
# Quick test in Python REPL
|
| 152 |
+
from src.infer import predict_salary
|
| 153 |
+
from src.schema import SalaryInput
|
| 154 |
+
|
| 155 |
+
test_input = SalaryInput(
|
| 156 |
+
country="United States",
|
| 157 |
+
years_code=3.0,
|
| 158 |
+
education_level="Bachelor's degree",
|
| 159 |
+
dev_type="Developer, back-end",
|
| 160 |
+
industry="Software Development"
|
| 161 |
+
)
|
| 162 |
+
print(predict_salary(test_input))
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
## Non-Goals (Intentionally Excluded)
|
| 166 |
+
- Cloud deployment or serving
|
| 167 |
+
- Hyperparameter tuning
|
| 168 |
+
- Model registry or experiment tracking
|
| 169 |
+
- Advanced feature engineering
|
| 170 |
+
- Production monitoring
|
| 171 |
+
- API endpoints (beyond Streamlit)
|
| 172 |
+
|
| 173 |
+
## Useful Commands
|
| 174 |
+
|
| 175 |
+
```bash
|
| 176 |
+
# Check environment
|
| 177 |
+
which python
|
| 178 |
+
python --version
|
| 179 |
+
|
| 180 |
+
# Verify uv installation
|
| 181 |
+
uv --version
|
| 182 |
+
|
| 183 |
+
# List installed packages
|
| 184 |
+
uv pip list
|
| 185 |
+
|
| 186 |
+
# Run with specific Python version
|
| 187 |
+
uv run python src/train.py
|
| 188 |
+
|
| 189 |
+
# Clean generated files
|
| 190 |
+
rm -f models/model.pkl
|
| 191 |
+
|
| 192 |
+
# Check data file size
|
| 193 |
+
du -h data/survey_results_public.csv
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
## Troubleshooting
|
| 197 |
+
|
| 198 |
+
### Model file not found
|
| 199 |
+
- Run training first: `python src/train.py`
|
| 200 |
+
- Check file exists: `ls -la models/model.pkl`
|
| 201 |
+
|
| 202 |
+
### Missing dependencies
|
| 203 |
+
- Sync environment: `uv sync`
|
| 204 |
+
- Verify pyproject.toml has all required packages
|
| 205 |
+
|
| 206 |
+
### Data file issues
|
| 207 |
+
- Ensure CSV is in `data/` directory
|
| 208 |
+
- Check file encoding (should be UTF-8)
|
| 209 |
+
- Verify required columns exist
|
| 210 |
+
|
| 211 |
+
### Streamlit won't start
|
| 212 |
+
- Check port 8501 is available
|
| 213 |
+
- Try specifying port: `streamlit run app.py --server.port 8502`
|
| 214 |
+
|
| 215 |
+
## Additional Resources
|
| 216 |
+
- [PRD](.llm/prd.md) - Full product requirements
|
| 217 |
+
- [README.md](README.md) - Project readme
|
| 218 |
+
- [Stack Overflow Survey](https://insights.stackoverflow.com/survey) - Data source
|
| 219 |
+
|
| 220 |
+
## Working with Claude Code
|
| 221 |
+
When asking Claude to help with this project:
|
| 222 |
+
- Reference specific files using markdown links: [filename](path)
|
| 223 |
+
- Be specific about which component needs changes
|
| 224 |
+
- Mention if you need training, inference, or UI updates
|
| 225 |
+
- Provide error messages in full when debugging
|
| 226 |
+
- Ask for explanations of model choices if unclear
|
Dockerfile
CHANGED
|
@@ -20,4 +20,4 @@ EXPOSE 8501
|
|
| 20 |
|
| 21 |
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
| 22 |
|
| 23 |
-
ENTRYPOINT ["streamlit", "run", "
|
|
|
|
| 20 |
|
| 21 |
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
| 22 |
|
| 23 |
+
ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
+
|
| 180 |
+
To apply the Apache License to your work, attach the following
|
| 181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 182 |
+
replaced with your own identifying information. (Don't include
|
| 183 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
+
comment syntax for the file format. We also recommend that a
|
| 185 |
+
file or class name and description of purpose be included on the
|
| 186 |
+
same "printed page" as the copyright notice for easier
|
| 187 |
+
identification within third-party archives.
|
| 188 |
+
|
| 189 |
+
Copyright [yyyy] [name of copyright owner]
|
| 190 |
+
|
| 191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
+
you may not use this file except in compliance with the License.
|
| 193 |
+
You may obtain a copy of the License at
|
| 194 |
+
|
| 195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
+
|
| 197 |
+
Unless required by applicable law or agreed to in writing, software
|
| 198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
+
See the License for the specific language governing permissions and
|
| 201 |
+
limitations under the License.
|
README.md
CHANGED
|
@@ -1,20 +1,305 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
#
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Developer Salary Prediction
|
| 2 |
+
|
| 3 |
+
A minimal, local-first ML application that predicts developer salaries using Stack Overflow Developer Survey data. Built with Python, scikit-learn, Pydantic, and Streamlit.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- 🎯 XGBoost (gradient boosting) model for salary prediction
|
| 8 |
+
- ✅ Input validation with Pydantic
|
| 9 |
+
- 🌐 Interactive web UI with Streamlit
|
| 10 |
+
- 📊 Trained on Stack Overflow Developer Survey data
|
| 11 |
+
- 🔧 Easy setup with `uv` package manager
|
| 12 |
+
|
| 13 |
+
## Quick Start
|
| 14 |
+
|
| 15 |
+
### 1. Install Dependencies
|
| 16 |
+
|
| 17 |
+
```bash
|
| 18 |
+
uv sync
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
### 2. Download Data
|
| 22 |
+
|
| 23 |
+
Download the Stack Overflow Developer Survey CSV file:
|
| 24 |
+
|
| 25 |
+
1. Visit: https://insights.stackoverflow.com/survey
|
| 26 |
+
2. Download the latest survey results (2024 or 2025)
|
| 27 |
+
3. Extract the `survey_results_public.csv` file
|
| 28 |
+
4. Place it in the `data/` directory:
|
| 29 |
+
```
|
| 30 |
+
data/survey_results_public.csv
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
**Required columns:** `Country`, `YearsCode`, `EdLevel`, `DevType`, `Industry`, `ConvertedCompYearly`
|
| 34 |
+
|
| 35 |
+
### 3. Train the Model
|
| 36 |
+
|
| 37 |
+
```bash
|
| 38 |
+
uv run python -m src.train
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
This will:
|
| 42 |
+
- Load configuration from `config/model_parameters.yaml`
|
| 43 |
+
- Load and preprocess the survey data (with cardinality reduction)
|
| 44 |
+
- Train an XGBoost model with early stopping
|
| 45 |
+
- Save the model to `models/model.pkl`
|
| 46 |
+
- Generate `config/valid_categories.yaml` with valid country, education, developer type, and industry values
|
| 47 |
+
|
| 48 |
+
### 4. Run the Streamlit App
|
| 49 |
+
|
| 50 |
+
```bash
|
| 51 |
+
uv run streamlit run app.py
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
The app will open in your browser at `http://localhost:8501`
|
| 55 |
+
|
| 56 |
+
## Usage
|
| 57 |
+
|
| 58 |
+
### Web Interface
|
| 59 |
+
|
| 60 |
+
Launch the Streamlit app and enter:
|
| 61 |
+
- **Country**: Developer's country
|
| 62 |
+
- **Years of Coding (Total)**: Total years coding including education
|
| 63 |
+
- **Education Level**: Highest degree completed
|
| 64 |
+
- **Developer Type**: Primary developer role
|
| 65 |
+
- **Industry**: Industry the developer works in
|
| 66 |
+
|
| 67 |
+
Click "Predict Salary" to see the estimated annual salary.
|
| 68 |
+
|
| 69 |
+
### Programmatic Usage
|
| 70 |
+
|
| 71 |
+
**Quick example:**
|
| 72 |
+
|
| 73 |
+
```python
|
| 74 |
+
from src.schema import SalaryInput
|
| 75 |
+
from src.infer import predict_salary
|
| 76 |
+
|
| 77 |
+
# Create input
|
| 78 |
+
input_data = SalaryInput(
|
| 79 |
+
country="United States of America",
|
| 80 |
+
years_code=5.0,
|
| 81 |
+
education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 82 |
+
dev_type="Developer, full-stack",
|
| 83 |
+
industry="Software Development"
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Get prediction
|
| 87 |
+
salary = predict_salary(input_data)
|
| 88 |
+
print(f"Estimated salary: ${salary:,.0f}")
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
**Run the example script:**
|
| 92 |
+
|
| 93 |
+
```bash
|
| 94 |
+
uv run python example_inference.py
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
This will show predictions for multiple sample scenarios (junior, mid-level, senior developers, different countries).
|
| 98 |
+
|
| 99 |
+
## Input Validation
|
| 100 |
+
|
| 101 |
+
The model validates inputs against actual training data categories:
|
| 102 |
+
|
| 103 |
+
- **Valid Countries**: Only countries from `config/valid_categories.yaml` (~21 countries)
|
| 104 |
+
- **Valid Education Levels**: Only education levels from training data (~9 levels)
|
| 105 |
+
- **Valid Developer Types**: Only developer types from training data (~20 types)
|
| 106 |
+
- **Valid Industries**: Only industries from training data (~15 industries)
|
| 107 |
+
|
| 108 |
+
The Streamlit app uses dropdown menus with only valid options. If you use the programmatic API with invalid values, you'll get a helpful error message pointing to the valid categories file.
|
| 109 |
+
|
| 110 |
+
**Example validation:**
|
| 111 |
+
```python
|
| 112 |
+
from src.infer import predict_salary
|
| 113 |
+
from src.schema import SalaryInput
|
| 114 |
+
|
| 115 |
+
# This will raise ValueError - Japan not in training data after cardinality reduction
|
| 116 |
+
invalid_input = SalaryInput(
|
| 117 |
+
country="Japan", # Invalid!
|
| 118 |
+
years_code=5.0,
|
| 119 |
+
education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 120 |
+
dev_type="Developer, back-end",
|
| 121 |
+
industry="Software Development"
|
| 122 |
+
)
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
**View valid categories:**
|
| 126 |
+
```bash
|
| 127 |
+
cat config/valid_categories.yaml
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
## Configuration
|
| 131 |
+
|
| 132 |
+
Model parameters are centralized in [config/model_parameters.yaml](config/model_parameters.yaml). You can customize:
|
| 133 |
+
|
| 134 |
+
- **Data Processing**: Salary thresholds, percentile bounds, train/test split ratio
|
| 135 |
+
- **Feature Engineering**: Cardinality reduction settings (max categories, min frequency)
|
| 136 |
+
- **Model Hyperparameters**: Learning rate, tree depth, early stopping, etc.
|
| 137 |
+
- **Training Settings**: Verbosity, model save path
|
| 138 |
+
|
| 139 |
+
**To modify parameters:**
|
| 140 |
+
|
| 141 |
+
```bash
|
| 142 |
+
# Edit the config file
|
| 143 |
+
nano config/model_parameters.yaml
|
| 144 |
+
|
| 145 |
+
# Then retrain the model
|
| 146 |
+
uv run python -m src.train
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
**Example parameter changes:**
|
| 150 |
+
```yaml
|
| 151 |
+
# Increase model complexity
|
| 152 |
+
model:
|
| 153 |
+
max_depth: 8 # Default: 6
|
| 154 |
+
n_estimators: 10000 # Default: 5000
|
| 155 |
+
|
| 156 |
+
# Keep more categories
|
| 157 |
+
features:
|
| 158 |
+
cardinality:
|
| 159 |
+
max_categories: 30 # Default: 20
|
| 160 |
+
min_frequency: 100 # Default: 50
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
+
## Project Structure
|
| 164 |
+
|
| 165 |
+
```
|
| 166 |
+
.
|
| 167 |
+
├── config/
|
| 168 |
+
│ ├── model_parameters.yaml # Model configuration
|
| 169 |
+
│ └── valid_categories.yaml # Valid input categories (generated)
|
| 170 |
+
├── data/
|
| 171 |
+
│ └── survey_results_public.csv # Stack Overflow survey data (download required)
|
| 172 |
+
├── models/
|
| 173 |
+
│ └── model.pkl # Trained model (generated)
|
| 174 |
+
├── src/
|
| 175 |
+
│ ├── __init__.py # Package initialization
|
| 176 |
+
│ ├── schema.py # Pydantic models
|
| 177 |
+
│ ├── preprocessing.py # Feature engineering utilities
|
| 178 |
+
│ ├── train.py # Training script
|
| 179 |
+
│ └── infer.py # Inference utilities
|
| 180 |
+
├── app.py # Streamlit web app
|
| 181 |
+
├── example_inference.py # Example inference script
|
| 182 |
+
├── pyproject.toml # Project dependencies
|
| 183 |
+
└── README.md # This file
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
## Tech Stack
|
| 187 |
+
|
| 188 |
+
- **Python 3.12+**
|
| 189 |
+
- **uv** - Package manager
|
| 190 |
+
- **pandas** - Data manipulation
|
| 191 |
+
- **xgboost** - Gradient boosting model
|
| 192 |
+
- **scikit-learn** - ML utilities (train/test split)
|
| 193 |
+
- **pydantic** - Data validation
|
| 194 |
+
- **streamlit** - Web UI
|
| 195 |
+
|
| 196 |
+
## Development
|
| 197 |
+
|
| 198 |
+
For detailed development information, see [Claude.md](Claude.md).
|
| 199 |
+
|
| 200 |
+
### Re-training the Model
|
| 201 |
+
|
| 202 |
+
If you want to use a different survey year or update the model:
|
| 203 |
+
|
| 204 |
+
```bash
|
| 205 |
+
# Place new CSV in data/ directory
|
| 206 |
+
uv run python -m src.train
|
| 207 |
+
```
|
| 208 |
+
|
| 209 |
+
### Running Tests
|
| 210 |
+
|
| 211 |
+
**Quick one-liner test:**
|
| 212 |
+
```bash
|
| 213 |
+
uv run python -c "from src.schema import SalaryInput; from src.infer import predict_salary; test = SalaryInput(country='United States of America', years_code=5.0, education_level='Bachelor'\''s degree (B.A., B.S., B.Eng., etc.)', dev_type='Developer, full-stack', industry='Software Development'); print(f'Prediction: \${predict_salary(test):,.0f}')"
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
**Or run the full example script:**
|
| 217 |
+
```bash
|
| 218 |
+
uv run python example_inference.py
|
| 219 |
+
```
|
| 220 |
+
|
| 221 |
+
## Deployment
|
| 222 |
+
|
| 223 |
+
### Hugging Face Spaces
|
| 224 |
+
|
| 225 |
+
This application is Docker-ready for deployment on Hugging Face Spaces:
|
| 226 |
+
|
| 227 |
+
**1. Build the Docker image:**
|
| 228 |
+
```bash
|
| 229 |
+
docker build -t developer-salary-predictor .
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
**2. Test locally:**
|
| 233 |
+
```bash
|
| 234 |
+
docker run -p 8501:8501 developer-salary-predictor
|
| 235 |
+
```
|
| 236 |
+
|
| 237 |
+
Then visit `http://localhost:8501`
|
| 238 |
+
|
| 239 |
+
**3. Deploy to Hugging Face:**
|
| 240 |
+
|
| 241 |
+
1. Create a new Space on [Hugging Face](https://huggingface.co/new-space)
|
| 242 |
+
2. Select "Docker" as the SDK
|
| 243 |
+
3. Clone your Space repository
|
| 244 |
+
4. Copy these files to your Space:
|
| 245 |
+
|
| 246 |
+
```text
|
| 247 |
+
Dockerfile
|
| 248 |
+
requirements.txt
|
| 249 |
+
app.py
|
| 250 |
+
src/
|
| 251 |
+
config/
|
| 252 |
+
models/
|
| 253 |
+
```
|
| 254 |
+
|
| 255 |
+
5. Push to your Space:
|
| 256 |
+
```bash
|
| 257 |
+
git add .
|
| 258 |
+
git commit -m "Initial deployment"
|
| 259 |
+
git push
|
| 260 |
+
```
|
| 261 |
+
|
| 262 |
+
**Note:** The pre-trained model (`models/model.pkl`) and configuration (`config/valid_categories.yaml`) are included in the Docker image. If you want to use a different model, retrain locally first, then rebuild the Docker image.
|
| 263 |
+
|
| 264 |
+
### Alternative: Local Deployment
|
| 265 |
+
|
| 266 |
+
**Using uv (recommended for development):**
|
| 267 |
+
```bash
|
| 268 |
+
uv run streamlit run app.py
|
| 269 |
+
```
|
| 270 |
+
|
| 271 |
+
**Using pip:**
|
| 272 |
+
```bash
|
| 273 |
+
pip install -r requirements.txt
|
| 274 |
+
streamlit run app.py
|
| 275 |
+
```
|
| 276 |
+
|
| 277 |
+
## Troubleshooting
|
| 278 |
+
|
| 279 |
+
### "Model file not found"
|
| 280 |
+
- Run `uv run python -m src.train` first to generate the model
|
| 281 |
+
|
| 282 |
+
### "Data file not found"
|
| 283 |
+
- Download the Stack Overflow survey CSV and place it in `data/`
|
| 284 |
+
|
| 285 |
+
### "Configuration file not found"
|
| 286 |
+
- The `config/model_parameters.yaml` file should exist in the project root
|
| 287 |
+
- Check that you're running commands from the project root directory
|
| 288 |
+
|
| 289 |
+
### Dependencies issues
|
| 290 |
+
- Run `uv sync` to ensure all packages are installed
|
| 291 |
+
|
| 292 |
+
## Design Principles
|
| 293 |
+
|
| 294 |
+
- **Simplicity**: Under 200 lines of code total
|
| 295 |
+
- **Clarity**: Easy to understand and modify
|
| 296 |
+
- **Local-first**: No cloud dependencies
|
| 297 |
+
- **Hackable**: Plain Python, no complex frameworks
|
| 298 |
+
|
| 299 |
+
## License
|
| 300 |
+
|
| 301 |
+
Apache 2.0 License - see [LICENSE](LICENSE) file
|
| 302 |
+
|
| 303 |
+
## Acknowledgments
|
| 304 |
+
|
| 305 |
+
Data from [Stack Overflow Developer Survey](https://insights.stackoverflow.com/survey)
|
app.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
|
| 3 |
import streamlit as st
|
| 4 |
|
| 5 |
-
from src.infer import predict_salary, valid_categories
|
| 6 |
from src.schema import SalaryInput
|
| 7 |
|
| 8 |
# Page configuration
|
|
@@ -26,9 +26,10 @@ with st.sidebar:
|
|
| 26 |
This app uses an XGBoost (gradient boosting) model trained on Stack Overflow
|
| 27 |
Developer Survey data to predict annual salaries based on:
|
| 28 |
- Country
|
| 29 |
-
-
|
| 30 |
- Education level
|
| 31 |
- Developer type
|
|
|
|
| 32 |
"""
|
| 33 |
)
|
| 34 |
st.info("💡 Tip: Results are estimates based on survey averages.")
|
|
@@ -38,6 +39,7 @@ with st.sidebar:
|
|
| 38 |
st.write(f"**Countries:** {len(valid_categories['Country'])} available")
|
| 39 |
st.write(f"**Education Levels:** {len(valid_categories['EdLevel'])} available")
|
| 40 |
st.write(f"**Developer Types:** {len(valid_categories['DevType'])} available")
|
|
|
|
| 41 |
st.caption("Only values from the training data are shown in the dropdowns.")
|
| 42 |
|
| 43 |
# Main input form
|
|
@@ -49,11 +51,13 @@ col1, col2 = st.columns(2)
|
|
| 49 |
valid_countries = valid_categories["Country"]
|
| 50 |
valid_education_levels = valid_categories["EdLevel"]
|
| 51 |
valid_dev_types = valid_categories["DevType"]
|
|
|
|
| 52 |
|
| 53 |
# Set default values (if available)
|
| 54 |
default_country = "United States of America" if "United States of America" in valid_countries else valid_countries[0]
|
| 55 |
default_education = "Bachelor's degree (B.A., B.S., B.Eng., etc.)" if "Bachelor's degree (B.A., B.S., B.Eng., etc.)" in valid_education_levels else valid_education_levels[0]
|
| 56 |
default_dev_type = "Developer, back-end" if "Developer, back-end" in valid_dev_types else valid_dev_types[0]
|
|
|
|
| 57 |
|
| 58 |
with col1:
|
| 59 |
country = st.selectbox(
|
|
@@ -64,12 +68,12 @@ with col1:
|
|
| 64 |
)
|
| 65 |
|
| 66 |
years = st.number_input(
|
| 67 |
-
"Years of
|
| 68 |
min_value=0,
|
| 69 |
max_value=50,
|
| 70 |
-
value=
|
| 71 |
step=1,
|
| 72 |
-
help="
|
| 73 |
)
|
| 74 |
|
| 75 |
with col2:
|
|
@@ -87,15 +91,23 @@ with col2:
|
|
| 87 |
help="Primary developer role (only types from training data)",
|
| 88 |
)
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
# Prediction button
|
| 91 |
if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
|
| 92 |
try:
|
| 93 |
# Create input model
|
| 94 |
input_data = SalaryInput(
|
| 95 |
country=country,
|
| 96 |
-
|
| 97 |
education_level=education,
|
| 98 |
dev_type=dev_type,
|
|
|
|
| 99 |
)
|
| 100 |
|
| 101 |
# Make prediction
|
|
@@ -104,11 +116,29 @@ if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
|
|
| 104 |
|
| 105 |
# Display result
|
| 106 |
st.success("Prediction Complete!")
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
except FileNotFoundError:
|
| 114 |
st.error(
|
|
|
|
| 2 |
|
| 3 |
import streamlit as st
|
| 4 |
|
| 5 |
+
from src.infer import predict_salary, get_local_currency, valid_categories
|
| 6 |
from src.schema import SalaryInput
|
| 7 |
|
| 8 |
# Page configuration
|
|
|
|
| 26 |
This app uses an XGBoost (gradient boosting) model trained on Stack Overflow
|
| 27 |
Developer Survey data to predict annual salaries based on:
|
| 28 |
- Country
|
| 29 |
+
- Total years of coding experience (including education)
|
| 30 |
- Education level
|
| 31 |
- Developer type
|
| 32 |
+
- Industry
|
| 33 |
"""
|
| 34 |
)
|
| 35 |
st.info("💡 Tip: Results are estimates based on survey averages.")
|
|
|
|
| 39 |
st.write(f"**Countries:** {len(valid_categories['Country'])} available")
|
| 40 |
st.write(f"**Education Levels:** {len(valid_categories['EdLevel'])} available")
|
| 41 |
st.write(f"**Developer Types:** {len(valid_categories['DevType'])} available")
|
| 42 |
+
st.write(f"**Industries:** {len(valid_categories['Industry'])} available")
|
| 43 |
st.caption("Only values from the training data are shown in the dropdowns.")
|
| 44 |
|
| 45 |
# Main input form
|
|
|
|
| 51 |
valid_countries = valid_categories["Country"]
|
| 52 |
valid_education_levels = valid_categories["EdLevel"]
|
| 53 |
valid_dev_types = valid_categories["DevType"]
|
| 54 |
+
valid_industries = valid_categories["Industry"]
|
| 55 |
|
| 56 |
# Set default values (if available)
|
| 57 |
default_country = "United States of America" if "United States of America" in valid_countries else valid_countries[0]
|
| 58 |
default_education = "Bachelor's degree (B.A., B.S., B.Eng., etc.)" if "Bachelor's degree (B.A., B.S., B.Eng., etc.)" in valid_education_levels else valid_education_levels[0]
|
| 59 |
default_dev_type = "Developer, back-end" if "Developer, back-end" in valid_dev_types else valid_dev_types[0]
|
| 60 |
+
default_industry = "Software Development" if "Software Development" in valid_industries else valid_industries[0]
|
| 61 |
|
| 62 |
with col1:
|
| 63 |
country = st.selectbox(
|
|
|
|
| 68 |
)
|
| 69 |
|
| 70 |
years = st.number_input(
|
| 71 |
+
"Years of Coding (Total)",
|
| 72 |
min_value=0,
|
| 73 |
max_value=50,
|
| 74 |
+
value=15,
|
| 75 |
step=1,
|
| 76 |
+
help="Including any education, how many years have you been coding in total?",
|
| 77 |
)
|
| 78 |
|
| 79 |
with col2:
|
|
|
|
| 91 |
help="Primary developer role (only types from training data)",
|
| 92 |
)
|
| 93 |
|
| 94 |
+
industry = st.selectbox(
|
| 95 |
+
"Industry",
|
| 96 |
+
options=valid_industries,
|
| 97 |
+
index=valid_industries.index(default_industry),
|
| 98 |
+
help="Industry the developer works in (only industries from training data)",
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
# Prediction button
|
| 102 |
if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
|
| 103 |
try:
|
| 104 |
# Create input model
|
| 105 |
input_data = SalaryInput(
|
| 106 |
country=country,
|
| 107 |
+
years_code=years,
|
| 108 |
education_level=education,
|
| 109 |
dev_type=dev_type,
|
| 110 |
+
industry=industry,
|
| 111 |
)
|
| 112 |
|
| 113 |
# Make prediction
|
|
|
|
| 116 |
|
| 117 |
# Display result
|
| 118 |
st.success("Prediction Complete!")
|
| 119 |
+
|
| 120 |
+
# Show USD and local currency side by side
|
| 121 |
+
local = get_local_currency(country, salary)
|
| 122 |
+
if local and local["code"] != "USD":
|
| 123 |
+
col_usd, col_local = st.columns(2)
|
| 124 |
+
with col_usd:
|
| 125 |
+
st.metric(
|
| 126 |
+
label="Estimated Annual Salary (USD)",
|
| 127 |
+
value=f"${salary:,.0f}",
|
| 128 |
+
help="Predicted annual compensation in USD",
|
| 129 |
+
)
|
| 130 |
+
with col_local:
|
| 131 |
+
st.metric(
|
| 132 |
+
label=f"Estimated Annual Salary ({local['code']})",
|
| 133 |
+
value=f"{local['salary_local']:,.0f} {local['code']}",
|
| 134 |
+
help=f"Converted using survey-derived rate: 1 USD = {local['rate']} {local['code']} ({local['name']})",
|
| 135 |
+
)
|
| 136 |
+
else:
|
| 137 |
+
st.metric(
|
| 138 |
+
label="Estimated Annual Salary",
|
| 139 |
+
value=f"${salary:,.0f}",
|
| 140 |
+
help="Predicted annual compensation in USD",
|
| 141 |
+
)
|
| 142 |
|
| 143 |
except FileNotFoundError:
|
| 144 |
st.error(
|
config/currency_rates.yaml
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Australia:
|
| 2 |
+
code: AUD
|
| 3 |
+
name: Australian dollar
|
| 4 |
+
rate: 1.54
|
| 5 |
+
Austria:
|
| 6 |
+
code: EUR
|
| 7 |
+
name: European Euro
|
| 8 |
+
rate: 0.86
|
| 9 |
+
Belgium:
|
| 10 |
+
code: EUR
|
| 11 |
+
name: European Euro
|
| 12 |
+
rate: 0.86
|
| 13 |
+
Brazil:
|
| 14 |
+
code: BRL
|
| 15 |
+
name: Brazilian real
|
| 16 |
+
rate: 5.49
|
| 17 |
+
Canada:
|
| 18 |
+
code: CAD
|
| 19 |
+
name: Canadian dollar
|
| 20 |
+
rate: 1.37
|
| 21 |
+
Czech Republic:
|
| 22 |
+
code: CZK
|
| 23 |
+
name: Czech koruna
|
| 24 |
+
rate: 21.36
|
| 25 |
+
Denmark:
|
| 26 |
+
code: DKK
|
| 27 |
+
name: Danish krone
|
| 28 |
+
rate: 6.43
|
| 29 |
+
France:
|
| 30 |
+
code: EUR
|
| 31 |
+
name: European Euro
|
| 32 |
+
rate: 0.86
|
| 33 |
+
Germany:
|
| 34 |
+
code: EUR
|
| 35 |
+
name: European Euro
|
| 36 |
+
rate: 0.86
|
| 37 |
+
India:
|
| 38 |
+
code: INR
|
| 39 |
+
name: Indian rupee
|
| 40 |
+
rate: 86.03
|
| 41 |
+
Italy:
|
| 42 |
+
code: EUR
|
| 43 |
+
name: European Euro
|
| 44 |
+
rate: 0.86
|
| 45 |
+
Netherlands:
|
| 46 |
+
code: EUR
|
| 47 |
+
name: European Euro
|
| 48 |
+
rate: 0.86
|
| 49 |
+
Other:
|
| 50 |
+
code: EUR
|
| 51 |
+
name: European Euro
|
| 52 |
+
rate: 0.86
|
| 53 |
+
Poland:
|
| 54 |
+
code: PLN
|
| 55 |
+
name: Polish zloty
|
| 56 |
+
rate: 3.66
|
| 57 |
+
Portugal:
|
| 58 |
+
code: EUR
|
| 59 |
+
name: European Euro
|
| 60 |
+
rate: 0.86
|
| 61 |
+
Spain:
|
| 62 |
+
code: EUR
|
| 63 |
+
name: European Euro
|
| 64 |
+
rate: 0.86
|
| 65 |
+
Sweden:
|
| 66 |
+
code: SEK
|
| 67 |
+
name: Swedish krona
|
| 68 |
+
rate: 9.54
|
| 69 |
+
Switzerland:
|
| 70 |
+
code: CHF
|
| 71 |
+
name: Swiss franc
|
| 72 |
+
rate: 0.81
|
| 73 |
+
Ukraine:
|
| 74 |
+
code: UAH
|
| 75 |
+
name: Ukrainian hryvnia
|
| 76 |
+
rate: 41.73
|
| 77 |
+
United Kingdom of Great Britain and Northern Ireland:
|
| 78 |
+
code: GBP
|
| 79 |
+
name: Pound sterling
|
| 80 |
+
rate: 0.73
|
| 81 |
+
United States of America:
|
| 82 |
+
code: USD
|
| 83 |
+
name: United States dollar
|
| 84 |
+
rate: 1.0
|
config/valid_categories.yaml
CHANGED
|
@@ -52,3 +52,20 @@ DevType:
|
|
| 52 |
- Senior executive (C-suite, VP, etc.)
|
| 53 |
- Student
|
| 54 |
- System administrator
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
- Senior executive (C-suite, VP, etc.)
|
| 53 |
- Student
|
| 54 |
- System administrator
|
| 55 |
+
Industry:
|
| 56 |
+
- Banking/Financial Services
|
| 57 |
+
- Computer Systems Design and Services
|
| 58 |
+
- Energy
|
| 59 |
+
- Fintech
|
| 60 |
+
- Government
|
| 61 |
+
- Healthcare
|
| 62 |
+
- Higher Education
|
| 63 |
+
- Insurance
|
| 64 |
+
- Internet, Telecomm or Information Services
|
| 65 |
+
- Manufacturing
|
| 66 |
+
- Media & Advertising Services
|
| 67 |
+
- Other
|
| 68 |
+
- 'Other:'
|
| 69 |
+
- Retail and Consumer Services
|
| 70 |
+
- Software Development
|
| 71 |
+
- Transportation, or Supply Chain
|
debug_prepare_features.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Debug prepare_features step by step."""
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from src.preprocessing import reduce_cardinality
|
| 5 |
+
import yaml
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
# Load config
|
| 9 |
+
config_path = Path("config/model_parameters.yaml")
|
| 10 |
+
with open(config_path, "r") as f:
|
| 11 |
+
config = yaml.safe_load(f)
|
| 12 |
+
|
| 13 |
+
# Create test input
|
| 14 |
+
df = pd.DataFrame({
|
| 15 |
+
'Country': ['United States of America'],
|
| 16 |
+
'YearsCode': [5.0],
|
| 17 |
+
'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
|
| 18 |
+
'DevType': ['Developer, full-stack']
|
| 19 |
+
})
|
| 20 |
+
|
| 21 |
+
print("=" * 70)
|
| 22 |
+
print("STEP-BY-STEP DEBUGGING OF prepare_features()")
|
| 23 |
+
print("=" * 70)
|
| 24 |
+
|
| 25 |
+
print("\n1. Original input:")
|
| 26 |
+
print(f" Columns: {list(df.columns)}")
|
| 27 |
+
print(f" Values: {df.iloc[0].to_dict()}")
|
| 28 |
+
|
| 29 |
+
# Step 2: Copy
|
| 30 |
+
df_processed = df.copy()
|
| 31 |
+
|
| 32 |
+
# Step 3: Unicode normalization
|
| 33 |
+
for col in ["Country", "EdLevel", "DevType"]:
|
| 34 |
+
if col in df_processed.columns:
|
| 35 |
+
df_processed[col] = df_processed[col].str.replace('\u2019', "'", regex=False)
|
| 36 |
+
|
| 37 |
+
print("\n2. After unicode normalization:")
|
| 38 |
+
print(f" Columns: {list(df_processed.columns)}")
|
| 39 |
+
|
| 40 |
+
# Step 4: Fill missing values
|
| 41 |
+
df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
|
| 42 |
+
df_processed["Country"] = df_processed["Country"].fillna("Unknown")
|
| 43 |
+
df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
|
| 44 |
+
df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
|
| 45 |
+
|
| 46 |
+
print("\n3. After filling missing values:")
|
| 47 |
+
print(f" Columns: {list(df_processed.columns)}")
|
| 48 |
+
print(f" Country value: '{df_processed['Country'].iloc[0]}'")
|
| 49 |
+
print(f" EdLevel value: '{df_processed['EdLevel'].iloc[0]}'")
|
| 50 |
+
print(f" DevType value: '{df_processed['DevType'].iloc[0]}'")
|
| 51 |
+
|
| 52 |
+
# Step 5: Reduce cardinality
|
| 53 |
+
print("\n4. Before cardinality reduction:")
|
| 54 |
+
print(f" Country value: '{df_processed['Country'].iloc[0]}'")
|
| 55 |
+
df_processed["Country"] = reduce_cardinality(df_processed["Country"])
|
| 56 |
+
print(f" After Country reduction: '{df_processed['Country'].iloc[0]}'")
|
| 57 |
+
|
| 58 |
+
print(f" EdLevel value: '{df_processed['EdLevel'].iloc[0]}'")
|
| 59 |
+
df_processed["EdLevel"] = reduce_cardinality(df_processed["EdLevel"])
|
| 60 |
+
print(f" After EdLevel reduction: '{df_processed['EdLevel'].iloc[0]}'")
|
| 61 |
+
|
| 62 |
+
print(f" DevType value: '{df_processed['DevType'].iloc[0]}'")
|
| 63 |
+
df_processed["DevType"] = reduce_cardinality(df_processed["DevType"])
|
| 64 |
+
print(f" After DevType reduction: '{df_processed['DevType'].iloc[0]}'")
|
| 65 |
+
|
| 66 |
+
# Step 6: Select feature columns
|
| 67 |
+
feature_cols = ["Country", "YearsCode", "EdLevel", "DevType"]
|
| 68 |
+
df_features = df_processed[feature_cols]
|
| 69 |
+
|
| 70 |
+
print("\n5. After selecting feature columns:")
|
| 71 |
+
print(f" Columns: {list(df_features.columns)}")
|
| 72 |
+
print(f" Values: {df_features.iloc[0].to_dict()}")
|
| 73 |
+
|
| 74 |
+
# Step 7: One-hot encode
|
| 75 |
+
drop_first = config['features']['encoding']['drop_first']
|
| 76 |
+
print(f"\n6. One-hot encoding with drop_first={drop_first}:")
|
| 77 |
+
df_encoded = pd.get_dummies(df_features, drop_first=drop_first)
|
| 78 |
+
|
| 79 |
+
print(f" Result shape: {df_encoded.shape}")
|
| 80 |
+
print(f" Result columns: {list(df_encoded.columns)}")
|
| 81 |
+
print(f" Non-zero values: {df_encoded.columns[df_encoded.iloc[0] != 0].tolist()}")
|
diagnose_encoding.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Diagnose why categorical features aren't affecting predictions."""
|
| 2 |
+
|
| 3 |
+
from src.preprocessing import prepare_features
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
# Create two inputs that differ ONLY in Country
|
| 7 |
+
input1 = pd.DataFrame({
|
| 8 |
+
'Country': ['United States of America'],
|
| 9 |
+
'YearsCode': [5.0],
|
| 10 |
+
'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
|
| 11 |
+
'DevType': ['Developer, full-stack']
|
| 12 |
+
})
|
| 13 |
+
|
| 14 |
+
input2 = pd.DataFrame({
|
| 15 |
+
'Country': ['Germany'], # Different!
|
| 16 |
+
'YearsCode': [5.0],
|
| 17 |
+
'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
|
| 18 |
+
'DevType': ['Developer, full-stack']
|
| 19 |
+
})
|
| 20 |
+
|
| 21 |
+
print("=" * 70)
|
| 22 |
+
print("ENCODING DIAGNOSIS")
|
| 23 |
+
print("=" * 70)
|
| 24 |
+
|
| 25 |
+
# Process features
|
| 26 |
+
features1 = prepare_features(input1)
|
| 27 |
+
features2 = prepare_features(input2)
|
| 28 |
+
|
| 29 |
+
print(f"\nInput 1 (USA):")
|
| 30 |
+
print(f" Shape: {features1.shape}")
|
| 31 |
+
print(f" Columns: {list(features1.columns)}")
|
| 32 |
+
non_zero1 = [col for col in features1.columns if features1[col].iloc[0] != 0]
|
| 33 |
+
print(f" Non-zero features ({len(non_zero1)}): {non_zero1}")
|
| 34 |
+
|
| 35 |
+
print(f"\nInput 2 (Germany):")
|
| 36 |
+
print(f" Shape: {features2.shape}")
|
| 37 |
+
non_zero2 = [col for col in features2.columns if features2[col].iloc[0] != 0]
|
| 38 |
+
print(f" Non-zero features ({len(non_zero2)}): {non_zero2}")
|
| 39 |
+
|
| 40 |
+
print(f"\nAre encoded features identical? {features1.equals(features2)}")
|
| 41 |
+
|
| 42 |
+
if features1.equals(features2):
|
| 43 |
+
print("\n❌ PROBLEM: Different countries produce IDENTICAL encodings!")
|
| 44 |
+
print(" This explains why categorical features don't affect predictions.")
|
| 45 |
+
else:
|
| 46 |
+
print("\n✅ Encodings are different - categorical features should work.")
|
| 47 |
+
|
| 48 |
+
# Check what happens with Country specifically
|
| 49 |
+
print("\n" + "=" * 70)
|
| 50 |
+
print("COUNTRY ENCODING CHECK")
|
| 51 |
+
print("=" * 70)
|
| 52 |
+
|
| 53 |
+
# Test just Country encoding
|
| 54 |
+
test_countries = ['United States of America', 'Germany', 'India']
|
| 55 |
+
for country in test_countries:
|
| 56 |
+
test_df = pd.DataFrame({
|
| 57 |
+
'Country': [country],
|
| 58 |
+
'YearsCode': [5.0],
|
| 59 |
+
'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
|
| 60 |
+
'DevType': ['Developer, full-stack']
|
| 61 |
+
})
|
| 62 |
+
encoded = prepare_features(test_df)
|
| 63 |
+
country_cols = [col for col in encoded.columns if col.startswith('Country_')]
|
| 64 |
+
non_zero_countries = [col for col in country_cols if encoded[col].iloc[0] != 0]
|
| 65 |
+
print(f"{country:40s} -> {non_zero_countries}")
|
example_inference.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Example script showing how to use the salary prediction model programmatically."""
|
| 2 |
+
|
| 3 |
+
from src.schema import SalaryInput
|
| 4 |
+
from src.infer import predict_salary
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def main():
|
| 8 |
+
"""Run sample predictions with different input parameters."""
|
| 9 |
+
|
| 10 |
+
print("=" * 60)
|
| 11 |
+
print("Developer Salary Prediction - Sample Inference")
|
| 12 |
+
print("=" * 60)
|
| 13 |
+
|
| 14 |
+
# Example 1: Default parameters (same as Streamlit app defaults)
|
| 15 |
+
print("\n📊 Example 1: Default Parameters")
|
| 16 |
+
print("-" * 60)
|
| 17 |
+
|
| 18 |
+
input_data_1 = SalaryInput(
|
| 19 |
+
country="United States of America",
|
| 20 |
+
years_code=5.0,
|
| 21 |
+
education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 22 |
+
dev_type="Developer, full-stack",
|
| 23 |
+
industry="Software Development",
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
print(f"Country: {input_data_1.country}")
|
| 27 |
+
print(f"Years of Coding (Total): {input_data_1.years_code}")
|
| 28 |
+
print(f"Education Level: {input_data_1.education_level}")
|
| 29 |
+
print(f"Developer Type: {input_data_1.dev_type}")
|
| 30 |
+
print(f"Industry: {input_data_1.industry}")
|
| 31 |
+
|
| 32 |
+
salary_1 = predict_salary(input_data_1)
|
| 33 |
+
print(f"💰 Predicted Salary: ${salary_1:,.2f} USD/year")
|
| 34 |
+
|
| 35 |
+
# Example 2: Junior developer
|
| 36 |
+
print("\n📊 Example 2: Junior Developer")
|
| 37 |
+
print("-" * 60)
|
| 38 |
+
|
| 39 |
+
input_data_2 = SalaryInput(
|
| 40 |
+
country="United States of America",
|
| 41 |
+
years_code=2.0,
|
| 42 |
+
education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
|
| 43 |
+
dev_type="Developer, front-end",
|
| 44 |
+
industry="Fintech",
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
print(f"Country: {input_data_2.country}")
|
| 48 |
+
print(f"Years of Coding (Total): {input_data_2.years_code}")
|
| 49 |
+
print(f"Education Level: {input_data_2.education_level}")
|
| 50 |
+
print(f"Developer Type: {input_data_2.dev_type}")
|
| 51 |
+
print(f"Industry: {input_data_2.industry}")
|
| 52 |
+
|
| 53 |
+
salary_2 = predict_salary(input_data_2)
|
| 54 |
+
print(f"💰 Predicted Salary: ${salary_2:,.2f} USD/year")
|
| 55 |
+
|
| 56 |
+
# Example 3: Senior developer with Master's degree
|
| 57 |
+
print("\n📊 Example 3: Senior Developer")
|
| 58 |
+
print("-" * 60)
|
| 59 |
+
|
| 60 |
+
input_data_3 = SalaryInput(
|
| 61 |
+
country="United States of America",
|
| 62 |
+
years_code=10.0,
|
| 63 |
+
education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
|
| 64 |
+
dev_type="Engineering manager",
|
| 65 |
+
industry="Banking/Financial Services",
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
print(f"Country: {input_data_3.country}")
|
| 69 |
+
print(f"Years of Coding (Total): {input_data_3.years_code}")
|
| 70 |
+
print(f"Education Level: {input_data_3.education_level}")
|
| 71 |
+
print(f"Developer Type: {input_data_3.dev_type}")
|
| 72 |
+
print(f"Industry: {input_data_3.industry}")
|
| 73 |
+
|
| 74 |
+
salary_3 = predict_salary(input_data_3)
|
| 75 |
+
print(f"💰 Predicted Salary: ${salary_3:,.2f} USD/year")
|
| 76 |
+
|
| 77 |
+
# Example 4: Different country
|
| 78 |
+
print("\n📊 Example 4: Different Country (Germany)")
|
| 79 |
+
print("-" * 60)
|
| 80 |
+
|
| 81 |
+
input_data_4 = SalaryInput(
|
| 82 |
+
country="Germany",
|
| 83 |
+
years_code=5.0,
|
| 84 |
+
education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 85 |
+
dev_type="Developer, back-end",
|
| 86 |
+
industry="Manufacturing",
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
print(f"Country: {input_data_4.country}")
|
| 90 |
+
print(f"Years of Coding (Total): {input_data_4.years_code}")
|
| 91 |
+
print(f"Education Level: {input_data_4.education_level}")
|
| 92 |
+
print(f"Developer Type: {input_data_4.dev_type}")
|
| 93 |
+
print(f"Industry: {input_data_4.industry}")
|
| 94 |
+
|
| 95 |
+
salary_4 = predict_salary(input_data_4)
|
| 96 |
+
print(f"💰 Predicted Salary: ${salary_4:,.2f} USD/year")
|
| 97 |
+
|
| 98 |
+
print("\n" + "=" * 60)
|
| 99 |
+
print("✅ All predictions completed successfully!")
|
| 100 |
+
print("=" * 60)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
if __name__ == "__main__":
|
| 104 |
+
try:
|
| 105 |
+
main()
|
| 106 |
+
except FileNotFoundError:
|
| 107 |
+
print("❌ Error: Model file not found!")
|
| 108 |
+
print("Please train the model first by running:")
|
| 109 |
+
print(" uv run python src/train.py")
|
| 110 |
+
except Exception as e:
|
| 111 |
+
print(f"❌ Error occurred: {str(e)}")
|
models/model.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5165f22311d0eb6809380cf4fa5a749b59f0d8e81903462fe7c2c882e09e916f
|
| 3 |
+
size 3192752
|
pyproject.toml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "developer-salary-prediction"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Simple ML app for predicting developer salaries using Stack Overflow survey data"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.12"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"pandas>=2.0.0",
|
| 9 |
+
"scikit-learn>=1.3.0",
|
| 10 |
+
"pydantic>=2.0.0",
|
| 11 |
+
"streamlit>=1.28.0",
|
| 12 |
+
"xgboost>=3.1.0",
|
| 13 |
+
"ruff>=0.15.0",
|
| 14 |
+
"pyyaml>=6.0.0",
|
| 15 |
+
]
|
src/infer.py
CHANGED
|
@@ -33,6 +33,30 @@ if not valid_categories_path.exists():
|
|
| 33 |
with open(valid_categories_path, "r") as f:
|
| 34 |
valid_categories = yaml.safe_load(f)
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
def predict_salary(data: SalaryInput) -> float:
|
| 38 |
"""Predict salary based on input features.
|
|
@@ -68,13 +92,21 @@ def predict_salary(data: SalaryInput) -> float:
|
|
| 68 |
f"Check config/valid_categories.yaml for all valid values."
|
| 69 |
)
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
# Create a DataFrame with the input data
|
| 72 |
input_df = pd.DataFrame(
|
| 73 |
{
|
| 74 |
"Country": [data.country],
|
| 75 |
-
"
|
| 76 |
"EdLevel": [data.education_level],
|
| 77 |
"DevType": [data.dev_type],
|
|
|
|
| 78 |
}
|
| 79 |
)
|
| 80 |
|
|
|
|
| 33 |
with open(valid_categories_path, "r") as f:
|
| 34 |
valid_categories = yaml.safe_load(f)
|
| 35 |
|
| 36 |
+
# Load currency conversion rates
|
| 37 |
+
currency_rates_path = Path("config/currency_rates.yaml")
|
| 38 |
+
currency_rates = {}
|
| 39 |
+
if currency_rates_path.exists():
|
| 40 |
+
with open(currency_rates_path, "r") as f:
|
| 41 |
+
currency_rates = yaml.safe_load(f) or {}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def get_local_currency(country: str, salary_usd: float) -> dict | None:
|
| 45 |
+
"""Convert USD salary to local currency for a given country.
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
Dict with code, name, rate, and salary_local, or None if unavailable.
|
| 49 |
+
"""
|
| 50 |
+
if country not in currency_rates:
|
| 51 |
+
return None
|
| 52 |
+
info = currency_rates[country]
|
| 53 |
+
return {
|
| 54 |
+
"code": info["code"],
|
| 55 |
+
"name": info["name"],
|
| 56 |
+
"rate": info["rate"],
|
| 57 |
+
"salary_local": round(salary_usd * info["rate"], 2),
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
|
| 61 |
def predict_salary(data: SalaryInput) -> float:
|
| 62 |
"""Predict salary based on input features.
|
|
|
|
| 92 |
f"Check config/valid_categories.yaml for all valid values."
|
| 93 |
)
|
| 94 |
|
| 95 |
+
if data.industry not in valid_categories["Industry"]:
|
| 96 |
+
raise ValueError(
|
| 97 |
+
f"Invalid industry: '{data.industry}'. "
|
| 98 |
+
f"Must be one of {len(valid_categories['Industry'])} valid industries. "
|
| 99 |
+
f"Check config/valid_categories.yaml for all valid values."
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
# Create a DataFrame with the input data
|
| 103 |
input_df = pd.DataFrame(
|
| 104 |
{
|
| 105 |
"Country": [data.country],
|
| 106 |
+
"YearsCode": [data.years_code],
|
| 107 |
"EdLevel": [data.education_level],
|
| 108 |
"DevType": [data.dev_type],
|
| 109 |
+
"Industry": [data.industry],
|
| 110 |
}
|
| 111 |
)
|
| 112 |
|
src/preprocessing.py
CHANGED
|
@@ -55,7 +55,7 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 55 |
during training and inference, preventing data leakage and inconsistencies.
|
| 56 |
|
| 57 |
Args:
|
| 58 |
-
df: DataFrame with columns: Country, YearsCode
|
| 59 |
NOTE: During training, cardinality reduction should be applied to df
|
| 60 |
BEFORE calling this function. During inference, valid_categories.yaml
|
| 61 |
ensures only valid (already-reduced) categories are used.
|
|
@@ -67,7 +67,7 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 67 |
- Fills missing values with defaults (0 for numeric, "Unknown" for categorical)
|
| 68 |
- Normalizes Unicode apostrophes to regular apostrophes
|
| 69 |
- Applies one-hot encoding with drop_first=True to avoid multicollinearity
|
| 70 |
-
- Column names in output will be like: YearsCode, Country_X, EdLevel_Y, DevType_Z
|
| 71 |
- Does NOT apply cardinality reduction (must be done before calling this)
|
| 72 |
"""
|
| 73 |
# Create a copy to avoid modifying the original
|
|
@@ -75,26 +75,27 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 75 |
|
| 76 |
# Normalize Unicode apostrophes to regular apostrophes for consistency
|
| 77 |
# This handles cases where data has \u2019 (') instead of '
|
| 78 |
-
for col in ["Country", "EdLevel", "DevType"]:
|
| 79 |
if col in df_processed.columns:
|
| 80 |
df_processed[col] = df_processed[col].str.replace('\u2019', "'", regex=False)
|
| 81 |
|
| 82 |
-
# Handle column name
|
| 83 |
if "YearsCodePro" in df_processed.columns and "YearsCode" not in df_processed.columns:
|
| 84 |
-
df_processed
|
| 85 |
|
| 86 |
# Fill missing values with defaults
|
| 87 |
df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
|
| 88 |
df_processed["Country"] = df_processed["Country"].fillna("Unknown")
|
| 89 |
df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
|
| 90 |
df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
|
|
|
|
| 91 |
|
| 92 |
# NOTE: Cardinality reduction is NOT applied here
|
| 93 |
# It should be applied during training BEFORE calling this function
|
| 94 |
# During inference, valid_categories.yaml ensures only valid values are used
|
| 95 |
|
| 96 |
# Select only the features we need
|
| 97 |
-
feature_cols = ["Country", "YearsCode", "EdLevel", "DevType"]
|
| 98 |
df_features = df_processed[feature_cols]
|
| 99 |
|
| 100 |
# Apply one-hot encoding for categorical variables
|
|
|
|
| 55 |
during training and inference, preventing data leakage and inconsistencies.
|
| 56 |
|
| 57 |
Args:
|
| 58 |
+
df: DataFrame with columns: Country, YearsCode, EdLevel, DevType, Industry
|
| 59 |
NOTE: During training, cardinality reduction should be applied to df
|
| 60 |
BEFORE calling this function. During inference, valid_categories.yaml
|
| 61 |
ensures only valid (already-reduced) categories are used.
|
|
|
|
| 67 |
- Fills missing values with defaults (0 for numeric, "Unknown" for categorical)
|
| 68 |
- Normalizes Unicode apostrophes to regular apostrophes
|
| 69 |
- Applies one-hot encoding with drop_first=True to avoid multicollinearity
|
| 70 |
+
- Column names in output will be like: YearsCode, Country_X, EdLevel_Y, DevType_Z, Industry_W
|
| 71 |
- Does NOT apply cardinality reduction (must be done before calling this)
|
| 72 |
"""
|
| 73 |
# Create a copy to avoid modifying the original
|
|
|
|
| 75 |
|
| 76 |
# Normalize Unicode apostrophes to regular apostrophes for consistency
|
| 77 |
# This handles cases where data has \u2019 (') instead of '
|
| 78 |
+
for col in ["Country", "EdLevel", "DevType", "Industry"]:
|
| 79 |
if col in df_processed.columns:
|
| 80 |
df_processed[col] = df_processed[col].str.replace('\u2019', "'", regex=False)
|
| 81 |
|
| 82 |
+
# Handle legacy column name (YearsCodePro -> YearsCode)
|
| 83 |
if "YearsCodePro" in df_processed.columns and "YearsCode" not in df_processed.columns:
|
| 84 |
+
df_processed.rename(columns={"YearsCodePro": "YearsCode"}, inplace=True)
|
| 85 |
|
| 86 |
# Fill missing values with defaults
|
| 87 |
df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
|
| 88 |
df_processed["Country"] = df_processed["Country"].fillna("Unknown")
|
| 89 |
df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
|
| 90 |
df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
|
| 91 |
+
df_processed["Industry"] = df_processed["Industry"].fillna("Unknown")
|
| 92 |
|
| 93 |
# NOTE: Cardinality reduction is NOT applied here
|
| 94 |
# It should be applied during training BEFORE calling this function
|
| 95 |
# During inference, valid_categories.yaml ensures only valid values are used
|
| 96 |
|
| 97 |
# Select only the features we need
|
| 98 |
+
feature_cols = ["Country", "YearsCode", "EdLevel", "DevType", "Industry"]
|
| 99 |
df_features = df_processed[feature_cols]
|
| 100 |
|
| 101 |
# Apply one-hot encoding for categorical variables
|
src/schema.py
CHANGED
|
@@ -7,11 +7,14 @@ class SalaryInput(BaseModel):
|
|
| 7 |
"""Input model for salary prediction."""
|
| 8 |
|
| 9 |
country: str = Field(..., description="Developer's country")
|
| 10 |
-
|
| 11 |
-
...,
|
|
|
|
|
|
|
| 12 |
)
|
| 13 |
education_level: str = Field(..., description="Education level")
|
| 14 |
dev_type: str = Field(..., description="Developer type")
|
|
|
|
| 15 |
|
| 16 |
class Config:
|
| 17 |
"""Pydantic configuration."""
|
|
@@ -19,8 +22,9 @@ class SalaryInput(BaseModel):
|
|
| 19 |
json_schema_extra = {
|
| 20 |
"example": {
|
| 21 |
"country": "United States",
|
| 22 |
-
"
|
| 23 |
"education_level": "Bachelor's degree",
|
| 24 |
"dev_type": "Developer, back-end",
|
|
|
|
| 25 |
}
|
| 26 |
}
|
|
|
|
| 7 |
"""Input model for salary prediction."""
|
| 8 |
|
| 9 |
country: str = Field(..., description="Developer's country")
|
| 10 |
+
years_code: float = Field(
|
| 11 |
+
...,
|
| 12 |
+
ge=0,
|
| 13 |
+
description="Including any education, how many years have you been coding in total?",
|
| 14 |
)
|
| 15 |
education_level: str = Field(..., description="Education level")
|
| 16 |
dev_type: str = Field(..., description="Developer type")
|
| 17 |
+
industry: str = Field(..., description="Industry the developer works in")
|
| 18 |
|
| 19 |
class Config:
|
| 20 |
"""Pydantic configuration."""
|
|
|
|
| 22 |
json_schema_extra = {
|
| 23 |
"example": {
|
| 24 |
"country": "United States",
|
| 25 |
+
"years_code": 5.0,
|
| 26 |
"education_level": "Bachelor's degree",
|
| 27 |
"dev_type": "Developer, back-end",
|
| 28 |
+
"industry": "Software Development",
|
| 29 |
}
|
| 30 |
}
|
src/train.py
CHANGED
|
@@ -7,7 +7,7 @@ import pandas as pd
|
|
| 7 |
import numpy as np
|
| 8 |
import yaml
|
| 9 |
from xgboost import XGBRegressor
|
| 10 |
-
from sklearn.model_selection import train_test_split
|
| 11 |
|
| 12 |
from src.preprocessing import prepare_features, reduce_cardinality
|
| 13 |
|
|
@@ -32,7 +32,8 @@ def main():
|
|
| 32 |
# Load only required columns to save memory
|
| 33 |
df = pd.read_csv(
|
| 34 |
data_path,
|
| 35 |
-
usecols=["Country", "YearsCode", "EdLevel", "DevType", "
|
|
|
|
| 36 |
)
|
| 37 |
|
| 38 |
print(f"Loaded {len(df):,} rows")
|
|
@@ -43,11 +44,14 @@ def main():
|
|
| 43 |
# select records with main label more than min_salary threshold
|
| 44 |
min_salary = config['data']['min_salary']
|
| 45 |
df = df[df[main_label] > min_salary]
|
| 46 |
-
#
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
print(df.shape)
|
| 53 |
|
|
@@ -62,17 +66,20 @@ def main():
|
|
| 62 |
df_copy["Country"] = df_copy["Country"].str.replace('\u2019', "'", regex=False)
|
| 63 |
df_copy["EdLevel"] = df_copy["EdLevel"].str.replace('\u2019', "'", regex=False)
|
| 64 |
df_copy["DevType"] = df_copy["DevType"].str.replace('\u2019', "'", regex=False)
|
|
|
|
| 65 |
|
| 66 |
# Apply cardinality reduction
|
| 67 |
df_copy["Country"] = reduce_cardinality(df_copy["Country"])
|
| 68 |
df_copy["EdLevel"] = reduce_cardinality(df_copy["EdLevel"])
|
| 69 |
df_copy["DevType"] = reduce_cardinality(df_copy["DevType"])
|
|
|
|
| 70 |
|
| 71 |
# Apply cardinality reduction to the actual training data as well
|
| 72 |
# (prepare_features no longer does this internally)
|
| 73 |
df["Country"] = reduce_cardinality(df["Country"])
|
| 74 |
df["EdLevel"] = reduce_cardinality(df["EdLevel"])
|
| 75 |
df["DevType"] = reduce_cardinality(df["DevType"])
|
|
|
|
| 76 |
|
| 77 |
# Now apply full feature transformations for model training
|
| 78 |
X = prepare_features(df)
|
|
@@ -83,18 +90,64 @@ def main():
|
|
| 83 |
country_values = df_copy["Country"].dropna().unique().tolist()
|
| 84 |
edlevel_values = df_copy["EdLevel"].dropna().unique().tolist()
|
| 85 |
devtype_values = df_copy["DevType"].dropna().unique().tolist()
|
|
|
|
| 86 |
|
| 87 |
valid_categories = {
|
| 88 |
"Country": sorted(country_values),
|
| 89 |
"EdLevel": sorted(edlevel_values),
|
| 90 |
"DevType": sorted(devtype_values),
|
|
|
|
| 91 |
}
|
| 92 |
|
| 93 |
valid_categories_path = Path("config/valid_categories.yaml")
|
| 94 |
with open(valid_categories_path, "w") as f:
|
| 95 |
yaml.dump(valid_categories, f, default_flow_style=False, sort_keys=False)
|
| 96 |
|
| 97 |
-
print(f"\nSaved {len(valid_categories['Country'])} valid countries, {len(valid_categories['EdLevel'])} valid education levels,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
print(f"\nFeature matrix shape: {X.shape}")
|
| 100 |
print(f"Total features: {X.shape[1]}")
|
|
@@ -122,6 +175,12 @@ def main():
|
|
| 122 |
for devtype, count in top_devtype.items():
|
| 123 |
print(f" - {devtype}: {count:,} ({count/len(df)*100:.1f}%)")
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
# Show YearsCode statistics
|
| 126 |
print("\n💼 Years of Coding Experience:")
|
| 127 |
print(f" - Min: {df['YearsCode'].min():.1f}")
|
|
@@ -164,25 +223,77 @@ def main():
|
|
| 164 |
devtype_name = feature.replace('DevType_', '')
|
| 165 |
print(f" {i:2d}. {devtype_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
|
| 166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
print(f"\n📊 Total one-hot encoded features: {len(X.columns)}")
|
| 168 |
print(" - Numeric: 1 (YearsCode)")
|
| 169 |
print(f" - Country: {len(country_features)}")
|
| 170 |
print(f" - Education: {len(edlevel_features)}")
|
| 171 |
print(f" - DevType: {len(devtype_features)}")
|
|
|
|
| 172 |
|
| 173 |
print("=" * 60 + "\n")
|
| 174 |
|
| 175 |
-
#
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
)
|
| 181 |
|
| 182 |
-
|
| 183 |
-
print("Training XGBoost model...")
|
| 184 |
-
model_config = config['model']
|
| 185 |
-
model = XGBRegressor(
|
| 186 |
n_estimators=model_config['n_estimators'],
|
| 187 |
learning_rate=model_config['learning_rate'],
|
| 188 |
max_depth=model_config['max_depth'],
|
|
@@ -191,27 +302,19 @@ def main():
|
|
| 191 |
n_jobs=model_config['n_jobs'],
|
| 192 |
early_stopping_rounds=model_config['early_stopping_rounds'],
|
| 193 |
)
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
eval_set=[(X_test, y_test)],
|
| 198 |
verbose=config['training']['verbose'],
|
| 199 |
)
|
| 200 |
-
|
| 201 |
-
print(f"Best iteration: {model.best_iteration + 1} (early stopping at {model.n_estimators} max)")
|
| 202 |
-
|
| 203 |
-
# Evaluate
|
| 204 |
-
train_score = model.score(X_train, y_train)
|
| 205 |
-
test_score = model.score(X_test, y_test)
|
| 206 |
-
print(f"Training R2 score: {train_score:.4f}")
|
| 207 |
-
print(f"Test R2 score: {test_score:.4f}")
|
| 208 |
|
| 209 |
# Save model and feature columns for inference
|
| 210 |
model_path = Path(config['training']['model_path'])
|
| 211 |
-
model_path.parent.mkdir(parents=True, exist_ok=True)
|
| 212 |
|
| 213 |
artifacts = {
|
| 214 |
-
"model":
|
| 215 |
"feature_columns": list(X.columns),
|
| 216 |
}
|
| 217 |
|
|
|
|
| 7 |
import numpy as np
|
| 8 |
import yaml
|
| 9 |
from xgboost import XGBRegressor
|
| 10 |
+
from sklearn.model_selection import KFold, train_test_split
|
| 11 |
|
| 12 |
from src.preprocessing import prepare_features, reduce_cardinality
|
| 13 |
|
|
|
|
| 32 |
# Load only required columns to save memory
|
| 33 |
df = pd.read_csv(
|
| 34 |
data_path,
|
| 35 |
+
usecols=["Country", "YearsCode", "EdLevel", "DevType", "Industry",
|
| 36 |
+
"Currency", "CompTotal", "ConvertedCompYearly"],
|
| 37 |
)
|
| 38 |
|
| 39 |
print(f"Loaded {len(df):,} rows")
|
|
|
|
| 44 |
# select records with main label more than min_salary threshold
|
| 45 |
min_salary = config['data']['min_salary']
|
| 46 |
df = df[df[main_label] > min_salary]
|
| 47 |
+
# Exclude outliers based on percentile bounds PER COUNTRY
|
| 48 |
+
# This preserves records from lower-paid and higher-paid countries
|
| 49 |
+
# that would otherwise be removed by global percentile filtering
|
| 50 |
+
lower_pct = config['data']['lower_percentile'] / 100
|
| 51 |
+
upper_pct = config['data']['upper_percentile'] / 100
|
| 52 |
+
lower_bound = df.groupby("Country")[main_label].transform("quantile", lower_pct)
|
| 53 |
+
upper_bound = df.groupby("Country")[main_label].transform("quantile", upper_pct)
|
| 54 |
+
df = df[(df[main_label] > lower_bound) & (df[main_label] < upper_bound)]
|
| 55 |
|
| 56 |
print(df.shape)
|
| 57 |
|
|
|
|
| 66 |
df_copy["Country"] = df_copy["Country"].str.replace('\u2019', "'", regex=False)
|
| 67 |
df_copy["EdLevel"] = df_copy["EdLevel"].str.replace('\u2019', "'", regex=False)
|
| 68 |
df_copy["DevType"] = df_copy["DevType"].str.replace('\u2019', "'", regex=False)
|
| 69 |
+
df_copy["Industry"] = df_copy["Industry"].str.replace('\u2019', "'", regex=False)
|
| 70 |
|
| 71 |
# Apply cardinality reduction
|
| 72 |
df_copy["Country"] = reduce_cardinality(df_copy["Country"])
|
| 73 |
df_copy["EdLevel"] = reduce_cardinality(df_copy["EdLevel"])
|
| 74 |
df_copy["DevType"] = reduce_cardinality(df_copy["DevType"])
|
| 75 |
+
df_copy["Industry"] = reduce_cardinality(df_copy["Industry"])
|
| 76 |
|
| 77 |
# Apply cardinality reduction to the actual training data as well
|
| 78 |
# (prepare_features no longer does this internally)
|
| 79 |
df["Country"] = reduce_cardinality(df["Country"])
|
| 80 |
df["EdLevel"] = reduce_cardinality(df["EdLevel"])
|
| 81 |
df["DevType"] = reduce_cardinality(df["DevType"])
|
| 82 |
+
df["Industry"] = reduce_cardinality(df["Industry"])
|
| 83 |
|
| 84 |
# Now apply full feature transformations for model training
|
| 85 |
X = prepare_features(df)
|
|
|
|
| 90 |
country_values = df_copy["Country"].dropna().unique().tolist()
|
| 91 |
edlevel_values = df_copy["EdLevel"].dropna().unique().tolist()
|
| 92 |
devtype_values = df_copy["DevType"].dropna().unique().tolist()
|
| 93 |
+
industry_values = df_copy["Industry"].dropna().unique().tolist()
|
| 94 |
|
| 95 |
valid_categories = {
|
| 96 |
"Country": sorted(country_values),
|
| 97 |
"EdLevel": sorted(edlevel_values),
|
| 98 |
"DevType": sorted(devtype_values),
|
| 99 |
+
"Industry": sorted(industry_values),
|
| 100 |
}
|
| 101 |
|
| 102 |
valid_categories_path = Path("config/valid_categories.yaml")
|
| 103 |
with open(valid_categories_path, "w") as f:
|
| 104 |
yaml.dump(valid_categories, f, default_flow_style=False, sort_keys=False)
|
| 105 |
|
| 106 |
+
print(f"\nSaved {len(valid_categories['Country'])} valid countries, {len(valid_categories['EdLevel'])} valid education levels, {len(valid_categories['DevType'])} valid developer types, and {len(valid_categories['Industry'])} valid industries to {valid_categories_path}")
|
| 107 |
+
|
| 108 |
+
# Compute currency conversion rates per country
|
| 109 |
+
# Use the original data with Currency and CompTotal columns
|
| 110 |
+
print("\nComputing currency conversion rates per country...")
|
| 111 |
+
currency_df = df[["Country", "Currency", "CompTotal", main_label]].dropna()
|
| 112 |
+
# Extract 3-letter currency code from values like "EUR European Euro"
|
| 113 |
+
currency_df = currency_df.copy()
|
| 114 |
+
currency_df["CurrencyCode"] = currency_df["Currency"].str.split(r"\s+", n=1).str[0]
|
| 115 |
+
currency_df["CurrencyName"] = currency_df["Currency"].str.split(r"\s+", n=1).str[1]
|
| 116 |
+
# Compute conversion rate: local currency / USD
|
| 117 |
+
currency_df["rate"] = currency_df["CompTotal"] / currency_df[main_label]
|
| 118 |
+
# Filter out unreasonable rates (negative, zero, or extreme)
|
| 119 |
+
currency_df = currency_df[(currency_df["rate"] > 0.001) & (currency_df["rate"] < 100000)]
|
| 120 |
+
|
| 121 |
+
currency_rates = {}
|
| 122 |
+
for country in valid_categories["Country"]:
|
| 123 |
+
country_data = currency_df[currency_df["Country"] == country]
|
| 124 |
+
if country_data.empty:
|
| 125 |
+
continue
|
| 126 |
+
# Find the most common currency for this country
|
| 127 |
+
most_common = country_data["CurrencyCode"].mode()
|
| 128 |
+
if most_common.empty:
|
| 129 |
+
continue
|
| 130 |
+
code = most_common.iloc[0]
|
| 131 |
+
# Get the full name from the first matching record
|
| 132 |
+
name_row = country_data[country_data["CurrencyCode"] == code].iloc[0]
|
| 133 |
+
full_name = name_row["CurrencyName"]
|
| 134 |
+
# Compute median conversion rate for this country+currency pair
|
| 135 |
+
rates = country_data[country_data["CurrencyCode"] == code]["rate"]
|
| 136 |
+
median_rate = round(float(rates.median()), 2)
|
| 137 |
+
currency_rates[country] = {
|
| 138 |
+
"code": code,
|
| 139 |
+
"name": full_name,
|
| 140 |
+
"rate": median_rate,
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
currency_rates_path = Path("config/currency_rates.yaml")
|
| 144 |
+
with open(currency_rates_path, "w") as f:
|
| 145 |
+
yaml.dump(currency_rates, f, default_flow_style=False, sort_keys=True,
|
| 146 |
+
allow_unicode=True)
|
| 147 |
+
|
| 148 |
+
print(f"Saved currency rates for {len(currency_rates)} countries to {currency_rates_path}")
|
| 149 |
+
for country, info in sorted(currency_rates.items()):
|
| 150 |
+
print(f" {country:45s} -> {info['code']} ({info['name']}, rate: {info['rate']})")
|
| 151 |
|
| 152 |
print(f"\nFeature matrix shape: {X.shape}")
|
| 153 |
print(f"Total features: {X.shape[1]}")
|
|
|
|
| 175 |
for devtype, count in top_devtype.items():
|
| 176 |
print(f" - {devtype}: {count:,} ({count/len(df)*100:.1f}%)")
|
| 177 |
|
| 178 |
+
# Show top industries
|
| 179 |
+
print("\n🏢 Top Industries:")
|
| 180 |
+
top_industry = df["Industry"].value_counts().head(10)
|
| 181 |
+
for industry, count in top_industry.items():
|
| 182 |
+
print(f" - {industry}: {count:,} ({count/len(df)*100:.1f}%)")
|
| 183 |
+
|
| 184 |
# Show YearsCode statistics
|
| 185 |
print("\n💼 Years of Coding Experience:")
|
| 186 |
print(f" - Min: {df['YearsCode'].min():.1f}")
|
|
|
|
| 223 |
devtype_name = feature.replace('DevType_', '')
|
| 224 |
print(f" {i:2d}. {devtype_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
|
| 225 |
|
| 226 |
+
# Industry features
|
| 227 |
+
print("\n🏢 Top 10 Industry Features (most common):")
|
| 228 |
+
industry_features = categorical_features[categorical_features.index.str.startswith('Industry_')]
|
| 229 |
+
for i, (feature, count) in enumerate(industry_features.head(10).items(), 1):
|
| 230 |
+
percentage = (count / len(X)) * 100
|
| 231 |
+
industry_name = feature.replace('Industry_', '')
|
| 232 |
+
print(f" {i:2d}. {industry_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
|
| 233 |
+
|
| 234 |
print(f"\n📊 Total one-hot encoded features: {len(X.columns)}")
|
| 235 |
print(" - Numeric: 1 (YearsCode)")
|
| 236 |
print(f" - Country: {len(country_features)}")
|
| 237 |
print(f" - Education: {len(edlevel_features)}")
|
| 238 |
print(f" - DevType: {len(devtype_features)}")
|
| 239 |
+
print(f" - Industry: {len(industry_features)}")
|
| 240 |
|
| 241 |
print("=" * 60 + "\n")
|
| 242 |
|
| 243 |
+
# Cross-validation for robust evaluation
|
| 244 |
+
n_splits = config['data'].get('cv_splits', 5)
|
| 245 |
+
random_state = config['data']['random_state']
|
| 246 |
+
model_config = config['model']
|
| 247 |
+
|
| 248 |
+
print(f"Running {n_splits}-fold cross-validation...")
|
| 249 |
+
kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
|
| 250 |
+
|
| 251 |
+
train_scores = []
|
| 252 |
+
test_scores = []
|
| 253 |
+
best_iterations = []
|
| 254 |
+
|
| 255 |
+
for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
|
| 256 |
+
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
|
| 257 |
+
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
|
| 258 |
+
|
| 259 |
+
model = XGBRegressor(
|
| 260 |
+
n_estimators=model_config['n_estimators'],
|
| 261 |
+
learning_rate=model_config['learning_rate'],
|
| 262 |
+
max_depth=model_config['max_depth'],
|
| 263 |
+
min_child_weight=model_config['min_child_weight'],
|
| 264 |
+
random_state=model_config['random_state'],
|
| 265 |
+
n_jobs=model_config['n_jobs'],
|
| 266 |
+
early_stopping_rounds=model_config['early_stopping_rounds'],
|
| 267 |
+
)
|
| 268 |
+
model.fit(
|
| 269 |
+
X_train, y_train,
|
| 270 |
+
eval_set=[(X_test, y_test)],
|
| 271 |
+
verbose=False,
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
train_r2 = model.score(X_train, y_train)
|
| 275 |
+
test_r2 = model.score(X_test, y_test)
|
| 276 |
+
train_scores.append(train_r2)
|
| 277 |
+
test_scores.append(test_r2)
|
| 278 |
+
best_iterations.append(model.best_iteration + 1)
|
| 279 |
+
print(f" Fold {fold}: Train R2 = {train_r2:.4f}, Test R2 = {test_r2:.4f} (best iter: {model.best_iteration + 1})")
|
| 280 |
+
|
| 281 |
+
avg_train = np.mean(train_scores)
|
| 282 |
+
avg_test = np.mean(test_scores)
|
| 283 |
+
std_test = np.std(test_scores)
|
| 284 |
+
avg_best_iter = int(np.mean(best_iterations))
|
| 285 |
+
print(f"\nCV Average Train R2: {avg_train:.4f}")
|
| 286 |
+
print(f"CV Average Test R2: {avg_test:.4f} (+/- {std_test:.4f})")
|
| 287 |
+
print(f"CV Average best iteration: {avg_best_iter}")
|
| 288 |
+
|
| 289 |
+
# Train final model on all data for deployment
|
| 290 |
+
# Use a small held-out split for early stopping only
|
| 291 |
+
print("\nTraining final model on full dataset...")
|
| 292 |
+
X_train_final, X_es, y_train_final, y_es = train_test_split(
|
| 293 |
+
X, y, test_size=0.1, random_state=random_state
|
| 294 |
)
|
| 295 |
|
| 296 |
+
final_model = XGBRegressor(
|
|
|
|
|
|
|
|
|
|
| 297 |
n_estimators=model_config['n_estimators'],
|
| 298 |
learning_rate=model_config['learning_rate'],
|
| 299 |
max_depth=model_config['max_depth'],
|
|
|
|
| 302 |
n_jobs=model_config['n_jobs'],
|
| 303 |
early_stopping_rounds=model_config['early_stopping_rounds'],
|
| 304 |
)
|
| 305 |
+
final_model.fit(
|
| 306 |
+
X_train_final, y_train_final,
|
| 307 |
+
eval_set=[(X_es, y_es)],
|
|
|
|
| 308 |
verbose=config['training']['verbose'],
|
| 309 |
)
|
| 310 |
+
print(f"Final model best iteration: {final_model.best_iteration + 1}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
|
| 312 |
# Save model and feature columns for inference
|
| 313 |
model_path = Path(config['training']['model_path'])
|
| 314 |
+
model_path.parent.mkdir(parents=True, exist_ok=True)
|
| 315 |
|
| 316 |
artifacts = {
|
| 317 |
+
"model": final_model,
|
| 318 |
"feature_columns": list(X.columns),
|
| 319 |
}
|
| 320 |
|
test_feature_impact.py
ADDED
|
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Test that changing input features actually changes predictions."""
|
| 2 |
+
|
| 3 |
+
from src.schema import SalaryInput
|
| 4 |
+
from src.infer import predict_salary, valid_categories
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_years_experience_impact():
|
| 8 |
+
"""Test that changing years of experience changes prediction."""
|
| 9 |
+
print("\n" + "=" * 70)
|
| 10 |
+
print("TEST 1: Total Years of Coding Impact")
|
| 11 |
+
print("=" * 70)
|
| 12 |
+
|
| 13 |
+
base_input = {
|
| 14 |
+
"country": "United States of America",
|
| 15 |
+
"education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 16 |
+
"dev_type": "Developer, full-stack",
|
| 17 |
+
"industry": "Software Development",
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
# Test with different years of experience
|
| 21 |
+
years_tests = [0, 2, 5, 10, 20]
|
| 22 |
+
predictions = []
|
| 23 |
+
|
| 24 |
+
for years in years_tests:
|
| 25 |
+
input_data = SalaryInput(**base_input, years_code=years)
|
| 26 |
+
salary = predict_salary(input_data)
|
| 27 |
+
predictions.append(salary)
|
| 28 |
+
print(f" Years: {years:2d} -> Salary: ${salary:,.2f}")
|
| 29 |
+
|
| 30 |
+
# Check if predictions are different
|
| 31 |
+
unique_predictions = len(set(predictions))
|
| 32 |
+
if unique_predictions == len(predictions):
|
| 33 |
+
print(f"\n✅ PASS: All {len(predictions)} predictions are different")
|
| 34 |
+
return True
|
| 35 |
+
else:
|
| 36 |
+
print(f"\n❌ FAIL: Only {unique_predictions}/{len(predictions)} unique predictions")
|
| 37 |
+
return False
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def test_country_impact():
|
| 41 |
+
"""Test that changing country changes prediction."""
|
| 42 |
+
print("\n" + "=" * 70)
|
| 43 |
+
print("TEST 2: Country Impact")
|
| 44 |
+
print("=" * 70)
|
| 45 |
+
|
| 46 |
+
base_input = {
|
| 47 |
+
"years_code": 5.0,
|
| 48 |
+
"education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 49 |
+
"dev_type": "Developer, full-stack",
|
| 50 |
+
"industry": "Software Development",
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
# Test with different countries (select diverse ones)
|
| 54 |
+
test_countries = [
|
| 55 |
+
"United States of America",
|
| 56 |
+
"Germany",
|
| 57 |
+
"India",
|
| 58 |
+
"Brazil",
|
| 59 |
+
"Poland"
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
# Filter to only countries that exist in valid categories
|
| 63 |
+
test_countries = [c for c in test_countries if c in valid_categories["Country"]]
|
| 64 |
+
|
| 65 |
+
predictions = []
|
| 66 |
+
for country in test_countries:
|
| 67 |
+
input_data = SalaryInput(**base_input, country=country)
|
| 68 |
+
salary = predict_salary(input_data)
|
| 69 |
+
predictions.append(salary)
|
| 70 |
+
print(f" Country: {country:40s} -> Salary: ${salary:,.2f}")
|
| 71 |
+
|
| 72 |
+
# Check if predictions are different
|
| 73 |
+
unique_predictions = len(set(predictions))
|
| 74 |
+
if unique_predictions == len(predictions):
|
| 75 |
+
print(f"\n✅ PASS: All {len(predictions)} predictions are different")
|
| 76 |
+
return True
|
| 77 |
+
elif unique_predictions == 1:
|
| 78 |
+
print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
|
| 79 |
+
print(" This indicates the model is NOT using country as a feature!")
|
| 80 |
+
return False
|
| 81 |
+
else:
|
| 82 |
+
print(f"\n⚠️ PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
|
| 83 |
+
print(f" Duplicate salaries found - possible feature issue")
|
| 84 |
+
return False
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def test_education_impact():
|
| 88 |
+
"""Test that changing education level changes prediction."""
|
| 89 |
+
print("\n" + "=" * 70)
|
| 90 |
+
print("TEST 3: Education Level Impact")
|
| 91 |
+
print("=" * 70)
|
| 92 |
+
|
| 93 |
+
base_input = {
|
| 94 |
+
"country": "United States of America",
|
| 95 |
+
"years_code": 5.0,
|
| 96 |
+
"dev_type": "Developer, full-stack",
|
| 97 |
+
"industry": "Software Development",
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
# Test with different education levels
|
| 101 |
+
test_education = [
|
| 102 |
+
"Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)",
|
| 103 |
+
"Some college/university study without earning a degree",
|
| 104 |
+
"Associate degree (A.A., A.S., etc.)",
|
| 105 |
+
"Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 106 |
+
"Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
|
| 107 |
+
"Professional degree (JD, MD, Ph.D, Ed.D, etc.)",
|
| 108 |
+
]
|
| 109 |
+
|
| 110 |
+
# Filter to only education levels that exist in valid categories
|
| 111 |
+
test_education = [e for e in test_education if e in valid_categories["EdLevel"]]
|
| 112 |
+
|
| 113 |
+
predictions = []
|
| 114 |
+
for education in test_education:
|
| 115 |
+
input_data = SalaryInput(**base_input, education_level=education)
|
| 116 |
+
salary = predict_salary(input_data)
|
| 117 |
+
predictions.append(salary)
|
| 118 |
+
print(f" Education: {education[:50]:50s} -> Salary: ${salary:,.2f}")
|
| 119 |
+
|
| 120 |
+
# Check if predictions are different
|
| 121 |
+
unique_predictions = len(set(predictions))
|
| 122 |
+
if unique_predictions == len(predictions):
|
| 123 |
+
print(f"\n✅ PASS: All {len(predictions)} predictions are different")
|
| 124 |
+
return True
|
| 125 |
+
elif unique_predictions == 1:
|
| 126 |
+
print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
|
| 127 |
+
print(" This indicates the model is NOT using education level as a feature!")
|
| 128 |
+
return False
|
| 129 |
+
else:
|
| 130 |
+
print(f"\n⚠️ PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
|
| 131 |
+
print(f" Duplicate salaries found - possible feature issue")
|
| 132 |
+
return False
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def test_devtype_impact():
|
| 136 |
+
"""Test that changing developer type changes prediction."""
|
| 137 |
+
print("\n" + "=" * 70)
|
| 138 |
+
print("TEST 4: Developer Type Impact")
|
| 139 |
+
print("=" * 70)
|
| 140 |
+
|
| 141 |
+
base_input = {
|
| 142 |
+
"country": "United States of America",
|
| 143 |
+
"years_code": 5.0,
|
| 144 |
+
"education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 145 |
+
"industry": "Software Development",
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
# Test with different developer types (using actual values from trained model)
|
| 149 |
+
test_devtypes = [
|
| 150 |
+
"Developer, front-end",
|
| 151 |
+
"Developer, back-end",
|
| 152 |
+
"Developer, full-stack",
|
| 153 |
+
"Data scientist",
|
| 154 |
+
"Engineering manager",
|
| 155 |
+
"DevOps engineer or professional",
|
| 156 |
+
]
|
| 157 |
+
|
| 158 |
+
# Filter to only developer types that exist in valid categories
|
| 159 |
+
test_devtypes = [d for d in test_devtypes if d in valid_categories["DevType"]]
|
| 160 |
+
|
| 161 |
+
predictions = []
|
| 162 |
+
for devtype in test_devtypes:
|
| 163 |
+
input_data = SalaryInput(**base_input, dev_type=devtype)
|
| 164 |
+
salary = predict_salary(input_data)
|
| 165 |
+
predictions.append(salary)
|
| 166 |
+
print(f" Dev Type: {devtype[:50]:50s} -> Salary: ${salary:,.2f}")
|
| 167 |
+
|
| 168 |
+
# Check if predictions are different
|
| 169 |
+
unique_predictions = len(set(predictions))
|
| 170 |
+
if unique_predictions == len(predictions):
|
| 171 |
+
print(f"\n✅ PASS: All {len(predictions)} predictions are different")
|
| 172 |
+
return True
|
| 173 |
+
elif unique_predictions == 1:
|
| 174 |
+
print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
|
| 175 |
+
print(" This indicates the model is NOT using developer type as a feature!")
|
| 176 |
+
return False
|
| 177 |
+
else:
|
| 178 |
+
print(f"\n⚠️ PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
|
| 179 |
+
print(f" Duplicate salaries found - possible feature issue")
|
| 180 |
+
return False
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def test_industry_impact():
|
| 184 |
+
"""Test that changing industry changes prediction."""
|
| 185 |
+
print("\n" + "=" * 70)
|
| 186 |
+
print("TEST 5: Industry Impact")
|
| 187 |
+
print("=" * 70)
|
| 188 |
+
|
| 189 |
+
base_input = {
|
| 190 |
+
"country": "United States of America",
|
| 191 |
+
"years_code": 5.0,
|
| 192 |
+
"education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
|
| 193 |
+
"dev_type": "Developer, full-stack",
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
# Test with different industries (using actual values from trained model)
|
| 197 |
+
test_industries = [
|
| 198 |
+
"Software Development",
|
| 199 |
+
"Fintech",
|
| 200 |
+
"Banking/Financial Services",
|
| 201 |
+
"Healthcare",
|
| 202 |
+
"Manufacturing",
|
| 203 |
+
"Government",
|
| 204 |
+
]
|
| 205 |
+
|
| 206 |
+
# Filter to only industries that exist in valid categories
|
| 207 |
+
test_industries = [i for i in test_industries if i in valid_categories["Industry"]]
|
| 208 |
+
|
| 209 |
+
predictions = []
|
| 210 |
+
for industry in test_industries:
|
| 211 |
+
input_data = SalaryInput(**base_input, industry=industry)
|
| 212 |
+
salary = predict_salary(input_data)
|
| 213 |
+
predictions.append(salary)
|
| 214 |
+
print(f" Industry: {industry[:50]:50s} -> Salary: ${salary:,.2f}")
|
| 215 |
+
|
| 216 |
+
# Check if predictions are different
|
| 217 |
+
unique_predictions = len(set(predictions))
|
| 218 |
+
if unique_predictions == len(predictions):
|
| 219 |
+
print(f"\n✅ PASS: All {len(predictions)} predictions are different")
|
| 220 |
+
return True
|
| 221 |
+
elif unique_predictions == 1:
|
| 222 |
+
print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
|
| 223 |
+
print(" This indicates the model is NOT using industry as a feature!")
|
| 224 |
+
return False
|
| 225 |
+
else:
|
| 226 |
+
print(f"\n⚠️ PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
|
| 227 |
+
print(f" Duplicate salaries found - possible feature issue")
|
| 228 |
+
return False
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def test_combined_features():
|
| 232 |
+
"""Test that combining different features produces expected variations."""
|
| 233 |
+
print("\n" + "=" * 70)
|
| 234 |
+
print("TEST 6: Combined Feature Variations")
|
| 235 |
+
print("=" * 70)
|
| 236 |
+
|
| 237 |
+
# Create diverse combinations (using actual values from trained model)
|
| 238 |
+
test_cases = [
|
| 239 |
+
("India", 2, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, back-end", "Software Development"),
|
| 240 |
+
("Germany", 5, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Developer, full-stack", "Manufacturing"),
|
| 241 |
+
("United States of America", 10, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Engineering manager", "Fintech"),
|
| 242 |
+
("Poland", 15, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, front-end", "Healthcare"),
|
| 243 |
+
("Brazil", 5, "Some college/university study without earning a degree", "DevOps engineer or professional", "Government"),
|
| 244 |
+
]
|
| 245 |
+
|
| 246 |
+
predictions = []
|
| 247 |
+
for country, years, education, devtype, industry in test_cases:
|
| 248 |
+
# Skip if not in valid categories
|
| 249 |
+
if (country not in valid_categories["Country"]
|
| 250 |
+
or education not in valid_categories["EdLevel"]
|
| 251 |
+
or devtype not in valid_categories["DevType"]
|
| 252 |
+
or industry not in valid_categories["Industry"]):
|
| 253 |
+
continue
|
| 254 |
+
|
| 255 |
+
input_data = SalaryInput(
|
| 256 |
+
country=country,
|
| 257 |
+
years_code=years,
|
| 258 |
+
education_level=education,
|
| 259 |
+
dev_type=devtype,
|
| 260 |
+
industry=industry,
|
| 261 |
+
)
|
| 262 |
+
salary = predict_salary(input_data)
|
| 263 |
+
predictions.append(salary)
|
| 264 |
+
print(f" {country[:15]:15s} | {years:2d}y | {education[:25]:25s} | {devtype[:25]:25s} | {industry[:20]:20s} -> ${salary:,.2f}")
|
| 265 |
+
|
| 266 |
+
# Check if predictions are different
|
| 267 |
+
unique_predictions = len(set(predictions))
|
| 268 |
+
if unique_predictions == len(predictions):
|
| 269 |
+
print(f"\n✅ PASS: All {len(predictions)} combined predictions are different")
|
| 270 |
+
return True
|
| 271 |
+
else:
|
| 272 |
+
print(f"\n⚠️ Only {unique_predictions}/{len(predictions)} unique predictions")
|
| 273 |
+
print(f" Some combinations produce identical salaries")
|
| 274 |
+
return False
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def print_feature_analysis():
|
| 278 |
+
"""Analyze which features the model is actually using."""
|
| 279 |
+
print("\n" + "=" * 70)
|
| 280 |
+
print("FEATURE ANALYSIS")
|
| 281 |
+
print("=" * 70)
|
| 282 |
+
|
| 283 |
+
from src.infer import feature_columns
|
| 284 |
+
|
| 285 |
+
print(f"\nTotal features in model: {len(feature_columns)}")
|
| 286 |
+
|
| 287 |
+
# Count by type
|
| 288 |
+
country_features = [f for f in feature_columns if f.startswith('Country_')]
|
| 289 |
+
edlevel_features = [f for f in feature_columns if f.startswith('EdLevel_')]
|
| 290 |
+
devtype_features = [f for f in feature_columns if f.startswith('DevType_')]
|
| 291 |
+
industry_features = [f for f in feature_columns if f.startswith('Industry_')]
|
| 292 |
+
numeric_features = [f for f in feature_columns if not f.startswith(('Country_', 'EdLevel_', 'DevType_', 'Industry_'))]
|
| 293 |
+
|
| 294 |
+
print(f" - Numeric features: {len(numeric_features)} -> {numeric_features}")
|
| 295 |
+
print(f" - Country features: {len(country_features)}")
|
| 296 |
+
print(f" - Education features: {len(edlevel_features)}")
|
| 297 |
+
print(f" - DevType features: {len(devtype_features)}")
|
| 298 |
+
print(f" - Industry features: {len(industry_features)}")
|
| 299 |
+
|
| 300 |
+
if len(country_features) > 0:
|
| 301 |
+
print(f"\nSample country features:")
|
| 302 |
+
for feat in country_features[:5]:
|
| 303 |
+
print(f" - {feat}")
|
| 304 |
+
|
| 305 |
+
if len(edlevel_features) > 0:
|
| 306 |
+
print(f"\nSample education features:")
|
| 307 |
+
for feat in edlevel_features[:5]:
|
| 308 |
+
print(f" - {feat}")
|
| 309 |
+
|
| 310 |
+
if len(devtype_features) > 0:
|
| 311 |
+
print(f"\nSample developer type features:")
|
| 312 |
+
for feat in devtype_features[:5]:
|
| 313 |
+
print(f" - {feat}")
|
| 314 |
+
|
| 315 |
+
if len(industry_features) > 0:
|
| 316 |
+
print(f"\nSample industry features:")
|
| 317 |
+
for feat in industry_features[:5]:
|
| 318 |
+
print(f" - {feat}")
|
| 319 |
+
|
| 320 |
+
# Check if there are any features at all
|
| 321 |
+
if len(country_features) == 0:
|
| 322 |
+
print("\n⚠️ WARNING: No country features found!")
|
| 323 |
+
if len(edlevel_features) == 0:
|
| 324 |
+
print("\n⚠️ WARNING: No education features found!")
|
| 325 |
+
if len(devtype_features) == 0:
|
| 326 |
+
print("\n⚠️ WARNING: No developer type features found!")
|
| 327 |
+
if len(industry_features) == 0:
|
| 328 |
+
print("\n⚠️ WARNING: No industry features found!")
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
def main():
|
| 332 |
+
"""Run all tests."""
|
| 333 |
+
print("\n" + "=" * 70)
|
| 334 |
+
print("FEATURE IMPACT TESTS")
|
| 335 |
+
print("Testing if changing inputs actually changes predictions")
|
| 336 |
+
print("=" * 70)
|
| 337 |
+
|
| 338 |
+
# First, analyze what features exist
|
| 339 |
+
print_feature_analysis()
|
| 340 |
+
|
| 341 |
+
# Run all tests
|
| 342 |
+
results = {
|
| 343 |
+
"Years of Experience": test_years_experience_impact(),
|
| 344 |
+
"Country": test_country_impact(),
|
| 345 |
+
"Education Level": test_education_impact(),
|
| 346 |
+
"Developer Type": test_devtype_impact(),
|
| 347 |
+
"Industry": test_industry_impact(),
|
| 348 |
+
"Combined Features": test_combined_features(),
|
| 349 |
+
}
|
| 350 |
+
|
| 351 |
+
# Summary
|
| 352 |
+
print("\n" + "=" * 70)
|
| 353 |
+
print("TEST SUMMARY")
|
| 354 |
+
print("=" * 70)
|
| 355 |
+
|
| 356 |
+
for test_name, passed in results.items():
|
| 357 |
+
status = "✅ PASS" if passed else "❌ FAIL"
|
| 358 |
+
print(f" {status} - {test_name}")
|
| 359 |
+
|
| 360 |
+
passed_count = sum(results.values())
|
| 361 |
+
total_count = len(results)
|
| 362 |
+
|
| 363 |
+
print(f"\n{passed_count}/{total_count} tests passed")
|
| 364 |
+
|
| 365 |
+
if passed_count == total_count:
|
| 366 |
+
print("\n🎉 All tests passed! The model is using all features correctly.")
|
| 367 |
+
else:
|
| 368 |
+
print("\n⚠️ Some tests failed. The model may not be using all features properly.")
|
| 369 |
+
print(" This indicates potential training-testing skew or feature engineering issues.")
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
if __name__ == "__main__":
|
| 373 |
+
main()
|
test_fix.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Test that the encoding fix works."""
|
| 2 |
+
|
| 3 |
+
# Force reload of modules
|
| 4 |
+
import sys
|
| 5 |
+
if 'src.preprocessing' in sys.modules:
|
| 6 |
+
del sys.modules['src.preprocessing']
|
| 7 |
+
if 'src.infer' in sys.modules:
|
| 8 |
+
del sys.modules['src.infer']
|
| 9 |
+
|
| 10 |
+
from src.preprocessing import prepare_features
|
| 11 |
+
import pandas as pd
|
| 12 |
+
|
| 13 |
+
# Create test inputs with different countries (values from valid_categories)
|
| 14 |
+
input1 = pd.DataFrame({
|
| 15 |
+
'Country': ['United States of America'],
|
| 16 |
+
'YearsCode': [5.0],
|
| 17 |
+
'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
|
| 18 |
+
'DevType': ['Developer, full-stack']
|
| 19 |
+
})
|
| 20 |
+
|
| 21 |
+
input2 = pd.DataFrame({
|
| 22 |
+
'Country': ['Germany'],
|
| 23 |
+
'YearsCode': [5.0],
|
| 24 |
+
'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
|
| 25 |
+
'DevType': ['Developer, full-stack']
|
| 26 |
+
})
|
| 27 |
+
|
| 28 |
+
print("Testing prepare_features with different countries...")
|
| 29 |
+
features1 = prepare_features(input1)
|
| 30 |
+
features2 = prepare_features(input2)
|
| 31 |
+
|
| 32 |
+
print(f"\nUSA features: {features1.shape}")
|
| 33 |
+
print(f"Columns: {list(features1.columns)[:10]}")
|
| 34 |
+
|
| 35 |
+
print(f"\nGermany features: {features2.shape}")
|
| 36 |
+
print(f"Columns: {list(features2.columns)[:10]}")
|
| 37 |
+
|
| 38 |
+
print(f"\nAre they different? {not features1.equals(features2)}")
|
| 39 |
+
|
| 40 |
+
if features1.shape[1] > 1:
|
| 41 |
+
print("\n✅ SUCCESS: Categorical features are preserved!")
|
| 42 |
+
else:
|
| 43 |
+
print("\n❌ FAIL: Still only has numeric features")
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|