dima806 commited on
Commit
55cdb7e
·
verified ·
1 Parent(s): 603808b

Upload 25 files

Browse files
.dockerignore ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ *.egg-info/
8
+ dist/
9
+ build/
10
+ *.egg
11
+
12
+ # Virtual environments
13
+ .venv/
14
+ venv/
15
+ ENV/
16
+ env/
17
+
18
+ # IDE
19
+ .vscode/
20
+ .idea/
21
+ *.swp
22
+ *.swo
23
+ *~
24
+
25
+ # OS
26
+ .DS_Store
27
+ Thumbs.db
28
+
29
+ # Git
30
+ .git/
31
+ .gitignore
32
+ .gitattributes
33
+
34
+ # Data (don't include raw survey data in Docker image)
35
+ data/
36
+
37
+ # Testing and development
38
+ test*.py
39
+ debug*.py
40
+ diagnose*.py
41
+ example_inference.py
42
+
43
+ # Documentation
44
+ .llm/
45
+ *.md
46
+ !README.md
47
+
48
+ # CI/CD
49
+ .github/
50
+
51
+ # Project specific
52
+ pyproject.toml
53
+ uv.lock
.gitignore ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
208
+
209
+ # Project-specific
210
+ # Data files (too large for git)
211
+ data/*.csv
212
+ data/*.zip
213
+
214
+ # Trained model artifacts
215
+ # Note: Model files are included in the repo for deployment
216
+ # models/*.pkl
217
+ # models/*.joblib
218
+
219
+ # LLM
220
+ .llm/
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
Claude.md ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Claude Development Guide
2
+
3
+ ## Project Overview
4
+ This is a minimal, local-first ML application built in Python that predicts developer salaries using Stack Overflow Developer Survey data. The project emphasizes clarity and simplicity over production completeness.
5
+
6
+ ## Tech Stack
7
+ - **Python 3.11+**
8
+ - **uv** - Package & virtual environment management
9
+ - **pandas** - Data manipulation
10
+ - **scikit-learn** - ML modeling
11
+ - **pydantic** - Input validation
12
+ - **streamlit** - Web UI
13
+ - **xgboost** - Advanced gradient boosting (optional)
14
+
15
+ ## Project Structure
16
+ ```
17
+ .
18
+ ├── data/
19
+ │ └── survey_results_public.csv # Stack Overflow survey data
20
+ ├── models/
21
+ │ └── model.pkl # Serialized trained model
22
+ ├── src/
23
+ │ ├── schema.py # Pydantic validation models
24
+ │ ├── train.py # Model training script
25
+ │ └── infer.py # Inference utilities
26
+ ├── app.py # Streamlit web application
27
+ ├── example_inference.py # Example inference script
28
+ ├── pyproject.toml # Project dependencies (uv)
29
+ ├── uv.lock # Locked dependencies
30
+ └── README.md # Project documentation
31
+ ```
32
+
33
+ ## Setup & Installation
34
+
35
+ ### Initial Setup
36
+ ```bash
37
+ # The virtual environment is already created at .venv/
38
+ # Activate it:
39
+ source .venv/bin/activate # On Linux/Mac
40
+ # or
41
+ .venv\Scripts\activate # On Windows
42
+
43
+ # Install/sync dependencies with uv:
44
+ uv sync
45
+ ```
46
+
47
+ ### Adding New Dependencies
48
+ ```bash
49
+ uv add <package-name>
50
+ ```
51
+
52
+ ## Key Workflows
53
+
54
+ ### Training the Model
55
+ ```bash
56
+ python src/train.py
57
+ ```
58
+ This will:
59
+ - Load data from `data/survey_results_public.csv`
60
+ - Clean and preprocess features
61
+ - Train the regression model
62
+ - Save model to `models/model.pkl`
63
+
64
+ ### Running the Streamlit App
65
+ ```bash
66
+ streamlit run app.py
67
+ ```
68
+ Opens a browser interface for salary predictions.
69
+
70
+ ### Running Inference Programmatically
71
+ ```python
72
+ from src.schema import SalaryInput
73
+ from src.infer import predict_salary
74
+
75
+ input_data = SalaryInput(
76
+ country="United States",
77
+ years_code=5.0,
78
+ education_level="Bachelor's degree",
79
+ dev_type="Developer, back-end",
80
+ industry="Software Development"
81
+ )
82
+ salary = predict_salary(input_data)
83
+ ```
84
+
85
+ ## Key Files
86
+
87
+ ### [src/schema.py](src/schema.py)
88
+ Contains Pydantic models for:
89
+ - Input validation (`SalaryInput`)
90
+ - Type safety across the application
91
+
92
+ ### [src/train.py](src/train.py)
93
+ Training pipeline:
94
+ - Data loading and cleaning
95
+ - Feature engineering
96
+ - Model training
97
+ - Model persistence
98
+
99
+ ### [src/infer.py](src/infer.py)
100
+ Inference utilities:
101
+ - Model loading
102
+ - Prediction logic
103
+ - Validated input processing
104
+
105
+ ### [app.py](app.py)
106
+ Streamlit UI:
107
+ - User input forms
108
+ - Real-time predictions
109
+ - Results visualization
110
+
111
+ ## Development Guidelines
112
+
113
+ ### Code Style
114
+ - Keep code simple and readable
115
+ - Total codebase should remain under ~200 lines
116
+ - Focus on clarity over cleverness
117
+ - Use type hints where helpful
118
+
119
+ ### Data Requirements
120
+ The dataset must include these columns:
121
+ - `Country` - Developer location
122
+ - `YearsCode` - Total years of coding (including education)
123
+ - `EdLevel` - Education level
124
+ - `DevType` - Developer type
125
+ - `Industry` - Industry the developer works in
126
+ - `ConvertedCompYearly` - Annual salary (target variable)
127
+
128
+ ### Model Expectations
129
+ - Basic regression model (LinearRegression or similar)
130
+ - Simple feature encoding (one-hot for categoricals)
131
+ - No hyperparameter tuning required
132
+ - Focus on working end-to-end pipeline
133
+
134
+ ## Common Tasks
135
+
136
+ ### Debugging Training Issues
137
+ 1. Check if data file exists: `ls -la data/`
138
+ 2. Verify CSV columns: `head -1 data/survey_results_public.csv`
139
+ 3. Check for missing values in target column
140
+ 4. Review data types and encoding
141
+
142
+ ### Updating Features
143
+ 1. Modify `SalaryInput` schema in [src/schema.py](src/schema.py)
144
+ 2. Update feature extraction in [src/train.py](src/train.py)
145
+ 3. Update inference logic in [src/infer.py](src/infer.py)
146
+ 4. Update UI inputs in [app.py](app.py)
147
+ 5. Retrain the model
148
+
149
+ ### Testing Predictions
150
+ ```python
151
+ # Quick test in Python REPL
152
+ from src.infer import predict_salary
153
+ from src.schema import SalaryInput
154
+
155
+ test_input = SalaryInput(
156
+ country="United States",
157
+ years_code=3.0,
158
+ education_level="Bachelor's degree",
159
+ dev_type="Developer, back-end",
160
+ industry="Software Development"
161
+ )
162
+ print(predict_salary(test_input))
163
+ ```
164
+
165
+ ## Non-Goals (Intentionally Excluded)
166
+ - Cloud deployment or serving
167
+ - Hyperparameter tuning
168
+ - Model registry or experiment tracking
169
+ - Advanced feature engineering
170
+ - Production monitoring
171
+ - API endpoints (beyond Streamlit)
172
+
173
+ ## Useful Commands
174
+
175
+ ```bash
176
+ # Check environment
177
+ which python
178
+ python --version
179
+
180
+ # Verify uv installation
181
+ uv --version
182
+
183
+ # List installed packages
184
+ uv pip list
185
+
186
+ # Run with specific Python version
187
+ uv run python src/train.py
188
+
189
+ # Clean generated files
190
+ rm -f models/model.pkl
191
+
192
+ # Check data file size
193
+ du -h data/survey_results_public.csv
194
+ ```
195
+
196
+ ## Troubleshooting
197
+
198
+ ### Model file not found
199
+ - Run training first: `python src/train.py`
200
+ - Check file exists: `ls -la models/model.pkl`
201
+
202
+ ### Missing dependencies
203
+ - Sync environment: `uv sync`
204
+ - Verify pyproject.toml has all required packages
205
+
206
+ ### Data file issues
207
+ - Ensure CSV is in `data/` directory
208
+ - Check file encoding (should be UTF-8)
209
+ - Verify required columns exist
210
+
211
+ ### Streamlit won't start
212
+ - Check port 8501 is available
213
+ - Try specifying port: `streamlit run app.py --server.port 8502`
214
+
215
+ ## Additional Resources
216
+ - [PRD](.llm/prd.md) - Full product requirements
217
+ - [README.md](README.md) - Project readme
218
+ - [Stack Overflow Survey](https://insights.stackoverflow.com/survey) - Data source
219
+
220
+ ## Working with Claude Code
221
+ When asking Claude to help with this project:
222
+ - Reference specific files using markdown links: [filename](path)
223
+ - Be specific about which component needs changes
224
+ - Mention if you need training, inference, or UI updates
225
+ - Provide error messages in full when debugging
226
+ - Ask for explanations of model choices if unclear
Dockerfile CHANGED
@@ -20,4 +20,4 @@ EXPOSE 8501
20
 
21
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
22
 
23
- ENTRYPOINT ["streamlit", "run", "./app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
20
 
21
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
22
 
23
+ ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md CHANGED
@@ -1,20 +1,305 @@
1
- ---
2
- title: Developer Salary Prediction
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
- pinned: false
11
- short_description: Developer salary prediction using 2025 Stackoverflow survey
12
- license: apache-2.0
13
- ---
14
-
15
- # Welcome to Streamlit!
16
-
17
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
18
-
19
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Developer Salary Prediction
2
+
3
+ A minimal, local-first ML application that predicts developer salaries using Stack Overflow Developer Survey data. Built with Python, scikit-learn, Pydantic, and Streamlit.
4
+
5
+ ## Features
6
+
7
+ - 🎯 XGBoost (gradient boosting) model for salary prediction
8
+ - ✅ Input validation with Pydantic
9
+ - 🌐 Interactive web UI with Streamlit
10
+ - 📊 Trained on Stack Overflow Developer Survey data
11
+ - 🔧 Easy setup with `uv` package manager
12
+
13
+ ## Quick Start
14
+
15
+ ### 1. Install Dependencies
16
+
17
+ ```bash
18
+ uv sync
19
+ ```
20
+
21
+ ### 2. Download Data
22
+
23
+ Download the Stack Overflow Developer Survey CSV file:
24
+
25
+ 1. Visit: https://insights.stackoverflow.com/survey
26
+ 2. Download the latest survey results (2024 or 2025)
27
+ 3. Extract the `survey_results_public.csv` file
28
+ 4. Place it in the `data/` directory:
29
+ ```
30
+ data/survey_results_public.csv
31
+ ```
32
+
33
+ **Required columns:** `Country`, `YearsCode`, `EdLevel`, `DevType`, `Industry`, `ConvertedCompYearly`
34
+
35
+ ### 3. Train the Model
36
+
37
+ ```bash
38
+ uv run python -m src.train
39
+ ```
40
+
41
+ This will:
42
+ - Load configuration from `config/model_parameters.yaml`
43
+ - Load and preprocess the survey data (with cardinality reduction)
44
+ - Train an XGBoost model with early stopping
45
+ - Save the model to `models/model.pkl`
46
+ - Generate `config/valid_categories.yaml` with valid country, education, developer type, and industry values
47
+
48
+ ### 4. Run the Streamlit App
49
+
50
+ ```bash
51
+ uv run streamlit run app.py
52
+ ```
53
+
54
+ The app will open in your browser at `http://localhost:8501`
55
+
56
+ ## Usage
57
+
58
+ ### Web Interface
59
+
60
+ Launch the Streamlit app and enter:
61
+ - **Country**: Developer's country
62
+ - **Years of Coding (Total)**: Total years coding including education
63
+ - **Education Level**: Highest degree completed
64
+ - **Developer Type**: Primary developer role
65
+ - **Industry**: Industry the developer works in
66
+
67
+ Click "Predict Salary" to see the estimated annual salary.
68
+
69
+ ### Programmatic Usage
70
+
71
+ **Quick example:**
72
+
73
+ ```python
74
+ from src.schema import SalaryInput
75
+ from src.infer import predict_salary
76
+
77
+ # Create input
78
+ input_data = SalaryInput(
79
+ country="United States of America",
80
+ years_code=5.0,
81
+ education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
82
+ dev_type="Developer, full-stack",
83
+ industry="Software Development"
84
+ )
85
+
86
+ # Get prediction
87
+ salary = predict_salary(input_data)
88
+ print(f"Estimated salary: ${salary:,.0f}")
89
+ ```
90
+
91
+ **Run the example script:**
92
+
93
+ ```bash
94
+ uv run python example_inference.py
95
+ ```
96
+
97
+ This will show predictions for multiple sample scenarios (junior, mid-level, senior developers, different countries).
98
+
99
+ ## Input Validation
100
+
101
+ The model validates inputs against actual training data categories:
102
+
103
+ - **Valid Countries**: Only countries from `config/valid_categories.yaml` (~21 countries)
104
+ - **Valid Education Levels**: Only education levels from training data (~9 levels)
105
+ - **Valid Developer Types**: Only developer types from training data (~20 types)
106
+ - **Valid Industries**: Only industries from training data (~15 industries)
107
+
108
+ The Streamlit app uses dropdown menus with only valid options. If you use the programmatic API with invalid values, you'll get a helpful error message pointing to the valid categories file.
109
+
110
+ **Example validation:**
111
+ ```python
112
+ from src.infer import predict_salary
113
+ from src.schema import SalaryInput
114
+
115
+ # This will raise ValueError - Japan not in training data after cardinality reduction
116
+ invalid_input = SalaryInput(
117
+ country="Japan", # Invalid!
118
+ years_code=5.0,
119
+ education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
120
+ dev_type="Developer, back-end",
121
+ industry="Software Development"
122
+ )
123
+ ```
124
+
125
+ **View valid categories:**
126
+ ```bash
127
+ cat config/valid_categories.yaml
128
+ ```
129
+
130
+ ## Configuration
131
+
132
+ Model parameters are centralized in [config/model_parameters.yaml](config/model_parameters.yaml). You can customize:
133
+
134
+ - **Data Processing**: Salary thresholds, percentile bounds, train/test split ratio
135
+ - **Feature Engineering**: Cardinality reduction settings (max categories, min frequency)
136
+ - **Model Hyperparameters**: Learning rate, tree depth, early stopping, etc.
137
+ - **Training Settings**: Verbosity, model save path
138
+
139
+ **To modify parameters:**
140
+
141
+ ```bash
142
+ # Edit the config file
143
+ nano config/model_parameters.yaml
144
+
145
+ # Then retrain the model
146
+ uv run python -m src.train
147
+ ```
148
+
149
+ **Example parameter changes:**
150
+ ```yaml
151
+ # Increase model complexity
152
+ model:
153
+ max_depth: 8 # Default: 6
154
+ n_estimators: 10000 # Default: 5000
155
+
156
+ # Keep more categories
157
+ features:
158
+ cardinality:
159
+ max_categories: 30 # Default: 20
160
+ min_frequency: 100 # Default: 50
161
+ ```
162
+
163
+ ## Project Structure
164
+
165
+ ```
166
+ .
167
+ ├── config/
168
+ │ ├── model_parameters.yaml # Model configuration
169
+ │ └── valid_categories.yaml # Valid input categories (generated)
170
+ ├── data/
171
+ │ └── survey_results_public.csv # Stack Overflow survey data (download required)
172
+ ├── models/
173
+ │ └── model.pkl # Trained model (generated)
174
+ ├── src/
175
+ │ ├── __init__.py # Package initialization
176
+ │ ├── schema.py # Pydantic models
177
+ │ ├── preprocessing.py # Feature engineering utilities
178
+ │ ├── train.py # Training script
179
+ │ └── infer.py # Inference utilities
180
+ ├── app.py # Streamlit web app
181
+ ├── example_inference.py # Example inference script
182
+ ├── pyproject.toml # Project dependencies
183
+ └── README.md # This file
184
+ ```
185
+
186
+ ## Tech Stack
187
+
188
+ - **Python 3.12+**
189
+ - **uv** - Package manager
190
+ - **pandas** - Data manipulation
191
+ - **xgboost** - Gradient boosting model
192
+ - **scikit-learn** - ML utilities (train/test split)
193
+ - **pydantic** - Data validation
194
+ - **streamlit** - Web UI
195
+
196
+ ## Development
197
+
198
+ For detailed development information, see [Claude.md](Claude.md).
199
+
200
+ ### Re-training the Model
201
+
202
+ If you want to use a different survey year or update the model:
203
+
204
+ ```bash
205
+ # Place new CSV in data/ directory
206
+ uv run python -m src.train
207
+ ```
208
+
209
+ ### Running Tests
210
+
211
+ **Quick one-liner test:**
212
+ ```bash
213
+ uv run python -c "from src.schema import SalaryInput; from src.infer import predict_salary; test = SalaryInput(country='United States of America', years_code=5.0, education_level='Bachelor'\''s degree (B.A., B.S., B.Eng., etc.)', dev_type='Developer, full-stack', industry='Software Development'); print(f'Prediction: \${predict_salary(test):,.0f}')"
214
+ ```
215
+
216
+ **Or run the full example script:**
217
+ ```bash
218
+ uv run python example_inference.py
219
+ ```
220
+
221
+ ## Deployment
222
+
223
+ ### Hugging Face Spaces
224
+
225
+ This application is Docker-ready for deployment on Hugging Face Spaces:
226
+
227
+ **1. Build the Docker image:**
228
+ ```bash
229
+ docker build -t developer-salary-predictor .
230
+ ```
231
+
232
+ **2. Test locally:**
233
+ ```bash
234
+ docker run -p 8501:8501 developer-salary-predictor
235
+ ```
236
+
237
+ Then visit `http://localhost:8501`
238
+
239
+ **3. Deploy to Hugging Face:**
240
+
241
+ 1. Create a new Space on [Hugging Face](https://huggingface.co/new-space)
242
+ 2. Select "Docker" as the SDK
243
+ 3. Clone your Space repository
244
+ 4. Copy these files to your Space:
245
+
246
+ ```text
247
+ Dockerfile
248
+ requirements.txt
249
+ app.py
250
+ src/
251
+ config/
252
+ models/
253
+ ```
254
+
255
+ 5. Push to your Space:
256
+ ```bash
257
+ git add .
258
+ git commit -m "Initial deployment"
259
+ git push
260
+ ```
261
+
262
+ **Note:** The pre-trained model (`models/model.pkl`) and configuration (`config/valid_categories.yaml`) are included in the Docker image. If you want to use a different model, retrain locally first, then rebuild the Docker image.
263
+
264
+ ### Alternative: Local Deployment
265
+
266
+ **Using uv (recommended for development):**
267
+ ```bash
268
+ uv run streamlit run app.py
269
+ ```
270
+
271
+ **Using pip:**
272
+ ```bash
273
+ pip install -r requirements.txt
274
+ streamlit run app.py
275
+ ```
276
+
277
+ ## Troubleshooting
278
+
279
+ ### "Model file not found"
280
+ - Run `uv run python -m src.train` first to generate the model
281
+
282
+ ### "Data file not found"
283
+ - Download the Stack Overflow survey CSV and place it in `data/`
284
+
285
+ ### "Configuration file not found"
286
+ - The `config/model_parameters.yaml` file should exist in the project root
287
+ - Check that you're running commands from the project root directory
288
+
289
+ ### Dependencies issues
290
+ - Run `uv sync` to ensure all packages are installed
291
+
292
+ ## Design Principles
293
+
294
+ - **Simplicity**: Under 200 lines of code total
295
+ - **Clarity**: Easy to understand and modify
296
+ - **Local-first**: No cloud dependencies
297
+ - **Hackable**: Plain Python, no complex frameworks
298
+
299
+ ## License
300
+
301
+ Apache 2.0 License - see [LICENSE](LICENSE) file
302
+
303
+ ## Acknowledgments
304
+
305
+ Data from [Stack Overflow Developer Survey](https://insights.stackoverflow.com/survey)
app.py CHANGED
@@ -2,7 +2,7 @@
2
 
3
  import streamlit as st
4
 
5
- from src.infer import predict_salary, valid_categories
6
  from src.schema import SalaryInput
7
 
8
  # Page configuration
@@ -26,9 +26,10 @@ with st.sidebar:
26
  This app uses an XGBoost (gradient boosting) model trained on Stack Overflow
27
  Developer Survey data to predict annual salaries based on:
28
  - Country
29
- - Years of professional coding experience
30
  - Education level
31
  - Developer type
 
32
  """
33
  )
34
  st.info("💡 Tip: Results are estimates based on survey averages.")
@@ -38,6 +39,7 @@ with st.sidebar:
38
  st.write(f"**Countries:** {len(valid_categories['Country'])} available")
39
  st.write(f"**Education Levels:** {len(valid_categories['EdLevel'])} available")
40
  st.write(f"**Developer Types:** {len(valid_categories['DevType'])} available")
 
41
  st.caption("Only values from the training data are shown in the dropdowns.")
42
 
43
  # Main input form
@@ -49,11 +51,13 @@ col1, col2 = st.columns(2)
49
  valid_countries = valid_categories["Country"]
50
  valid_education_levels = valid_categories["EdLevel"]
51
  valid_dev_types = valid_categories["DevType"]
 
52
 
53
  # Set default values (if available)
54
  default_country = "United States of America" if "United States of America" in valid_countries else valid_countries[0]
55
  default_education = "Bachelor's degree (B.A., B.S., B.Eng., etc.)" if "Bachelor's degree (B.A., B.S., B.Eng., etc.)" in valid_education_levels else valid_education_levels[0]
56
  default_dev_type = "Developer, back-end" if "Developer, back-end" in valid_dev_types else valid_dev_types[0]
 
57
 
58
  with col1:
59
  country = st.selectbox(
@@ -64,12 +68,12 @@ with col1:
64
  )
65
 
66
  years = st.number_input(
67
- "Years of Professional Coding",
68
  min_value=0,
69
  max_value=50,
70
- value=5,
71
  step=1,
72
- help="Years of professional coding experience",
73
  )
74
 
75
  with col2:
@@ -87,15 +91,23 @@ with col2:
87
  help="Primary developer role (only types from training data)",
88
  )
89
 
 
 
 
 
 
 
 
90
  # Prediction button
91
  if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
92
  try:
93
  # Create input model
94
  input_data = SalaryInput(
95
  country=country,
96
- years_code_pro=years,
97
  education_level=education,
98
  dev_type=dev_type,
 
99
  )
100
 
101
  # Make prediction
@@ -104,11 +116,29 @@ if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
104
 
105
  # Display result
106
  st.success("Prediction Complete!")
107
- st.metric(
108
- label="Estimated Annual Salary",
109
- value=f"${salary:,.0f}",
110
- help="Predicted annual compensation in USD",
111
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  except FileNotFoundError:
114
  st.error(
 
2
 
3
  import streamlit as st
4
 
5
+ from src.infer import predict_salary, get_local_currency, valid_categories
6
  from src.schema import SalaryInput
7
 
8
  # Page configuration
 
26
  This app uses an XGBoost (gradient boosting) model trained on Stack Overflow
27
  Developer Survey data to predict annual salaries based on:
28
  - Country
29
+ - Total years of coding experience (including education)
30
  - Education level
31
  - Developer type
32
+ - Industry
33
  """
34
  )
35
  st.info("💡 Tip: Results are estimates based on survey averages.")
 
39
  st.write(f"**Countries:** {len(valid_categories['Country'])} available")
40
  st.write(f"**Education Levels:** {len(valid_categories['EdLevel'])} available")
41
  st.write(f"**Developer Types:** {len(valid_categories['DevType'])} available")
42
+ st.write(f"**Industries:** {len(valid_categories['Industry'])} available")
43
  st.caption("Only values from the training data are shown in the dropdowns.")
44
 
45
  # Main input form
 
51
  valid_countries = valid_categories["Country"]
52
  valid_education_levels = valid_categories["EdLevel"]
53
  valid_dev_types = valid_categories["DevType"]
54
+ valid_industries = valid_categories["Industry"]
55
 
56
  # Set default values (if available)
57
  default_country = "United States of America" if "United States of America" in valid_countries else valid_countries[0]
58
  default_education = "Bachelor's degree (B.A., B.S., B.Eng., etc.)" if "Bachelor's degree (B.A., B.S., B.Eng., etc.)" in valid_education_levels else valid_education_levels[0]
59
  default_dev_type = "Developer, back-end" if "Developer, back-end" in valid_dev_types else valid_dev_types[0]
60
+ default_industry = "Software Development" if "Software Development" in valid_industries else valid_industries[0]
61
 
62
  with col1:
63
  country = st.selectbox(
 
68
  )
69
 
70
  years = st.number_input(
71
+ "Years of Coding (Total)",
72
  min_value=0,
73
  max_value=50,
74
+ value=15,
75
  step=1,
76
+ help="Including any education, how many years have you been coding in total?",
77
  )
78
 
79
  with col2:
 
91
  help="Primary developer role (only types from training data)",
92
  )
93
 
94
+ industry = st.selectbox(
95
+ "Industry",
96
+ options=valid_industries,
97
+ index=valid_industries.index(default_industry),
98
+ help="Industry the developer works in (only industries from training data)",
99
+ )
100
+
101
  # Prediction button
102
  if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
103
  try:
104
  # Create input model
105
  input_data = SalaryInput(
106
  country=country,
107
+ years_code=years,
108
  education_level=education,
109
  dev_type=dev_type,
110
+ industry=industry,
111
  )
112
 
113
  # Make prediction
 
116
 
117
  # Display result
118
  st.success("Prediction Complete!")
119
+
120
+ # Show USD and local currency side by side
121
+ local = get_local_currency(country, salary)
122
+ if local and local["code"] != "USD":
123
+ col_usd, col_local = st.columns(2)
124
+ with col_usd:
125
+ st.metric(
126
+ label="Estimated Annual Salary (USD)",
127
+ value=f"${salary:,.0f}",
128
+ help="Predicted annual compensation in USD",
129
+ )
130
+ with col_local:
131
+ st.metric(
132
+ label=f"Estimated Annual Salary ({local['code']})",
133
+ value=f"{local['salary_local']:,.0f} {local['code']}",
134
+ help=f"Converted using survey-derived rate: 1 USD = {local['rate']} {local['code']} ({local['name']})",
135
+ )
136
+ else:
137
+ st.metric(
138
+ label="Estimated Annual Salary",
139
+ value=f"${salary:,.0f}",
140
+ help="Predicted annual compensation in USD",
141
+ )
142
 
143
  except FileNotFoundError:
144
  st.error(
config/currency_rates.yaml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Australia:
2
+ code: AUD
3
+ name: Australian dollar
4
+ rate: 1.54
5
+ Austria:
6
+ code: EUR
7
+ name: European Euro
8
+ rate: 0.86
9
+ Belgium:
10
+ code: EUR
11
+ name: European Euro
12
+ rate: 0.86
13
+ Brazil:
14
+ code: BRL
15
+ name: Brazilian real
16
+ rate: 5.49
17
+ Canada:
18
+ code: CAD
19
+ name: Canadian dollar
20
+ rate: 1.37
21
+ Czech Republic:
22
+ code: CZK
23
+ name: Czech koruna
24
+ rate: 21.36
25
+ Denmark:
26
+ code: DKK
27
+ name: Danish krone
28
+ rate: 6.43
29
+ France:
30
+ code: EUR
31
+ name: European Euro
32
+ rate: 0.86
33
+ Germany:
34
+ code: EUR
35
+ name: European Euro
36
+ rate: 0.86
37
+ India:
38
+ code: INR
39
+ name: Indian rupee
40
+ rate: 86.03
41
+ Italy:
42
+ code: EUR
43
+ name: European Euro
44
+ rate: 0.86
45
+ Netherlands:
46
+ code: EUR
47
+ name: European Euro
48
+ rate: 0.86
49
+ Other:
50
+ code: EUR
51
+ name: European Euro
52
+ rate: 0.86
53
+ Poland:
54
+ code: PLN
55
+ name: Polish zloty
56
+ rate: 3.66
57
+ Portugal:
58
+ code: EUR
59
+ name: European Euro
60
+ rate: 0.86
61
+ Spain:
62
+ code: EUR
63
+ name: European Euro
64
+ rate: 0.86
65
+ Sweden:
66
+ code: SEK
67
+ name: Swedish krona
68
+ rate: 9.54
69
+ Switzerland:
70
+ code: CHF
71
+ name: Swiss franc
72
+ rate: 0.81
73
+ Ukraine:
74
+ code: UAH
75
+ name: Ukrainian hryvnia
76
+ rate: 41.73
77
+ United Kingdom of Great Britain and Northern Ireland:
78
+ code: GBP
79
+ name: Pound sterling
80
+ rate: 0.73
81
+ United States of America:
82
+ code: USD
83
+ name: United States dollar
84
+ rate: 1.0
config/valid_categories.yaml CHANGED
@@ -52,3 +52,20 @@ DevType:
52
  - Senior executive (C-suite, VP, etc.)
53
  - Student
54
  - System administrator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  - Senior executive (C-suite, VP, etc.)
53
  - Student
54
  - System administrator
55
+ Industry:
56
+ - Banking/Financial Services
57
+ - Computer Systems Design and Services
58
+ - Energy
59
+ - Fintech
60
+ - Government
61
+ - Healthcare
62
+ - Higher Education
63
+ - Insurance
64
+ - Internet, Telecomm or Information Services
65
+ - Manufacturing
66
+ - Media & Advertising Services
67
+ - Other
68
+ - 'Other:'
69
+ - Retail and Consumer Services
70
+ - Software Development
71
+ - Transportation, or Supply Chain
debug_prepare_features.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Debug prepare_features step by step."""
2
+
3
+ import pandas as pd
4
+ from src.preprocessing import reduce_cardinality
5
+ import yaml
6
+ from pathlib import Path
7
+
8
+ # Load config
9
+ config_path = Path("config/model_parameters.yaml")
10
+ with open(config_path, "r") as f:
11
+ config = yaml.safe_load(f)
12
+
13
+ # Create test input
14
+ df = pd.DataFrame({
15
+ 'Country': ['United States of America'],
16
+ 'YearsCode': [5.0],
17
+ 'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
18
+ 'DevType': ['Developer, full-stack']
19
+ })
20
+
21
+ print("=" * 70)
22
+ print("STEP-BY-STEP DEBUGGING OF prepare_features()")
23
+ print("=" * 70)
24
+
25
+ print("\n1. Original input:")
26
+ print(f" Columns: {list(df.columns)}")
27
+ print(f" Values: {df.iloc[0].to_dict()}")
28
+
29
+ # Step 2: Copy
30
+ df_processed = df.copy()
31
+
32
+ # Step 3: Unicode normalization
33
+ for col in ["Country", "EdLevel", "DevType"]:
34
+ if col in df_processed.columns:
35
+ df_processed[col] = df_processed[col].str.replace('\u2019', "'", regex=False)
36
+
37
+ print("\n2. After unicode normalization:")
38
+ print(f" Columns: {list(df_processed.columns)}")
39
+
40
+ # Step 4: Fill missing values
41
+ df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
42
+ df_processed["Country"] = df_processed["Country"].fillna("Unknown")
43
+ df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
44
+ df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
45
+
46
+ print("\n3. After filling missing values:")
47
+ print(f" Columns: {list(df_processed.columns)}")
48
+ print(f" Country value: '{df_processed['Country'].iloc[0]}'")
49
+ print(f" EdLevel value: '{df_processed['EdLevel'].iloc[0]}'")
50
+ print(f" DevType value: '{df_processed['DevType'].iloc[0]}'")
51
+
52
+ # Step 5: Reduce cardinality
53
+ print("\n4. Before cardinality reduction:")
54
+ print(f" Country value: '{df_processed['Country'].iloc[0]}'")
55
+ df_processed["Country"] = reduce_cardinality(df_processed["Country"])
56
+ print(f" After Country reduction: '{df_processed['Country'].iloc[0]}'")
57
+
58
+ print(f" EdLevel value: '{df_processed['EdLevel'].iloc[0]}'")
59
+ df_processed["EdLevel"] = reduce_cardinality(df_processed["EdLevel"])
60
+ print(f" After EdLevel reduction: '{df_processed['EdLevel'].iloc[0]}'")
61
+
62
+ print(f" DevType value: '{df_processed['DevType'].iloc[0]}'")
63
+ df_processed["DevType"] = reduce_cardinality(df_processed["DevType"])
64
+ print(f" After DevType reduction: '{df_processed['DevType'].iloc[0]}'")
65
+
66
+ # Step 6: Select feature columns
67
+ feature_cols = ["Country", "YearsCode", "EdLevel", "DevType"]
68
+ df_features = df_processed[feature_cols]
69
+
70
+ print("\n5. After selecting feature columns:")
71
+ print(f" Columns: {list(df_features.columns)}")
72
+ print(f" Values: {df_features.iloc[0].to_dict()}")
73
+
74
+ # Step 7: One-hot encode
75
+ drop_first = config['features']['encoding']['drop_first']
76
+ print(f"\n6. One-hot encoding with drop_first={drop_first}:")
77
+ df_encoded = pd.get_dummies(df_features, drop_first=drop_first)
78
+
79
+ print(f" Result shape: {df_encoded.shape}")
80
+ print(f" Result columns: {list(df_encoded.columns)}")
81
+ print(f" Non-zero values: {df_encoded.columns[df_encoded.iloc[0] != 0].tolist()}")
diagnose_encoding.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Diagnose why categorical features aren't affecting predictions."""
2
+
3
+ from src.preprocessing import prepare_features
4
+ import pandas as pd
5
+
6
+ # Create two inputs that differ ONLY in Country
7
+ input1 = pd.DataFrame({
8
+ 'Country': ['United States of America'],
9
+ 'YearsCode': [5.0],
10
+ 'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
11
+ 'DevType': ['Developer, full-stack']
12
+ })
13
+
14
+ input2 = pd.DataFrame({
15
+ 'Country': ['Germany'], # Different!
16
+ 'YearsCode': [5.0],
17
+ 'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
18
+ 'DevType': ['Developer, full-stack']
19
+ })
20
+
21
+ print("=" * 70)
22
+ print("ENCODING DIAGNOSIS")
23
+ print("=" * 70)
24
+
25
+ # Process features
26
+ features1 = prepare_features(input1)
27
+ features2 = prepare_features(input2)
28
+
29
+ print(f"\nInput 1 (USA):")
30
+ print(f" Shape: {features1.shape}")
31
+ print(f" Columns: {list(features1.columns)}")
32
+ non_zero1 = [col for col in features1.columns if features1[col].iloc[0] != 0]
33
+ print(f" Non-zero features ({len(non_zero1)}): {non_zero1}")
34
+
35
+ print(f"\nInput 2 (Germany):")
36
+ print(f" Shape: {features2.shape}")
37
+ non_zero2 = [col for col in features2.columns if features2[col].iloc[0] != 0]
38
+ print(f" Non-zero features ({len(non_zero2)}): {non_zero2}")
39
+
40
+ print(f"\nAre encoded features identical? {features1.equals(features2)}")
41
+
42
+ if features1.equals(features2):
43
+ print("\n❌ PROBLEM: Different countries produce IDENTICAL encodings!")
44
+ print(" This explains why categorical features don't affect predictions.")
45
+ else:
46
+ print("\n✅ Encodings are different - categorical features should work.")
47
+
48
+ # Check what happens with Country specifically
49
+ print("\n" + "=" * 70)
50
+ print("COUNTRY ENCODING CHECK")
51
+ print("=" * 70)
52
+
53
+ # Test just Country encoding
54
+ test_countries = ['United States of America', 'Germany', 'India']
55
+ for country in test_countries:
56
+ test_df = pd.DataFrame({
57
+ 'Country': [country],
58
+ 'YearsCode': [5.0],
59
+ 'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
60
+ 'DevType': ['Developer, full-stack']
61
+ })
62
+ encoded = prepare_features(test_df)
63
+ country_cols = [col for col in encoded.columns if col.startswith('Country_')]
64
+ non_zero_countries = [col for col in country_cols if encoded[col].iloc[0] != 0]
65
+ print(f"{country:40s} -> {non_zero_countries}")
example_inference.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Example script showing how to use the salary prediction model programmatically."""
2
+
3
+ from src.schema import SalaryInput
4
+ from src.infer import predict_salary
5
+
6
+
7
+ def main():
8
+ """Run sample predictions with different input parameters."""
9
+
10
+ print("=" * 60)
11
+ print("Developer Salary Prediction - Sample Inference")
12
+ print("=" * 60)
13
+
14
+ # Example 1: Default parameters (same as Streamlit app defaults)
15
+ print("\n📊 Example 1: Default Parameters")
16
+ print("-" * 60)
17
+
18
+ input_data_1 = SalaryInput(
19
+ country="United States of America",
20
+ years_code=5.0,
21
+ education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
22
+ dev_type="Developer, full-stack",
23
+ industry="Software Development",
24
+ )
25
+
26
+ print(f"Country: {input_data_1.country}")
27
+ print(f"Years of Coding (Total): {input_data_1.years_code}")
28
+ print(f"Education Level: {input_data_1.education_level}")
29
+ print(f"Developer Type: {input_data_1.dev_type}")
30
+ print(f"Industry: {input_data_1.industry}")
31
+
32
+ salary_1 = predict_salary(input_data_1)
33
+ print(f"💰 Predicted Salary: ${salary_1:,.2f} USD/year")
34
+
35
+ # Example 2: Junior developer
36
+ print("\n📊 Example 2: Junior Developer")
37
+ print("-" * 60)
38
+
39
+ input_data_2 = SalaryInput(
40
+ country="United States of America",
41
+ years_code=2.0,
42
+ education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
43
+ dev_type="Developer, front-end",
44
+ industry="Fintech",
45
+ )
46
+
47
+ print(f"Country: {input_data_2.country}")
48
+ print(f"Years of Coding (Total): {input_data_2.years_code}")
49
+ print(f"Education Level: {input_data_2.education_level}")
50
+ print(f"Developer Type: {input_data_2.dev_type}")
51
+ print(f"Industry: {input_data_2.industry}")
52
+
53
+ salary_2 = predict_salary(input_data_2)
54
+ print(f"💰 Predicted Salary: ${salary_2:,.2f} USD/year")
55
+
56
+ # Example 3: Senior developer with Master's degree
57
+ print("\n📊 Example 3: Senior Developer")
58
+ print("-" * 60)
59
+
60
+ input_data_3 = SalaryInput(
61
+ country="United States of America",
62
+ years_code=10.0,
63
+ education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
64
+ dev_type="Engineering manager",
65
+ industry="Banking/Financial Services",
66
+ )
67
+
68
+ print(f"Country: {input_data_3.country}")
69
+ print(f"Years of Coding (Total): {input_data_3.years_code}")
70
+ print(f"Education Level: {input_data_3.education_level}")
71
+ print(f"Developer Type: {input_data_3.dev_type}")
72
+ print(f"Industry: {input_data_3.industry}")
73
+
74
+ salary_3 = predict_salary(input_data_3)
75
+ print(f"💰 Predicted Salary: ${salary_3:,.2f} USD/year")
76
+
77
+ # Example 4: Different country
78
+ print("\n📊 Example 4: Different Country (Germany)")
79
+ print("-" * 60)
80
+
81
+ input_data_4 = SalaryInput(
82
+ country="Germany",
83
+ years_code=5.0,
84
+ education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
85
+ dev_type="Developer, back-end",
86
+ industry="Manufacturing",
87
+ )
88
+
89
+ print(f"Country: {input_data_4.country}")
90
+ print(f"Years of Coding (Total): {input_data_4.years_code}")
91
+ print(f"Education Level: {input_data_4.education_level}")
92
+ print(f"Developer Type: {input_data_4.dev_type}")
93
+ print(f"Industry: {input_data_4.industry}")
94
+
95
+ salary_4 = predict_salary(input_data_4)
96
+ print(f"💰 Predicted Salary: ${salary_4:,.2f} USD/year")
97
+
98
+ print("\n" + "=" * 60)
99
+ print("✅ All predictions completed successfully!")
100
+ print("=" * 60)
101
+
102
+
103
+ if __name__ == "__main__":
104
+ try:
105
+ main()
106
+ except FileNotFoundError:
107
+ print("❌ Error: Model file not found!")
108
+ print("Please train the model first by running:")
109
+ print(" uv run python src/train.py")
110
+ except Exception as e:
111
+ print(f"❌ Error occurred: {str(e)}")
models/model.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:10f6724fde852ab09e30230b712a1fb0c7ecc24cd5da7cd6896f17a8c6619e84
3
- size 2697578
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5165f22311d0eb6809380cf4fa5a749b59f0d8e81903462fe7c2c882e09e916f
3
+ size 3192752
pyproject.toml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "developer-salary-prediction"
3
+ version = "0.1.0"
4
+ description = "Simple ML app for predicting developer salaries using Stack Overflow survey data"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "pandas>=2.0.0",
9
+ "scikit-learn>=1.3.0",
10
+ "pydantic>=2.0.0",
11
+ "streamlit>=1.28.0",
12
+ "xgboost>=3.1.0",
13
+ "ruff>=0.15.0",
14
+ "pyyaml>=6.0.0",
15
+ ]
src/infer.py CHANGED
@@ -33,6 +33,30 @@ if not valid_categories_path.exists():
33
  with open(valid_categories_path, "r") as f:
34
  valid_categories = yaml.safe_load(f)
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  def predict_salary(data: SalaryInput) -> float:
38
  """Predict salary based on input features.
@@ -68,13 +92,21 @@ def predict_salary(data: SalaryInput) -> float:
68
  f"Check config/valid_categories.yaml for all valid values."
69
  )
70
 
 
 
 
 
 
 
 
71
  # Create a DataFrame with the input data
72
  input_df = pd.DataFrame(
73
  {
74
  "Country": [data.country],
75
- "YearsCodePro": [data.years_code_pro],
76
  "EdLevel": [data.education_level],
77
  "DevType": [data.dev_type],
 
78
  }
79
  )
80
 
 
33
  with open(valid_categories_path, "r") as f:
34
  valid_categories = yaml.safe_load(f)
35
 
36
+ # Load currency conversion rates
37
+ currency_rates_path = Path("config/currency_rates.yaml")
38
+ currency_rates = {}
39
+ if currency_rates_path.exists():
40
+ with open(currency_rates_path, "r") as f:
41
+ currency_rates = yaml.safe_load(f) or {}
42
+
43
+
44
+ def get_local_currency(country: str, salary_usd: float) -> dict | None:
45
+ """Convert USD salary to local currency for a given country.
46
+
47
+ Returns:
48
+ Dict with code, name, rate, and salary_local, or None if unavailable.
49
+ """
50
+ if country not in currency_rates:
51
+ return None
52
+ info = currency_rates[country]
53
+ return {
54
+ "code": info["code"],
55
+ "name": info["name"],
56
+ "rate": info["rate"],
57
+ "salary_local": round(salary_usd * info["rate"], 2),
58
+ }
59
+
60
 
61
  def predict_salary(data: SalaryInput) -> float:
62
  """Predict salary based on input features.
 
92
  f"Check config/valid_categories.yaml for all valid values."
93
  )
94
 
95
+ if data.industry not in valid_categories["Industry"]:
96
+ raise ValueError(
97
+ f"Invalid industry: '{data.industry}'. "
98
+ f"Must be one of {len(valid_categories['Industry'])} valid industries. "
99
+ f"Check config/valid_categories.yaml for all valid values."
100
+ )
101
+
102
  # Create a DataFrame with the input data
103
  input_df = pd.DataFrame(
104
  {
105
  "Country": [data.country],
106
+ "YearsCode": [data.years_code],
107
  "EdLevel": [data.education_level],
108
  "DevType": [data.dev_type],
109
+ "Industry": [data.industry],
110
  }
111
  )
112
 
src/preprocessing.py CHANGED
@@ -55,7 +55,7 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
55
  during training and inference, preventing data leakage and inconsistencies.
56
 
57
  Args:
58
- df: DataFrame with columns: Country, YearsCode (or YearsCodePro), EdLevel, DevType
59
  NOTE: During training, cardinality reduction should be applied to df
60
  BEFORE calling this function. During inference, valid_categories.yaml
61
  ensures only valid (already-reduced) categories are used.
@@ -67,7 +67,7 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
67
  - Fills missing values with defaults (0 for numeric, "Unknown" for categorical)
68
  - Normalizes Unicode apostrophes to regular apostrophes
69
  - Applies one-hot encoding with drop_first=True to avoid multicollinearity
70
- - Column names in output will be like: YearsCode, Country_X, EdLevel_Y, DevType_Z
71
  - Does NOT apply cardinality reduction (must be done before calling this)
72
  """
73
  # Create a copy to avoid modifying the original
@@ -75,26 +75,27 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
75
 
76
  # Normalize Unicode apostrophes to regular apostrophes for consistency
77
  # This handles cases where data has \u2019 (') instead of '
78
- for col in ["Country", "EdLevel", "DevType"]:
79
  if col in df_processed.columns:
80
  df_processed[col] = df_processed[col].str.replace('\u2019', "'", regex=False)
81
 
82
- # Handle column name variations (YearsCode vs YearsCodePro)
83
  if "YearsCodePro" in df_processed.columns and "YearsCode" not in df_processed.columns:
84
- df_processed["YearsCode"] = df_processed["YearsCodePro"]
85
 
86
  # Fill missing values with defaults
87
  df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
88
  df_processed["Country"] = df_processed["Country"].fillna("Unknown")
89
  df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
90
  df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
 
91
 
92
  # NOTE: Cardinality reduction is NOT applied here
93
  # It should be applied during training BEFORE calling this function
94
  # During inference, valid_categories.yaml ensures only valid values are used
95
 
96
  # Select only the features we need
97
- feature_cols = ["Country", "YearsCode", "EdLevel", "DevType"]
98
  df_features = df_processed[feature_cols]
99
 
100
  # Apply one-hot encoding for categorical variables
 
55
  during training and inference, preventing data leakage and inconsistencies.
56
 
57
  Args:
58
+ df: DataFrame with columns: Country, YearsCode, EdLevel, DevType, Industry
59
  NOTE: During training, cardinality reduction should be applied to df
60
  BEFORE calling this function. During inference, valid_categories.yaml
61
  ensures only valid (already-reduced) categories are used.
 
67
  - Fills missing values with defaults (0 for numeric, "Unknown" for categorical)
68
  - Normalizes Unicode apostrophes to regular apostrophes
69
  - Applies one-hot encoding with drop_first=True to avoid multicollinearity
70
+ - Column names in output will be like: YearsCode, Country_X, EdLevel_Y, DevType_Z, Industry_W
71
  - Does NOT apply cardinality reduction (must be done before calling this)
72
  """
73
  # Create a copy to avoid modifying the original
 
75
 
76
  # Normalize Unicode apostrophes to regular apostrophes for consistency
77
  # This handles cases where data has \u2019 (') instead of '
78
+ for col in ["Country", "EdLevel", "DevType", "Industry"]:
79
  if col in df_processed.columns:
80
  df_processed[col] = df_processed[col].str.replace('\u2019', "'", regex=False)
81
 
82
+ # Handle legacy column name (YearsCodePro -> YearsCode)
83
  if "YearsCodePro" in df_processed.columns and "YearsCode" not in df_processed.columns:
84
+ df_processed.rename(columns={"YearsCodePro": "YearsCode"}, inplace=True)
85
 
86
  # Fill missing values with defaults
87
  df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
88
  df_processed["Country"] = df_processed["Country"].fillna("Unknown")
89
  df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
90
  df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
91
+ df_processed["Industry"] = df_processed["Industry"].fillna("Unknown")
92
 
93
  # NOTE: Cardinality reduction is NOT applied here
94
  # It should be applied during training BEFORE calling this function
95
  # During inference, valid_categories.yaml ensures only valid values are used
96
 
97
  # Select only the features we need
98
+ feature_cols = ["Country", "YearsCode", "EdLevel", "DevType", "Industry"]
99
  df_features = df_processed[feature_cols]
100
 
101
  # Apply one-hot encoding for categorical variables
src/schema.py CHANGED
@@ -7,11 +7,14 @@ class SalaryInput(BaseModel):
7
  """Input model for salary prediction."""
8
 
9
  country: str = Field(..., description="Developer's country")
10
- years_code_pro: float = Field(
11
- ..., ge=0, description="Years of professional coding experience"
 
 
12
  )
13
  education_level: str = Field(..., description="Education level")
14
  dev_type: str = Field(..., description="Developer type")
 
15
 
16
  class Config:
17
  """Pydantic configuration."""
@@ -19,8 +22,9 @@ class SalaryInput(BaseModel):
19
  json_schema_extra = {
20
  "example": {
21
  "country": "United States",
22
- "years_code_pro": 5.0,
23
  "education_level": "Bachelor's degree",
24
  "dev_type": "Developer, back-end",
 
25
  }
26
  }
 
7
  """Input model for salary prediction."""
8
 
9
  country: str = Field(..., description="Developer's country")
10
+ years_code: float = Field(
11
+ ...,
12
+ ge=0,
13
+ description="Including any education, how many years have you been coding in total?",
14
  )
15
  education_level: str = Field(..., description="Education level")
16
  dev_type: str = Field(..., description="Developer type")
17
+ industry: str = Field(..., description="Industry the developer works in")
18
 
19
  class Config:
20
  """Pydantic configuration."""
 
22
  json_schema_extra = {
23
  "example": {
24
  "country": "United States",
25
+ "years_code": 5.0,
26
  "education_level": "Bachelor's degree",
27
  "dev_type": "Developer, back-end",
28
+ "industry": "Software Development",
29
  }
30
  }
src/train.py CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
7
  import numpy as np
8
  import yaml
9
  from xgboost import XGBRegressor
10
- from sklearn.model_selection import train_test_split
11
 
12
  from src.preprocessing import prepare_features, reduce_cardinality
13
 
@@ -32,7 +32,8 @@ def main():
32
  # Load only required columns to save memory
33
  df = pd.read_csv(
34
  data_path,
35
- usecols=["Country", "YearsCode", "EdLevel", "DevType", "ConvertedCompYearly"],
 
36
  )
37
 
38
  print(f"Loaded {len(df):,} rows")
@@ -43,11 +44,14 @@ def main():
43
  # select records with main label more than min_salary threshold
44
  min_salary = config['data']['min_salary']
45
  df = df[df[main_label] > min_salary]
46
- # further exclude outliers based on percentile bounds
47
- lower_pct = config['data']['lower_percentile']
48
- upper_pct = config['data']['upper_percentile']
49
- P = np.percentile(df[main_label], [lower_pct, upper_pct])
50
- df = df[(df[main_label] > P[0]) & (df[main_label] < P[1])]
 
 
 
51
 
52
  print(df.shape)
53
 
@@ -62,17 +66,20 @@ def main():
62
  df_copy["Country"] = df_copy["Country"].str.replace('\u2019', "'", regex=False)
63
  df_copy["EdLevel"] = df_copy["EdLevel"].str.replace('\u2019', "'", regex=False)
64
  df_copy["DevType"] = df_copy["DevType"].str.replace('\u2019', "'", regex=False)
 
65
 
66
  # Apply cardinality reduction
67
  df_copy["Country"] = reduce_cardinality(df_copy["Country"])
68
  df_copy["EdLevel"] = reduce_cardinality(df_copy["EdLevel"])
69
  df_copy["DevType"] = reduce_cardinality(df_copy["DevType"])
 
70
 
71
  # Apply cardinality reduction to the actual training data as well
72
  # (prepare_features no longer does this internally)
73
  df["Country"] = reduce_cardinality(df["Country"])
74
  df["EdLevel"] = reduce_cardinality(df["EdLevel"])
75
  df["DevType"] = reduce_cardinality(df["DevType"])
 
76
 
77
  # Now apply full feature transformations for model training
78
  X = prepare_features(df)
@@ -83,18 +90,64 @@ def main():
83
  country_values = df_copy["Country"].dropna().unique().tolist()
84
  edlevel_values = df_copy["EdLevel"].dropna().unique().tolist()
85
  devtype_values = df_copy["DevType"].dropna().unique().tolist()
 
86
 
87
  valid_categories = {
88
  "Country": sorted(country_values),
89
  "EdLevel": sorted(edlevel_values),
90
  "DevType": sorted(devtype_values),
 
91
  }
92
 
93
  valid_categories_path = Path("config/valid_categories.yaml")
94
  with open(valid_categories_path, "w") as f:
95
  yaml.dump(valid_categories, f, default_flow_style=False, sort_keys=False)
96
 
97
- print(f"\nSaved {len(valid_categories['Country'])} valid countries, {len(valid_categories['EdLevel'])} valid education levels, and {len(valid_categories['DevType'])} valid developer types to {valid_categories_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  print(f"\nFeature matrix shape: {X.shape}")
100
  print(f"Total features: {X.shape[1]}")
@@ -122,6 +175,12 @@ def main():
122
  for devtype, count in top_devtype.items():
123
  print(f" - {devtype}: {count:,} ({count/len(df)*100:.1f}%)")
124
 
 
 
 
 
 
 
125
  # Show YearsCode statistics
126
  print("\n💼 Years of Coding Experience:")
127
  print(f" - Min: {df['YearsCode'].min():.1f}")
@@ -164,25 +223,77 @@ def main():
164
  devtype_name = feature.replace('DevType_', '')
165
  print(f" {i:2d}. {devtype_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
166
 
 
 
 
 
 
 
 
 
167
  print(f"\n📊 Total one-hot encoded features: {len(X.columns)}")
168
  print(" - Numeric: 1 (YearsCode)")
169
  print(f" - Country: {len(country_features)}")
170
  print(f" - Education: {len(edlevel_features)}")
171
  print(f" - DevType: {len(devtype_features)}")
 
172
 
173
  print("=" * 60 + "\n")
174
 
175
- # Split data
176
- X_train, X_test, y_train, y_test = train_test_split(
177
- X, y,
178
- test_size=config['data']['test_size'],
179
- random_state=config['data']['random_state']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  )
181
 
182
- # Train model
183
- print("Training XGBoost model...")
184
- model_config = config['model']
185
- model = XGBRegressor(
186
  n_estimators=model_config['n_estimators'],
187
  learning_rate=model_config['learning_rate'],
188
  max_depth=model_config['max_depth'],
@@ -191,27 +302,19 @@ def main():
191
  n_jobs=model_config['n_jobs'],
192
  early_stopping_rounds=model_config['early_stopping_rounds'],
193
  )
194
- model.fit(
195
- X_train,
196
- y_train,
197
- eval_set=[(X_test, y_test)],
198
  verbose=config['training']['verbose'],
199
  )
200
-
201
- print(f"Best iteration: {model.best_iteration + 1} (early stopping at {model.n_estimators} max)")
202
-
203
- # Evaluate
204
- train_score = model.score(X_train, y_train)
205
- test_score = model.score(X_test, y_test)
206
- print(f"Training R2 score: {train_score:.4f}")
207
- print(f"Test R2 score: {test_score:.4f}")
208
 
209
  # Save model and feature columns for inference
210
  model_path = Path(config['training']['model_path'])
211
- model_path.parent.mkdir(parents=True, exist_ok=True) # Ensure directory exists
212
 
213
  artifacts = {
214
- "model": model,
215
  "feature_columns": list(X.columns),
216
  }
217
 
 
7
  import numpy as np
8
  import yaml
9
  from xgboost import XGBRegressor
10
+ from sklearn.model_selection import KFold, train_test_split
11
 
12
  from src.preprocessing import prepare_features, reduce_cardinality
13
 
 
32
  # Load only required columns to save memory
33
  df = pd.read_csv(
34
  data_path,
35
+ usecols=["Country", "YearsCode", "EdLevel", "DevType", "Industry",
36
+ "Currency", "CompTotal", "ConvertedCompYearly"],
37
  )
38
 
39
  print(f"Loaded {len(df):,} rows")
 
44
  # select records with main label more than min_salary threshold
45
  min_salary = config['data']['min_salary']
46
  df = df[df[main_label] > min_salary]
47
+ # Exclude outliers based on percentile bounds PER COUNTRY
48
+ # This preserves records from lower-paid and higher-paid countries
49
+ # that would otherwise be removed by global percentile filtering
50
+ lower_pct = config['data']['lower_percentile'] / 100
51
+ upper_pct = config['data']['upper_percentile'] / 100
52
+ lower_bound = df.groupby("Country")[main_label].transform("quantile", lower_pct)
53
+ upper_bound = df.groupby("Country")[main_label].transform("quantile", upper_pct)
54
+ df = df[(df[main_label] > lower_bound) & (df[main_label] < upper_bound)]
55
 
56
  print(df.shape)
57
 
 
66
  df_copy["Country"] = df_copy["Country"].str.replace('\u2019', "'", regex=False)
67
  df_copy["EdLevel"] = df_copy["EdLevel"].str.replace('\u2019', "'", regex=False)
68
  df_copy["DevType"] = df_copy["DevType"].str.replace('\u2019', "'", regex=False)
69
+ df_copy["Industry"] = df_copy["Industry"].str.replace('\u2019', "'", regex=False)
70
 
71
  # Apply cardinality reduction
72
  df_copy["Country"] = reduce_cardinality(df_copy["Country"])
73
  df_copy["EdLevel"] = reduce_cardinality(df_copy["EdLevel"])
74
  df_copy["DevType"] = reduce_cardinality(df_copy["DevType"])
75
+ df_copy["Industry"] = reduce_cardinality(df_copy["Industry"])
76
 
77
  # Apply cardinality reduction to the actual training data as well
78
  # (prepare_features no longer does this internally)
79
  df["Country"] = reduce_cardinality(df["Country"])
80
  df["EdLevel"] = reduce_cardinality(df["EdLevel"])
81
  df["DevType"] = reduce_cardinality(df["DevType"])
82
+ df["Industry"] = reduce_cardinality(df["Industry"])
83
 
84
  # Now apply full feature transformations for model training
85
  X = prepare_features(df)
 
90
  country_values = df_copy["Country"].dropna().unique().tolist()
91
  edlevel_values = df_copy["EdLevel"].dropna().unique().tolist()
92
  devtype_values = df_copy["DevType"].dropna().unique().tolist()
93
+ industry_values = df_copy["Industry"].dropna().unique().tolist()
94
 
95
  valid_categories = {
96
  "Country": sorted(country_values),
97
  "EdLevel": sorted(edlevel_values),
98
  "DevType": sorted(devtype_values),
99
+ "Industry": sorted(industry_values),
100
  }
101
 
102
  valid_categories_path = Path("config/valid_categories.yaml")
103
  with open(valid_categories_path, "w") as f:
104
  yaml.dump(valid_categories, f, default_flow_style=False, sort_keys=False)
105
 
106
+ print(f"\nSaved {len(valid_categories['Country'])} valid countries, {len(valid_categories['EdLevel'])} valid education levels, {len(valid_categories['DevType'])} valid developer types, and {len(valid_categories['Industry'])} valid industries to {valid_categories_path}")
107
+
108
+ # Compute currency conversion rates per country
109
+ # Use the original data with Currency and CompTotal columns
110
+ print("\nComputing currency conversion rates per country...")
111
+ currency_df = df[["Country", "Currency", "CompTotal", main_label]].dropna()
112
+ # Extract 3-letter currency code from values like "EUR European Euro"
113
+ currency_df = currency_df.copy()
114
+ currency_df["CurrencyCode"] = currency_df["Currency"].str.split(r"\s+", n=1).str[0]
115
+ currency_df["CurrencyName"] = currency_df["Currency"].str.split(r"\s+", n=1).str[1]
116
+ # Compute conversion rate: local currency / USD
117
+ currency_df["rate"] = currency_df["CompTotal"] / currency_df[main_label]
118
+ # Filter out unreasonable rates (negative, zero, or extreme)
119
+ currency_df = currency_df[(currency_df["rate"] > 0.001) & (currency_df["rate"] < 100000)]
120
+
121
+ currency_rates = {}
122
+ for country in valid_categories["Country"]:
123
+ country_data = currency_df[currency_df["Country"] == country]
124
+ if country_data.empty:
125
+ continue
126
+ # Find the most common currency for this country
127
+ most_common = country_data["CurrencyCode"].mode()
128
+ if most_common.empty:
129
+ continue
130
+ code = most_common.iloc[0]
131
+ # Get the full name from the first matching record
132
+ name_row = country_data[country_data["CurrencyCode"] == code].iloc[0]
133
+ full_name = name_row["CurrencyName"]
134
+ # Compute median conversion rate for this country+currency pair
135
+ rates = country_data[country_data["CurrencyCode"] == code]["rate"]
136
+ median_rate = round(float(rates.median()), 2)
137
+ currency_rates[country] = {
138
+ "code": code,
139
+ "name": full_name,
140
+ "rate": median_rate,
141
+ }
142
+
143
+ currency_rates_path = Path("config/currency_rates.yaml")
144
+ with open(currency_rates_path, "w") as f:
145
+ yaml.dump(currency_rates, f, default_flow_style=False, sort_keys=True,
146
+ allow_unicode=True)
147
+
148
+ print(f"Saved currency rates for {len(currency_rates)} countries to {currency_rates_path}")
149
+ for country, info in sorted(currency_rates.items()):
150
+ print(f" {country:45s} -> {info['code']} ({info['name']}, rate: {info['rate']})")
151
 
152
  print(f"\nFeature matrix shape: {X.shape}")
153
  print(f"Total features: {X.shape[1]}")
 
175
  for devtype, count in top_devtype.items():
176
  print(f" - {devtype}: {count:,} ({count/len(df)*100:.1f}%)")
177
 
178
+ # Show top industries
179
+ print("\n🏢 Top Industries:")
180
+ top_industry = df["Industry"].value_counts().head(10)
181
+ for industry, count in top_industry.items():
182
+ print(f" - {industry}: {count:,} ({count/len(df)*100:.1f}%)")
183
+
184
  # Show YearsCode statistics
185
  print("\n💼 Years of Coding Experience:")
186
  print(f" - Min: {df['YearsCode'].min():.1f}")
 
223
  devtype_name = feature.replace('DevType_', '')
224
  print(f" {i:2d}. {devtype_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
225
 
226
+ # Industry features
227
+ print("\n🏢 Top 10 Industry Features (most common):")
228
+ industry_features = categorical_features[categorical_features.index.str.startswith('Industry_')]
229
+ for i, (feature, count) in enumerate(industry_features.head(10).items(), 1):
230
+ percentage = (count / len(X)) * 100
231
+ industry_name = feature.replace('Industry_', '')
232
+ print(f" {i:2d}. {industry_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
233
+
234
  print(f"\n📊 Total one-hot encoded features: {len(X.columns)}")
235
  print(" - Numeric: 1 (YearsCode)")
236
  print(f" - Country: {len(country_features)}")
237
  print(f" - Education: {len(edlevel_features)}")
238
  print(f" - DevType: {len(devtype_features)}")
239
+ print(f" - Industry: {len(industry_features)}")
240
 
241
  print("=" * 60 + "\n")
242
 
243
+ # Cross-validation for robust evaluation
244
+ n_splits = config['data'].get('cv_splits', 5)
245
+ random_state = config['data']['random_state']
246
+ model_config = config['model']
247
+
248
+ print(f"Running {n_splits}-fold cross-validation...")
249
+ kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
250
+
251
+ train_scores = []
252
+ test_scores = []
253
+ best_iterations = []
254
+
255
+ for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
256
+ X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
257
+ y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
258
+
259
+ model = XGBRegressor(
260
+ n_estimators=model_config['n_estimators'],
261
+ learning_rate=model_config['learning_rate'],
262
+ max_depth=model_config['max_depth'],
263
+ min_child_weight=model_config['min_child_weight'],
264
+ random_state=model_config['random_state'],
265
+ n_jobs=model_config['n_jobs'],
266
+ early_stopping_rounds=model_config['early_stopping_rounds'],
267
+ )
268
+ model.fit(
269
+ X_train, y_train,
270
+ eval_set=[(X_test, y_test)],
271
+ verbose=False,
272
+ )
273
+
274
+ train_r2 = model.score(X_train, y_train)
275
+ test_r2 = model.score(X_test, y_test)
276
+ train_scores.append(train_r2)
277
+ test_scores.append(test_r2)
278
+ best_iterations.append(model.best_iteration + 1)
279
+ print(f" Fold {fold}: Train R2 = {train_r2:.4f}, Test R2 = {test_r2:.4f} (best iter: {model.best_iteration + 1})")
280
+
281
+ avg_train = np.mean(train_scores)
282
+ avg_test = np.mean(test_scores)
283
+ std_test = np.std(test_scores)
284
+ avg_best_iter = int(np.mean(best_iterations))
285
+ print(f"\nCV Average Train R2: {avg_train:.4f}")
286
+ print(f"CV Average Test R2: {avg_test:.4f} (+/- {std_test:.4f})")
287
+ print(f"CV Average best iteration: {avg_best_iter}")
288
+
289
+ # Train final model on all data for deployment
290
+ # Use a small held-out split for early stopping only
291
+ print("\nTraining final model on full dataset...")
292
+ X_train_final, X_es, y_train_final, y_es = train_test_split(
293
+ X, y, test_size=0.1, random_state=random_state
294
  )
295
 
296
+ final_model = XGBRegressor(
 
 
 
297
  n_estimators=model_config['n_estimators'],
298
  learning_rate=model_config['learning_rate'],
299
  max_depth=model_config['max_depth'],
 
302
  n_jobs=model_config['n_jobs'],
303
  early_stopping_rounds=model_config['early_stopping_rounds'],
304
  )
305
+ final_model.fit(
306
+ X_train_final, y_train_final,
307
+ eval_set=[(X_es, y_es)],
 
308
  verbose=config['training']['verbose'],
309
  )
310
+ print(f"Final model best iteration: {final_model.best_iteration + 1}")
 
 
 
 
 
 
 
311
 
312
  # Save model and feature columns for inference
313
  model_path = Path(config['training']['model_path'])
314
+ model_path.parent.mkdir(parents=True, exist_ok=True)
315
 
316
  artifacts = {
317
+ "model": final_model,
318
  "feature_columns": list(X.columns),
319
  }
320
 
test_feature_impact.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Test that changing input features actually changes predictions."""
2
+
3
+ from src.schema import SalaryInput
4
+ from src.infer import predict_salary, valid_categories
5
+
6
+
7
+ def test_years_experience_impact():
8
+ """Test that changing years of experience changes prediction."""
9
+ print("\n" + "=" * 70)
10
+ print("TEST 1: Total Years of Coding Impact")
11
+ print("=" * 70)
12
+
13
+ base_input = {
14
+ "country": "United States of America",
15
+ "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
16
+ "dev_type": "Developer, full-stack",
17
+ "industry": "Software Development",
18
+ }
19
+
20
+ # Test with different years of experience
21
+ years_tests = [0, 2, 5, 10, 20]
22
+ predictions = []
23
+
24
+ for years in years_tests:
25
+ input_data = SalaryInput(**base_input, years_code=years)
26
+ salary = predict_salary(input_data)
27
+ predictions.append(salary)
28
+ print(f" Years: {years:2d} -> Salary: ${salary:,.2f}")
29
+
30
+ # Check if predictions are different
31
+ unique_predictions = len(set(predictions))
32
+ if unique_predictions == len(predictions):
33
+ print(f"\n✅ PASS: All {len(predictions)} predictions are different")
34
+ return True
35
+ else:
36
+ print(f"\n❌ FAIL: Only {unique_predictions}/{len(predictions)} unique predictions")
37
+ return False
38
+
39
+
40
+ def test_country_impact():
41
+ """Test that changing country changes prediction."""
42
+ print("\n" + "=" * 70)
43
+ print("TEST 2: Country Impact")
44
+ print("=" * 70)
45
+
46
+ base_input = {
47
+ "years_code": 5.0,
48
+ "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
49
+ "dev_type": "Developer, full-stack",
50
+ "industry": "Software Development",
51
+ }
52
+
53
+ # Test with different countries (select diverse ones)
54
+ test_countries = [
55
+ "United States of America",
56
+ "Germany",
57
+ "India",
58
+ "Brazil",
59
+ "Poland"
60
+ ]
61
+
62
+ # Filter to only countries that exist in valid categories
63
+ test_countries = [c for c in test_countries if c in valid_categories["Country"]]
64
+
65
+ predictions = []
66
+ for country in test_countries:
67
+ input_data = SalaryInput(**base_input, country=country)
68
+ salary = predict_salary(input_data)
69
+ predictions.append(salary)
70
+ print(f" Country: {country:40s} -> Salary: ${salary:,.2f}")
71
+
72
+ # Check if predictions are different
73
+ unique_predictions = len(set(predictions))
74
+ if unique_predictions == len(predictions):
75
+ print(f"\n✅ PASS: All {len(predictions)} predictions are different")
76
+ return True
77
+ elif unique_predictions == 1:
78
+ print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
79
+ print(" This indicates the model is NOT using country as a feature!")
80
+ return False
81
+ else:
82
+ print(f"\n⚠️ PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
83
+ print(f" Duplicate salaries found - possible feature issue")
84
+ return False
85
+
86
+
87
+ def test_education_impact():
88
+ """Test that changing education level changes prediction."""
89
+ print("\n" + "=" * 70)
90
+ print("TEST 3: Education Level Impact")
91
+ print("=" * 70)
92
+
93
+ base_input = {
94
+ "country": "United States of America",
95
+ "years_code": 5.0,
96
+ "dev_type": "Developer, full-stack",
97
+ "industry": "Software Development",
98
+ }
99
+
100
+ # Test with different education levels
101
+ test_education = [
102
+ "Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)",
103
+ "Some college/university study without earning a degree",
104
+ "Associate degree (A.A., A.S., etc.)",
105
+ "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
106
+ "Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
107
+ "Professional degree (JD, MD, Ph.D, Ed.D, etc.)",
108
+ ]
109
+
110
+ # Filter to only education levels that exist in valid categories
111
+ test_education = [e for e in test_education if e in valid_categories["EdLevel"]]
112
+
113
+ predictions = []
114
+ for education in test_education:
115
+ input_data = SalaryInput(**base_input, education_level=education)
116
+ salary = predict_salary(input_data)
117
+ predictions.append(salary)
118
+ print(f" Education: {education[:50]:50s} -> Salary: ${salary:,.2f}")
119
+
120
+ # Check if predictions are different
121
+ unique_predictions = len(set(predictions))
122
+ if unique_predictions == len(predictions):
123
+ print(f"\n✅ PASS: All {len(predictions)} predictions are different")
124
+ return True
125
+ elif unique_predictions == 1:
126
+ print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
127
+ print(" This indicates the model is NOT using education level as a feature!")
128
+ return False
129
+ else:
130
+ print(f"\n⚠️ PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
131
+ print(f" Duplicate salaries found - possible feature issue")
132
+ return False
133
+
134
+
135
+ def test_devtype_impact():
136
+ """Test that changing developer type changes prediction."""
137
+ print("\n" + "=" * 70)
138
+ print("TEST 4: Developer Type Impact")
139
+ print("=" * 70)
140
+
141
+ base_input = {
142
+ "country": "United States of America",
143
+ "years_code": 5.0,
144
+ "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
145
+ "industry": "Software Development",
146
+ }
147
+
148
+ # Test with different developer types (using actual values from trained model)
149
+ test_devtypes = [
150
+ "Developer, front-end",
151
+ "Developer, back-end",
152
+ "Developer, full-stack",
153
+ "Data scientist",
154
+ "Engineering manager",
155
+ "DevOps engineer or professional",
156
+ ]
157
+
158
+ # Filter to only developer types that exist in valid categories
159
+ test_devtypes = [d for d in test_devtypes if d in valid_categories["DevType"]]
160
+
161
+ predictions = []
162
+ for devtype in test_devtypes:
163
+ input_data = SalaryInput(**base_input, dev_type=devtype)
164
+ salary = predict_salary(input_data)
165
+ predictions.append(salary)
166
+ print(f" Dev Type: {devtype[:50]:50s} -> Salary: ${salary:,.2f}")
167
+
168
+ # Check if predictions are different
169
+ unique_predictions = len(set(predictions))
170
+ if unique_predictions == len(predictions):
171
+ print(f"\n✅ PASS: All {len(predictions)} predictions are different")
172
+ return True
173
+ elif unique_predictions == 1:
174
+ print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
175
+ print(" This indicates the model is NOT using developer type as a feature!")
176
+ return False
177
+ else:
178
+ print(f"\n⚠️ PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
179
+ print(f" Duplicate salaries found - possible feature issue")
180
+ return False
181
+
182
+
183
+ def test_industry_impact():
184
+ """Test that changing industry changes prediction."""
185
+ print("\n" + "=" * 70)
186
+ print("TEST 5: Industry Impact")
187
+ print("=" * 70)
188
+
189
+ base_input = {
190
+ "country": "United States of America",
191
+ "years_code": 5.0,
192
+ "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
193
+ "dev_type": "Developer, full-stack",
194
+ }
195
+
196
+ # Test with different industries (using actual values from trained model)
197
+ test_industries = [
198
+ "Software Development",
199
+ "Fintech",
200
+ "Banking/Financial Services",
201
+ "Healthcare",
202
+ "Manufacturing",
203
+ "Government",
204
+ ]
205
+
206
+ # Filter to only industries that exist in valid categories
207
+ test_industries = [i for i in test_industries if i in valid_categories["Industry"]]
208
+
209
+ predictions = []
210
+ for industry in test_industries:
211
+ input_data = SalaryInput(**base_input, industry=industry)
212
+ salary = predict_salary(input_data)
213
+ predictions.append(salary)
214
+ print(f" Industry: {industry[:50]:50s} -> Salary: ${salary:,.2f}")
215
+
216
+ # Check if predictions are different
217
+ unique_predictions = len(set(predictions))
218
+ if unique_predictions == len(predictions):
219
+ print(f"\n✅ PASS: All {len(predictions)} predictions are different")
220
+ return True
221
+ elif unique_predictions == 1:
222
+ print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
223
+ print(" This indicates the model is NOT using industry as a feature!")
224
+ return False
225
+ else:
226
+ print(f"\n⚠️ PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
227
+ print(f" Duplicate salaries found - possible feature issue")
228
+ return False
229
+
230
+
231
+ def test_combined_features():
232
+ """Test that combining different features produces expected variations."""
233
+ print("\n" + "=" * 70)
234
+ print("TEST 6: Combined Feature Variations")
235
+ print("=" * 70)
236
+
237
+ # Create diverse combinations (using actual values from trained model)
238
+ test_cases = [
239
+ ("India", 2, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, back-end", "Software Development"),
240
+ ("Germany", 5, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Developer, full-stack", "Manufacturing"),
241
+ ("United States of America", 10, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Engineering manager", "Fintech"),
242
+ ("Poland", 15, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, front-end", "Healthcare"),
243
+ ("Brazil", 5, "Some college/university study without earning a degree", "DevOps engineer or professional", "Government"),
244
+ ]
245
+
246
+ predictions = []
247
+ for country, years, education, devtype, industry in test_cases:
248
+ # Skip if not in valid categories
249
+ if (country not in valid_categories["Country"]
250
+ or education not in valid_categories["EdLevel"]
251
+ or devtype not in valid_categories["DevType"]
252
+ or industry not in valid_categories["Industry"]):
253
+ continue
254
+
255
+ input_data = SalaryInput(
256
+ country=country,
257
+ years_code=years,
258
+ education_level=education,
259
+ dev_type=devtype,
260
+ industry=industry,
261
+ )
262
+ salary = predict_salary(input_data)
263
+ predictions.append(salary)
264
+ print(f" {country[:15]:15s} | {years:2d}y | {education[:25]:25s} | {devtype[:25]:25s} | {industry[:20]:20s} -> ${salary:,.2f}")
265
+
266
+ # Check if predictions are different
267
+ unique_predictions = len(set(predictions))
268
+ if unique_predictions == len(predictions):
269
+ print(f"\n✅ PASS: All {len(predictions)} combined predictions are different")
270
+ return True
271
+ else:
272
+ print(f"\n⚠️ Only {unique_predictions}/{len(predictions)} unique predictions")
273
+ print(f" Some combinations produce identical salaries")
274
+ return False
275
+
276
+
277
+ def print_feature_analysis():
278
+ """Analyze which features the model is actually using."""
279
+ print("\n" + "=" * 70)
280
+ print("FEATURE ANALYSIS")
281
+ print("=" * 70)
282
+
283
+ from src.infer import feature_columns
284
+
285
+ print(f"\nTotal features in model: {len(feature_columns)}")
286
+
287
+ # Count by type
288
+ country_features = [f for f in feature_columns if f.startswith('Country_')]
289
+ edlevel_features = [f for f in feature_columns if f.startswith('EdLevel_')]
290
+ devtype_features = [f for f in feature_columns if f.startswith('DevType_')]
291
+ industry_features = [f for f in feature_columns if f.startswith('Industry_')]
292
+ numeric_features = [f for f in feature_columns if not f.startswith(('Country_', 'EdLevel_', 'DevType_', 'Industry_'))]
293
+
294
+ print(f" - Numeric features: {len(numeric_features)} -> {numeric_features}")
295
+ print(f" - Country features: {len(country_features)}")
296
+ print(f" - Education features: {len(edlevel_features)}")
297
+ print(f" - DevType features: {len(devtype_features)}")
298
+ print(f" - Industry features: {len(industry_features)}")
299
+
300
+ if len(country_features) > 0:
301
+ print(f"\nSample country features:")
302
+ for feat in country_features[:5]:
303
+ print(f" - {feat}")
304
+
305
+ if len(edlevel_features) > 0:
306
+ print(f"\nSample education features:")
307
+ for feat in edlevel_features[:5]:
308
+ print(f" - {feat}")
309
+
310
+ if len(devtype_features) > 0:
311
+ print(f"\nSample developer type features:")
312
+ for feat in devtype_features[:5]:
313
+ print(f" - {feat}")
314
+
315
+ if len(industry_features) > 0:
316
+ print(f"\nSample industry features:")
317
+ for feat in industry_features[:5]:
318
+ print(f" - {feat}")
319
+
320
+ # Check if there are any features at all
321
+ if len(country_features) == 0:
322
+ print("\n⚠️ WARNING: No country features found!")
323
+ if len(edlevel_features) == 0:
324
+ print("\n⚠️ WARNING: No education features found!")
325
+ if len(devtype_features) == 0:
326
+ print("\n⚠️ WARNING: No developer type features found!")
327
+ if len(industry_features) == 0:
328
+ print("\n⚠️ WARNING: No industry features found!")
329
+
330
+
331
+ def main():
332
+ """Run all tests."""
333
+ print("\n" + "=" * 70)
334
+ print("FEATURE IMPACT TESTS")
335
+ print("Testing if changing inputs actually changes predictions")
336
+ print("=" * 70)
337
+
338
+ # First, analyze what features exist
339
+ print_feature_analysis()
340
+
341
+ # Run all tests
342
+ results = {
343
+ "Years of Experience": test_years_experience_impact(),
344
+ "Country": test_country_impact(),
345
+ "Education Level": test_education_impact(),
346
+ "Developer Type": test_devtype_impact(),
347
+ "Industry": test_industry_impact(),
348
+ "Combined Features": test_combined_features(),
349
+ }
350
+
351
+ # Summary
352
+ print("\n" + "=" * 70)
353
+ print("TEST SUMMARY")
354
+ print("=" * 70)
355
+
356
+ for test_name, passed in results.items():
357
+ status = "✅ PASS" if passed else "❌ FAIL"
358
+ print(f" {status} - {test_name}")
359
+
360
+ passed_count = sum(results.values())
361
+ total_count = len(results)
362
+
363
+ print(f"\n{passed_count}/{total_count} tests passed")
364
+
365
+ if passed_count == total_count:
366
+ print("\n🎉 All tests passed! The model is using all features correctly.")
367
+ else:
368
+ print("\n⚠️ Some tests failed. The model may not be using all features properly.")
369
+ print(" This indicates potential training-testing skew or feature engineering issues.")
370
+
371
+
372
+ if __name__ == "__main__":
373
+ main()
test_fix.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Test that the encoding fix works."""
2
+
3
+ # Force reload of modules
4
+ import sys
5
+ if 'src.preprocessing' in sys.modules:
6
+ del sys.modules['src.preprocessing']
7
+ if 'src.infer' in sys.modules:
8
+ del sys.modules['src.infer']
9
+
10
+ from src.preprocessing import prepare_features
11
+ import pandas as pd
12
+
13
+ # Create test inputs with different countries (values from valid_categories)
14
+ input1 = pd.DataFrame({
15
+ 'Country': ['United States of America'],
16
+ 'YearsCode': [5.0],
17
+ 'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
18
+ 'DevType': ['Developer, full-stack']
19
+ })
20
+
21
+ input2 = pd.DataFrame({
22
+ 'Country': ['Germany'],
23
+ 'YearsCode': [5.0],
24
+ 'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
25
+ 'DevType': ['Developer, full-stack']
26
+ })
27
+
28
+ print("Testing prepare_features with different countries...")
29
+ features1 = prepare_features(input1)
30
+ features2 = prepare_features(input2)
31
+
32
+ print(f"\nUSA features: {features1.shape}")
33
+ print(f"Columns: {list(features1.columns)[:10]}")
34
+
35
+ print(f"\nGermany features: {features2.shape}")
36
+ print(f"Columns: {list(features2.columns)[:10]}")
37
+
38
+ print(f"\nAre they different? {not features1.equals(features2)}")
39
+
40
+ if features1.shape[1] > 1:
41
+ print("\n✅ SUCCESS: Categorical features are preserved!")
42
+ else:
43
+ print("\n❌ FAIL: Still only has numeric features")
uv.lock ADDED
The diff for this file is too large to render. See raw diff