Spaces:

dima806
/

developer_salary_prediction

Running

App Files Files Community

dima806 commited on Feb 12

Commit

55cdb7e

verified ·

1 Parent(s): 603808b

Upload 25 files

Browse files

Files changed (22) hide show

.dockerignore +53 -0
.gitignore +220 -0
.python-version +1 -0
Claude.md +226 -0
Dockerfile +1 -1
LICENSE +201 -0
README.md +305 -20
app.py +41 -11
config/currency_rates.yaml +84 -0
config/valid_categories.yaml +17 -0
debug_prepare_features.py +81 -0
diagnose_encoding.py +65 -0
example_inference.py +111 -0
models/model.pkl +2 -2
pyproject.toml +15 -0
src/infer.py +33 -1
src/preprocessing.py +7 -6
src/schema.py +7 -3
src/train.py +134 -31
test_feature_impact.py +373 -0
test_fix.py +43 -0
uv.lock +0 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,53 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+*.egg-info/
+dist/
+build/
+*.egg
+# Virtual environments
+.venv/
+venv/
+ENV/
+env/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+Thumbs.db
+# Git
+.git/
+.gitignore
+.gitattributes
+# Data (don't include raw survey data in Docker image)
+data/
+# Testing and development
+test*.py
+debug*.py
+diagnose*.py
+example_inference.py
+# Documentation
+.llm/
+*.md
+!README.md
+# CI/CD
+.github/
+# Project specific
+pyproject.toml
+uv.lock

.gitignore ADDED Viewed

	@@ -0,0 +1,220 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Project-specific
+# Data files (too large for git)
+data/*.csv
+data/*.zip
+# Trained model artifacts
+# Note: Model files are included in the repo for deployment
+# models/*.pkl
+# models/*.joblib
+# LLM
+.llm/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

Claude.md ADDED Viewed

	@@ -0,0 +1,226 @@

+# Claude Development Guide
+## Project Overview
+This is a minimal, local-first ML application built in Python that predicts developer salaries using Stack Overflow Developer Survey data. The project emphasizes clarity and simplicity over production completeness.
+## Tech Stack
+- **Python 3.11+**
+- **uv** - Package & virtual environment management
+- **pandas** - Data manipulation
+- **scikit-learn** - ML modeling
+- **pydantic** - Input validation
+- **streamlit** - Web UI
+- **xgboost** - Advanced gradient boosting (optional)
+## Project Structure
+```
+.
+├── data/
+│   └── survey_results_public.csv    # Stack Overflow survey data
+├── models/
+│   └── model.pkl                    # Serialized trained model
+├── src/
+│   ├── schema.py                    # Pydantic validation models
+│   ├── train.py                     # Model training script
+│   └── infer.py                     # Inference utilities
+├── app.py                           # Streamlit web application
+├── example_inference.py             # Example inference script
+├── pyproject.toml                   # Project dependencies (uv)
+├── uv.lock                          # Locked dependencies
+└── README.md                        # Project documentation
+```
+## Setup & Installation
+### Initial Setup
+```bash
+# The virtual environment is already created at .venv/
+# Activate it:
+source .venv/bin/activate  # On Linux/Mac
+# or
+.venv\Scripts\activate     # On Windows
+# Install/sync dependencies with uv:
+uv sync
+```
+### Adding New Dependencies
+```bash
+uv add <package-name>
+```
+## Key Workflows
+### Training the Model
+```bash
+python src/train.py
+```
+This will:
+- Load data from `data/survey_results_public.csv`
+- Clean and preprocess features
+- Train the regression model
+- Save model to `models/model.pkl`
+### Running the Streamlit App
+```bash
+streamlit run app.py
+```
+Opens a browser interface for salary predictions.
+### Running Inference Programmatically
+```python
+from src.schema import SalaryInput
+from src.infer import predict_salary
+input_data = SalaryInput(
+    country="United States",
+    years_code=5.0,
+    education_level="Bachelor's degree",
+    dev_type="Developer, back-end",
+    industry="Software Development"
+)
+salary = predict_salary(input_data)
+```
+## Key Files
+### [src/schema.py](src/schema.py)
+Contains Pydantic models for:
+- Input validation (`SalaryInput`)
+- Type safety across the application
+### [src/train.py](src/train.py)
+Training pipeline:
+- Data loading and cleaning
+- Feature engineering
+- Model training
+- Model persistence
+### [src/infer.py](src/infer.py)
+Inference utilities:
+- Model loading
+- Prediction logic
+- Validated input processing
+### [app.py](app.py)
+Streamlit UI:
+- User input forms
+- Real-time predictions
+- Results visualization
+## Development Guidelines
+### Code Style
+- Keep code simple and readable
+- Total codebase should remain under ~200 lines
+- Focus on clarity over cleverness
+- Use type hints where helpful
+### Data Requirements
+The dataset must include these columns:
+- `Country` - Developer location
+- `YearsCode` - Total years of coding (including education)
+- `EdLevel` - Education level
+- `DevType` - Developer type
+- `Industry` - Industry the developer works in
+- `ConvertedCompYearly` - Annual salary (target variable)
+### Model Expectations
+- Basic regression model (LinearRegression or similar)
+- Simple feature encoding (one-hot for categoricals)
+- No hyperparameter tuning required
+- Focus on working end-to-end pipeline
+## Common Tasks
+### Debugging Training Issues
+1. Check if data file exists: `ls -la data/`
+2. Verify CSV columns: `head -1 data/survey_results_public.csv`
+3. Check for missing values in target column
+4. Review data types and encoding
+### Updating Features
+1. Modify `SalaryInput` schema in [src/schema.py](src/schema.py)
+2. Update feature extraction in [src/train.py](src/train.py)
+3. Update inference logic in [src/infer.py](src/infer.py)
+4. Update UI inputs in [app.py](app.py)
+5. Retrain the model
+### Testing Predictions
+```python
+# Quick test in Python REPL
+from src.infer import predict_salary
+from src.schema import SalaryInput
+test_input = SalaryInput(
+    country="United States",
+    years_code=3.0,
+    education_level="Bachelor's degree",
+    dev_type="Developer, back-end",
+    industry="Software Development"
+)
+print(predict_salary(test_input))
+```
+## Non-Goals (Intentionally Excluded)
+- Cloud deployment or serving
+- Hyperparameter tuning
+- Model registry or experiment tracking
+- Advanced feature engineering
+- Production monitoring
+- API endpoints (beyond Streamlit)
+## Useful Commands
+```bash
+# Check environment
+which python
+python --version
+# Verify uv installation
+uv --version
+# List installed packages
+uv pip list
+# Run with specific Python version
+uv run python src/train.py
+# Clean generated files
+rm -f models/model.pkl
+# Check data file size
+du -h data/survey_results_public.csv
+```
+## Troubleshooting
+### Model file not found
+- Run training first: `python src/train.py`
+- Check file exists: `ls -la models/model.pkl`
+### Missing dependencies
+- Sync environment: `uv sync`
+- Verify pyproject.toml has all required packages
+### Data file issues
+- Ensure CSV is in `data/` directory
+- Check file encoding (should be UTF-8)
+- Verify required columns exist
+### Streamlit won't start
+- Check port 8501 is available
+- Try specifying port: `streamlit run app.py --server.port 8502`
+## Additional Resources
+- [PRD](.llm/prd.md) - Full product requirements
+- [README.md](README.md) - Project readme
+- [Stack Overflow Survey](https://insights.stackoverflow.com/survey) - Data source
+## Working with Claude Code
+When asking Claude to help with this project:
+- Reference specific files using markdown links: [filename](path)
+- Be specific about which component needs changes
+- Mention if you need training, inference, or UI updates
+- Provide error messages in full when debugging
+- Ask for explanations of model choices if unclear

Dockerfile CHANGED Viewed

@@ -20,4 +20,4 @@ EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "./app.py", "--server.port=8501", "--server.address=0.0.0.0"]


20
21	HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
22
23	+ ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,20 +1,305 @@
----
-title: Developer Salary Prediction
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
-pinned: false
-short_description: Developer salary prediction using 2025 Stackoverflow survey
-license: apache-2.0
----
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

+# Developer Salary Prediction
+A minimal, local-first ML application that predicts developer salaries using Stack Overflow Developer Survey data. Built with Python, scikit-learn, Pydantic, and Streamlit.
+## Features
+- 🎯 XGBoost (gradient boosting) model for salary prediction
+- ✅ Input validation with Pydantic
+- 🌐 Interactive web UI with Streamlit
+- 📊 Trained on Stack Overflow Developer Survey data
+- 🔧 Easy setup with `uv` package manager
+## Quick Start
+### 1. Install Dependencies
+```bash
+uv sync
+```
+### 2. Download Data
+Download the Stack Overflow Developer Survey CSV file:
+1. Visit: https://insights.stackoverflow.com/survey
+2. Download the latest survey results (2024 or 2025)
+3. Extract the `survey_results_public.csv` file
+4. Place it in the `data/` directory:
+   ```
+   data/survey_results_public.csv
+   ```
+**Required columns:** `Country`, `YearsCode`, `EdLevel`, `DevType`, `Industry`, `ConvertedCompYearly`
+### 3. Train the Model
+```bash
+uv run python -m src.train
+```
+This will:
+- Load configuration from `config/model_parameters.yaml`
+- Load and preprocess the survey data (with cardinality reduction)
+- Train an XGBoost model with early stopping
+- Save the model to `models/model.pkl`
+- Generate `config/valid_categories.yaml` with valid country, education, developer type, and industry values
+### 4. Run the Streamlit App
+```bash
+uv run streamlit run app.py
+```
+The app will open in your browser at `http://localhost:8501`
+## Usage
+### Web Interface
+Launch the Streamlit app and enter:
+- **Country**: Developer's country
+- **Years of Coding (Total)**: Total years coding including education
+- **Education Level**: Highest degree completed
+- **Developer Type**: Primary developer role
+- **Industry**: Industry the developer works in
+Click "Predict Salary" to see the estimated annual salary.
+### Programmatic Usage
+**Quick example:**
+```python
+from src.schema import SalaryInput
+from src.infer import predict_salary
+# Create input
+input_data = SalaryInput(
+    country="United States of America",
+    years_code=5.0,
+    education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
+    dev_type="Developer, full-stack",
+    industry="Software Development"
+)
+# Get prediction
+salary = predict_salary(input_data)
+print(f"Estimated salary: ${salary:,.0f}")
+```
+**Run the example script:**
+```bash
+uv run python example_inference.py
+```
+This will show predictions for multiple sample scenarios (junior, mid-level, senior developers, different countries).
+## Input Validation
+The model validates inputs against actual training data categories:
+- **Valid Countries**: Only countries from `config/valid_categories.yaml` (~21 countries)
+- **Valid Education Levels**: Only education levels from training data (~9 levels)
+- **Valid Developer Types**: Only developer types from training data (~20 types)
+- **Valid Industries**: Only industries from training data (~15 industries)
+The Streamlit app uses dropdown menus with only valid options. If you use the programmatic API with invalid values, you'll get a helpful error message pointing to the valid categories file.
+**Example validation:**
+```python
+from src.infer import predict_salary
+from src.schema import SalaryInput
+# This will raise ValueError - Japan not in training data after cardinality reduction
+invalid_input = SalaryInput(
+    country="Japan",  # Invalid!
+    years_code=5.0,
+    education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
+    dev_type="Developer, back-end",
+    industry="Software Development"
+)
+```
+**View valid categories:**
+```bash
+cat config/valid_categories.yaml
+```
+## Configuration
+Model parameters are centralized in [config/model_parameters.yaml](config/model_parameters.yaml). You can customize:
+- **Data Processing**: Salary thresholds, percentile bounds, train/test split ratio
+- **Feature Engineering**: Cardinality reduction settings (max categories, min frequency)
+- **Model Hyperparameters**: Learning rate, tree depth, early stopping, etc.
+- **Training Settings**: Verbosity, model save path
+**To modify parameters:**
+```bash
+# Edit the config file
+nano config/model_parameters.yaml
+# Then retrain the model
+uv run python -m src.train
+```
+**Example parameter changes:**
+```yaml
+# Increase model complexity
+model:
+  max_depth: 8                 # Default: 6
+  n_estimators: 10000          # Default: 5000
+# Keep more categories
+features:
+  cardinality:
+    max_categories: 30         # Default: 20
+    min_frequency: 100         # Default: 50
+```
+## Project Structure
+```
+.
+├── config/
+│   ├── model_parameters.yaml        # Model configuration
+│   └── valid_categories.yaml        # Valid input categories (generated)
+├── data/
+│   └── survey_results_public.csv    # Stack Overflow survey data (download required)
+├── models/
+│   └── model.pkl                    # Trained model (generated)
+├── src/
+│   ├── __init__.py                  # Package initialization
+│   ├── schema.py                    # Pydantic models
+│   ├── preprocessing.py             # Feature engineering utilities
+│   ├── train.py                     # Training script
+│   └── infer.py                     # Inference utilities
+├── app.py                           # Streamlit web app
+├── example_inference.py             # Example inference script
+├── pyproject.toml                   # Project dependencies
+└── README.md                        # This file
+```
+## Tech Stack
+- **Python 3.12+**
+- **uv** - Package manager
+- **pandas** - Data manipulation
+- **xgboost** - Gradient boosting model
+- **scikit-learn** - ML utilities (train/test split)
+- **pydantic** - Data validation
+- **streamlit** - Web UI
+## Development
+For detailed development information, see [Claude.md](Claude.md).
+### Re-training the Model
+If you want to use a different survey year or update the model:
+```bash
+# Place new CSV in data/ directory
+uv run python -m src.train
+```
+### Running Tests
+**Quick one-liner test:**
+```bash
+uv run python -c "from src.schema import SalaryInput; from src.infer import predict_salary; test = SalaryInput(country='United States of America', years_code=5.0, education_level='Bachelor'\''s degree (B.A., B.S., B.Eng., etc.)', dev_type='Developer, full-stack', industry='Software Development'); print(f'Prediction: \${predict_salary(test):,.0f}')"
+```
+**Or run the full example script:**
+```bash
+uv run python example_inference.py
+```
+## Deployment
+### Hugging Face Spaces
+This application is Docker-ready for deployment on Hugging Face Spaces:
+**1. Build the Docker image:**
+```bash
+docker build -t developer-salary-predictor .
+```
+**2. Test locally:**
+```bash
+docker run -p 8501:8501 developer-salary-predictor
+```
+Then visit `http://localhost:8501`
+**3. Deploy to Hugging Face:**
+1. Create a new Space on [Hugging Face](https://huggingface.co/new-space)
+2. Select "Docker" as the SDK
+3. Clone your Space repository
+4. Copy these files to your Space:
+   ```text
+   Dockerfile
+   requirements.txt
+   app.py
+   src/
+   config/
+   models/
+   ```
+5. Push to your Space:
+   ```bash
+   git add .
+   git commit -m "Initial deployment"
+   git push
+   ```
+**Note:** The pre-trained model (`models/model.pkl`) and configuration (`config/valid_categories.yaml`) are included in the Docker image. If you want to use a different model, retrain locally first, then rebuild the Docker image.
+### Alternative: Local Deployment
+**Using uv (recommended for development):**
+```bash
+uv run streamlit run app.py
+```
+**Using pip:**
+```bash
+pip install -r requirements.txt
+streamlit run app.py
+```
+## Troubleshooting
+### "Model file not found"
+- Run `uv run python -m src.train` first to generate the model
+### "Data file not found"
+- Download the Stack Overflow survey CSV and place it in `data/`
+### "Configuration file not found"
+- The `config/model_parameters.yaml` file should exist in the project root
+- Check that you're running commands from the project root directory
+### Dependencies issues
+- Run `uv sync` to ensure all packages are installed
+## Design Principles
+- **Simplicity**: Under 200 lines of code total
+- **Clarity**: Easy to understand and modify
+- **Local-first**: No cloud dependencies
+- **Hackable**: Plain Python, no complex frameworks
+## License
+Apache 2.0 License - see [LICENSE](LICENSE) file
+## Acknowledgments
+Data from [Stack Overflow Developer Survey](https://insights.stackoverflow.com/survey)

app.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import streamlit as st
-from src.infer import predict_salary, valid_categories
 from src.schema import SalaryInput
 # Page configuration
@@ -26,9 +26,10 @@ with st.sidebar:
         This app uses an XGBoost (gradient boosting) model trained on Stack Overflow
         Developer Survey data to predict annual salaries based on:
         - Country
-        - Years of professional coding experience
         - Education level
         - Developer type
         """
     )
     st.info("💡 Tip: Results are estimates based on survey averages.")
@@ -38,6 +39,7 @@ with st.sidebar:
     st.write(f"**Countries:** {len(valid_categories['Country'])} available")
     st.write(f"**Education Levels:** {len(valid_categories['EdLevel'])} available")
     st.write(f"**Developer Types:** {len(valid_categories['DevType'])} available")
     st.caption("Only values from the training data are shown in the dropdowns.")
 # Main input form
@@ -49,11 +51,13 @@ col1, col2 = st.columns(2)
 valid_countries = valid_categories["Country"]
 valid_education_levels = valid_categories["EdLevel"]
 valid_dev_types = valid_categories["DevType"]
 # Set default values (if available)
 default_country = "United States of America" if "United States of America" in valid_countries else valid_countries[0]
 default_education = "Bachelor's degree (B.A., B.S., B.Eng., etc.)" if "Bachelor's degree (B.A., B.S., B.Eng., etc.)" in valid_education_levels else valid_education_levels[0]
 default_dev_type = "Developer, back-end" if "Developer, back-end" in valid_dev_types else valid_dev_types[0]
 with col1:
     country = st.selectbox(
@@ -64,12 +68,12 @@ with col1:
     )
     years = st.number_input(
-        "Years of Professional Coding",
         min_value=0,
         max_value=50,
-        value=5,
         step=1,
-        help="Years of professional coding experience",
     )
 with col2:
@@ -87,15 +91,23 @@ with col2:
         help="Primary developer role (only types from training data)",
     )
 # Prediction button
 if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
     try:
         # Create input model
         input_data = SalaryInput(
             country=country,
-            years_code_pro=years,
             education_level=education,
             dev_type=dev_type,
         )
         # Make prediction
@@ -104,11 +116,29 @@ if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
         # Display result
         st.success("Prediction Complete!")
-        st.metric(
-            label="Estimated Annual Salary",
-            value=f"${salary:,.0f}",
-            help="Predicted annual compensation in USD",
-        )
     except FileNotFoundError:
         st.error(

 import streamlit as st
+from src.infer import predict_salary, get_local_currency, valid_categories
 from src.schema import SalaryInput
 # Page configuration
         This app uses an XGBoost (gradient boosting) model trained on Stack Overflow
         Developer Survey data to predict annual salaries based on:
         - Country
+        - Total years of coding experience (including education)
         - Education level
         - Developer type
+        - Industry
         """
     )
     st.info("💡 Tip: Results are estimates based on survey averages.")
     st.write(f"**Countries:** {len(valid_categories['Country'])} available")
     st.write(f"**Education Levels:** {len(valid_categories['EdLevel'])} available")
     st.write(f"**Developer Types:** {len(valid_categories['DevType'])} available")
+    st.write(f"**Industries:** {len(valid_categories['Industry'])} available")
     st.caption("Only values from the training data are shown in the dropdowns.")
 # Main input form
 valid_countries = valid_categories["Country"]
 valid_education_levels = valid_categories["EdLevel"]
 valid_dev_types = valid_categories["DevType"]
+valid_industries = valid_categories["Industry"]
 # Set default values (if available)
 default_country = "United States of America" if "United States of America" in valid_countries else valid_countries[0]
 default_education = "Bachelor's degree (B.A., B.S., B.Eng., etc.)" if "Bachelor's degree (B.A., B.S., B.Eng., etc.)" in valid_education_levels else valid_education_levels[0]
 default_dev_type = "Developer, back-end" if "Developer, back-end" in valid_dev_types else valid_dev_types[0]
+default_industry = "Software Development" if "Software Development" in valid_industries else valid_industries[0]
 with col1:
     country = st.selectbox(
     )
     years = st.number_input(
+        "Years of Coding (Total)",
         min_value=0,
         max_value=50,
+        value=15,
         step=1,
+        help="Including any education, how many years have you been coding in total?",
     )
 with col2:
         help="Primary developer role (only types from training data)",
     )
+industry = st.selectbox(
+    "Industry",
+    options=valid_industries,
+    index=valid_industries.index(default_industry),
+    help="Industry the developer works in (only industries from training data)",
+)
 # Prediction button
 if st.button("🔮 Predict Salary", type="primary", use_container_width=True):
     try:
         # Create input model
         input_data = SalaryInput(
             country=country,
+            years_code=years,
             education_level=education,
             dev_type=dev_type,
+            industry=industry,
         )
         # Make prediction
         # Display result
         st.success("Prediction Complete!")
+        # Show USD and local currency side by side
+        local = get_local_currency(country, salary)
+        if local and local["code"] != "USD":
+            col_usd, col_local = st.columns(2)
+            with col_usd:
+                st.metric(
+                    label="Estimated Annual Salary (USD)",
+                    value=f"${salary:,.0f}",
+                    help="Predicted annual compensation in USD",
+                )
+            with col_local:
+                st.metric(
+                    label=f"Estimated Annual Salary ({local['code']})",
+                    value=f"{local['salary_local']:,.0f} {local['code']}",
+                    help=f"Converted using survey-derived rate: 1 USD = {local['rate']} {local['code']} ({local['name']})",
+                )
+        else:
+            st.metric(
+                label="Estimated Annual Salary",
+                value=f"${salary:,.0f}",
+                help="Predicted annual compensation in USD",
+            )
     except FileNotFoundError:
         st.error(

config/currency_rates.yaml ADDED Viewed

	@@ -0,0 +1,84 @@

+Australia:
+  code: AUD
+  name: Australian dollar
+  rate: 1.54
+Austria:
+  code: EUR
+  name: European Euro
+  rate: 0.86
+Belgium:
+  code: EUR
+  name: European Euro
+  rate: 0.86
+Brazil:
+  code: BRL
+  name: Brazilian real
+  rate: 5.49
+Canada:
+  code: CAD
+  name: Canadian dollar
+  rate: 1.37
+Czech Republic:
+  code: CZK
+  name: Czech koruna
+  rate: 21.36
+Denmark:
+  code: DKK
+  name: Danish krone
+  rate: 6.43
+France:
+  code: EUR
+  name: European Euro
+  rate: 0.86
+Germany:
+  code: EUR
+  name: European Euro
+  rate: 0.86
+India:
+  code: INR
+  name: Indian rupee
+  rate: 86.03
+Italy:
+  code: EUR
+  name: European Euro
+  rate: 0.86
+Netherlands:
+  code: EUR
+  name: European Euro
+  rate: 0.86
+Other:
+  code: EUR
+  name: European Euro
+  rate: 0.86
+Poland:
+  code: PLN
+  name: Polish zloty
+  rate: 3.66
+Portugal:
+  code: EUR
+  name: European Euro
+  rate: 0.86
+Spain:
+  code: EUR
+  name: European Euro
+  rate: 0.86
+Sweden:
+  code: SEK
+  name: Swedish krona
+  rate: 9.54
+Switzerland:
+  code: CHF
+  name: Swiss franc
+  rate: 0.81
+Ukraine:
+  code: UAH
+  name: Ukrainian hryvnia
+  rate: 41.73
+United Kingdom of Great Britain and Northern Ireland:
+  code: GBP
+  name: Pound sterling
+  rate: 0.73
+United States of America:
+  code: USD
+  name: United States dollar
+  rate: 1.0

config/valid_categories.yaml CHANGED Viewed

@@ -52,3 +52,20 @@ DevType:
 - Senior executive (C-suite, VP, etc.)
 - Student
 - System administrator

 - Senior executive (C-suite, VP, etc.)
 - Student
 - System administrator
+Industry:
+- Banking/Financial Services
+- Computer Systems Design and Services
+- Energy
+- Fintech
+- Government
+- Healthcare
+- Higher Education
+- Insurance
+- Internet, Telecomm or Information Services
+- Manufacturing
+- Media & Advertising Services
+- Other
+- 'Other:'
+- Retail and Consumer Services
+- Software Development
+- Transportation, or Supply Chain

debug_prepare_features.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""Debug prepare_features step by step."""
+import pandas as pd
+from src.preprocessing import reduce_cardinality
+import yaml
+from pathlib import Path
+# Load config
+config_path = Path("config/model_parameters.yaml")
+with open(config_path, "r") as f:
+    config = yaml.safe_load(f)
+# Create test input
+df = pd.DataFrame({
+    'Country': ['United States of America'],
+    'YearsCode': [5.0],
+    'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
+    'DevType': ['Developer, full-stack']
+})
+print("=" * 70)
+print("STEP-BY-STEP DEBUGGING OF prepare_features()")
+print("=" * 70)
+print("\n1. Original input:")
+print(f"   Columns: {list(df.columns)}")
+print(f"   Values: {df.iloc[0].to_dict()}")
+# Step 2: Copy
+df_processed = df.copy()
+# Step 3: Unicode normalization
+for col in ["Country", "EdLevel", "DevType"]:
+    if col in df_processed.columns:
+        df_processed[col] = df_processed[col].str.replace('\u2019', "'", regex=False)
+print("\n2. After unicode normalization:")
+print(f"   Columns: {list(df_processed.columns)}")
+# Step 4: Fill missing values
+df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
+df_processed["Country"] = df_processed["Country"].fillna("Unknown")
+df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
+df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
+print("\n3. After filling missing values:")
+print(f"   Columns: {list(df_processed.columns)}")
+print(f"   Country value: '{df_processed['Country'].iloc[0]}'")
+print(f"   EdLevel value: '{df_processed['EdLevel'].iloc[0]}'")
+print(f"   DevType value: '{df_processed['DevType'].iloc[0]}'")
+# Step 5: Reduce cardinality
+print("\n4. Before cardinality reduction:")
+print(f"   Country value: '{df_processed['Country'].iloc[0]}'")
+df_processed["Country"] = reduce_cardinality(df_processed["Country"])
+print(f"   After Country reduction: '{df_processed['Country'].iloc[0]}'")
+print(f"   EdLevel value: '{df_processed['EdLevel'].iloc[0]}'")
+df_processed["EdLevel"] = reduce_cardinality(df_processed["EdLevel"])
+print(f"   After EdLevel reduction: '{df_processed['EdLevel'].iloc[0]}'")
+print(f"   DevType value: '{df_processed['DevType'].iloc[0]}'")
+df_processed["DevType"] = reduce_cardinality(df_processed["DevType"])
+print(f"   After DevType reduction: '{df_processed['DevType'].iloc[0]}'")
+# Step 6: Select feature columns
+feature_cols = ["Country", "YearsCode", "EdLevel", "DevType"]
+df_features = df_processed[feature_cols]
+print("\n5. After selecting feature columns:")
+print(f"   Columns: {list(df_features.columns)}")
+print(f"   Values: {df_features.iloc[0].to_dict()}")
+# Step 7: One-hot encode
+drop_first = config['features']['encoding']['drop_first']
+print(f"\n6. One-hot encoding with drop_first={drop_first}:")
+df_encoded = pd.get_dummies(df_features, drop_first=drop_first)
+print(f"   Result shape: {df_encoded.shape}")
+print(f"   Result columns: {list(df_encoded.columns)}")
+print(f"   Non-zero values: {df_encoded.columns[df_encoded.iloc[0] != 0].tolist()}")

diagnose_encoding.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""Diagnose why categorical features aren't affecting predictions."""
+from src.preprocessing import prepare_features
+import pandas as pd
+# Create two inputs that differ ONLY in Country
+input1 = pd.DataFrame({
+    'Country': ['United States of America'],
+    'YearsCode': [5.0],
+    'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
+    'DevType': ['Developer, full-stack']
+})
+input2 = pd.DataFrame({
+    'Country': ['Germany'],  # Different!
+    'YearsCode': [5.0],
+    'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
+    'DevType': ['Developer, full-stack']
+})
+print("=" * 70)
+print("ENCODING DIAGNOSIS")
+print("=" * 70)
+# Process features
+features1 = prepare_features(input1)
+features2 = prepare_features(input2)
+print(f"\nInput 1 (USA):")
+print(f"  Shape: {features1.shape}")
+print(f"  Columns: {list(features1.columns)}")
+non_zero1 = [col for col in features1.columns if features1[col].iloc[0] != 0]
+print(f"  Non-zero features ({len(non_zero1)}): {non_zero1}")
+print(f"\nInput 2 (Germany):")
+print(f"  Shape: {features2.shape}")
+non_zero2 = [col for col in features2.columns if features2[col].iloc[0] != 0]
+print(f"  Non-zero features ({len(non_zero2)}): {non_zero2}")
+print(f"\nAre encoded features identical? {features1.equals(features2)}")
+if features1.equals(features2):
+    print("\n❌ PROBLEM: Different countries produce IDENTICAL encodings!")
+    print("   This explains why categorical features don't affect predictions.")
+else:
+    print("\n✅ Encodings are different - categorical features should work.")
+# Check what happens with Country specifically
+print("\n" + "=" * 70)
+print("COUNTRY ENCODING CHECK")
+print("=" * 70)
+# Test just Country encoding
+test_countries = ['United States of America', 'Germany', 'India']
+for country in test_countries:
+    test_df = pd.DataFrame({
+        'Country': [country],
+        'YearsCode': [5.0],
+        'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
+        'DevType': ['Developer, full-stack']
+    })
+    encoded = prepare_features(test_df)
+    country_cols = [col for col in encoded.columns if col.startswith('Country_')]
+    non_zero_countries = [col for col in country_cols if encoded[col].iloc[0] != 0]
+    print(f"{country:40s} -> {non_zero_countries}")

example_inference.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""Example script showing how to use the salary prediction model programmatically."""
+from src.schema import SalaryInput
+from src.infer import predict_salary
+def main():
+    """Run sample predictions with different input parameters."""
+    print("=" * 60)
+    print("Developer Salary Prediction - Sample Inference")
+    print("=" * 60)
+    # Example 1: Default parameters (same as Streamlit app defaults)
+    print("\n📊 Example 1: Default Parameters")
+    print("-" * 60)
+    input_data_1 = SalaryInput(
+        country="United States of America",
+        years_code=5.0,
+        education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
+        dev_type="Developer, full-stack",
+        industry="Software Development",
+    )
+    print(f"Country: {input_data_1.country}")
+    print(f"Years of Coding (Total): {input_data_1.years_code}")
+    print(f"Education Level: {input_data_1.education_level}")
+    print(f"Developer Type: {input_data_1.dev_type}")
+    print(f"Industry: {input_data_1.industry}")
+    salary_1 = predict_salary(input_data_1)
+    print(f"💰 Predicted Salary: ${salary_1:,.2f} USD/year")
+    # Example 2: Junior developer
+    print("\n📊 Example 2: Junior Developer")
+    print("-" * 60)
+    input_data_2 = SalaryInput(
+        country="United States of America",
+        years_code=2.0,
+        education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
+        dev_type="Developer, front-end",
+        industry="Fintech",
+    )
+    print(f"Country: {input_data_2.country}")
+    print(f"Years of Coding (Total): {input_data_2.years_code}")
+    print(f"Education Level: {input_data_2.education_level}")
+    print(f"Developer Type: {input_data_2.dev_type}")
+    print(f"Industry: {input_data_2.industry}")
+    salary_2 = predict_salary(input_data_2)
+    print(f"💰 Predicted Salary: ${salary_2:,.2f} USD/year")
+    # Example 3: Senior developer with Master's degree
+    print("\n📊 Example 3: Senior Developer")
+    print("-" * 60)
+    input_data_3 = SalaryInput(
+        country="United States of America",
+        years_code=10.0,
+        education_level="Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
+        dev_type="Engineering manager",
+        industry="Banking/Financial Services",
+    )
+    print(f"Country: {input_data_3.country}")
+    print(f"Years of Coding (Total): {input_data_3.years_code}")
+    print(f"Education Level: {input_data_3.education_level}")
+    print(f"Developer Type: {input_data_3.dev_type}")
+    print(f"Industry: {input_data_3.industry}")
+    salary_3 = predict_salary(input_data_3)
+    print(f"💰 Predicted Salary: ${salary_3:,.2f} USD/year")
+    # Example 4: Different country
+    print("\n📊 Example 4: Different Country (Germany)")
+    print("-" * 60)
+    input_data_4 = SalaryInput(
+        country="Germany",
+        years_code=5.0,
+        education_level="Bachelor's degree (B.A., B.S., B.Eng., etc.)",
+        dev_type="Developer, back-end",
+        industry="Manufacturing",
+    )
+    print(f"Country: {input_data_4.country}")
+    print(f"Years of Coding (Total): {input_data_4.years_code}")
+    print(f"Education Level: {input_data_4.education_level}")
+    print(f"Developer Type: {input_data_4.dev_type}")
+    print(f"Industry: {input_data_4.industry}")
+    salary_4 = predict_salary(input_data_4)
+    print(f"💰 Predicted Salary: ${salary_4:,.2f} USD/year")
+    print("\n" + "=" * 60)
+    print("✅ All predictions completed successfully!")
+    print("=" * 60)
+if __name__ == "__main__":
+    try:
+        main()
+    except FileNotFoundError:
+        print("❌ Error: Model file not found!")
+        print("Please train the model first by running:")
+        print("  uv run python src/train.py")
+    except Exception as e:
+        print(f"❌ Error occurred: {str(e)}")

models/model.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:10f6724fde852ab09e30230b712a1fb0c7ecc24cd5da7cd6896f17a8c6619e84
-size 2697578

 version https://git-lfs.github.com/spec/v1
+oid sha256:5165f22311d0eb6809380cf4fa5a749b59f0d8e81903462fe7c2c882e09e916f
+size 3192752

pyproject.toml ADDED Viewed

	@@ -0,0 +1,15 @@

+[project]
+name = "developer-salary-prediction"
+version = "0.1.0"
+description = "Simple ML app for predicting developer salaries using Stack Overflow survey data"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "pandas>=2.0.0",
+    "scikit-learn>=1.3.0",
+    "pydantic>=2.0.0",
+    "streamlit>=1.28.0",
+    "xgboost>=3.1.0",
+    "ruff>=0.15.0",
+    "pyyaml>=6.0.0",
+]

src/infer.py CHANGED Viewed

@@ -33,6 +33,30 @@ if not valid_categories_path.exists():
 with open(valid_categories_path, "r") as f:
     valid_categories = yaml.safe_load(f)
 def predict_salary(data: SalaryInput) -> float:
     """Predict salary based on input features.
@@ -68,13 +92,21 @@ def predict_salary(data: SalaryInput) -> float:
             f"Check config/valid_categories.yaml for all valid values."
         )
     # Create a DataFrame with the input data
     input_df = pd.DataFrame(
         {
             "Country": [data.country],
-            "YearsCodePro": [data.years_code_pro],
             "EdLevel": [data.education_level],
             "DevType": [data.dev_type],
         }
     )

 with open(valid_categories_path, "r") as f:
     valid_categories = yaml.safe_load(f)
+# Load currency conversion rates
+currency_rates_path = Path("config/currency_rates.yaml")
+currency_rates = {}
+if currency_rates_path.exists():
+    with open(currency_rates_path, "r") as f:
+        currency_rates = yaml.safe_load(f) or {}
+def get_local_currency(country: str, salary_usd: float) -> dict | None:
+    """Convert USD salary to local currency for a given country.
+    Returns:
+        Dict with code, name, rate, and salary_local, or None if unavailable.
+    """
+    if country not in currency_rates:
+        return None
+    info = currency_rates[country]
+    return {
+        "code": info["code"],
+        "name": info["name"],
+        "rate": info["rate"],
+        "salary_local": round(salary_usd * info["rate"], 2),
+    }
 def predict_salary(data: SalaryInput) -> float:
     """Predict salary based on input features.
             f"Check config/valid_categories.yaml for all valid values."
         )
+    if data.industry not in valid_categories["Industry"]:
+        raise ValueError(
+            f"Invalid industry: '{data.industry}'. "
+            f"Must be one of {len(valid_categories['Industry'])} valid industries. "
+            f"Check config/valid_categories.yaml for all valid values."
+        )
     # Create a DataFrame with the input data
     input_df = pd.DataFrame(
         {
             "Country": [data.country],
+            "YearsCode": [data.years_code],
             "EdLevel": [data.education_level],
             "DevType": [data.dev_type],
+            "Industry": [data.industry],
         }
     )

src/preprocessing.py CHANGED Viewed

@@ -55,7 +55,7 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
     during training and inference, preventing data leakage and inconsistencies.
     Args:
-        df: DataFrame with columns: Country, YearsCode (or YearsCodePro), EdLevel, DevType
             NOTE: During training, cardinality reduction should be applied to df
             BEFORE calling this function. During inference, valid_categories.yaml
             ensures only valid (already-reduced) categories are used.
@@ -67,7 +67,7 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
         - Fills missing values with defaults (0 for numeric, "Unknown" for categorical)
         - Normalizes Unicode apostrophes to regular apostrophes
         - Applies one-hot encoding with drop_first=True to avoid multicollinearity
-        - Column names in output will be like: YearsCode, Country_X, EdLevel_Y, DevType_Z
         - Does NOT apply cardinality reduction (must be done before calling this)
     """
     # Create a copy to avoid modifying the original
@@ -75,26 +75,27 @@ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
     # Normalize Unicode apostrophes to regular apostrophes for consistency
     # This handles cases where data has \u2019 (') instead of '
-    for col in ["Country", "EdLevel", "DevType"]:
         if col in df_processed.columns:
             df_processed[col] = df_processed[col].str.replace('\u2019', "'", regex=False)
-    # Handle column name variations (YearsCode vs YearsCodePro)
     if "YearsCodePro" in df_processed.columns and "YearsCode" not in df_processed.columns:
-        df_processed["YearsCode"] = df_processed["YearsCodePro"]
     # Fill missing values with defaults
     df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
     df_processed["Country"] = df_processed["Country"].fillna("Unknown")
     df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
     df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
     # NOTE: Cardinality reduction is NOT applied here
     # It should be applied during training BEFORE calling this function
     # During inference, valid_categories.yaml ensures only valid values are used
     # Select only the features we need
-    feature_cols = ["Country", "YearsCode", "EdLevel", "DevType"]
     df_features = df_processed[feature_cols]
     # Apply one-hot encoding for categorical variables

     during training and inference, preventing data leakage and inconsistencies.
     Args:
+        df: DataFrame with columns: Country, YearsCode, EdLevel, DevType, Industry
             NOTE: During training, cardinality reduction should be applied to df
             BEFORE calling this function. During inference, valid_categories.yaml
             ensures only valid (already-reduced) categories are used.
         - Fills missing values with defaults (0 for numeric, "Unknown" for categorical)
         - Normalizes Unicode apostrophes to regular apostrophes
         - Applies one-hot encoding with drop_first=True to avoid multicollinearity
+        - Column names in output will be like: YearsCode, Country_X, EdLevel_Y, DevType_Z, Industry_W
         - Does NOT apply cardinality reduction (must be done before calling this)
     """
     # Create a copy to avoid modifying the original
     # Normalize Unicode apostrophes to regular apostrophes for consistency
     # This handles cases where data has \u2019 (') instead of '
+    for col in ["Country", "EdLevel", "DevType", "Industry"]:
         if col in df_processed.columns:
             df_processed[col] = df_processed[col].str.replace('\u2019', "'", regex=False)
+    # Handle legacy column name (YearsCodePro -> YearsCode)
     if "YearsCodePro" in df_processed.columns and "YearsCode" not in df_processed.columns:
+        df_processed.rename(columns={"YearsCodePro": "YearsCode"}, inplace=True)
     # Fill missing values with defaults
     df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0)
     df_processed["Country"] = df_processed["Country"].fillna("Unknown")
     df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown")
     df_processed["DevType"] = df_processed["DevType"].fillna("Unknown")
+    df_processed["Industry"] = df_processed["Industry"].fillna("Unknown")
     # NOTE: Cardinality reduction is NOT applied here
     # It should be applied during training BEFORE calling this function
     # During inference, valid_categories.yaml ensures only valid values are used
     # Select only the features we need
+    feature_cols = ["Country", "YearsCode", "EdLevel", "DevType", "Industry"]
     df_features = df_processed[feature_cols]
     # Apply one-hot encoding for categorical variables

src/schema.py CHANGED Viewed

@@ -7,11 +7,14 @@ class SalaryInput(BaseModel):
     """Input model for salary prediction."""
     country: str = Field(..., description="Developer's country")
-    years_code_pro: float = Field(
-        ..., ge=0, description="Years of professional coding experience"
     )
     education_level: str = Field(..., description="Education level")
     dev_type: str = Field(..., description="Developer type")
     class Config:
         """Pydantic configuration."""
@@ -19,8 +22,9 @@ class SalaryInput(BaseModel):
         json_schema_extra = {
             "example": {
                 "country": "United States",
-                "years_code_pro": 5.0,
                 "education_level": "Bachelor's degree",
                 "dev_type": "Developer, back-end",
             }
         }

     """Input model for salary prediction."""
     country: str = Field(..., description="Developer's country")
+    years_code: float = Field(
+        ...,
+        ge=0,
+        description="Including any education, how many years have you been coding in total?",
     )
     education_level: str = Field(..., description="Education level")
     dev_type: str = Field(..., description="Developer type")
+    industry: str = Field(..., description="Industry the developer works in")
     class Config:
         """Pydantic configuration."""
         json_schema_extra = {
             "example": {
                 "country": "United States",
+                "years_code": 5.0,
                 "education_level": "Bachelor's degree",
                 "dev_type": "Developer, back-end",
+                "industry": "Software Development",
             }
         }

src/train.py CHANGED Viewed

@@ -7,7 +7,7 @@ import pandas as pd
 import numpy as np
 import yaml
 from xgboost import XGBRegressor
-from sklearn.model_selection import train_test_split
 from src.preprocessing import prepare_features, reduce_cardinality
@@ -32,7 +32,8 @@ def main():
     # Load only required columns to save memory
     df = pd.read_csv(
         data_path,
-        usecols=["Country", "YearsCode", "EdLevel", "DevType", "ConvertedCompYearly"],
     )
     print(f"Loaded {len(df):,} rows")
@@ -43,11 +44,14 @@ def main():
     # select records with main label more than min_salary threshold
     min_salary = config['data']['min_salary']
     df = df[df[main_label] > min_salary]
-    # further exclude outliers based on percentile bounds
-    lower_pct = config['data']['lower_percentile']
-    upper_pct = config['data']['upper_percentile']
-    P = np.percentile(df[main_label], [lower_pct, upper_pct])
-    df = df[(df[main_label] > P[0]) & (df[main_label] < P[1])]
     print(df.shape)
@@ -62,17 +66,20 @@ def main():
     df_copy["Country"] = df_copy["Country"].str.replace('\u2019', "'", regex=False)
     df_copy["EdLevel"] = df_copy["EdLevel"].str.replace('\u2019', "'", regex=False)
     df_copy["DevType"] = df_copy["DevType"].str.replace('\u2019', "'", regex=False)
     # Apply cardinality reduction
     df_copy["Country"] = reduce_cardinality(df_copy["Country"])
     df_copy["EdLevel"] = reduce_cardinality(df_copy["EdLevel"])
     df_copy["DevType"] = reduce_cardinality(df_copy["DevType"])
     # Apply cardinality reduction to the actual training data as well
     # (prepare_features no longer does this internally)
     df["Country"] = reduce_cardinality(df["Country"])
     df["EdLevel"] = reduce_cardinality(df["EdLevel"])
     df["DevType"] = reduce_cardinality(df["DevType"])
     # Now apply full feature transformations for model training
     X = prepare_features(df)
@@ -83,18 +90,64 @@ def main():
     country_values = df_copy["Country"].dropna().unique().tolist()
     edlevel_values = df_copy["EdLevel"].dropna().unique().tolist()
     devtype_values = df_copy["DevType"].dropna().unique().tolist()
     valid_categories = {
         "Country": sorted(country_values),
         "EdLevel": sorted(edlevel_values),
         "DevType": sorted(devtype_values),
     }
     valid_categories_path = Path("config/valid_categories.yaml")
     with open(valid_categories_path, "w") as f:
         yaml.dump(valid_categories, f, default_flow_style=False, sort_keys=False)
-    print(f"\nSaved {len(valid_categories['Country'])} valid countries, {len(valid_categories['EdLevel'])} valid education levels, and {len(valid_categories['DevType'])} valid developer types to {valid_categories_path}")
     print(f"\nFeature matrix shape: {X.shape}")
     print(f"Total features: {X.shape[1]}")
@@ -122,6 +175,12 @@ def main():
     for devtype, count in top_devtype.items():
         print(f"  - {devtype}: {count:,} ({count/len(df)*100:.1f}%)")
     # Show YearsCode statistics
     print("\n💼 Years of Coding Experience:")
     print(f"  - Min: {df['YearsCode'].min():.1f}")
@@ -164,25 +223,77 @@ def main():
         devtype_name = feature.replace('DevType_', '')
         print(f"  {i:2d}. {devtype_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
     print(f"\n📊 Total one-hot encoded features: {len(X.columns)}")
     print("   - Numeric: 1 (YearsCode)")
     print(f"   - Country: {len(country_features)}")
     print(f"   - Education: {len(edlevel_features)}")
     print(f"   - DevType: {len(devtype_features)}")
     print("=" * 60 + "\n")
-    # Split data
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y,
-        test_size=config['data']['test_size'],
-        random_state=config['data']['random_state']
     )
-    # Train model
-    print("Training XGBoost model...")
-    model_config = config['model']
-    model = XGBRegressor(
         n_estimators=model_config['n_estimators'],
         learning_rate=model_config['learning_rate'],
         max_depth=model_config['max_depth'],
@@ -191,27 +302,19 @@ def main():
         n_jobs=model_config['n_jobs'],
         early_stopping_rounds=model_config['early_stopping_rounds'],
     )
-    model.fit(
-        X_train,
-        y_train,
-        eval_set=[(X_test, y_test)],
         verbose=config['training']['verbose'],
     )
-    print(f"Best iteration: {model.best_iteration + 1} (early stopping at {model.n_estimators} max)")
-    # Evaluate
-    train_score = model.score(X_train, y_train)
-    test_score = model.score(X_test, y_test)
-    print(f"Training R2 score: {train_score:.4f}")
-    print(f"Test R2 score: {test_score:.4f}")
     # Save model and feature columns for inference
     model_path = Path(config['training']['model_path'])
-    model_path.parent.mkdir(parents=True, exist_ok=True)  # Ensure directory exists
     artifacts = {
-        "model": model,
         "feature_columns": list(X.columns),
     }

 import numpy as np
 import yaml
 from xgboost import XGBRegressor
+from sklearn.model_selection import KFold, train_test_split
 from src.preprocessing import prepare_features, reduce_cardinality
     # Load only required columns to save memory
     df = pd.read_csv(
         data_path,
+        usecols=["Country", "YearsCode", "EdLevel", "DevType", "Industry",
+                 "Currency", "CompTotal", "ConvertedCompYearly"],
     )
     print(f"Loaded {len(df):,} rows")
     # select records with main label more than min_salary threshold
     min_salary = config['data']['min_salary']
     df = df[df[main_label] > min_salary]
+    # Exclude outliers based on percentile bounds PER COUNTRY
+    # This preserves records from lower-paid and higher-paid countries
+    # that would otherwise be removed by global percentile filtering
+    lower_pct = config['data']['lower_percentile'] / 100
+    upper_pct = config['data']['upper_percentile'] / 100
+    lower_bound = df.groupby("Country")[main_label].transform("quantile", lower_pct)
+    upper_bound = df.groupby("Country")[main_label].transform("quantile", upper_pct)
+    df = df[(df[main_label] > lower_bound) & (df[main_label] < upper_bound)]
     print(df.shape)
     df_copy["Country"] = df_copy["Country"].str.replace('\u2019', "'", regex=False)
     df_copy["EdLevel"] = df_copy["EdLevel"].str.replace('\u2019', "'", regex=False)
     df_copy["DevType"] = df_copy["DevType"].str.replace('\u2019', "'", regex=False)
+    df_copy["Industry"] = df_copy["Industry"].str.replace('\u2019', "'", regex=False)
     # Apply cardinality reduction
     df_copy["Country"] = reduce_cardinality(df_copy["Country"])
     df_copy["EdLevel"] = reduce_cardinality(df_copy["EdLevel"])
     df_copy["DevType"] = reduce_cardinality(df_copy["DevType"])
+    df_copy["Industry"] = reduce_cardinality(df_copy["Industry"])
     # Apply cardinality reduction to the actual training data as well
     # (prepare_features no longer does this internally)
     df["Country"] = reduce_cardinality(df["Country"])
     df["EdLevel"] = reduce_cardinality(df["EdLevel"])
     df["DevType"] = reduce_cardinality(df["DevType"])
+    df["Industry"] = reduce_cardinality(df["Industry"])
     # Now apply full feature transformations for model training
     X = prepare_features(df)
     country_values = df_copy["Country"].dropna().unique().tolist()
     edlevel_values = df_copy["EdLevel"].dropna().unique().tolist()
     devtype_values = df_copy["DevType"].dropna().unique().tolist()
+    industry_values = df_copy["Industry"].dropna().unique().tolist()
     valid_categories = {
         "Country": sorted(country_values),
         "EdLevel": sorted(edlevel_values),
         "DevType": sorted(devtype_values),
+        "Industry": sorted(industry_values),
     }
     valid_categories_path = Path("config/valid_categories.yaml")
     with open(valid_categories_path, "w") as f:
         yaml.dump(valid_categories, f, default_flow_style=False, sort_keys=False)
+    print(f"\nSaved {len(valid_categories['Country'])} valid countries, {len(valid_categories['EdLevel'])} valid education levels, {len(valid_categories['DevType'])} valid developer types, and {len(valid_categories['Industry'])} valid industries to {valid_categories_path}")
+    # Compute currency conversion rates per country
+    # Use the original data with Currency and CompTotal columns
+    print("\nComputing currency conversion rates per country...")
+    currency_df = df[["Country", "Currency", "CompTotal", main_label]].dropna()
+    # Extract 3-letter currency code from values like "EUR European Euro"
+    currency_df = currency_df.copy()
+    currency_df["CurrencyCode"] = currency_df["Currency"].str.split(r"\s+", n=1).str[0]
+    currency_df["CurrencyName"] = currency_df["Currency"].str.split(r"\s+", n=1).str[1]
+    # Compute conversion rate: local currency / USD
+    currency_df["rate"] = currency_df["CompTotal"] / currency_df[main_label]
+    # Filter out unreasonable rates (negative, zero, or extreme)
+    currency_df = currency_df[(currency_df["rate"] > 0.001) & (currency_df["rate"] < 100000)]
+    currency_rates = {}
+    for country in valid_categories["Country"]:
+        country_data = currency_df[currency_df["Country"] == country]
+        if country_data.empty:
+            continue
+        # Find the most common currency for this country
+        most_common = country_data["CurrencyCode"].mode()
+        if most_common.empty:
+            continue
+        code = most_common.iloc[0]
+        # Get the full name from the first matching record
+        name_row = country_data[country_data["CurrencyCode"] == code].iloc[0]
+        full_name = name_row["CurrencyName"]
+        # Compute median conversion rate for this country+currency pair
+        rates = country_data[country_data["CurrencyCode"] == code]["rate"]
+        median_rate = round(float(rates.median()), 2)
+        currency_rates[country] = {
+            "code": code,
+            "name": full_name,
+            "rate": median_rate,
+        }
+    currency_rates_path = Path("config/currency_rates.yaml")
+    with open(currency_rates_path, "w") as f:
+        yaml.dump(currency_rates, f, default_flow_style=False, sort_keys=True,
+                  allow_unicode=True)
+    print(f"Saved currency rates for {len(currency_rates)} countries to {currency_rates_path}")
+    for country, info in sorted(currency_rates.items()):
+        print(f"  {country:45s} -> {info['code']} ({info['name']}, rate: {info['rate']})")
     print(f"\nFeature matrix shape: {X.shape}")
     print(f"Total features: {X.shape[1]}")
     for devtype, count in top_devtype.items():
         print(f"  - {devtype}: {count:,} ({count/len(df)*100:.1f}%)")
+    # Show top industries
+    print("\n🏢 Top Industries:")
+    top_industry = df["Industry"].value_counts().head(10)
+    for industry, count in top_industry.items():
+        print(f"  - {industry}: {count:,} ({count/len(df)*100:.1f}%)")
     # Show YearsCode statistics
     print("\n💼 Years of Coding Experience:")
     print(f"  - Min: {df['YearsCode'].min():.1f}")
         devtype_name = feature.replace('DevType_', '')
         print(f"  {i:2d}. {devtype_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
+    # Industry features
+    print("\n🏢 Top 10 Industry Features (most common):")
+    industry_features = categorical_features[categorical_features.index.str.startswith('Industry_')]
+    for i, (feature, count) in enumerate(industry_features.head(10).items(), 1):
+        percentage = (count / len(X)) * 100
+        industry_name = feature.replace('Industry_', '')
+        print(f"  {i:2d}. {industry_name:45s} - {count:6.0f} occurrences ({percentage:5.1f}%)")
     print(f"\n📊 Total one-hot encoded features: {len(X.columns)}")
     print("   - Numeric: 1 (YearsCode)")
     print(f"   - Country: {len(country_features)}")
     print(f"   - Education: {len(edlevel_features)}")
     print(f"   - DevType: {len(devtype_features)}")
+    print(f"   - Industry: {len(industry_features)}")
     print("=" * 60 + "\n")
+    # Cross-validation for robust evaluation
+    n_splits = config['data'].get('cv_splits', 5)
+    random_state = config['data']['random_state']
+    model_config = config['model']
+    print(f"Running {n_splits}-fold cross-validation...")
+    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
+    train_scores = []
+    test_scores = []
+    best_iterations = []
+    for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
+        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
+        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
+        model = XGBRegressor(
+            n_estimators=model_config['n_estimators'],
+            learning_rate=model_config['learning_rate'],
+            max_depth=model_config['max_depth'],
+            min_child_weight=model_config['min_child_weight'],
+            random_state=model_config['random_state'],
+            n_jobs=model_config['n_jobs'],
+            early_stopping_rounds=model_config['early_stopping_rounds'],
+        )
+        model.fit(
+            X_train, y_train,
+            eval_set=[(X_test, y_test)],
+            verbose=False,
+        )
+        train_r2 = model.score(X_train, y_train)
+        test_r2 = model.score(X_test, y_test)
+        train_scores.append(train_r2)
+        test_scores.append(test_r2)
+        best_iterations.append(model.best_iteration + 1)
+        print(f"  Fold {fold}: Train R2 = {train_r2:.4f}, Test R2 = {test_r2:.4f} (best iter: {model.best_iteration + 1})")
+    avg_train = np.mean(train_scores)
+    avg_test = np.mean(test_scores)
+    std_test = np.std(test_scores)
+    avg_best_iter = int(np.mean(best_iterations))
+    print(f"\nCV Average Train R2: {avg_train:.4f}")
+    print(f"CV Average Test R2:  {avg_test:.4f} (+/- {std_test:.4f})")
+    print(f"CV Average best iteration: {avg_best_iter}")
+    # Train final model on all data for deployment
+    # Use a small held-out split for early stopping only
+    print("\nTraining final model on full dataset...")
+    X_train_final, X_es, y_train_final, y_es = train_test_split(
+        X, y, test_size=0.1, random_state=random_state
     )
+    final_model = XGBRegressor(
         n_estimators=model_config['n_estimators'],
         learning_rate=model_config['learning_rate'],
         max_depth=model_config['max_depth'],
         n_jobs=model_config['n_jobs'],
         early_stopping_rounds=model_config['early_stopping_rounds'],
     )
+    final_model.fit(
+        X_train_final, y_train_final,
+        eval_set=[(X_es, y_es)],
         verbose=config['training']['verbose'],
     )
+    print(f"Final model best iteration: {final_model.best_iteration + 1}")
     # Save model and feature columns for inference
     model_path = Path(config['training']['model_path'])
+    model_path.parent.mkdir(parents=True, exist_ok=True)
     artifacts = {
+        "model": final_model,
         "feature_columns": list(X.columns),
     }

test_feature_impact.py ADDED Viewed

	@@ -0,0 +1,373 @@

+"""Test that changing input features actually changes predictions."""
+from src.schema import SalaryInput
+from src.infer import predict_salary, valid_categories
+def test_years_experience_impact():
+    """Test that changing years of experience changes prediction."""
+    print("\n" + "=" * 70)
+    print("TEST 1: Total Years of Coding Impact")
+    print("=" * 70)
+    base_input = {
+        "country": "United States of America",
+        "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
+        "dev_type": "Developer, full-stack",
+        "industry": "Software Development",
+    }
+    # Test with different years of experience
+    years_tests = [0, 2, 5, 10, 20]
+    predictions = []
+    for years in years_tests:
+        input_data = SalaryInput(**base_input, years_code=years)
+        salary = predict_salary(input_data)
+        predictions.append(salary)
+        print(f"  Years: {years:2d} -> Salary: ${salary:,.2f}")
+    # Check if predictions are different
+    unique_predictions = len(set(predictions))
+    if unique_predictions == len(predictions):
+        print(f"\n✅ PASS: All {len(predictions)} predictions are different")
+        return True
+    else:
+        print(f"\n❌ FAIL: Only {unique_predictions}/{len(predictions)} unique predictions")
+        return False
+def test_country_impact():
+    """Test that changing country changes prediction."""
+    print("\n" + "=" * 70)
+    print("TEST 2: Country Impact")
+    print("=" * 70)
+    base_input = {
+        "years_code": 5.0,
+        "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
+        "dev_type": "Developer, full-stack",
+        "industry": "Software Development",
+    }
+    # Test with different countries (select diverse ones)
+    test_countries = [
+        "United States of America",
+        "Germany",
+        "India",
+        "Brazil",
+        "Poland"
+    ]
+    # Filter to only countries that exist in valid categories
+    test_countries = [c for c in test_countries if c in valid_categories["Country"]]
+    predictions = []
+    for country in test_countries:
+        input_data = SalaryInput(**base_input, country=country)
+        salary = predict_salary(input_data)
+        predictions.append(salary)
+        print(f"  Country: {country:40s} -> Salary: ${salary:,.2f}")
+    # Check if predictions are different
+    unique_predictions = len(set(predictions))
+    if unique_predictions == len(predictions):
+        print(f"\n✅ PASS: All {len(predictions)} predictions are different")
+        return True
+    elif unique_predictions == 1:
+        print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
+        print("   This indicates the model is NOT using country as a feature!")
+        return False
+    else:
+        print(f"\n⚠️  PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
+        print(f"   Duplicate salaries found - possible feature issue")
+        return False
+def test_education_impact():
+    """Test that changing education level changes prediction."""
+    print("\n" + "=" * 70)
+    print("TEST 3: Education Level Impact")
+    print("=" * 70)
+    base_input = {
+        "country": "United States of America",
+        "years_code": 5.0,
+        "dev_type": "Developer, full-stack",
+        "industry": "Software Development",
+    }
+    # Test with different education levels
+    test_education = [
+        "Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)",
+        "Some college/university study without earning a degree",
+        "Associate degree (A.A., A.S., etc.)",
+        "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
+        "Master's degree (M.A., M.S., M.Eng., MBA, etc.)",
+        "Professional degree (JD, MD, Ph.D, Ed.D, etc.)",
+    ]
+    # Filter to only education levels that exist in valid categories
+    test_education = [e for e in test_education if e in valid_categories["EdLevel"]]
+    predictions = []
+    for education in test_education:
+        input_data = SalaryInput(**base_input, education_level=education)
+        salary = predict_salary(input_data)
+        predictions.append(salary)
+        print(f"  Education: {education[:50]:50s} -> Salary: ${salary:,.2f}")
+    # Check if predictions are different
+    unique_predictions = len(set(predictions))
+    if unique_predictions == len(predictions):
+        print(f"\n✅ PASS: All {len(predictions)} predictions are different")
+        return True
+    elif unique_predictions == 1:
+        print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
+        print("   This indicates the model is NOT using education level as a feature!")
+        return False
+    else:
+        print(f"\n⚠️  PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
+        print(f"   Duplicate salaries found - possible feature issue")
+        return False
+def test_devtype_impact():
+    """Test that changing developer type changes prediction."""
+    print("\n" + "=" * 70)
+    print("TEST 4: Developer Type Impact")
+    print("=" * 70)
+    base_input = {
+        "country": "United States of America",
+        "years_code": 5.0,
+        "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
+        "industry": "Software Development",
+    }
+    # Test with different developer types (using actual values from trained model)
+    test_devtypes = [
+        "Developer, front-end",
+        "Developer, back-end",
+        "Developer, full-stack",
+        "Data scientist",
+        "Engineering manager",
+        "DevOps engineer or professional",
+    ]
+    # Filter to only developer types that exist in valid categories
+    test_devtypes = [d for d in test_devtypes if d in valid_categories["DevType"]]
+    predictions = []
+    for devtype in test_devtypes:
+        input_data = SalaryInput(**base_input, dev_type=devtype)
+        salary = predict_salary(input_data)
+        predictions.append(salary)
+        print(f"  Dev Type: {devtype[:50]:50s} -> Salary: ${salary:,.2f}")
+    # Check if predictions are different
+    unique_predictions = len(set(predictions))
+    if unique_predictions == len(predictions):
+        print(f"\n✅ PASS: All {len(predictions)} predictions are different")
+        return True
+    elif unique_predictions == 1:
+        print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
+        print("   This indicates the model is NOT using developer type as a feature!")
+        return False
+    else:
+        print(f"\n⚠️  PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
+        print(f"   Duplicate salaries found - possible feature issue")
+        return False
+def test_industry_impact():
+    """Test that changing industry changes prediction."""
+    print("\n" + "=" * 70)
+    print("TEST 5: Industry Impact")
+    print("=" * 70)
+    base_input = {
+        "country": "United States of America",
+        "years_code": 5.0,
+        "education_level": "Bachelor's degree (B.A., B.S., B.Eng., etc.)",
+        "dev_type": "Developer, full-stack",
+    }
+    # Test with different industries (using actual values from trained model)
+    test_industries = [
+        "Software Development",
+        "Fintech",
+        "Banking/Financial Services",
+        "Healthcare",
+        "Manufacturing",
+        "Government",
+    ]
+    # Filter to only industries that exist in valid categories
+    test_industries = [i for i in test_industries if i in valid_categories["Industry"]]
+    predictions = []
+    for industry in test_industries:
+        input_data = SalaryInput(**base_input, industry=industry)
+        salary = predict_salary(input_data)
+        predictions.append(salary)
+        print(f"  Industry: {industry[:50]:50s} -> Salary: ${salary:,.2f}")
+    # Check if predictions are different
+    unique_predictions = len(set(predictions))
+    if unique_predictions == len(predictions):
+        print(f"\n✅ PASS: All {len(predictions)} predictions are different")
+        return True
+    elif unique_predictions == 1:
+        print(f"\n❌ FAIL: All predictions are IDENTICAL (${predictions[0]:,.2f})")
+        print("   This indicates the model is NOT using industry as a feature!")
+        return False
+    else:
+        print(f"\n⚠️  PARTIAL: Only {unique_predictions}/{len(predictions)} unique predictions")
+        print(f"   Duplicate salaries found - possible feature issue")
+        return False
+def test_combined_features():
+    """Test that combining different features produces expected variations."""
+    print("\n" + "=" * 70)
+    print("TEST 6: Combined Feature Variations")
+    print("=" * 70)
+    # Create diverse combinations (using actual values from trained model)
+    test_cases = [
+        ("India", 2, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, back-end", "Software Development"),
+        ("Germany", 5, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Developer, full-stack", "Manufacturing"),
+        ("United States of America", 10, "Master's degree (M.A., M.S., M.Eng., MBA, etc.)", "Engineering manager", "Fintech"),
+        ("Poland", 15, "Bachelor's degree (B.A., B.S., B.Eng., etc.)", "Developer, front-end", "Healthcare"),
+        ("Brazil", 5, "Some college/university study without earning a degree", "DevOps engineer or professional", "Government"),
+    ]
+    predictions = []
+    for country, years, education, devtype, industry in test_cases:
+        # Skip if not in valid categories
+        if (country not in valid_categories["Country"]
+                or education not in valid_categories["EdLevel"]
+                or devtype not in valid_categories["DevType"]
+                or industry not in valid_categories["Industry"]):
+            continue
+        input_data = SalaryInput(
+            country=country,
+            years_code=years,
+            education_level=education,
+            dev_type=devtype,
+            industry=industry,
+        )
+        salary = predict_salary(input_data)
+        predictions.append(salary)
+        print(f"  {country[:15]:15s} | {years:2d}y | {education[:25]:25s} | {devtype[:25]:25s} | {industry[:20]:20s} -> ${salary:,.2f}")
+    # Check if predictions are different
+    unique_predictions = len(set(predictions))
+    if unique_predictions == len(predictions):
+        print(f"\n✅ PASS: All {len(predictions)} combined predictions are different")
+        return True
+    else:
+        print(f"\n⚠️  Only {unique_predictions}/{len(predictions)} unique predictions")
+        print(f"   Some combinations produce identical salaries")
+        return False
+def print_feature_analysis():
+    """Analyze which features the model is actually using."""
+    print("\n" + "=" * 70)
+    print("FEATURE ANALYSIS")
+    print("=" * 70)
+    from src.infer import feature_columns
+    print(f"\nTotal features in model: {len(feature_columns)}")
+    # Count by type
+    country_features = [f for f in feature_columns if f.startswith('Country_')]
+    edlevel_features = [f for f in feature_columns if f.startswith('EdLevel_')]
+    devtype_features = [f for f in feature_columns if f.startswith('DevType_')]
+    industry_features = [f for f in feature_columns if f.startswith('Industry_')]
+    numeric_features = [f for f in feature_columns if not f.startswith(('Country_', 'EdLevel_', 'DevType_', 'Industry_'))]
+    print(f"  - Numeric features: {len(numeric_features)} -> {numeric_features}")
+    print(f"  - Country features: {len(country_features)}")
+    print(f"  - Education features: {len(edlevel_features)}")
+    print(f"  - DevType features: {len(devtype_features)}")
+    print(f"  - Industry features: {len(industry_features)}")
+    if len(country_features) > 0:
+        print(f"\nSample country features:")
+        for feat in country_features[:5]:
+            print(f"    - {feat}")
+    if len(edlevel_features) > 0:
+        print(f"\nSample education features:")
+        for feat in edlevel_features[:5]:
+            print(f"    - {feat}")
+    if len(devtype_features) > 0:
+        print(f"\nSample developer type features:")
+        for feat in devtype_features[:5]:
+            print(f"    - {feat}")
+    if len(industry_features) > 0:
+        print(f"\nSample industry features:")
+        for feat in industry_features[:5]:
+            print(f"    - {feat}")
+    # Check if there are any features at all
+    if len(country_features) == 0:
+        print("\n⚠️  WARNING: No country features found!")
+    if len(edlevel_features) == 0:
+        print("\n⚠️  WARNING: No education features found!")
+    if len(devtype_features) == 0:
+        print("\n⚠️  WARNING: No developer type features found!")
+    if len(industry_features) == 0:
+        print("\n⚠️  WARNING: No industry features found!")
+def main():
+    """Run all tests."""
+    print("\n" + "=" * 70)
+    print("FEATURE IMPACT TESTS")
+    print("Testing if changing inputs actually changes predictions")
+    print("=" * 70)
+    # First, analyze what features exist
+    print_feature_analysis()
+    # Run all tests
+    results = {
+        "Years of Experience": test_years_experience_impact(),
+        "Country": test_country_impact(),
+        "Education Level": test_education_impact(),
+        "Developer Type": test_devtype_impact(),
+        "Industry": test_industry_impact(),
+        "Combined Features": test_combined_features(),
+    }
+    # Summary
+    print("\n" + "=" * 70)
+    print("TEST SUMMARY")
+    print("=" * 70)
+    for test_name, passed in results.items():
+        status = "✅ PASS" if passed else "❌ FAIL"
+        print(f"  {status} - {test_name}")
+    passed_count = sum(results.values())
+    total_count = len(results)
+    print(f"\n{passed_count}/{total_count} tests passed")
+    if passed_count == total_count:
+        print("\n🎉 All tests passed! The model is using all features correctly.")
+    else:
+        print("\n⚠️  Some tests failed. The model may not be using all features properly.")
+        print("   This indicates potential training-testing skew or feature engineering issues.")
+if __name__ == "__main__":
+    main()

test_fix.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""Test that the encoding fix works."""
+# Force reload of modules
+import sys
+if 'src.preprocessing' in sys.modules:
+    del sys.modules['src.preprocessing']
+if 'src.infer' in sys.modules:
+    del sys.modules['src.infer']
+from src.preprocessing import prepare_features
+import pandas as pd
+# Create test inputs with different countries (values from valid_categories)
+input1 = pd.DataFrame({
+    'Country': ['United States of America'],
+    'YearsCode': [5.0],
+    'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
+    'DevType': ['Developer, full-stack']
+})
+input2 = pd.DataFrame({
+    'Country': ['Germany'],
+    'YearsCode': [5.0],
+    'EdLevel': ["Bachelor's degree (B.A., B.S., B.Eng., etc.)"],
+    'DevType': ['Developer, full-stack']
+})
+print("Testing prepare_features with different countries...")
+features1 = prepare_features(input1)
+features2 = prepare_features(input2)
+print(f"\nUSA features: {features1.shape}")
+print(f"Columns: {list(features1.columns)[:10]}")
+print(f"\nGermany features: {features2.shape}")
+print(f"Columns: {list(features2.columns)[:10]}")
+print(f"\nAre they different? {not features1.equals(features2)}")
+if features1.shape[1] > 1:
+    print("\n✅ SUCCESS: Categorical features are preserved!")
+else:
+    print("\n❌ FAIL: Still only has numeric features")

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff