DaCrow13 commited on
Commit
39d224b
·
0 Parent(s):

Deploy to HF Spaces (Clean)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +31 -0
  2. .dvc/.gitignore +3 -0
  3. .dvc/config +6 -0
  4. .dvcignore +3 -0
  5. .env.example +19 -0
  6. .github/workflows/ci.yml +68 -0
  7. .gitignore +192 -0
  8. Dockerfile +40 -0
  9. Dockerfile.streamlit +29 -0
  10. Makefile +189 -0
  11. README.md +576 -0
  12. data/.gitignore +1 -0
  13. data/README.md +83 -0
  14. data/processed/.gitignore +2 -0
  15. data/processed/embedding.dvc +6 -0
  16. data/processed/tfidf.dvc +6 -0
  17. data/raw.dvc +6 -0
  18. docker-compose.yml +56 -0
  19. docs/.gitkeep +0 -0
  20. docs/ML Canvas.md +39 -0
  21. docs/README.md +12 -0
  22. docs/docs/getting-started.md +6 -0
  23. docs/docs/index.md +10 -0
  24. docs/mkdocs.yml +4 -0
  25. docs/testing_and_validation.md +208 -0
  26. hopcroft_skill_classification_tool_competition/__init__.py +0 -0
  27. hopcroft_skill_classification_tool_competition/api_models.py +221 -0
  28. hopcroft_skill_classification_tool_competition/config.py +137 -0
  29. hopcroft_skill_classification_tool_competition/data_cleaning.py +559 -0
  30. hopcroft_skill_classification_tool_competition/dataset.py +99 -0
  31. hopcroft_skill_classification_tool_competition/features.py +492 -0
  32. hopcroft_skill_classification_tool_competition/main.py +434 -0
  33. hopcroft_skill_classification_tool_competition/mlsmote.py +157 -0
  34. hopcroft_skill_classification_tool_competition/modeling/predict.py +198 -0
  35. hopcroft_skill_classification_tool_competition/modeling/train.py +858 -0
  36. hopcroft_skill_classification_tool_competition/streamlit_app.py +322 -0
  37. hopcroft_skill_classification_tool_competition/threshold_optimization.py +295 -0
  38. models/.gitignore +11 -0
  39. models/.gitkeep +0 -0
  40. models/README.md +206 -0
  41. models/kept_label_indices.npy +0 -0
  42. models/label_names.pkl.dvc +5 -0
  43. models/random_forest_embedding_gridsearch.pkl.dvc +5 -0
  44. models/random_forest_embedding_gridsearch_smote.pkl.dvc +5 -0
  45. models/random_forest_tfidf_gridsearch.pkl.dvc +5 -0
  46. models/random_forest_tfidf_gridsearch_adasyn_pca.pkl.dvc +5 -0
  47. models/random_forest_tfidf_gridsearch_ros.pkl.dvc +5 -0
  48. models/random_forest_tfidf_gridsearch_smote.pkl.dvc +5 -0
  49. models/tfidf_vectorizer.pkl.dvc +5 -0
  50. notebooks/.gitkeep +0 -0
.dockerignore ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ env/
7
+ venv/
8
+ pip-log.txt
9
+ pip-delete-this-directory.txt
10
+ .tox/
11
+ .coverage
12
+ .coverage.*
13
+ .cache
14
+ nosetests.xml
15
+ coverage.xml
16
+ *.cover
17
+ *.log
18
+ .git
19
+ .gitignore
20
+ .mypy_cache
21
+ .pytest_cache
22
+ .hydra
23
+ .dvc/
24
+ data/
25
+ mlruns/
26
+ notebooks/
27
+ reports/
28
+ docs/
29
+ tests/
30
+ scripts/
31
+ !scripts/start_space.sh
.dvc/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /config.local
2
+ /tmp
3
+ /cache
.dvc/config ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [cache]
2
+ type = copy
3
+ [core]
4
+ remote = origin
5
+ ['remote "origin"']
6
+ url = https://dagshub.com/se4ai2526-uniba/Hopcroft.dvc
.dvcignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Add patterns of files dvc should ignore, which could improve
2
+ # the performance. Learn more at
3
+ # https://dvc.org/doc/user-guide/dvcignore
.env.example ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================
2
+ # Hopcroft API Environment Configuration
3
+ # ============================================
4
+ # Copy this file to .env and update with your values
5
+ # Command: cp .env.example .env
6
+ # IMPORTANT: Never commit .env to version control!
7
+
8
+ # MLflow Configuration
9
+ MLFLOW_TRACKING_URI=https://dagshub.com/se4ai2526-uniba/Hopcroft.mlflow
10
+ MLFLOW_TRACKING_USERNAME=your_username
11
+ MLFLOW_TRACKING_PASSWORD=your_token
12
+
13
+ # API Configuration
14
+ API_HOST=0.0.0.0
15
+ API_PORT=8080
16
+ LOG_LEVEL=info
17
+
18
+ # Model Configuration
19
+ MODEL_PATH=/app/models/random_forest_embedding_gridsearch.pkl
.github/workflows/ci.yml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI Pipeline
2
+
3
+ on:
4
+ push:
5
+ branches: [ "main", "feature/*" ]
6
+ pull_request:
7
+ branches: [ "main" ]
8
+
9
+ jobs:
10
+ build-and-test:
11
+ runs-on: ubuntu-latest
12
+
13
+ steps:
14
+ - name: Checkout code
15
+ uses: actions/checkout@v3
16
+
17
+ - name: Free Disk Space
18
+ run: |
19
+ sudo rm -rf /usr/share/dotnet
20
+ sudo rm -rf /usr/local/lib/android
21
+ sudo rm -rf /opt/ghc
22
+ sudo rm -rf /opt/hostedtoolcache/CodeQL
23
+ sudo docker image prune --all --force
24
+
25
+ - name: Set up Python 3.10
26
+ uses: actions/setup-python@v4
27
+ with:
28
+ python-version: "3.10"
29
+ cache: 'pip' # Enable caching for pip
30
+
31
+ - name: Install dependencies
32
+ run: |
33
+ python -m pip install --upgrade pip
34
+ # Install CPU-only PyTorch to save space (we don't need CUDA for tests)
35
+ pip install torch --index-url https://download.pytorch.org/whl/cpu
36
+ # Install other dependencies
37
+ pip install -r requirements.txt --no-cache-dir
38
+
39
+ - name: Lint with Ruff
40
+ run: |
41
+ # Using make lint as defined in Makefile
42
+ make lint
43
+
44
+ - name: Run Unit Tests
45
+ run: |
46
+ # Run tests and generate HTML report
47
+ pytest tests/unit/ -v -m unit --html=report.html --self-contained-html
48
+
49
+ - name: Upload Test Report
50
+ if: always() # Upload report even if tests fail
51
+ uses: actions/upload-artifact@v4
52
+ with:
53
+ name: test-report
54
+ path: report.html
55
+
56
+ - name: Configure DVC
57
+ run: |
58
+ dvc remote modify origin --local auth basic
59
+ dvc remote modify origin --local user ${{ secrets.DAGSHUB_USERNAME }}
60
+ dvc remote modify origin --local password ${{ secrets.DAGSHUB_TOKEN }}
61
+
62
+ - name: Pull Models with DVC
63
+ run: |
64
+ dvc pull models/random_forest_embedding_gridsearch.pkl models/label_names.pkl
65
+
66
+ - name: Build Docker Image
67
+ run: |
68
+ docker build . -t hopcroft-app:latest
.gitignore ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Mac OS-specific storage files
3
+ .DS_Store
4
+
5
+ # vim
6
+ *.swp
7
+ *.swo
8
+
9
+ ## https://github.com/github/gitignore/blob/e8554d85bf62e38d6db966a50d2064ac025fd82a/Python.gitignore
10
+
11
+ # Byte-compiled / optimized / DLL files
12
+ __pycache__/
13
+ *.py[cod]
14
+ *$py.class
15
+
16
+ # C extensions
17
+ *.so
18
+
19
+ # Distribution / packaging
20
+ .Python
21
+ build/
22
+ develop-eggs/
23
+ dist/
24
+ downloads/
25
+ eggs/
26
+ .eggs/
27
+ lib/
28
+ lib64/
29
+ parts/
30
+ sdist/
31
+ var/
32
+ wheels/
33
+ share/python-wheels/
34
+ *.egg-info/
35
+ .installed.cfg
36
+ *.egg
37
+ MANIFEST
38
+
39
+ # PyInstaller
40
+ # Usually these files are written by a python script from a template
41
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
42
+ *.manifest
43
+ *.spec
44
+
45
+ # Installer logs
46
+ pip-log.txt
47
+ pip-delete-this-directory.txt
48
+
49
+ # Unit test / coverage reports
50
+ htmlcov/
51
+ .tox/
52
+ .nox/
53
+ .coverage
54
+ .coverage.*
55
+ .cache
56
+ nosetests.xml
57
+ coverage.xml
58
+ *.cover
59
+ *.py,cover
60
+ .hypothesis/
61
+ .pytest_cache/
62
+ cover/
63
+
64
+ # Translations
65
+ *.mo
66
+ *.pot
67
+
68
+ # Django stuff:
69
+ *.log
70
+ local_settings.py
71
+ db.sqlite3
72
+ db.sqlite3-journal
73
+
74
+ # Flask stuff:
75
+ instance/
76
+ .webassets-cache
77
+
78
+ # Scrapy stuff:
79
+ .scrapy
80
+
81
+ # MkDocs documentation
82
+ docs/site/
83
+
84
+ # PyBuilder
85
+ .pybuilder/
86
+ target/
87
+
88
+ # Jupyter Notebook
89
+ .ipynb_checkpoints
90
+
91
+ # IPython
92
+ profile_default/
93
+ ipython_config.py
94
+
95
+ # pyenv
96
+ # For a library or package, you might want to ignore these files since the code is
97
+ # intended to run in multiple environments; otherwise, check them in:
98
+ # .python-version
99
+
100
+ # pipenv
101
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
102
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
103
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
104
+ # install all needed dependencies.
105
+ #Pipfile.lock
106
+
107
+ # UV
108
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
109
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
110
+ # commonly ignored for libraries.
111
+ #uv.lock
112
+
113
+ # poetry
114
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
115
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
116
+ # commonly ignored for libraries.
117
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
118
+ #poetry.lock
119
+
120
+ # pdm
121
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
122
+ #pdm.lock
123
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
124
+ # in version control.
125
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
126
+ .pdm.toml
127
+ .pdm-python
128
+ .pdm-build/
129
+
130
+ # pixi
131
+ # pixi.lock should be committed to version control for reproducibility
132
+ # .pixi/ contains the environments and should not be committed
133
+ .pixi/
134
+
135
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
136
+ __pypackages__/
137
+
138
+ # Celery stuff
139
+ celerybeat-schedule
140
+ celerybeat.pid
141
+
142
+ # SageMath parsed files
143
+ *.sage.py
144
+
145
+ # Environments
146
+ .env
147
+ .venv
148
+ env/
149
+ venv/
150
+ ENV/
151
+ env.bak/
152
+ venv.bak/
153
+
154
+ # Spyder project settings
155
+ .spyderproject
156
+ .spyproject
157
+
158
+ # Rope project settings
159
+ .ropeproject
160
+
161
+ # mkdocs documentation
162
+ /site
163
+
164
+ # mypy
165
+ .mypy_cache/
166
+ .dmypy.json
167
+ dmypy.json
168
+
169
+ # Pyre type checker
170
+ .pyre/
171
+
172
+ # pytype static type analyzer
173
+ .pytype/
174
+
175
+ # Cython debug symbols
176
+ cython_debug/
177
+
178
+ # PyCharm
179
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
180
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
181
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
182
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
183
+ #.idea/
184
+
185
+ # Ruff stuff:
186
+ .ruff_cache/
187
+
188
+ # PyPI configuration file
189
+ .pypirc
190
+ .github/copilot-instructions.md
191
+
192
+ docs/img/
Dockerfile ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Set environment variables
4
+ ENV PYTHONDONTWRITEBYTECODE=1 \
5
+ PYTHONUNBUFFERED=1 \
6
+ PIP_NO_CACHE_DIR=off \
7
+ PIP_DISABLE_PIP_VERSION_CHECK=on \
8
+ PIP_DEFAULT_TIMEOUT=100
9
+
10
+ # Install system dependencies
11
+ RUN apt-get update && apt-get install -y \
12
+ git \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # Create a non-root user
16
+ RUN useradd -m -u 1000 user
17
+
18
+ # Set working directory
19
+ WORKDIR /app
20
+
21
+ # Copy requirements first for caching
22
+ COPY requirements.txt .
23
+
24
+ # Install dependencies
25
+ RUN pip install --no-cache-dir -r requirements.txt
26
+
27
+ # Copy the rest of the application
28
+ COPY --chown=user:user . .
29
+
30
+ # Make start script executable
31
+ RUN chmod +x scripts/start_space.sh
32
+
33
+ # Switch to non-root user
34
+ USER user
35
+
36
+ # Expose the port
37
+ EXPOSE 7860
38
+
39
+ # Command to run the application
40
+ CMD ["./scripts/start_space.sh"]
Dockerfile.streamlit ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ ENV PYTHONUNBUFFERED=1 \
6
+ PYTHONDONTWRITEBYTECODE=1 \
7
+ PIP_NO_CACHE_DIR=1 \
8
+ PIP_DISABLE_PIP_VERSION_CHECK=1
9
+
10
+ # Create non-root user
11
+ RUN useradd -m -u 1000 appuser
12
+
13
+ # Install only Streamlit dependencies
14
+ RUN pip install --no-cache-dir \
15
+ streamlit>=1.28.0 \
16
+ requests>=2.31.0 \
17
+ pandas>=2.0.0
18
+
19
+ # Copy only the Streamlit app
20
+ COPY --chown=appuser:appuser hopcroft_skill_classification_tool_competition/streamlit_app.py ./
21
+
22
+ EXPOSE 8501
23
+
24
+ USER appuser
25
+
26
+ # Set API URL to point to the API service
27
+ ENV API_BASE_URL=http://hopcroft-api:8080
28
+
29
+ CMD ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
Makefile ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #################################################################################
2
+ # GLOBALS #
3
+ #################################################################################
4
+
5
+ PROJECT_NAME = Hopcroft
6
+ PYTHON_VERSION = 3.10
7
+ PYTHON_INTERPRETER = python
8
+
9
+ #################################################################################
10
+ # COMMANDS #
11
+ #################################################################################
12
+
13
+ ## Install Python dependencies
14
+ .PHONY: requirements
15
+ requirements:
16
+ $(PYTHON_INTERPRETER) -m pip install -U pip
17
+ $(PYTHON_INTERPRETER) -m pip install -r requirements.txt
18
+
19
+ ## Delete all compiled Python files
20
+ .PHONY: clean
21
+ clean:
22
+ find . -type f -name "*.py[co]" -delete
23
+ find . -type d -name "__pycache__" -delete
24
+
25
+ ## Lint using ruff
26
+ .PHONY: lint
27
+ lint:
28
+ ruff format --check
29
+ ruff check
30
+
31
+ ## Format source code with ruff
32
+ .PHONY: format
33
+ format:
34
+ ruff check --fix
35
+ ruff format
36
+
37
+ #################################################################################
38
+ # PROJECT RULES #
39
+ #################################################################################
40
+
41
+ ## Download dataset from Hugging Face
42
+ .PHONY: data
43
+ data:
44
+ $(PYTHON_INTERPRETER) -m hopcroft_skill_classification_tool_competition.dataset
45
+
46
+ ## Extract features from raw data
47
+ .PHONY: features
48
+ features:
49
+ $(PYTHON_INTERPRETER) -m hopcroft_skill_classification_tool_competition.features
50
+
51
+ #################################################################################
52
+ # TRAINING RULES #
53
+ #################################################################################
54
+
55
+ ## Train Random Forest baseline with TF-IDF features (cleaned data)
56
+ .PHONY: train-baseline-tfidf
57
+ train-baseline-tfidf:
58
+ $(PYTHON_INTERPRETER) -m hopcroft_skill_classification_tool_competition.modeling.train baseline
59
+
60
+ ## Train Random Forest baseline with Embedding features (cleaned data)
61
+ .PHONY: train-baseline-embeddings
62
+ train-baseline-embeddings:
63
+ $(PYTHON_INTERPRETER) -c "from hopcroft_skill_classification_tool_competition.modeling.train import run_baseline_train; run_baseline_train(feature_type='embedding', use_cleaned=True)"
64
+
65
+ ## Train Random Forest with SMOTE and TF-IDF features (cleaned data)
66
+ .PHONY: train-smote-tfidf
67
+ train-smote-tfidf:
68
+ $(PYTHON_INTERPRETER) -c "from hopcroft_skill_classification_tool_competition.modeling.train import run_smote_experiment, load_data; X, Y = load_data(feature_type='tfidf', use_cleaned=True); run_smote_experiment(X, Y, feature_type='tfidf')"
69
+
70
+ ## Train Random Forest with SMOTE and Embedding features (cleaned data)
71
+ .PHONY: train-smote-embeddings
72
+ train-smote-embeddings:
73
+ $(PYTHON_INTERPRETER) -c "from hopcroft_skill_classification_tool_competition.modeling.train import run_smote_experiment, load_data; X, Y = load_data(feature_type='embedding', use_cleaned=True); run_smote_experiment(X, Y, feature_type='embedding')"
74
+
75
+ #################################################################################
76
+ # TESTING RULES #
77
+ #################################################################################
78
+
79
+ ## Run all unit tests
80
+ .PHONY: test-unit
81
+ test-unit:
82
+ pytest tests/unit/ -v -m unit
83
+
84
+ ## Run all integration tests
85
+ .PHONY: test-integration
86
+ test-integration:
87
+ pytest tests/integration/ -v -m integration
88
+
89
+ ## Run all system tests
90
+ .PHONY: test-system
91
+ test-system:
92
+ pytest tests/system/ -v -m system
93
+
94
+ ## Run all tests (unit, integration, system)
95
+ .PHONY: test-all
96
+ test-all:
97
+ pytest tests/ -v --ignore=tests/behavioral --ignore=tests/deepchecks
98
+
99
+ ## Run tests with coverage report
100
+ .PHONY: test-coverage
101
+ test-coverage:
102
+ pytest tests/ --cov=hopcroft_skill_classification_tool_competition --cov-report=html --cov-report=term
103
+
104
+ ## Run fast tests only (exclude slow tests)
105
+ .PHONY: test-fast
106
+ test-fast:
107
+ pytest tests/ -v -m "not slow" --ignore=tests/behavioral --ignore=tests/deepchecks
108
+
109
+ ## Run behavioral tests
110
+ .PHONY: test-behavioral
111
+ test-behavioral:
112
+ pytest tests/behavioral/ -v --ignore=tests/behavioral/test_model_training.py
113
+
114
+ ## Run Great Expectations validation
115
+ .PHONY: validate-gx
116
+ validate-gx:
117
+ $(PYTHON_INTERPRETER) -m hopcroft_skill_classification_tool_competition.tests.test_gx
118
+
119
+ ## Run Deepchecks validation
120
+ .PHONY: validate-deepchecks
121
+ validate-deepchecks:
122
+ $(PYTHON_INTERPRETER) tests/deepchecks/run_all_deepchecks.py
123
+
124
+ ## Run all validation and tests
125
+ .PHONY: test-complete
126
+ test-complete: test-all validate-gx validate-deepchecks test-behavioral
127
+
128
+ #################################################################################
129
+ # Self Documenting Commands #
130
+ #################################################################################
131
+
132
+ .DEFAULT_GOAL := help
133
+
134
+ define PRINT_HELP_PYSCRIPT
135
+ import re, sys; \
136
+ lines = '\n'.join([line for line in sys.stdin]); \
137
+ matches = re.findall(r'\n## (.*)\n[\s\S]+?\n([a-zA-Z_-]+):', lines); \
138
+ print('Available rules:\n'); \
139
+ print('\n'.join(['{:25}{}'.format(*reversed(match)) for match in matches]))
140
+ endef
141
+ export PRINT_HELP_PYSCRIPT
142
+
143
+ help:
144
+ @$(PYTHON_INTERPRETER) -c "${PRINT_HELP_PYSCRIPT}" < $(MAKEFILE_LIST)
145
+
146
+ ################################################################################
147
+ # API COMMANDS #
148
+ ################################################################################
149
+
150
+ ## Run API in development mode
151
+ .PHONY: api-dev
152
+ api-dev:
153
+ fastapi dev hopcroft_skill_classification_tool_competition/main.py
154
+
155
+ ## Run API in production mode
156
+ .PHONY: api-run
157
+ api-run:
158
+ fastapi run hopcroft_skill_classification_tool_competition/main.py
159
+
160
+ ## Test API health check (requires running API)
161
+ .PHONY: test-api-health
162
+ test-api-health:
163
+ @echo "Testing API health endpoint..."
164
+ curl -X GET "http://127.0.0.1:8000/health"
165
+
166
+ ## Test API POST /predict (requires running API)
167
+ .PHONY: test-api-predict
168
+ test-api-predict:
169
+ @echo "Testing prediction endpoint..."
170
+ curl -X POST "http://127.0.0.1:8000/predict" -H "Content-Type: application/json" -d '{"issue_text": "Fix critical bug in authentication and login flow with OAuth2", "repo_name": "my-repo"}'
171
+
172
+ ## Test API GET /predictions (requires running API)
173
+ .PHONY: test-api-list
174
+ test-api-list:
175
+ @echo "Testing list predictions endpoint..."
176
+ curl "http://127.0.0.1:8000/predictions?limit=5"
177
+
178
+ ## Test API GET /predictions/{run_id} (requires running API and valid run_id)
179
+ .PHONY: test-api-get-prediction
180
+ test-api-get-prediction:
181
+ @echo "Testing get specific prediction endpoint..."
182
+ @echo "Usage: make test-api-get-prediction RUN_ID=<your_run_id>"
183
+ @if [ -z "$(RUN_ID)" ]; then echo "Error: RUN_ID not set. Example: make test-api-get-prediction RUN_ID=abc123"; exit 1; fi
184
+ curl "http://127.0.0.1:8000/predictions/$(RUN_ID)"
185
+
186
+ ## Run all API tests (requires running API)
187
+ .PHONY: test-api-all
188
+ test-api-all: test-api-health test-api-predict test-api-list
189
+ @echo "\n All API tests completed!"
README.md ADDED
@@ -0,0 +1,576 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Hopcroft Skill Classification
3
+ emoji: 🧠
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ app_port: 7860
8
+ ---
9
+
10
+ # Hopcroft_Skill-Classification-Tool-Competition
11
+
12
+ The task involves analyzing the relationship between issue characteristics and required skills, developing effective feature extraction methods that combine textual and code-context information, and implementing sophisticated multi-label classification approaches. Students may incorporate additional GitHub metadata to enhance model inputs, but must avoid using third-party classification engines or direct outputs from the provided database. The work requires careful attention to the multi-label nature of the problem, where each issue may require multiple different skills for resolution.
13
+
14
+ ## Project Organization
15
+
16
+ ```
17
+ ├── LICENSE <- Open-source license if one is chosen
18
+ ├── Makefile <- Makefile with convenience commands like `make data` or `make train`
19
+ ├── README.md <- The top-level README for developers using this project.
20
+ ├── data
21
+ │ ├── external <- Data from third party sources.
22
+ │ ├── interim <- Intermediate data that has been transformed.
23
+ │ ├── processed <- The final, canonical data sets for modeling.
24
+ │ └── raw <- The original, immutable data dump.
25
+
26
+ ├── docs <- A default mkdocs project; see www.mkdocs.org for details
27
+
28
+ ├── models <- Trained and serialized models, model predictions, or model summaries
29
+
30
+ ├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering),
31
+ │ the creator's initials, and a short `-` delimited description, e.g.
32
+ │ `1.0-jqp-initial-data-exploration`.
33
+
34
+ ├── pyproject.toml <- Project configuration file with package metadata for
35
+ │ hopcroft_skill_classification_tool_competition and configuration for tools like black
36
+
37
+ ├── references <- Data dictionaries, manuals, and all other explanatory materials.
38
+
39
+ ├── reports <- Generated analysis as HTML, PDF, LaTeX, etc.
40
+ │ └── figures <- Generated graphics and figures to be used in reporting
41
+
42
+ ├── requirements.txt <- The requirements file for reproducing the analysis environment, e.g.
43
+ │ generated with `pip freeze > requirements.txt`
44
+
45
+ ├── setup.cfg <- Configuration file for flake8
46
+
47
+ └── hopcroft_skill_classification_tool_competition <- Source code for use in this project.
48
+
49
+ ├── __init__.py <- Makes hopcroft_skill_classification_tool_competition a Python module
50
+
51
+ ├── config.py <- Store useful variables and configuration
52
+
53
+ ├── dataset.py <- Scripts to download or generate data
54
+
55
+ ├── features.py <- Code to create features for modeling
56
+
57
+ ├── modeling
58
+ │ ├── __init__.py
59
+ │ ├── predict.py <- Code to run model inference with trained models
60
+ │ └── train.py <- Code to train models
61
+
62
+ └── plots.py <- Code to create visualizations
63
+ ```
64
+
65
+ --------
66
+
67
+ ## Setup
68
+
69
+ ### MLflow Credentials Configuration
70
+
71
+ Set up DagsHub credentials for MLflow tracking.
72
+
73
+ **Get your token:** [DagsHub](https://dagshub.com) → Profile → Settings → Tokens
74
+
75
+ #### Option 1: Using `.env` file (Recommended for local development)
76
+
77
+ ```bash
78
+ # Copy the template
79
+ cp .env.example .env
80
+
81
+ # Edit .env with your credentials
82
+ ```
83
+
84
+ Your `.env` file should contain:
85
+ ```
86
+ MLFLOW_TRACKING_URI=https://dagshub.com/se4ai2526-uniba/Hopcroft.mlflow
87
+ MLFLOW_TRACKING_USERNAME=your_username
88
+ MLFLOW_TRACKING_PASSWORD=your_token
89
+ ```
90
+
91
+ > [!NOTE]
92
+ > The `.env` file is git-ignored for security. Never commit credentials to version control.
93
+
94
+ #### Option 2: Using Docker Compose
95
+
96
+ When using Docker Compose, the `.env` file is automatically loaded via `env_file` directive in `docker-compose.yml`.
97
+
98
+ ```bash
99
+ # Start the service (credentials loaded from .env)
100
+ docker compose up --build
101
+ ```
102
+
103
+ --------
104
+
105
+ ## CI Configuration
106
+
107
+ [![CI Pipeline](https://github.com/se4ai2526-uniba/Hopcroft/actions/workflows/ci.yml/badge.svg)](https://github.com/se4ai2526-uniba/Hopcroft/actions/workflows/ci.yml)
108
+
109
+ This project uses automatically triggered GitHub Actions triggers for Continuous Integration.
110
+
111
+ ### Secrets
112
+
113
+ To enable DVC model pulling, configure these Repository Secrets:
114
+
115
+ - `DAGSHUB_USERNAME`: DagsHub username.
116
+ - `DAGSHUB_TOKEN`: DagsHub access token.
117
+
118
+ --------
119
+
120
+ ## Milestone Summary
121
+
122
+ ### Milestone 1
123
+ We compiled the ML Canvas and defined:
124
+ - Problem: multi-label classification of skills for PR/issues.
125
+ - Stakeholders and business/research goals.
126
+ - Data sources (SkillScope DB) and constraints (no external classifiers).
127
+ - Success metrics (micro-F1, imbalance handling, experiment tracking).
128
+ - Risks (label imbalance, text noise, multi-label complexity) and mitigations.
129
+
130
+ ### Milestone 2
131
+ We implemented the essential end-to-end infrastructure to go from data to tracked modeling experiments:
132
+
133
+ 1. Data Management
134
+ - DVC setup (raw dataset and TF-IDF features tracked) with DagsHub remote; dedicated gitignores for data/models.
135
+
136
+ 2. Data Ingestion & EDA
137
+ - `dataset.py` to download/extract SkillScope from Hugging Face (zip → SQLite) with cleanup.
138
+ - Initial exploration notebook `notebooks/1.0-initial-data-exploration.ipynb` (schema, text stats, label distribution).
139
+
140
+ 3. Feature Engineering
141
+ - `features.py`: GitHub text cleaning (URL/HTML/markdown removal, normalization, Porter stemming) and TF-IDF (uni+bi-grams) saved as NumPy (`features_tfidf.npy`, `labels_tfidf.npy`).
142
+
143
+ 4. Central Config
144
+ - `config.py` with project paths, training settings, RF param grid, MLflow URI/experiments, PCA/ADASYN, feature constants.
145
+
146
+ 5. Modeling & Experiments
147
+ - Unified `modeling/train.py` with actions: baseline RF, MLSMOTE, ROS, ADASYN+PCA, LightGBM, LightGBM+MLSMOTE, and inference.
148
+ - GridSearchCV (micro-F1), MLflow logging, removal of all-zero labels, multilabel-stratified splits (with fallback).
149
+
150
+ 6. Imbalance Handling
151
+ - Local `mlsmote.py` (multi-label oversampling) with fallback to `RandomOverSampler`; dedicated ADASYN+PCA pipeline.
152
+
153
+ 7. Tracking & Reproducibility
154
+ - Remote MLflow (DagsHub) with README credential setup; DVC-tracked models and auxiliary artifacts (e.g., PCA, kept label indices).
155
+
156
+ 8. Tooling
157
+ - Updated `requirements.txt` (lightgbm, imbalanced-learn, iterative-stratification, huggingface-hub, dvc, mlflow, nltk, seaborn, etc.) and extended Makefile targets (`data`, `features`).
158
+
159
+ ### Milestone 3 (QA)
160
+ We implemented a comprehensive testing and validation framework to ensure data quality and model robustness:
161
+
162
+ 1. **Data Cleaning Pipeline**
163
+ - `data_cleaning.py`: Removes duplicates (481 samples), resolves label conflicts via majority voting (640 samples), filters sparse samples incompatible with SMOTE, and ensures train-test separation without leakage.
164
+ - Final cleaned dataset: 6,673 samples (from 7,154 original), 80/20 stratified split.
165
+
166
+ 2. **Great Expectations Validation** (10 tests)
167
+ - Database integrity, feature matrix validation (no NaN/Inf, sparsity checks), label format validation (binary {0,1}), feature-label consistency.
168
+ - Label distribution for stratification (min 5 occurrences), SMOTE compatibility (min 10 non-zero features), duplicate detection, train-test separation, label consistency.
169
+ - All 10 tests pass on cleaned data; comprehensive JSON reports in `reports/great_expectations/`.
170
+
171
+ 3. **Deepchecks Validation** (24 checks across 2 suites)
172
+ - Data Integrity Suite (92% score): validates duplicates, label conflicts, nulls, data types, feature correlation.
173
+ - Train-Test Validation Suite (100% score): **zero data leakage**, proper train/test split, feature/label drift analysis.
174
+ - Cleaned data achieved production-ready status (96% overall score).
175
+
176
+ 4. **Behavioral Testing** (36 tests)
177
+ - Invariance tests (9): typo robustness, synonym substitution, case insensitivity, punctuation/URL noise tolerance.
178
+ - Directional tests (10): keyword addition effects, technical detail impact on predictions.
179
+ - Minimum Functionality Tests (17): basic skill predictions on clear examples (bug fixes, database work, API development, testing, DevOps).
180
+ - All tests passed; comprehensive report in `reports/behavioral/`.
181
+
182
+ 5. **Code Quality Analysis**
183
+ - Ruff static analysis: 28 minor issues identified (unsorted imports, unused variables, f-strings), 100% fixable.
184
+ - PEP 8 compliant, Black compatible (line length 88).
185
+
186
+ 6. **Documentation**
187
+ - Comprehensive `docs/testing_and_validation.md` with detailed test descriptions, execution commands, and analysis results.
188
+ - Behavioral testing README with test categories, usage examples, and extension guide.
189
+
190
+ 7. **Tooling**
191
+ - Makefile targets: `validate-gx`, `validate-deepchecks`, `test-behavioral`, `test-complete`.
192
+ - Automated test execution and report generation.
193
+
194
+ ### Milestone 4 (API)
195
+ We implemented a production-ready FastAPI service for skill prediction with MLflow integration:
196
+
197
+ #### Features
198
+ - **REST API Endpoints**:
199
+ - `POST /predict` - Predict skills for a GitHub issue (logs to MLflow)
200
+ - `GET /predictions/{run_id}` - Retrieve prediction by MLflow run ID
201
+ - `GET /predictions` - List recent predictions with pagination
202
+ - `GET /health` - Health check endpoint
203
+ - **Model Management**: Loads trained Random Forest + TF-IDF vectorizer from `models/`
204
+ - **MLflow Tracking**: All predictions logged with metadata, probabilities, and timestamps
205
+ - **Input Validation**: Pydantic models for request/response validation
206
+ - **Interactive Docs**: Auto-generated Swagger UI and ReDoc
207
+
208
+ #### API Usage
209
+
210
+ **1. Start the API Server**
211
+ ```bash
212
+ # Development mode (auto-reload)
213
+ make api-dev
214
+
215
+ # Production mode
216
+ make api-run
217
+ ```
218
+ Server starts at: [http://127.0.0.1:8000](http://127.0.0.1:8000)
219
+
220
+ **2. Test Endpoints**
221
+
222
+ **Option A: Swagger UI (Recommended)**
223
+ - Navigate to: [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs)
224
+ - Interactive interface to test all endpoints
225
+ - View request/response schemas
226
+
227
+ **Option B: Make Commands**
228
+ ```bash
229
+ # Test all endpoints
230
+ make test-api-all
231
+
232
+ # Individual endpoints
233
+ make test-api-health # Health check
234
+ make test-api-predict # Single prediction
235
+ make test-api-list # List predictions
236
+ ```
237
+
238
+ #### Prerequisites
239
+ - Trained model: `models/random_forest_tfidf_gridsearch.pkl`
240
+ - TF-IDF vectorizer: `models/tfidf_vectorizer.pkl` (auto-saved during feature creation)
241
+ - Label names: `models/label_names.pkl` (auto-saved during feature creation)
242
+
243
+ #### MLflow Integration
244
+ - All predictions logged to: `https://dagshub.com/se4ai2526-uniba/Hopcroft.mlflow`
245
+ - Experiment: `skill_prediction_api`
246
+ - Tracked: input text, predictions, probabilities, metadata
247
+
248
+ #### Docker
249
+ Build and run the API in a container:
250
+ ```bash
251
+ docker build -t hopcroft-api .
252
+ docker run --rm --name hopcroft-api -p 8080:8080 hopcroft-api
253
+ ```
254
+
255
+ Endpoints:
256
+ - Swagger UI: [http://localhost:8080/docs](http://localhost:8080/docs)
257
+ - Health check: [http://localhost:8080/health](http://localhost:8080/health)
258
+
259
+ ---
260
+
261
+ ## Docker Compose Usage
262
+
263
+ Docker Compose orchestrates both the **API backend** and **Streamlit GUI** services with proper networking and configuration.
264
+
265
+ ### Prerequisites
266
+
267
+ 1. **Create your environment file:**
268
+ ```bash
269
+ cp .env.example .env
270
+ ```
271
+
272
+ 2. **Edit `.env`** with your actual credentials:
273
+ ```
274
+ MLFLOW_TRACKING_USERNAME=your_dagshub_username
275
+ MLFLOW_TRACKING_PASSWORD=your_dagshub_token
276
+ ```
277
+
278
+ Get your token from: [https://dagshub.com/user/settings/tokens](https://dagshub.com/user/settings/tokens)
279
+
280
+ ### Quick Start
281
+
282
+ #### 1. Build and Start All Services
283
+ Build both images and start the containers:
284
+ ```bash
285
+ docker-compose up -d --build
286
+ ```
287
+
288
+ | Flag | Description |
289
+ |------|-------------|
290
+ | `-d` | Run in detached mode (background) |
291
+ | `--build` | Rebuild images before starting (use when code/Dockerfile changes) |
292
+
293
+ **Available Services:**
294
+ - **API (FastAPI):** [http://localhost:8080/docs](http://localhost:8080/docs)
295
+ - **GUI (Streamlit):** [http://localhost:8501](http://localhost:8501)
296
+ - **Health Check:** [http://localhost:8080/health](http://localhost:8080/health)
297
+
298
+ #### 2. Stop All Services
299
+ Stop and remove containers and networks:
300
+ ```bash
301
+ docker-compose down
302
+ ```
303
+
304
+ | Flag | Description |
305
+ |------|-------------|
306
+ | `-v` | Also remove named volumes (e.g., `hopcroft-logs`): `docker-compose down -v` |
307
+ | `--rmi all` | Also remove images: `docker-compose down --rmi all` |
308
+
309
+ #### 3. Restart Services
310
+ After updating `.env` or configuration files:
311
+ ```bash
312
+ docker-compose restart
313
+ ```
314
+
315
+ Or for a full restart with environment reload:
316
+ ```bash
317
+ docker-compose down
318
+ docker-compose up -d
319
+ ```
320
+
321
+ #### 4. Check Status
322
+ View the status of all running services:
323
+ ```bash
324
+ docker-compose ps
325
+ ```
326
+
327
+ Or use Docker commands:
328
+ ```bash
329
+ docker ps
330
+ ```
331
+
332
+ #### 5. View Logs
333
+ Tail logs from both services in real-time:
334
+ ```bash
335
+ docker-compose logs -f
336
+ ```
337
+
338
+ View logs from a specific service:
339
+ ```bash
340
+ docker-compose logs -f hopcroft-api
341
+ docker-compose logs -f hopcroft-gui
342
+ ```
343
+
344
+ | Flag | Description |
345
+ |------|-------------|
346
+ | `-f` | Follow log output (stream new logs) |
347
+ | `--tail 100` | Show only last 100 lines: `docker-compose logs --tail 100` |
348
+
349
+ #### 6. Execute Commands in Container
350
+ Open an interactive shell inside a running container:
351
+ ```bash
352
+ docker-compose exec hopcroft-api /bin/bash
353
+ docker-compose exec hopcroft-gui /bin/bash
354
+ ```
355
+
356
+ Examples of useful commands inside the API container:
357
+ ```bash
358
+ # Check installed packages
359
+ pip list
360
+
361
+ # Run Python interactively
362
+ python
363
+
364
+ # Check model file exists
365
+ ls -la /app/models/
366
+
367
+ # Verify environment variables
368
+ printenv | grep MLFLOW
369
+ ```
370
+ ```
371
+
372
+ ### Architecture Overview
373
+
374
+ **Docker Compose orchestrates two services:**
375
+
376
+ ```
377
+ docker-compose.yml
378
+ ├── hopcroft-api (FastAPI Backend)
379
+ │ ├── Build: ./Dockerfile
380
+ │ ├── Port: 8080:8080
381
+ │ ├── Network: hopcroft-net
382
+ │ ├── Environment: .env (MLflow credentials)
383
+ │ ├── Volumes:
384
+ │ │ ├── ./hopcroft_skill_classification_tool_competition (hot reload)
385
+ │ │ └── hopcroft-logs:/app/logs (persistent logs)
386
+ │ └── Health Check: /health endpoint
387
+
388
+ ├── hopcroft-gui (Streamlit Frontend)
389
+ │ ├── Build: ./Dockerfile.streamlit
390
+ │ ├── Port: 8501:8501
391
+ │ ├── Network: hopcroft-net
392
+ │ ├── Environment: API_BASE_URL=http://hopcroft-api:8080
393
+ │ ├── Volumes:
394
+ │ │ └── ./hopcroft_skill_classification_tool_competition/streamlit_app.py (hot reload)
395
+ │ └── Depends on: hopcroft-api (waits for health check)
396
+
397
+ └── hopcroft-net (bridge network)
398
+ ```
399
+
400
+ **External Access:**
401
+ - API: http://localhost:8080
402
+ - GUI: http://localhost:8501
403
+
404
+ **Internal Communication:**
405
+ - GUI → API: http://hopcroft-api:8080 (via Docker network)
406
+
407
+ ### Services Description
408
+
409
+ **hopcroft-api (FastAPI Backend)**
410
+ - Purpose: FastAPI backend serving the ML model for skill classification
411
+ - Image: Built from `Dockerfile`
412
+ - Port: 8080 (maps to host 8080)
413
+ - Features:
414
+ - Random Forest model with embedding features
415
+ - MLflow experiment tracking
416
+ - Auto-reload in development mode
417
+ - Health check endpoint
418
+
419
+ **hopcroft-gui (Streamlit Frontend)**
420
+ - Purpose: Streamlit web interface for interactive predictions
421
+ - Image: Built from `Dockerfile.streamlit`
422
+ - Port: 8501 (maps to host 8501)
423
+ - Features:
424
+ - User-friendly interface for skill prediction
425
+ - Real-time communication with API
426
+ - Automatic reconnection on API restart
427
+ - Depends on API health before starting
428
+
429
+ ### Development vs Production
430
+
431
+ **Development (default):**
432
+ - Auto-reload enabled (`--reload`)
433
+ - Source code mounted with bind mounts
434
+ - Custom command with hot reload
435
+ - GUI → API via Docker network
436
+
437
+ **Production:**
438
+ - Auto-reload disabled
439
+ - Use built image only
440
+ - Use Dockerfile's CMD
441
+ - GUI → API via Docker network
442
+
443
+ For **production deployment**, modify `docker-compose.yml` to remove bind mounts and disable reload.
444
+
445
+ ### Troubleshooting
446
+
447
+ #### Issue: GUI shows "API is not available"
448
+ **Solution:**
449
+ 1. Wait 30-60 seconds for API to fully initialize and become healthy
450
+ 2. Refresh the GUI page (F5)
451
+ 3. Check API health: `curl http://localhost:8080/health`
452
+ 4. Check logs: `docker-compose logs hopcroft-api`
453
+
454
+ #### Issue: "500 Internal Server Error" on predictions
455
+ **Solution:**
456
+ 1. Verify MLflow credentials in `.env` are correct
457
+ 2. Restart services: `docker-compose down && docker-compose up -d`
458
+ 3. Check environment variables: `docker exec hopcroft-api printenv | grep MLFLOW`
459
+
460
+ #### Issue: Changes to code not reflected
461
+ **Solution:**
462
+ - For Python code changes: Auto-reload is enabled, wait a few seconds
463
+ - For Dockerfile changes: Rebuild with `docker-compose up -d --build`
464
+ - For `.env` changes: Restart with `docker-compose down && docker-compose up -d`
465
+
466
+ #### Issue: Port already in use
467
+ **Solution:**
468
+ ```bash
469
+ # Check what's using the port
470
+ netstat -ano | findstr :8080
471
+ netstat -ano | findstr :8501
472
+
473
+ # Stop existing containers
474
+ docker-compose down
475
+
476
+ # Or change ports in docker-compose.yml
477
+ ```
478
+
479
+
480
+ ## Demo UI (Streamlit)
481
+
482
+ The Streamlit GUI provides an interactive web interface for the skill classification API.
483
+
484
+ ### Features
485
+ - Real-time skill prediction from GitHub issue text
486
+ - Top-5 predicted skills with confidence scores
487
+ - Full predictions table with all skills
488
+ - API connection status indicator
489
+ - Responsive design
490
+
491
+ ### Usage
492
+ 1. Ensure both services are running: `docker-compose up -d`
493
+ 2. Open the GUI in your browser: [http://localhost:8501](http://localhost:8501)
494
+ 3. Enter a GitHub issue description in the text area
495
+ 4. Click "Predict Skills" to get predictions
496
+ 5. View results in the predictions table
497
+
498
+ ### Architecture
499
+ - **Frontend**: Streamlit (Python web framework)
500
+ - **Communication**: HTTP requests to FastAPI backend via Docker network
501
+ - **Independence**: GUI and API run in separate containers
502
+ - **Auto-reload**: GUI code changes are reflected immediately (bind mount)
503
+ > Both must run **simultaneously** in different terminals/containers.
504
+
505
+ ### Quick Start
506
+
507
+ 1. **Start the FastAPI backend:**
508
+ ```bash
509
+ fastapi dev hopcroft_skill_classification_tool_competition/main.py
510
+ ```
511
+
512
+ 2. **In a new terminal, start Streamlit:**
513
+ ```bash
514
+ streamlit run streamlit_app.py
515
+ ```
516
+
517
+ 3. **Open your browser:**
518
+ - Streamlit UI: http://localhost:8501
519
+ - FastAPI Docs: http://localhost:8000/docs
520
+
521
+ ### Features
522
+
523
+ - Interactive web interface for skill prediction
524
+ - Real-time predictions with confidence scores
525
+ - Adjustable confidence threshold
526
+ - Multiple input modes (quick/detailed/examples)
527
+ - Visual result display
528
+ - API health monitoring
529
+
530
+ ### Demo Walkthrough
531
+
532
+ #### Main Dashboard
533
+
534
+ ![gui_main_dashboard](docs/img/gui_main_dashboard.png)
535
+
536
+ The main interface provides:
537
+ - **Sidebar**: API health status, confidence threshold slider, model info
538
+ - **Three input modes**: Quick Input, Detailed Input, Examples
539
+ #### Quick Input Mode
540
+
541
+ ![gui_quick_input](docs/img/gui_quick_input.png)
542
+ Simply paste your GitHub issue text and click "Predict Skills"!
543
+
544
+ #### Prediction Results
545
+ ![gui_detailed](docs/img/gui_detailed.png)
546
+ View:
547
+ - **Top predictions** with confidence scores
548
+ - **Full predictions table** with filtering
549
+ - **Processing metrics** (time, model version)
550
+ - **Raw JSON response** (expandable)
551
+
552
+ #### Detailed Input Mode
553
+
554
+ ![gui_detailed_input](docs/img/gui_detailed_input.png)
555
+ Add optional metadata:
556
+ - Repository name
557
+ - PR number
558
+ - Detailed description
559
+
560
+ #### Example Gallery
561
+ ![gui_ex](docs/img/gui_ex.png)
562
+
563
+ Test with pre-loaded examples:
564
+ - Authentication bugs
565
+ - ML features
566
+ - Database issues
567
+ - UI enhancements
568
+
569
+
570
+ ### Usage
571
+
572
+ 1. Enter GitHub issue/PR text in the input area
573
+ 2. (Optional) Add description, repo name, PR number
574
+ 3. Click "Predict Skills"
575
+ 4. View results with confidence scores
576
+ 5. Adjust threshold slider to filter predictions
data/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ /raw
data/README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ tags:
5
+ - software-engineering
6
+ - multi-label-classification
7
+ - pull-requests
8
+ - skills
9
+ license: mit
10
+ ---
11
+
12
+ # Dataset Card for SkillScope Dataset
13
+
14
+ ## Dataset Details
15
+
16
+ - **Name:** SkillScope Dataset (NLBSE Tool Competition)
17
+ - **Repository:** [NLBSE/SkillCompetition](https://huggingface.co/datasets/NLBSE/SkillCompetition)
18
+ - **Version:** 1.0 (Processed for Hopcroft Project)
19
+ - **Type:** Tabular / Text / Code
20
+ - **Task:** Multi-label Classification
21
+ - **Maintainers:** se4ai2526-uniba (Hopcroft Project Team)
22
+
23
+ ## Intended Use
24
+
25
+ ### Primary Intended Uses
26
+ - Training and evaluating multi-label classification models to predict required skills for resolving GitHub issues/Pull Requests.
27
+ - Analyzing the relationship between issue characteristics (title, body, code changes) and developer skills.
28
+ - Benchmarking feature extraction techniques (TF-IDF vs. Embeddings) in Software Engineering contexts.
29
+
30
+ ### Out-of-Scope Use Cases
31
+ - Profiling individual developers (the dataset focuses on issues/PRs, not user profiling).
32
+ - General purpose code generation.
33
+
34
+ ## Dataset Contents
35
+
36
+ The dataset consists of merged Pull Requests from 11 Java repositories.
37
+
38
+ - **Total Samples (Raw):** 7,245 merged PRs
39
+ - **Source Files:** 57,206
40
+ - **Methods:** 59,644
41
+ - **Classes:** 13,097
42
+ - **Labels:** 217 distinct skill labels (domain/sub-domain pairs)
43
+
44
+ ### Schema
45
+ The data is stored in a SQLite database (`skillscope_data.db`) with the following main structures:
46
+ - `nlbse_tool_competition_data_by_issue`: Main table containing PR features (title, description, file paths) and skill labels.
47
+ - `vw_nlbse_tool_competition_data_by_file`: View providing file-level granularity.
48
+
49
+ ## Context and Motivation
50
+
51
+ ### Motivation
52
+ This dataset was created for the NLBSE (Natural Language-based Software Engineering) Tool Competition to foster research in automating skill identification in software maintenance. Accurately identifying required skills for an issue can help in automatic expert recommendation and task assignment.
53
+
54
+ ### Context
55
+ The data is derived from open-source Java projects on GitHub. It represents real-world development scenarios where developers describe issues and implement fixes.
56
+
57
+ ## Dataset Creation and Preprocessing
58
+
59
+ ### Source Data
60
+ The raw data is downloaded from the Hugging Face Hub (`NLBSE/SkillCompetition`).
61
+
62
+ ### Preprocessing Steps (Hopcroft Project)
63
+ To ensure data quality for modeling, the following preprocessing steps are applied (via `data_cleaning.py`):
64
+
65
+ 1. **Duplicate Removal:** ~6.5% of samples were identified as duplicates and removed.
66
+ 2. **Conflict Resolution:** ~8.9% of samples had conflicting labels for identical features; resolved using majority voting.
67
+ 3. **Rare Label Removal:** Labels with fewer than 5 occurrences were removed to ensure valid cross-validation.
68
+ 4. **Feature Extraction:**
69
+ - **Text Cleaning:** Removal of URLs, HTML, Markdown, and normalization.
70
+ - **TF-IDF:** Uni-grams and bi-grams (max 5000 features).
71
+ - **Embeddings:** Sentence embeddings using `all-MiniLM-L6-v2`.
72
+ 5. **Splitting:** 80/20 Train/Test split using `MultilabelStratifiedShuffleSplit` to maintain label distribution and prevent data leakage.
73
+
74
+ ## Considerations
75
+
76
+ ### Ethical Considerations
77
+ - **Privacy:** The data comes from public GitHub repositories. No private or sensitive personal information is explicitly included, though developer names/IDs might be present in metadata.
78
+ - **Bias:** The dataset is limited to Java repositories, so models may not generalize to other programming languages or ecosystems.
79
+
80
+ ### Caveats and Recommendations
81
+ - **Label Imbalance:** The dataset is highly imbalanced (long-tail distribution of skills). Techniques like MLSMOTE or ADASYN are recommended.
82
+ - **Multi-label Nature:** Most samples have multiple labels; evaluation metrics should account for this (e.g., Micro-F1).
83
+ - **Text Noise:** PR descriptions can be noisy or sparse; robust preprocessing is essential.
data/processed/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ /tfidf
2
+ /embedding
data/processed/embedding.dvc ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ outs:
2
+ - md5: d388f4e3ebe391bf4393cd327a16a1bb.dir
3
+ size: 64320416
4
+ nfiles: 12
5
+ hash: md5
6
+ path: embedding
data/processed/tfidf.dvc ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 038f64e03853a832a891a854146c429d.dir
3
+ nfiles: 11
4
+ hash: md5
5
+ path: tfidf
6
+ size: 199262804
data/raw.dvc ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 9a91536f747b03a232c8cfd354393541.dir
3
+ size: 440922112
4
+ nfiles: 1
5
+ hash: md5
6
+ path: raw
docker-compose.yml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ hopcroft-api:
3
+ build:
4
+ context: .
5
+ dockerfile: Dockerfile
6
+ container_name: hopcroft-api
7
+ ports:
8
+ - "8080:8080"
9
+ env_file:
10
+ - .env
11
+ environment:
12
+ - PROJECT_NAME=Hopcroft
13
+ volumes:
14
+ # Bind mount: enables live code reloading for development
15
+ - ./hopcroft_skill_classification_tool_competition:/app/hopcroft_skill_classification_tool_competition
16
+ # Named volume: persistent storage for application logs
17
+ - hopcroft-logs:/app/logs
18
+ networks:
19
+ - hopcroft-net
20
+ # Override CMD for development with auto-reload
21
+ command: >
22
+ uvicorn hopcroft_skill_classification_tool_competition.main:app --host 0.0.0.0 --port 8080 --reload
23
+ restart: unless-stopped
24
+ healthcheck:
25
+ test: [ "CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8080/health', timeout=5)" ]
26
+ interval: 30s
27
+ timeout: 10s
28
+ retries: 3
29
+ start_period: 60s
30
+
31
+ hopcroft-gui:
32
+ build:
33
+ context: .
34
+ dockerfile: Dockerfile.streamlit
35
+ container_name: hopcroft-gui
36
+ ports:
37
+ - "8501:8501"
38
+ environment:
39
+ - API_BASE_URL=http://hopcroft-api:8080
40
+ volumes:
41
+ # Bind mount for development hot-reload
42
+ - ./hopcroft_skill_classification_tool_competition/streamlit_app.py:/app/streamlit_app.py
43
+ networks:
44
+ - hopcroft-net
45
+ depends_on:
46
+ hopcroft-api:
47
+ condition: service_healthy
48
+ restart: unless-stopped
49
+
50
+ networks:
51
+ hopcroft-net:
52
+ driver: bridge
53
+
54
+ volumes:
55
+ hopcroft-logs:
56
+ driver: local
docs/.gitkeep ADDED
File without changes
docs/ML Canvas.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Machine Learning Canvas
2
+
3
+ | Designed for | Designed by | Date | Iteration |
4
+ |---|---|---|---|
5
+ | NLBSE 2026 | Team Hopcroft | 13/10/2025 | 1 |
6
+
7
+ ## PREDICTION TASK
8
+ The prediction task is a multi-label classification aimed at identifying the technical skills required to resolve a specific software issue. The input for the model is a dataset extracted from a GitHub pull request, which includes textual features (like the issue description), code-context information, and other metadata. The output is a set of one or more skill labels, chosen from a predefined set of 217 skills, representing the technical domains and sub-domains (e.g., "database," "security," "UI") needed for the resolution.
9
+
10
+ ## DECISIONS
11
+ The predictions are used to make crucial operational decisions in software project management. The value for the end-user, such as a project manager or team lead, lies in the ability to automatically assign new issues to the most suitable developers—those who possess the skills identified by the model. This optimizes resource allocation, accelerates resolution times, and improves the overall efficiency of the development team.
12
+
13
+ ## VALUE PROPOSITION
14
+ The Machine Learning system is designed for project managers and developers, aiming to optimize task assignment. By automatically predicting the technical skills (domains and sub-domains) required to resolve GitHub issues, the system ensures that each task is assigned to the most qualified developer.
15
+ The primary value lies in a significant increase in the efficiency of the development process, leading to reduced resolution times and improved software quality.
16
+
17
+ ## DATA COLLECTION
18
+ The core data was collected by the competition organizers through a mining process on historical GitHub pull requests. This process involved sourcing the issue text and associated source code from tasks that were already completed and merged. Each issue in the dataset then underwent a rigorous, automated labeling protocol, where skill labels (domains and sub-domains) were annotated based on the specific API calls detected within the source code. Due to the nature of software development tasks, the resulting dataset faces a significant class imbalance issue, with certain skill labels appearing far more frequently than others.
19
+
20
+ ## DATA SOURCES
21
+ The ML system will leverage the official NLBSE’26 Skill Classification dataset, a comprehensive corpus released by the competition organizers. This dataset is sourced from 11 popular Java repositories and comprises 7,245 merged pull requests annotated with 217 distinct skill labels. /
22
+ All foundational data is provided in a SQLite database (`skillscope_data.db`), with the `nlbse_tool_competition_data_by_issue` table serving as the primary source for model training. The competition framework also permits the use of external GitHub APIs for supplementary data.
23
+
24
+ ## IMPACT SIMULATION
25
+ The model's impact is validated by outperforming the specific "SkillScope Random Forest + TF-IDF" baseline on precision, recall, or micro-F1 scores. This evaluation is performed using the provided SQLite database of labeled pull requests as the ground truth to ensure measurable and superior performance.
26
+
27
+ ## MAKING PREDICTIONS
28
+ As soon as a new issue is created, the system analyzes it in real-time to understand which technical skills are needed. Instead of waiting for a manual assignment, the system sends the task directly to the most suitable developer. This automated process is so fast that it ensures the right expert can start working on the problem without any delay.
29
+
30
+ ## BUILDING MODELS
31
+ The ML system will start with the competition’s baseline multi-label classifier, which predicts the domains and sub-domains representing the skills needed for each issue. Model development will focus on iterative improvements to enhance the specified performance metrics.
32
+ A new model will be trained until it achieves a statistically significant improvement in precision, recall, or micro-F1 score over the initial baseline, without degradation in the other metrics.
33
+ Training will occur offline, with computational needs scaling by model complexity and data volume.
34
+
35
+ ## FEATURES
36
+ Only the most important, non-null, and directly functional features will be selected. Textual data, such as the issue title and description, will be represented using established NLP techniques. We will also utilize numerical features, including the pull request number and the calculated issue duration. Skills will be encoded as binary multi-label vectors, and all features will be normalized to optimize model performance throughout iterative development cycles.
37
+
38
+ ## MONITORING
39
+ System quality will be assessed by comparing the model's skill predictions with the actual skills used by developers to resolve issues. Performance will be continuously monitored using key metrics (precision, recall, micro-F1 score). To detect data drift, the model will be periodically evaluated on new, recent data; a significant drop in these metrics will indicate the need for retraining. The system's value is measured according to the competition's criteria: the primary value is the increase in the micro-F1 score (∆micro-F1) over the baseline, without worsening precision and recall. Computational efficiency (runtime) serves as a secondary value metric.
docs/README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Generating the docs
2
+ ----------
3
+
4
+ Use [mkdocs](http://www.mkdocs.org/) structure to update the documentation.
5
+
6
+ Build locally with:
7
+
8
+ mkdocs build
9
+
10
+ Serve locally with:
11
+
12
+ mkdocs serve
docs/docs/getting-started.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Getting started
2
+ ===============
3
+
4
+ This is where you describe how to get set up on a clean install, including the
5
+ commands necessary to get the raw data (using the `sync_data_from_s3` command,
6
+ for example), and then how to make the cleaned, final data sets.
docs/docs/index.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hopcroft_Skill-Classification-Tool-Competition documentation!
2
+
3
+ ## Description
4
+
5
+ The task involves analyzing the relationship between issue characteristics and required skills, developing effective feature extraction methods that combine textual and code-context information, and implementing sophisticated multi-label classification approaches. Students may incorporate additional GitHub metadata to enhance model inputs, but must avoid using third-party classification engines or direct outputs from the provided database. The work requires careful attention to the multi-label nature of the problem, where each issue may require multiple different skills for resolution.
6
+
7
+ ## Commands
8
+
9
+ The Makefile contains the central entry points for common tasks related to this project.
10
+
docs/mkdocs.yml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ site_name: Hopcroft_Skill-Classification-Tool-Competition
2
+ #
3
+ site_author: Team Hopcroft
4
+ #
docs/testing_and_validation.md ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Testing and Validation Documentation
2
+
3
+ This document provides a comprehensive and detailed overview of the testing and validation strategies employed in the Hopcroft project. It consolidates all technical details, execution commands, and analysis reports from Behavioral Testing, Deepchecks, Great Expectations, and Ruff.
4
+
5
+ ---
6
+
7
+ ## 1. Behavioral Testing
8
+
9
+ **Report Source:** `reports/behavioral/`
10
+ **Status:** All Tests Passed (36/36)
11
+ **Last Run:** November 15, 2025
12
+ **Model:** Random Forest + TF-IDF (SMOTE oversampling)
13
+ **Execution Time:** ~8 minutes
14
+
15
+ Behavioral testing evaluates the model's capabilities and robustness beyond simple accuracy metrics.
16
+
17
+ ### Test Categories & Results
18
+
19
+ | Category | Tests | Status | Description |
20
+ |----------|-------|--------|-------------|
21
+ | **Invariance Tests** | 9 | **Passed** | Ensure model predictions remain stable under perturbations that shouldn't affect the outcome (e.g., changing variable names, minor typos). |
22
+ | **Directional Tests** | 10 | **Passed** | Verify that specific changes to the input cause expected changes in the output (e.g., adding specific keywords should increase probability of related skills). |
23
+ | **Minimum Functionality Tests** | 17 | **Passed** | Check basic capabilities and sanity checks (e.g., simple inputs produce valid outputs). |
24
+
25
+ ### Technical Notes
26
+ - **Training Tests Excluded:** `test_model_training.py` was excluded from the run due to a missing PyTorch dependency in the environment, but the inference tests cover the model's behavior fully.
27
+ - **Robustness:** The model demonstrates excellent consistency across all 36 behavioral scenarios.
28
+
29
+ ### How to Regenerate
30
+ To run the behavioral tests and generate the JSON report:
31
+
32
+ ```bash
33
+ python -m pytest tests/behavioral/ \
34
+ --ignore=tests/behavioral/test_model_training.py \
35
+ --json-report \
36
+ --json-report-file=reports/behavioral/behavioral_tests_report.json \
37
+ -v
38
+ ```
39
+
40
+ ---
41
+
42
+ ## 2. Deepchecks Validation
43
+
44
+ **Report Source:** `reports/deepchecks/`
45
+ **Status:** Cleaned Data is Production-Ready (Score: 96%)
46
+ **Last Run:** November 16, 2025
47
+
48
+ Deepchecks was used to validate the integrity of the dataset before and after cleaning. The validation process confirmed that the `data_cleaning.py` pipeline successfully resolved critical data quality issues.
49
+
50
+ ### Dataset Statistics: Before vs. After Cleaning
51
+
52
+ | Metric | Before Cleaning | After Cleaning | Difference |
53
+ |--------|-----------------|----------------|------------|
54
+ | **Total Samples** | 7,154 | 6,673 | -481 duplicates (6.72%) |
55
+ | **Duplicates** | 481 | **0** | **RESOLVED** |
56
+ | **Data Leakage** | Present | **0 samples** | **RESOLVED** |
57
+ | **Label Conflicts** | Present | **0** | **RESOLVED** |
58
+ | **Train/Test Split** | N/A | 5,338 / 1,335 | 80/20 Stratified |
59
+
60
+ ### Validation Suites Detailed Results
61
+
62
+ #### A. Data Integrity Suite (12 checks)
63
+ **Score:** 92% (7 Passed, 2 Non-Critical Failures, 2 Null)
64
+
65
+ * **PASSED:** Data Duplicates (0), Conflicting Labels (0), Mixed Nulls, Mixed Data Types, String Mismatch, String Length, Feature Label Correlation.
66
+ * **FAILED (Non-Critical/Acceptable):**
67
+ 1. **Single Value in Column:** Some TF-IDF features are all zeros.
68
+ 2. **Feature-Feature Correlation:** High correlation between features.
69
+
70
+ #### B. Train-Test Validation Suite (12 checks)
71
+ **Score:** 100% (12 Passed)
72
+
73
+ * **PASSED (CRITICAL):** **Train Test Samples Mix (0 leakage)**.
74
+ * **PASSED:** Datasets Size Comparison (80/20), New Label in Test (0), Feature Drift (< 0.025), Label Drift (0.0), Multivariate Drift.
75
+
76
+ ### Interpretation of Results & Important Notes
77
+
78
+ The validation identified two "failures" that are actually **expected behavior** for this type of data:
79
+
80
+ 1. **Features with Only Zeros (Non-Critical):**
81
+ * *Reason:* TF-IDF creates sparse features. If a specific word (feature) never appears in the specific subset being tested, its column will be all zeros.
82
+ * *Impact:* None. The model simply ignores these features.
83
+
84
+ 2. **High Feature Correlation (Non-Critical):**
85
+ * *Reason:* Linguistic terms naturally co-occur (e.g., "machine" and "learning", "python" and "code").
86
+ * *Impact:* Slight multicollinearity, which Random Forest handles well.
87
+
88
+ ### Recommendations & Next Steps
89
+ 1. **Model Retraining:** Now that the data is cleaned and leakage-free, the models should be retrained to obtain reliable performance metrics.
90
+ 2. **Continuous Monitoring:** Use `run_all_deepchecks.py` in CI/CD pipelines to prevent regression.
91
+
92
+ ### How to Use the Tests
93
+
94
+ **Run Complete Validation (Recommended):**
95
+ ```bash
96
+ python tests/deepchecks/run_all_deepchecks.py
97
+ ```
98
+
99
+ **Run Specific Suites:**
100
+ ```bash
101
+ # Data Integrity Only
102
+ python tests/deepchecks/test_data_integrity.py
103
+
104
+ # Train-Test Validation Only
105
+ python tests/deepchecks/test_train_test_validation.py
106
+
107
+ # Compare Original vs Cleaned
108
+ python tests/deepchecks/run_all_tests_comparison.py
109
+ ```
110
+
111
+ ---
112
+
113
+ ## 3. Great Expectations Data Validation
114
+
115
+ **Report Source:** `tests/great expectations/`
116
+ **Status:** All 10 Tests Passed on Cleaned Data
117
+
118
+ Great Expectations provides a rigorous suite of 10 tests to validate the data pipeline at various stages.
119
+
120
+ ### Detailed Test Descriptions
121
+
122
+ #### TEST 1: Raw Database Validation
123
+ * **Purpose:** Validates integrity/schema of `nlbse_tool_competition_data_by_issue` table. Ensures data source integrity before expensive feature engineering.
124
+ * **Checks:** Row count (7000-10000), Column count (220-230), Required columns present.
125
+ * **Result:** **PASS**. Schema is valid.
126
+
127
+ #### TEST 2: TF-IDF Feature Matrix Validation
128
+ * **Purpose:** Validates statistical properties of TF-IDF features. Ensures feature matrix is suitable for ML algorithms.
129
+ * **Checks:** No NaN/Inf, values >= 0, at least 1 non-zero feature per sample.
130
+ * **Original Data:** **FAIL** (25 samples had 0 features due to empty text).
131
+ * **Cleaned Data:** **PASS** (Sparse samples removed).
132
+
133
+ #### TEST 3: Multi-Label Binary Format Validation
134
+ * **Purpose:** Ensures label matrix is binary {0,1} for MultiOutputClassifier. Missing labels would invalidate training.
135
+ * **Checks:** Values in {0,1}, correct dimensions.
136
+ * **Result:** **PASS**.
137
+
138
+ #### TEST 4: Feature-Label Consistency Validation
139
+ * **Purpose:** Validates alignment between X and Y matrices. Misalignment causes catastrophic training failures.
140
+ * **Checks:** Row counts match, no empty vectors.
141
+ * **Original Data:** **FAIL** (Empty feature vectors present).
142
+ * **Cleaned Data:** **PASS** (Perfect alignment).
143
+
144
+ #### TEST 5: Label Distribution & Stratification
145
+ * **Purpose:** Ensures labels have enough samples for stratified splitting. Labels with insufficient samples cause stratification failures.
146
+ * **Checks:** Min 5 occurrences per label.
147
+ * **Original Data:** **FAIL** (75 labels had 0 occurrences).
148
+ * **Cleaned Data:** **PASS** (Rare labels removed).
149
+
150
+ #### TEST 6: Feature Sparsity & SMOTE Compatibility
151
+ * **Purpose:** Ensures feature density is sufficient for nearest-neighbor algorithms (SMOTE/ADASYN).
152
+ * **Checks:** Min 10 non-zero features per sample.
153
+ * **Original Data:** **FAIL** (31.5% samples < 10 features).
154
+ * **Cleaned Data:** **PASS** (Incompatible samples removed).
155
+
156
+ #### TEST 7: Multi-Output Classifier Compatibility
157
+ * **Purpose:** Validates multi-label structure. Insufficient multi-label samples would indicate inappropriate architecture.
158
+ * **Checks:** >50% samples have multiple labels.
159
+ * **Result:** **PASS** (Strong multi-label characteristics).
160
+
161
+ #### TEST 8: Duplicate Samples Detection
162
+ * **Purpose:** Detects duplicate feature vectors to prevent leakage.
163
+ * **Original Data:** **FAIL** (481 duplicates found).
164
+ * **Cleaned Data:** **PASS** (0 duplicates).
165
+
166
+ #### TEST 9: Train-Test Separation Validation
167
+ * **Purpose:** **CRITICAL**. Validates no data leakage between train and test sets.
168
+ * **Checks:** Intersection of Train and Test sets must be empty.
169
+ * **Result:** **PASS** (Cleaned data only).
170
+
171
+ #### TEST 10: Label Consistency Validation
172
+ * **Purpose:** Ensures identical features have identical labels. Inconsistency indicates ground truth errors.
173
+ * **Original Data:** **FAIL** (640 samples with conflicting labels).
174
+ * **Cleaned Data:** **PASS** (Resolved via majority voting).
175
+
176
+ ### Running the Tests
177
+ ```bash
178
+ python "tests/great expectations/test_gx.py"
179
+ ```
180
+
181
+ ---
182
+
183
+ ## 4. Ruff Code Quality Analysis
184
+
185
+ **Report Source:** `reports/ruff/`
186
+ **Status:** All Issues Resolvable
187
+ **Last Analysis:** November 17, 2025
188
+ **Total Issues:** 28
189
+
190
+ Static code analysis was performed using Ruff to ensure code quality and adherence to PEP 8 standards.
191
+
192
+ ### Issue Breakdown by File
193
+
194
+ | File | Issues | Severity | Key Findings |
195
+ |------|--------|----------|--------------|
196
+ | `data_cleaning.py` | 16 | Low/Med | Unsorted imports (I001), Unused imports (F401), f-strings without placeholders (F541), Comparison to False (E712). |
197
+ | `modeling/train.py` | 7 | Low/Med | Unused `SMOTE` import, Unused variable `n_labels`, f-strings. |
198
+ | `features.py` | 2 | Low | Unused `nltk` import. |
199
+ | `dataset.py` | 2 | Low | Unused `DB_PATH` import. |
200
+ | `mlsmote.py` | 1 | Low | Unsorted imports. |
201
+
202
+ ### Configuration & Compliance
203
+ * **Command:** `ruff check . --output-format json --output-file reports/ruff/ruff_report.json`
204
+ * **Standards:** PEP 8 (Pass), Black compatible (line length 88), isort (Pass).
205
+ * **Fixability:** 100% of issues can be fixed (26 automatically, 2 manually).
206
+
207
+ ### Conclusion
208
+ The project code quality is high, with only minor style and import issues that do not affect functionality but should be cleaned up for maintainability.
hopcroft_skill_classification_tool_competition/__init__.py ADDED
File without changes
hopcroft_skill_classification_tool_competition/api_models.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pydantic models for API data validation.
3
+
4
+ Defines request and response schemas with validation rules.
5
+ """
6
+
7
+ from datetime import datetime
8
+ from typing import Optional
9
+
10
+ from pydantic import BaseModel, ConfigDict, Field, field_serializer, field_validator
11
+
12
+
13
+ class IssueInput(BaseModel):
14
+ """Input model for GitHub issue or pull request classification."""
15
+
16
+ issue_text: str = Field(
17
+ ...,
18
+ min_length=1,
19
+ description="Issue title text",
20
+ examples=["Fix bug in authentication module"],
21
+ )
22
+ issue_description: Optional[str] = Field(
23
+ default=None,
24
+ description="Issue body text",
25
+ examples=["The authentication module fails when handling expired tokens"],
26
+ )
27
+ repo_name: Optional[str] = Field(
28
+ default=None, description="Repository name", examples=["user/repo-name"]
29
+ )
30
+ pr_number: Optional[int] = Field(
31
+ default=None, ge=1, description="Pull request number", examples=[123]
32
+ )
33
+ created_at: Optional[datetime] = Field(
34
+ default=None, description="Issue creation timestamp", examples=["2024-01-15T10:30:00Z"]
35
+ )
36
+ author_name: Optional[str] = Field(
37
+ default=None, description="Issue author username", examples=["johndoe"]
38
+ )
39
+
40
+ @field_validator("issue_text", "issue_description")
41
+ @classmethod
42
+ def clean_text(cls, v: Optional[str]) -> Optional[str]:
43
+ """Validate and clean text fields."""
44
+ if v is None:
45
+ return v
46
+ v = v.strip()
47
+ if not v:
48
+ raise ValueError("Text cannot be empty or whitespace only")
49
+ return v
50
+
51
+ model_config = ConfigDict(
52
+ json_schema_extra={
53
+ "example": {
54
+ "issue_text": "Add support for OAuth authentication",
55
+ "issue_description": "Implement OAuth 2.0 flow for third-party providers",
56
+ "repo_name": "myorg/myproject",
57
+ "pr_number": 456,
58
+ "author_name": "developer123",
59
+ }
60
+ }
61
+ )
62
+
63
+
64
+ class SkillPrediction(BaseModel):
65
+ """Single skill prediction with confidence score."""
66
+
67
+ skill_name: str = Field(
68
+ ...,
69
+ description="Name of the predicted skill (domain/subdomain)",
70
+ examples=["Language/Java", "DevOps/CI-CD"],
71
+ )
72
+ confidence: float = Field(
73
+ ..., ge=0.0, le=1.0, description="Confidence score (0.0 to 1.0)", examples=[0.85]
74
+ )
75
+
76
+ model_config = ConfigDict(
77
+ json_schema_extra={"example": {"skill_name": "Language/Java", "confidence": 0.92}}
78
+ )
79
+
80
+
81
+ class PredictionResponse(BaseModel):
82
+ """Response model for skill classification predictions."""
83
+
84
+ predictions: list[SkillPrediction] = Field(
85
+ default_factory=list, description="List of predicted skills with confidence scores"
86
+ )
87
+ num_predictions: int = Field(
88
+ ..., ge=0, description="Total number of predicted skills", examples=[5]
89
+ )
90
+ model_version: str = Field(default="1.0.0", description="Model version", examples=["1.0.0"])
91
+ processing_time_ms: Optional[float] = Field(
92
+ default=None, ge=0.0, description="Processing time in milliseconds", examples=[125.5]
93
+ )
94
+
95
+ model_config = ConfigDict(
96
+ json_schema_extra={
97
+ "example": {
98
+ "predictions": [
99
+ {"skill_name": "Language/Java", "confidence": 0.92},
100
+ {"skill_name": "DevOps/CI-CD", "confidence": 0.78},
101
+ ],
102
+ "num_predictions": 2,
103
+ "model_version": "1.0.0",
104
+ "processing_time_ms": 125.5,
105
+ }
106
+ }
107
+ )
108
+
109
+
110
+ class BatchIssueInput(BaseModel):
111
+ """Input model for batch prediction."""
112
+
113
+ issues: list[IssueInput] = Field(
114
+ ...,
115
+ min_length=1,
116
+ max_length=100,
117
+ description="Issues to classify (max 100)",
118
+ )
119
+
120
+ model_config = ConfigDict(
121
+ json_schema_extra={
122
+ "example": {
123
+ "issues": [
124
+ {
125
+ "issue_text": "Fix authentication bug",
126
+ "issue_description": "Users cannot login with OAuth",
127
+ },
128
+ {
129
+ "issue_text": "Add database migration",
130
+ "issue_description": "Create migration for new user table",
131
+ },
132
+ ]
133
+ }
134
+ }
135
+ )
136
+
137
+
138
+ class BatchPredictionResponse(BaseModel):
139
+ """Response model for batch predictions."""
140
+
141
+ results: list[PredictionResponse] = Field(
142
+ default_factory=list, description="Prediction results, one per issue"
143
+ )
144
+ total_issues: int = Field(..., ge=0, description="Number of issues processed", examples=[2])
145
+ total_processing_time_ms: Optional[float] = Field(
146
+ default=None, ge=0.0, description="Processing time in milliseconds", examples=[250.0]
147
+ )
148
+
149
+ model_config = ConfigDict(
150
+ json_schema_extra={
151
+ "example": {
152
+ "results": [
153
+ {
154
+ "predictions": [{"skill_name": "Language/Java", "confidence": 0.92}],
155
+ "num_predictions": 1,
156
+ "model_version": "1.0.0",
157
+ }
158
+ ],
159
+ "total_issues": 2,
160
+ "total_processing_time_ms": 250.0,
161
+ }
162
+ }
163
+ )
164
+
165
+
166
+ class ErrorResponse(BaseModel):
167
+ """Error response model."""
168
+
169
+ error: str = Field(..., description="Error message", examples=["Invalid input"])
170
+ detail: Optional[str] = Field(
171
+ default=None, description="Detailed error", examples=["Field 'issue_text' is required"]
172
+ )
173
+ timestamp: datetime = Field(default_factory=datetime.now, description="Error timestamp")
174
+
175
+ @field_serializer("timestamp")
176
+ def serialize_timestamp(self, value: datetime) -> str:
177
+ return value.isoformat()
178
+
179
+ model_config = ConfigDict(
180
+ json_schema_extra={
181
+ "example": {
182
+ "error": "Validation Error",
183
+ "detail": "issue_text: field required",
184
+ "timestamp": "2024-01-15T10:30:00Z",
185
+ }
186
+ }
187
+ )
188
+
189
+
190
+ class HealthCheckResponse(BaseModel):
191
+ """Health check response model."""
192
+
193
+ status: str = Field(default="healthy", description="Service status", examples=["healthy"])
194
+ model_loaded: bool = Field(..., description="Model ready status", examples=[True])
195
+ version: str = Field(default="1.0.0", description="API version", examples=["1.0.0"])
196
+ timestamp: datetime = Field(default_factory=datetime.now, description="Timestamp")
197
+
198
+
199
+ class PredictionRecord(PredictionResponse):
200
+ """Extended prediction model with metadata from MLflow."""
201
+
202
+ run_id: str = Field(..., description="MLflow Run ID")
203
+ timestamp: datetime = Field(..., description="Prediction timestamp")
204
+ input_text: Optional[str] = Field(default="", description="Input text classified")
205
+
206
+ model_config = ConfigDict(
207
+ json_schema_extra={
208
+ "example": {
209
+ "predictions": [
210
+ {"skill_name": "Language/Java", "confidence": 0.92},
211
+ {"skill_name": "DevOps/CI-CD", "confidence": 0.78},
212
+ ],
213
+ "num_predictions": 2,
214
+ "model_version": "1.0.0",
215
+ "processing_time_ms": 125.5,
216
+ "run_id": "a1b2c3d4e5f6",
217
+ "timestamp": "2024-01-15T10:30:00Z",
218
+ "input_text": "Fix bug in authentication module",
219
+ }
220
+ }
221
+ )
hopcroft_skill_classification_tool_competition/config.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration and constants for the project"""
2
+
3
+ from pathlib import Path
4
+
5
+ # Project paths
6
+ PROJECT_DIR = Path(__file__).resolve().parents[1]
7
+ DATA_DIR = PROJECT_DIR / "data"
8
+ RAW_DATA_DIR = DATA_DIR / "raw"
9
+ PROCESSED_DATA_DIR = DATA_DIR / "processed"
10
+ MODELS_DIR = PROJECT_DIR / "models"
11
+ REPORTS_DIR = PROJECT_DIR / "reports"
12
+
13
+ # Dataset paths
14
+ DB_PATH = RAW_DATA_DIR / "skillscope_data.db"
15
+
16
+ # Data paths configuration for training
17
+ # Updated to use cleaned data (duplicates removed, no data leakage)
18
+ # Now pointing to TF-IDF features for API compatibility
19
+ DATA_PATHS = {
20
+ "features": str(PROCESSED_DATA_DIR / "tfidf" / "features_tfidf.npy"),
21
+ "labels": str(PROCESSED_DATA_DIR / "tfidf" / "labels_tfidf.npy"),
22
+ "features_original": str(PROCESSED_DATA_DIR / "tfidf" / "features_tfidf.npy"),
23
+ "labels_original": str(PROCESSED_DATA_DIR / "tfidf" / "labels_tfidf.npy"),
24
+ "models_dir": str(MODELS_DIR),
25
+ }
26
+
27
+ # Embedding configuration
28
+ EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
29
+
30
+ # API Configuration - which model to use for predictions
31
+ API_CONFIG = {
32
+ # Model file to load (without path, just filename)
33
+ "model_name": "random_forest_embedding_gridsearch.pkl",
34
+ # Feature type: "tfidf" or "embedding"
35
+ # This determines how text is transformed before prediction
36
+ "feature_type": "embedding",
37
+ }
38
+
39
+ # Training configuration
40
+ TRAINING_CONFIG = {
41
+ "random_state": 42,
42
+ "test_size": 0.2,
43
+ "val_size": 0.1,
44
+ "cv_folds": 5,
45
+ }
46
+
47
+ # Model configuration (Random Forest)
48
+ MODEL_CONFIG = {
49
+ "param_grid": {
50
+ "estimator__n_estimators": [50, 100, 200],
51
+ "estimator__max_depth": [10, 20, 30],
52
+ "estimator__min_samples_split": [2, 5],
53
+ }
54
+ }
55
+
56
+ # ADASYN configuration
57
+ ADASYN_CONFIG = {
58
+ "n_neighbors": 5,
59
+ "sampling_strategy": "auto",
60
+ }
61
+
62
+ # PCA configuration
63
+ PCA_CONFIG = {
64
+ "variance_retained": 0.95,
65
+ }
66
+
67
+ # MLflow configuration
68
+ MLFLOW_CONFIG = {
69
+ "uri": "https://dagshub.com/se4ai2526-uniba/Hopcroft.mlflow",
70
+ "experiments": {
71
+ "baseline": "hopcroft_random_forest_baseline",
72
+ "smote": "hopcroft_random_forest_smote",
73
+ "ros": "hopcroft_random_forest_ros",
74
+ "adasyn_pca": "hopcroft_random_forest_adasyn_pca",
75
+ "lightgbm": "hopcroft_lightgbm",
76
+ "lightgbm_smote": "hopcroft_lightgbm_smote",
77
+ },
78
+ }
79
+
80
+ # Model parameters (legacy - kept for compatibility)
81
+ RANDOM_STATE = 42
82
+ TEST_SIZE = 0.2
83
+ VAL_SIZE = 0.1
84
+
85
+ # Feature engineering
86
+ MAX_TFIDF_FEATURES = 5000
87
+ NGRAM_RANGE = (1, 2)
88
+
89
+ # Model training (legacy)
90
+ N_ESTIMATORS = 100
91
+ MAX_DEPTH = 20
92
+
93
+ # Hugging Face dataset
94
+ HF_REPO_ID = "NLBSE/SkillCompetition"
95
+ HF_FILENAME = "skillscope_data.zip"
96
+
97
+
98
+ def get_feature_paths(feature_type: str = "embedding", use_cleaned: bool = True) -> dict:
99
+ """
100
+ Get data paths for specified feature type.
101
+
102
+ This function allows easy switching between TF-IDF and Embedding features
103
+ for baseline reproduction (TF-IDF) vs improved model (Embeddings).
104
+
105
+ Args:
106
+ feature_type: Type of features - 'tfidf' or 'embedding'
107
+ use_cleaned: If True, use cleaned data (duplicates removed, no leakage).
108
+ If False, use original processed data.
109
+
110
+ Returns:
111
+ Dictionary with paths to features, labels, and models directory
112
+
113
+ Example:
114
+ # For baseline (paper reproduction)
115
+ paths = get_feature_paths(feature_type='tfidf', use_cleaned=True)
116
+
117
+ # For improved model
118
+ paths = get_feature_paths(feature_type='embedding', use_cleaned=True)
119
+ """
120
+ if feature_type not in ["tfidf", "embedding"]:
121
+ raise ValueError(f"Invalid feature_type: {feature_type}. Must be 'tfidf' or 'embedding'")
122
+
123
+ feature_dir = PROCESSED_DATA_DIR / feature_type
124
+
125
+ if use_cleaned:
126
+ suffix = "_clean"
127
+ else:
128
+ suffix = ""
129
+
130
+ return {
131
+ "features": str(feature_dir / f"features_{feature_type}{suffix}.npy"),
132
+ "labels": str(feature_dir / f"labels_{feature_type}{suffix}.npy"),
133
+ "features_test": str(feature_dir / f"X_test_{feature_type}{suffix}.npy"),
134
+ "labels_test": str(feature_dir / f"Y_test_{feature_type}{suffix}.npy"),
135
+ "models_dir": str(MODELS_DIR),
136
+ "feature_type": feature_type,
137
+ }
hopcroft_skill_classification_tool_competition/data_cleaning.py ADDED
@@ -0,0 +1,559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Cleaning and Quality Assurance Module
3
+
4
+ This module addresses data quality issues identified by Deepchecks validation:
5
+ 1. Removes duplicate samples (6.5% duplicates detected)
6
+ 2. Resolves conflicting labels (8.9% samples with conflicts)
7
+ 3. Ensures proper train/test split without data leakage
8
+ 4. Removes highly correlated features
9
+
10
+ This script should be run BEFORE training to ensure data quality.
11
+ It regenerates the processed data files with cleaned data.
12
+
13
+ Usage:
14
+ python -m hopcroft_skill_classification_tool_competition.data_cleaning
15
+
16
+ Output:
17
+ - data/processed/tfidf/features_tfidf_clean.npy (cleaned training features)
18
+ - data/processed/tfidf/labels_tfidf_clean.npy (cleaned training labels)
19
+ - data/processed/tfidf/X_test_clean.npy (cleaned test features)
20
+ - data/processed/tfidf/Y_test_clean.npy (cleaned test labels)
21
+ """
22
+
23
+ from datetime import datetime
24
+ from pathlib import Path
25
+ from typing import Dict, Optional, Tuple
26
+
27
+ import numpy as np
28
+ import pandas as pd
29
+ from sklearn.model_selection import train_test_split
30
+
31
+ from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR
32
+
33
+
34
+ def remove_duplicates(X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray, Dict]:
35
+ """
36
+ Remove duplicate samples from the dataset.
37
+
38
+ Duplicates are identified by identical feature vectors.
39
+ When duplicates are found with different labels, we keep the first occurrence.
40
+
41
+ Args:
42
+ X: Feature matrix (samples x features)
43
+ y: Label matrix (samples x labels)
44
+
45
+ Returns:
46
+ Tuple of (cleaned_X, cleaned_y, stats_dict)
47
+ """
48
+ print("\n" + "=" * 80)
49
+ print("STEP 1: REMOVING DUPLICATES")
50
+ print("=" * 80)
51
+
52
+ initial_samples = X.shape[0]
53
+
54
+ # Convert to DataFrame for easier duplicate detection
55
+ # Use feature hash to identify duplicates (more memory efficient than full comparison)
56
+ df_features = pd.DataFrame(X)
57
+
58
+ # Find duplicates based on all features
59
+ duplicates_mask = df_features.duplicated(keep="first")
60
+ n_duplicates = duplicates_mask.sum()
61
+
62
+ print(f"Initial samples: {initial_samples:,}")
63
+ print(f"Duplicates found: {n_duplicates:,} ({n_duplicates / initial_samples * 100:.2f}%)")
64
+
65
+ if n_duplicates > 0:
66
+ # Keep only non-duplicate rows
67
+ X_clean = X[~duplicates_mask]
68
+ y_clean = y[~duplicates_mask]
69
+
70
+ print(f"Samples after removing duplicates: {X_clean.shape[0]:,}")
71
+ print(f"Removed: {n_duplicates:,} duplicate samples")
72
+ else:
73
+ X_clean = X
74
+ y_clean = y
75
+ print("No duplicates found")
76
+
77
+ stats = {
78
+ "initial_samples": int(initial_samples),
79
+ "duplicates_found": int(n_duplicates),
80
+ "duplicates_percentage": float(n_duplicates / initial_samples * 100),
81
+ "final_samples": int(X_clean.shape[0]),
82
+ }
83
+
84
+ return X_clean, y_clean, stats
85
+
86
+
87
+ def resolve_conflicting_labels(
88
+ X: np.ndarray, y: np.ndarray
89
+ ) -> Tuple[np.ndarray, np.ndarray, Dict]:
90
+ """
91
+ Resolve samples with conflicting labels.
92
+
93
+ Conflicting labels occur when identical feature vectors have different labels.
94
+ Resolution strategy: Use majority voting for each label across duplicates.
95
+
96
+ Args:
97
+ X: Feature matrix (samples x features)
98
+ y: Label matrix (samples x labels)
99
+
100
+ Returns:
101
+ Tuple of (cleaned_X, cleaned_y, stats_dict)
102
+ """
103
+ print("\n" + "=" * 80)
104
+ print("STEP 2: RESOLVING CONFLICTING LABELS")
105
+ print("=" * 80)
106
+
107
+ initial_samples = X.shape[0]
108
+
109
+ # Create a combined DataFrame
110
+ df_X = pd.DataFrame(X)
111
+ df_y = pd.DataFrame(y)
112
+
113
+ # Add a unique identifier based on features (use hash for efficiency)
114
+ # Create a string representation of each row
115
+ feature_hashes = pd.util.hash_pandas_object(df_X, index=False)
116
+
117
+ # Group by feature hash
118
+ groups = df_y.groupby(feature_hashes)
119
+
120
+ # Count conflicts: groups with size > 1
121
+ conflicts = groups.size()
122
+ n_conflict_groups = (conflicts > 1).sum()
123
+ n_conflict_samples = (conflicts[conflicts > 1]).sum()
124
+
125
+ print(f"Initial samples: {initial_samples:,}")
126
+ print(f"Duplicate feature groups: {n_conflict_groups:,}")
127
+ print(
128
+ f"Samples in conflict groups: {n_conflict_samples:,} ({n_conflict_samples / initial_samples * 100:.2f}%)"
129
+ )
130
+
131
+ if n_conflict_groups > 0:
132
+ # Resolve conflicts using majority voting
133
+ # For each group of duplicates, use the most common label value
134
+ resolved_labels = groups.apply(
135
+ lambda x: x.mode(axis=0).iloc[0] if len(x) > 1 else x.iloc[0]
136
+ )
137
+
138
+ # Keep only one sample per unique feature vector
139
+ unique_indices = ~df_X.duplicated(keep="first")
140
+ X_clean = X[unique_indices]
141
+
142
+ # Map resolved labels back to unique samples
143
+ unique_hashes = feature_hashes[unique_indices]
144
+ y_clean = np.array([resolved_labels.loc[h].values for h in unique_hashes])
145
+
146
+ print(f"Samples after conflict resolution: {X_clean.shape[0]:,}")
147
+ print("Conflicts resolved using majority voting")
148
+ else:
149
+ X_clean = X
150
+ y_clean = y
151
+ print("No conflicting labels found")
152
+
153
+ stats = {
154
+ "initial_samples": int(initial_samples),
155
+ "conflict_groups": int(n_conflict_groups),
156
+ "conflict_samples": int(n_conflict_samples),
157
+ "conflict_percentage": float(n_conflict_samples / initial_samples * 100),
158
+ "final_samples": int(X_clean.shape[0]),
159
+ }
160
+
161
+ return X_clean, y_clean, stats
162
+
163
+
164
+ def remove_sparse_samples(
165
+ X: np.ndarray, y: np.ndarray, min_nnz: int = 10
166
+ ) -> Tuple[np.ndarray, np.ndarray, Dict]:
167
+ """
168
+ Remove samples with too few non-zero features (incompatible with SMOTE).
169
+
170
+ Args:
171
+ X: Feature matrix
172
+ y: Label matrix
173
+ min_nnz: Minimum number of non-zero features required
174
+
175
+ Returns:
176
+ Tuple of (X_filtered, y_filtered, statistics_dict)
177
+ """
178
+ print("\n" + "=" * 80)
179
+ print(f"STEP 3: REMOVING SPARSE SAMPLES (min_nnz={min_nnz})")
180
+ print("=" * 80)
181
+
182
+ n_initial = X.shape[0]
183
+ print(f"Initial samples: {n_initial:,}")
184
+
185
+ nnz_counts = (X != 0).sum(axis=1)
186
+ valid_mask = nnz_counts >= min_nnz
187
+
188
+ X_filtered = X[valid_mask]
189
+ y_filtered = y[valid_mask]
190
+
191
+ n_removed = n_initial - X_filtered.shape[0]
192
+ removal_pct = (n_removed / n_initial * 100) if n_initial > 0 else 0
193
+
194
+ print(f"Sparse samples (< {min_nnz} features): {n_removed:,} ({removal_pct:.2f}%)")
195
+ print(f"Samples after filtering: {X_filtered.shape[0]:,}")
196
+
197
+ stats = {
198
+ "initial_samples": int(n_initial),
199
+ "min_nnz_threshold": min_nnz,
200
+ "sparse_samples_removed": int(n_removed),
201
+ "removal_percentage": float(removal_pct),
202
+ "final_samples": int(X_filtered.shape[0]),
203
+ }
204
+
205
+ return X_filtered, y_filtered, stats
206
+
207
+
208
+ def remove_empty_labels(
209
+ X: np.ndarray, y: np.ndarray, min_count: int = 5
210
+ ) -> Tuple[np.ndarray, np.ndarray, Dict]:
211
+ """
212
+ Remove labels with too few occurrences (cannot be stratified).
213
+
214
+ Args:
215
+ X: Feature matrix
216
+ y: Label matrix
217
+ min_count: Minimum number of occurrences required per label
218
+
219
+ Returns:
220
+ Tuple of (X_same, y_filtered, statistics_dict)
221
+ """
222
+ print("\n" + "=" * 80)
223
+ print(f"STEP 4: REMOVING RARE LABELS (min_count={min_count})")
224
+ print("=" * 80)
225
+
226
+ n_initial_labels = y.shape[1]
227
+ print(f"Initial labels: {n_initial_labels:,}")
228
+
229
+ label_counts = y.sum(axis=0)
230
+ valid_labels = label_counts >= min_count
231
+
232
+ y_filtered = y[:, valid_labels]
233
+
234
+ n_removed = n_initial_labels - y_filtered.shape[1]
235
+ removal_pct = (n_removed / n_initial_labels * 100) if n_initial_labels > 0 else 0
236
+
237
+ print(f"Rare labels (< {min_count} occurrences): {n_removed:,} ({removal_pct:.2f}%)")
238
+ print(f"Labels after filtering: {y_filtered.shape[1]:,}")
239
+
240
+ stats = {
241
+ "initial_labels": int(n_initial_labels),
242
+ "min_count_threshold": min_count,
243
+ "rare_labels_removed": int(n_removed),
244
+ "removal_percentage": float(removal_pct),
245
+ "final_labels": int(y_filtered.shape[1]),
246
+ }
247
+
248
+ return X, y_filtered, stats
249
+
250
+
251
+ def create_clean_train_test_split(
252
+ X: np.ndarray, y: np.ndarray, test_size: float = 0.2, random_state: int = 42
253
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, Dict]:
254
+ """
255
+ Create train/test split with verification of no data leakage.
256
+ Uses MultilabelStratifiedShuffleSplit if available.
257
+
258
+ Args:
259
+ X: Feature matrix
260
+ y: Label matrix
261
+ test_size: Proportion of test set (default: 0.2 = 20%)
262
+ random_state: Random seed for reproducibility
263
+
264
+ Returns:
265
+ Tuple of (X_train, X_test, y_train, y_test, stats_dict)
266
+ """
267
+ print("\n" + "=" * 80)
268
+ print("STEP 5: CREATING CLEAN TRAIN/TEST SPLIT")
269
+ print("=" * 80)
270
+
271
+ print(f"Total samples: {X.shape[0]:,}")
272
+ print(f"Test size: {test_size * 100:.1f}%")
273
+ print(f"Random state: {random_state}")
274
+
275
+ # Try to use iterative-stratification for better multi-label splits
276
+ try:
277
+ from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
278
+
279
+ has_iterstrat = True
280
+ print("Using MultilabelStratifiedShuffleSplit (iterative-stratification)")
281
+ except ImportError:
282
+ has_iterstrat = False
283
+ print(
284
+ "WARNING: iterative-stratification not installed. Using standard stratification (suboptimal for multi-label)."
285
+ )
286
+
287
+ if has_iterstrat:
288
+ msss = MultilabelStratifiedShuffleSplit(
289
+ n_splits=1, test_size=test_size, random_state=random_state
290
+ )
291
+ train_index, test_index = next(msss.split(X, y))
292
+ X_train, X_test = X[train_index], X[test_index]
293
+ y_train, y_test = y[train_index], y[test_index]
294
+ else:
295
+ # Fallback: Perform stratified split based on first label column (approximate stratification)
296
+ stratify_column = y[:, 0] if y.ndim > 1 else y
297
+ X_train, X_test, y_train, y_test = train_test_split(
298
+ X, y, test_size=test_size, random_state=random_state, stratify=stratify_column
299
+ )
300
+
301
+ # Verify no data leakage: check for overlapping samples
302
+ print("\nVerifying no data leakage...")
303
+
304
+ # Convert to sets of row hashes for efficient comparison
305
+ train_hashes = set(pd.util.hash_pandas_object(pd.DataFrame(X_train), index=False))
306
+ test_hashes = set(pd.util.hash_pandas_object(pd.DataFrame(X_test), index=False))
307
+
308
+ overlap = train_hashes & test_hashes
309
+
310
+ if len(overlap) > 0:
311
+ raise ValueError(
312
+ f"DATA LEAKAGE DETECTED: {len(overlap)} samples appear in both train and test!"
313
+ )
314
+
315
+ print("No data leakage detected")
316
+ print(f"Train samples: {X_train.shape[0]:,} ({X_train.shape[0] / X.shape[0] * 100:.1f}%)")
317
+ print(f"Test samples: {X_test.shape[0]:,} ({X_test.shape[0] / X.shape[0] * 100:.1f}%)")
318
+
319
+ # Verify feature dimensions match
320
+ if X_train.shape[1] != X_test.shape[1]:
321
+ raise ValueError(
322
+ f"Feature dimensions don't match: train={X_train.shape[1]}, test={X_test.shape[1]}"
323
+ )
324
+
325
+ print(f"Feature dimensions match: {X_train.shape[1]:,}")
326
+
327
+ stats = {
328
+ "total_samples": int(X.shape[0]),
329
+ "train_samples": int(X_train.shape[0]),
330
+ "test_samples": int(X_test.shape[0]),
331
+ "train_percentage": float(X_train.shape[0] / X.shape[0] * 100),
332
+ "test_percentage": float(X_test.shape[0] / X.shape[0] * 100),
333
+ "features": int(X_train.shape[1]),
334
+ "labels": int(y_train.shape[1]) if y_train.ndim > 1 else 1,
335
+ "data_leakage": False,
336
+ "overlap_samples": 0,
337
+ "stratification_method": "MultilabelStratifiedShuffleSplit"
338
+ if has_iterstrat
339
+ else "Standard StratifiedShuffleSplit",
340
+ }
341
+
342
+ return X_train, X_test, y_train, y_test, stats
343
+
344
+
345
+ def save_cleaned_data(
346
+ X_train: np.ndarray,
347
+ X_test: np.ndarray,
348
+ y_train: np.ndarray,
349
+ y_test: np.ndarray,
350
+ stats: Dict,
351
+ output_dir: Optional[Path] = None,
352
+ feature_type: str = "tfidf",
353
+ ) -> None:
354
+ """
355
+ Save cleaned train/test split to disk.
356
+
357
+ Args:
358
+ X_train: Training features
359
+ X_test: Test features
360
+ y_train: Training labels
361
+ y_test: Test labels
362
+ stats: Dictionary with cleaning statistics
363
+ output_dir: Output directory (default: data/processed/{feature_type}/)
364
+ feature_type: Type of features ('tfidf' or 'embedding')
365
+ """
366
+ print("\n" + "=" * 80)
367
+ print("STEP 6: SAVING CLEANED DATA")
368
+ print("=" * 80)
369
+
370
+ if output_dir is None:
371
+ output_dir = PROCESSED_DATA_DIR / feature_type
372
+
373
+ output_dir.mkdir(parents=True, exist_ok=True)
374
+
375
+ # Save cleaned data with "_clean" suffix
376
+ files = {
377
+ "features_train": output_dir / f"features_{feature_type}_clean.npy",
378
+ "labels_train": output_dir / f"labels_{feature_type}_clean.npy",
379
+ "features_test": output_dir / f"X_test_{feature_type}_clean.npy",
380
+ "labels_test": output_dir / f"Y_test_{feature_type}_clean.npy",
381
+ }
382
+
383
+ np.save(files["features_train"], X_train)
384
+ np.save(files["labels_train"], y_train)
385
+ np.save(files["features_test"], X_test)
386
+ np.save(files["labels_test"], y_test)
387
+
388
+ print(f"\nSaved cleaned data to: {output_dir}")
389
+ for name, path in files.items():
390
+ print(f" - {path.name}")
391
+
392
+
393
+ def clean_and_split_data(
394
+ test_size: float = 0.2,
395
+ random_state: int = 42,
396
+ regenerate_features: bool = True,
397
+ feature_type: str = "embedding", # 'tfidf' or 'embedding'
398
+ model_name: str = "all-MiniLM-L6-v2",
399
+ max_features: int = 2000, # Only for TF-IDF (must match features.py default)
400
+ ) -> Dict:
401
+ """
402
+ Main function to clean data and create proper train/test split.
403
+
404
+ This function:
405
+ 1. Loads or regenerates features (TF-IDF or Embeddings)
406
+ 2. Removes duplicate samples
407
+ 3. Resolves conflicting labels
408
+ 4. Creates clean train/test split
409
+ 5. Verifies no data leakage
410
+ 6. Saves cleaned data
411
+
412
+ Args:
413
+ test_size: Proportion of test set (default: 0.2)
414
+ random_state: Random seed for reproducibility (default: 42)
415
+ regenerate_features: If True, regenerate features from database (default: True)
416
+ feature_type: Type of features to extract ('tfidf' or 'embedding')
417
+ model_name: Model name for embeddings
418
+ max_features: Maximum number of TF-IDF features (default: 1000)
419
+
420
+ Returns:
421
+ Dictionary with all cleaning statistics
422
+ """
423
+ print("=" * 80)
424
+ print("DATA CLEANING AND QUALITY ASSURANCE PIPELINE")
425
+ print("=" * 80)
426
+ print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
427
+ print(f"Test size: {test_size * 100:.1f}%")
428
+ print(f"Random state: {random_state}")
429
+ print(f"Regenerate features: {regenerate_features}")
430
+ print(f"Feature type: {feature_type}")
431
+ if feature_type == "embedding":
432
+ print(f"Model name: {model_name}")
433
+ else:
434
+ print(f"Max features: {max_features}")
435
+
436
+ # Step 0: Load or generate features
437
+ if regenerate_features:
438
+ print("\nRegenerating features from database...")
439
+ # Load data and extract features
440
+ from hopcroft_skill_classification_tool_competition.features import create_feature_dataset
441
+
442
+ # Use the unified create_feature_dataset function
443
+ features, labels, _, _ = create_feature_dataset(
444
+ save_processed=False, # Don't save intermediate raw features, just return them
445
+ feature_type=feature_type,
446
+ model_name=model_name,
447
+ )
448
+
449
+ X = features
450
+ y = labels.values
451
+ else:
452
+ print(f"\nLoading existing features ({feature_type})...")
453
+ data_dir = PROCESSED_DATA_DIR / feature_type
454
+ X = np.load(data_dir / f"features_{feature_type}.npy")
455
+ y = np.load(data_dir / f"labels_{feature_type}.npy")
456
+
457
+ print("\nInitial data shape:")
458
+ print(f" Features: {X.shape}")
459
+ print(f" Labels: {y.shape}")
460
+
461
+ # Step 1: Remove duplicates
462
+ X_no_dup, y_no_dup, dup_stats = remove_duplicates(X, y)
463
+
464
+ # Step 2: Resolve conflicting labels
465
+ X_no_conf, y_no_conf, conflict_stats = resolve_conflicting_labels(X_no_dup, y_no_dup)
466
+
467
+ # Step 3: Remove sparse samples
468
+ # For embeddings, we don't have "sparse" features in the same way as TF-IDF (zeros).
469
+ # But we can check for near-zero vectors if needed.
470
+ # For now, we skip sparse check for embeddings or keep it if it checks for all-zeros.
471
+ if feature_type == "tfidf":
472
+ X_no_sparse, y_no_sparse, sparse_stats = remove_sparse_samples(
473
+ X_no_conf, y_no_conf, min_nnz=10
474
+ )
475
+ else:
476
+ # Skip sparse check for embeddings as they are dense
477
+ X_no_sparse, y_no_sparse = X_no_conf, y_no_conf
478
+ sparse_stats = {"sparse_samples_removed": 0, "removal_percentage": 0.0}
479
+ print("\nSkipping sparse sample removal for dense embeddings.")
480
+
481
+ # Step 4: Remove rare labels
482
+ X_clean, y_clean, rare_stats = remove_empty_labels(X_no_sparse, y_no_sparse, min_count=5)
483
+
484
+ # Step 5: Create clean train/test split
485
+ X_train, X_test, y_train, y_test, split_stats = create_clean_train_test_split(
486
+ X_clean, y_clean, test_size=test_size, random_state=random_state
487
+ )
488
+
489
+ # Step 6: Save cleaned data
490
+ all_stats = {
491
+ "duplicates": dup_stats,
492
+ "conflicts": conflict_stats,
493
+ "sparse_samples": sparse_stats,
494
+ "rare_labels": rare_stats,
495
+ "split": split_stats,
496
+ "feature_type": feature_type,
497
+ }
498
+
499
+ # Save to specific directory based on feature type
500
+ output_dir = PROCESSED_DATA_DIR / feature_type
501
+ save_cleaned_data(
502
+ X_train,
503
+ X_test,
504
+ y_train,
505
+ y_test,
506
+ all_stats,
507
+ output_dir=output_dir,
508
+ feature_type=feature_type,
509
+ )
510
+
511
+ # Print final summary
512
+ print("\n" + "=" * 80)
513
+ print("CLEANING PIPELINE COMPLETED SUCCESSFULLY")
514
+ print("=" * 80)
515
+ print("\nSummary:")
516
+ print(f" Original samples: {X.shape[0]:,}")
517
+ print(f" Original labels: {y.shape[1]:,}")
518
+ print(
519
+ f" Duplicates removed: {dup_stats['duplicates_found']:,} ({dup_stats['duplicates_percentage']:.2f}%)"
520
+ )
521
+ print(
522
+ f" Conflicts resolved: {conflict_stats['conflict_samples']:,} ({conflict_stats['conflict_percentage']:.2f}%)"
523
+ )
524
+ print(
525
+ f" Sparse samples removed: {sparse_stats['sparse_samples_removed']:,} ({sparse_stats['removal_percentage']:.2f}%)"
526
+ )
527
+ print(
528
+ f" Rare labels removed: {rare_stats['rare_labels_removed']:,} ({rare_stats['removal_percentage']:.2f}%)"
529
+ )
530
+ print(f" Final clean samples: {split_stats['total_samples']:,}")
531
+ print(f" Final clean labels: {y_clean.shape[1]:,}")
532
+ print(
533
+ f" Train samples: {split_stats['train_samples']:,} ({split_stats['train_percentage']:.1f}%)"
534
+ )
535
+ print(
536
+ f" Test samples: {split_stats['test_samples']:,} ({split_stats['test_percentage']:.1f}%)"
537
+ )
538
+ print("\nData quality issues resolved:")
539
+ print(" - Duplicates removed")
540
+ print(" - Label conflicts resolved")
541
+ if feature_type == "tfidf":
542
+ print(" - Sparse samples removed")
543
+ print(" - Rare labels removed")
544
+ print(" - Clean train/test split created")
545
+ print(" - No data leakage verified")
546
+ print("=" * 80)
547
+
548
+ return all_stats
549
+
550
+
551
+ if __name__ == "__main__":
552
+ # Run the cleaning pipeline
553
+ stats = clean_and_split_data(
554
+ test_size=0.2, # 80/20 split
555
+ random_state=42,
556
+ regenerate_features=True,
557
+ feature_type="embedding",
558
+ model_name="all-MiniLM-L6-v2",
559
+ )
hopcroft_skill_classification_tool_competition/dataset.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Scripts to download or generate data"""
2
+
3
+ from pathlib import Path
4
+ import shutil
5
+ import zipfile
6
+
7
+ from huggingface_hub import hf_hub_download
8
+
9
+ from hopcroft_skill_classification_tool_competition.config import (
10
+ HF_FILENAME,
11
+ HF_REPO_ID,
12
+ RAW_DATA_DIR,
13
+ )
14
+
15
+
16
+ def download_skillscope_dataset(output_dir: Path = None) -> Path:
17
+ """
18
+ Download and extract SkillScope dataset from Hugging Face Hub.
19
+
20
+ The dataset contains a SQLite database (skillscope_data.db) with:
21
+ - nlbse_tool_competition_data_by_issue: Main table with PR features and skill labels
22
+ - vw_nlbse_tool_competition_data_by_file: View with file-level labels
23
+
24
+ Dataset details:
25
+ - 7,245 merged pull requests from 11 Java repositories
26
+ - 57,206 source files; 59,644 methods; 13,097 classes
27
+ - 217 skill labels (domain/sub-domain pairs)
28
+
29
+ Args:
30
+ output_dir: Directory where to save the dataset (default: data/raw)
31
+
32
+ Returns:
33
+ Path to the extracted database file
34
+ """
35
+ if output_dir is None:
36
+ output_dir = RAW_DATA_DIR
37
+
38
+ output_dir.mkdir(parents=True, exist_ok=True)
39
+ db_path = output_dir / "skillscope_data.db"
40
+
41
+ if db_path.exists():
42
+ print(f"Database already exists at: {db_path}")
43
+ return db_path
44
+
45
+ print("Downloading SkillScope dataset from Hugging Face...")
46
+
47
+ # Download without using cache - use local_dir to avoid .cache folder
48
+ zip_path = hf_hub_download(
49
+ repo_id=HF_REPO_ID,
50
+ filename=HF_FILENAME,
51
+ repo_type="dataset",
52
+ local_dir=output_dir,
53
+ local_dir_use_symlinks=False, # Don't create symlinks, copy directly
54
+ )
55
+
56
+ print(f"Downloaded to: {zip_path}")
57
+ print("Extracting database...")
58
+
59
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
60
+ zip_ref.extractall(output_dir)
61
+
62
+ if not db_path.exists():
63
+ raise FileNotFoundError(f"Database file not found at: {db_path}")
64
+
65
+ print(f"Database extracted to: {db_path}")
66
+
67
+ # Clean up: remove zip file
68
+ print("Cleaning up temporary files...")
69
+ Path(zip_path).unlink()
70
+
71
+ # Clean up: remove .cache folder if exists
72
+ cache_dir = output_dir / ".cache"
73
+ if cache_dir.exists():
74
+ shutil.rmtree(cache_dir)
75
+ print("Removed .cache folder")
76
+
77
+ # Clean up: remove download folder if exists
78
+ download_dir = output_dir / "download"
79
+ if download_dir.exists():
80
+ shutil.rmtree(download_dir)
81
+ print("Removed download folder")
82
+
83
+ print("Cleanup completed")
84
+
85
+ print("\nDataset info:")
86
+ print(" - Table: nlbse_tool_competition_data_by_issue")
87
+ print(" - View: vw_nlbse_tool_competition_data_by_file")
88
+
89
+ return db_path
90
+
91
+
92
+ if __name__ == "__main__":
93
+ print("=" * 80)
94
+ print("SKILLSCOPE DATASET DOWNLOAD")
95
+ print("=" * 80)
96
+ download_skillscope_dataset()
97
+ print("=" * 80)
98
+ print("DOWNLOAD COMPLETED")
99
+ print("=" * 80)
hopcroft_skill_classification_tool_competition/features.py ADDED
@@ -0,0 +1,492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Feature extraction module for skill classification.
3
+
4
+ This module provides functions to extract features from the SkillScope dataset,
5
+ starting with TF-IDF vectorization of textual data from pull request issues.
6
+
7
+ Dataset Information (from nlbse_tool_competition_data_by_issue):
8
+ - 7,154 issues from 11 Java repositories
9
+ - 226 total columns:
10
+ - 2 text columns: 'issue text' (title) and 'issue description' (body)
11
+ - metadata and other columns containing PR/file/context information
12
+ - 217 label columns: domain/subdomain skill labels (142 active labels in this DB)
13
+
14
+ Label Characteristics:
15
+ - Multi-label classification problem
16
+ - Average 32.9 labels per issue (median: 31)
17
+ - Highly imbalanced: some labels appear in all issues, others in very few
18
+ - Top labels: Language, Data Structure, DevOps, Error Handling
19
+ """
20
+
21
+ from pathlib import Path
22
+ import re
23
+ import sqlite3
24
+ from typing import Optional, Tuple
25
+
26
+ import joblib
27
+
28
+ # Import per lo Stemming
29
+ from nltk.stem import PorterStemmer
30
+ import numpy as np
31
+ import pandas as pd
32
+ from sklearn.feature_extraction.text import TfidfVectorizer
33
+
34
+ from hopcroft_skill_classification_tool_competition.config import (
35
+ MODELS_DIR,
36
+ PROCESSED_DATA_DIR,
37
+ RAW_DATA_DIR,
38
+ )
39
+
40
+ # Inizializza lo stemmer una volta per efficienza
41
+ stemmer = PorterStemmer()
42
+
43
+
44
+ def clean_github_text(text: str, use_stemming: bool = True) -> str:
45
+ """
46
+ Clean GitHub issue text as per SkillScope paper (Aracena et al. process).
47
+ Removes emojis, URLs, HTML tags, and other noise commonly found in GitHub text.
48
+ Optionally applies stemming.
49
+
50
+ Args:
51
+ text: Raw text from GitHub issue
52
+ use_stemming: If True, apply Porter stemming (recommended for TF-IDF).
53
+ If False, keep original words (recommended for Embeddings/LLMs).
54
+
55
+ Returns:
56
+ Cleaned text string (stemmed if use_stemming=True)
57
+ """
58
+ if pd.isna(text) or text is None:
59
+ return ""
60
+
61
+ text = str(text)
62
+
63
+ # Remove URLs (http/httpss/www)
64
+ text = re.sub(r"http\S+|www\.\S+", "", text)
65
+
66
+ # Remove HTML tags
67
+ text = re.sub(r"<[^>]+>", "", text)
68
+
69
+ # Remove markdown code blocks
70
+ text = re.sub(r"```[\s\S]*?```", "", text)
71
+
72
+ # Remove inline code
73
+ text = re.sub(r"`[^`]*`", "", text)
74
+
75
+ # Remove emojis and non-ASCII characters
76
+ text = text.encode("ascii", "ignore").decode("ascii")
77
+
78
+ # Remove extra whitespace
79
+ text = re.sub(r"\s+", " ", text)
80
+
81
+ text = text.strip()
82
+
83
+ # Stemming condizionale: solo per TF-IDF, non per Embeddings
84
+ if use_stemming:
85
+ try:
86
+ tokens = text.split()
87
+ stemmed_tokens = [stemmer.stem(token) for token in tokens]
88
+ text = " ".join(stemmed_tokens)
89
+ except Exception as e:
90
+ print(f"Warning: Stemming failed for text snippet '{text[:50]}...'. Error: {e}")
91
+ # Ritorna il testo pulito ma non stemmato in caso di errore
92
+ return text.strip()
93
+
94
+ return text
95
+
96
+
97
+ def get_dataset_info(df: pd.DataFrame) -> dict:
98
+ """
99
+ Get summary information about the dataset.
100
+
101
+ Args:
102
+ df: Input dataframe
103
+
104
+ Returns:
105
+ Dictionary containing dataset statistics
106
+ """
107
+ text_cols = get_text_columns(df)
108
+ label_cols = get_label_columns(df)
109
+
110
+ # Convert to binary labels
111
+ binary_labels = (df[label_cols] > 0).astype(int)
112
+ labels_per_issue = binary_labels.sum(axis=1)
113
+ issues_per_label = binary_labels.sum(axis=0)
114
+
115
+ info = {
116
+ "total_issues": len(df),
117
+ "total_columns": len(df.columns),
118
+ "text_columns": text_cols,
119
+ "num_text_columns": len(text_cols),
120
+ "label_columns": label_cols,
121
+ "num_labels": len(label_cols),
122
+ "avg_labels_per_issue": labels_per_issue.mean(),
123
+ "median_labels_per_issue": labels_per_issue.median(),
124
+ "max_labels_per_issue": labels_per_issue.max(),
125
+ "min_labels_per_issue": labels_per_issue.min(),
126
+ "avg_issues_per_label": issues_per_label.mean(),
127
+ "labels_with_no_issues": (issues_per_label == 0).sum(),
128
+ }
129
+
130
+ return info
131
+
132
+
133
+ def load_data_from_db(db_path: Optional[Path] = None) -> pd.DataFrame:
134
+ """
135
+ Load data from the SQLite database.
136
+
137
+ Args:
138
+ db_path: Path to the SQLite database file.
139
+ If None, uses default path in data/raw/skillscope_data.db
140
+
141
+ Returns:
142
+ DataFrame containing the nlbse_tool_competition_data_by_issue table
143
+ """
144
+ if db_path is None:
145
+ db_path = RAW_DATA_DIR / "skillscope_data.db"
146
+
147
+ conn = sqlite3.connect(db_path)
148
+
149
+ # Load the main table
150
+ query = "SELECT * FROM nlbse_tool_competition_data_by_issue"
151
+ df = pd.read_sql_query(query, conn)
152
+
153
+ conn.close()
154
+
155
+ print(f"Loaded {len(df)} records from database")
156
+ return df
157
+
158
+
159
+ def get_text_columns(df: pd.DataFrame) -> list:
160
+ """
161
+ Identify text columns in the dataframe (typically issue title, body, etc.).
162
+
163
+ Args:
164
+ df: Input dataframe
165
+
166
+ Returns:
167
+ List of column names containing textual data
168
+ """
169
+ # Text columns from SkillScope database schema
170
+ # Based on exploration: issue text (title) and issue description (body)
171
+ text_cols = ["issue text", "issue description"]
172
+
173
+ return [col for col in text_cols if col in df.columns]
174
+
175
+
176
+ def get_label_columns(df: pd.DataFrame) -> list:
177
+ """
178
+ Identify label columns (domains/subdomains with API counts).
179
+
180
+ Args:
181
+ df: Input dataframe
182
+
183
+ Returns:
184
+ List of column names containing labels
185
+ """
186
+ # Metadata columns to exclude from labels
187
+ # Based on exploration: these are not skill labels
188
+ exclude_cols = [
189
+ "Repo Name",
190
+ "PR #",
191
+ "issue text",
192
+ "issue description",
193
+ "created_at",
194
+ "author_name",
195
+ ]
196
+
197
+ # Label columns are numeric but not metadata. Use pandas is_numeric_dtype
198
+ # to be robust to dtype representations.
199
+ from pandas.api.types import is_numeric_dtype
200
+
201
+ label_cols = [
202
+ col for col in df.columns if col not in exclude_cols and is_numeric_dtype(df[col])
203
+ ]
204
+
205
+ return label_cols
206
+
207
+
208
+ def combine_text_fields(
209
+ df: pd.DataFrame, text_columns: list, use_stemming: bool = True
210
+ ) -> pd.Series:
211
+ """
212
+ Combine multiple text fields into a single text representation.
213
+ Applies text cleaning as per SkillScope paper.
214
+
215
+ Args:
216
+ df: Input dataframe
217
+ text_columns: List of column names to combine
218
+ use_stemming: If True, apply stemming (for TF-IDF). If False, keep original words (for Embeddings).
219
+
220
+ Returns:
221
+ Series containing cleaned and combined text for each row
222
+ """
223
+ # Apply cleaning to each text column and then combine
224
+ combined_text = (
225
+ df[text_columns]
226
+ .fillna("")
227
+ .astype(str)
228
+ .apply(
229
+ lambda x: " ".join(
230
+ x.map(lambda text: clean_github_text(text, use_stemming=use_stemming))
231
+ ),
232
+ axis=1,
233
+ )
234
+ )
235
+ return combined_text
236
+
237
+
238
+ def extract_tfidf_features(
239
+ df: pd.DataFrame,
240
+ text_columns: Optional[list] = None,
241
+ max_features: Optional[int] = 2000,
242
+ min_df: int = 2,
243
+ max_df: float = 0.95,
244
+ ngram_range: Tuple[int, int] = (1, 2),
245
+ ) -> Tuple[np.ndarray, TfidfVectorizer]:
246
+ """
247
+ Extract TF-IDF features from textual data.
248
+
249
+ Args:
250
+ df: Input dataframe
251
+ text_columns: List of text columns to use. If None, auto-detect.
252
+ max_features: Maximum number of features to extract (default: 2000 for balanced sparsity)
253
+ min_df: Minimum document frequency for a term to be included
254
+ max_df: Maximum document frequency (ignore terms appearing in >max_df of docs)
255
+ ngram_range: Range of n-grams to consider (e.g., (1,2) for unigrams and bigrams)
256
+
257
+ Returns:
258
+ Tuple of (feature matrix, fitted vectorizer)
259
+ """
260
+ if text_columns is None:
261
+ text_columns = get_text_columns(df)
262
+
263
+ if not text_columns:
264
+ raise ValueError("No text columns found in dataframe")
265
+
266
+ # Combine text fields (with stemming for TF-IDF)
267
+ print(f"Combining text from columns: {text_columns}")
268
+ combined_text = combine_text_fields(df, text_columns, use_stemming=True)
269
+
270
+ # Initialize TF-IDF vectorizer
271
+ vectorizer = TfidfVectorizer(
272
+ max_features=max_features,
273
+ min_df=min_df,
274
+ max_df=max_df,
275
+ ngram_range=ngram_range,
276
+ stop_words="english",
277
+ lowercase=True,
278
+ strip_accents="unicode",
279
+ )
280
+
281
+ # Fit and transform
282
+ print(
283
+ f"Extracting TF-IDF features with max_features={max_features if max_features else 'All'}, "
284
+ f"ngram_range={ngram_range}"
285
+ )
286
+ tfidf_matrix = vectorizer.fit_transform(combined_text)
287
+
288
+ print(
289
+ f"Extracted {tfidf_matrix.shape[1]} TF-IDF features from {tfidf_matrix.shape[0]} samples"
290
+ )
291
+
292
+ return tfidf_matrix.toarray(), vectorizer
293
+
294
+
295
+ def extract_embedding_features(
296
+ df: pd.DataFrame,
297
+ text_columns: Optional[list] = None,
298
+ model_name: str = "all-MiniLM-L6-v2",
299
+ batch_size: int = 32,
300
+ ) -> Tuple[np.ndarray, object]:
301
+ """
302
+ Extract LLM embeddings from textual data using Sentence Transformers.
303
+
304
+ Args:
305
+ df: Input dataframe
306
+ text_columns: List of text columns to use. If None, auto-detect.
307
+ model_name: Name of the pre-trained model to use
308
+ batch_size: Batch size for encoding
309
+
310
+ Returns:
311
+ Tuple of (feature matrix, model object)
312
+ """
313
+ try:
314
+ from sentence_transformers import SentenceTransformer
315
+ except ImportError as e:
316
+ raise ImportError(
317
+ f"sentence-transformers import failed: {e}. Try running: pip install sentence-transformers"
318
+ ) from e
319
+
320
+ if text_columns is None:
321
+ text_columns = get_text_columns(df)
322
+
323
+ if not text_columns:
324
+ raise ValueError("No text columns found in dataframe")
325
+
326
+ # Combine text fields (without stemming for embeddings - LLMs need full words)
327
+ print(f"Combining text from columns: {text_columns}")
328
+ combined_text = combine_text_fields(df, text_columns, use_stemming=False)
329
+
330
+ # Load model
331
+ print(f"Loading embedding model: {model_name}")
332
+ model = SentenceTransformer(model_name)
333
+
334
+ # Encode
335
+ print(f"Extracting embeddings for {len(combined_text)} samples...")
336
+ embeddings = model.encode(
337
+ combined_text.tolist(),
338
+ batch_size=batch_size,
339
+ show_progress_bar=True,
340
+ convert_to_numpy=True,
341
+ )
342
+
343
+ print(f"Extracted embeddings shape: {embeddings.shape}")
344
+
345
+ return embeddings, model
346
+
347
+
348
+ def prepare_labels(df: pd.DataFrame, label_columns: Optional[list] = None) -> pd.DataFrame:
349
+ """
350
+ Prepare multi-label binary matrix from label columns.
351
+
352
+ Args:
353
+ df: Input dataframe
354
+ label_columns: List of label columns. If None, auto-detect.
355
+
356
+ Returns:
357
+ DataFrame with binary labels (1 if label present, 0 otherwise)
358
+ """
359
+ if label_columns is None:
360
+ label_columns = get_label_columns(df)
361
+
362
+ # Convert to binary: any value > 0 means label is present
363
+ labels = (df[label_columns] > 0).astype(int)
364
+
365
+ print(f"Prepared {len(label_columns)} labels")
366
+ print(f"Label distribution:\n{labels.sum().describe()}")
367
+
368
+ return labels
369
+
370
+
371
+ def create_feature_dataset(
372
+ db_path: Optional[Path] = None,
373
+ save_processed: bool = True,
374
+ feature_type: str = "tfidf", # 'tfidf' or 'embedding'
375
+ model_name: str = "all-MiniLM-L6-v2",
376
+ ) -> Tuple[np.ndarray, pd.DataFrame, list, list]:
377
+ """
378
+ Main function to create the complete feature dataset.
379
+
380
+ Args:
381
+ db_path: Path to SQLite database
382
+ save_processed: Whether to save processed data to disk
383
+ feature_type: Type of features to extract ('tfidf' or 'embedding')
384
+ model_name: Model name for embeddings (ignored if feature_type='tfidf')
385
+
386
+ Returns:
387
+ Tuple of (features, labels, feature_names, label_names)
388
+ """
389
+ # Load data
390
+ df = load_data_from_db(db_path)
391
+
392
+ # Get dataset info
393
+ info = get_dataset_info(df)
394
+ print("\n=== Dataset Information ===")
395
+ print(f"Total issues: {info['total_issues']:,}")
396
+ print(f"Text columns: {info['text_columns']}")
397
+ print(f"Number of labels: {info['num_labels']}")
398
+ print(f"Avg labels per issue: {info['avg_labels_per_issue']:.2f}")
399
+ print(f"Labels with no issues: {info['labels_with_no_issues']}")
400
+
401
+ # Extract features
402
+ text_columns = get_text_columns(df)
403
+ label_columns = get_label_columns(df)
404
+
405
+ feature_names = []
406
+
407
+ vectorizer = None
408
+
409
+ if feature_type == "tfidf":
410
+ features, vectorizer = extract_tfidf_features(df, text_columns=text_columns)
411
+ feature_names = vectorizer.get_feature_names_out()
412
+ elif feature_type == "embedding":
413
+ features, _ = extract_embedding_features(
414
+ df, text_columns=text_columns, model_name=model_name
415
+ )
416
+ feature_names = [f"emb_{i}" for i in range(features.shape[1])]
417
+ else:
418
+ raise ValueError(f"Unknown feature_type: {feature_type}")
419
+
420
+ # Prepare labels
421
+ labels = prepare_labels(df, label_columns)
422
+
423
+ # Save processed data
424
+ if save_processed:
425
+ # Path: processed/{feature_type}/
426
+ output_dir = PROCESSED_DATA_DIR / feature_type
427
+ output_dir.mkdir(parents=True, exist_ok=True)
428
+
429
+ features_path = output_dir / f"features_{feature_type}.npy"
430
+ labels_path = output_dir / f"labels_{feature_type}.npy"
431
+
432
+ np.save(features_path, features)
433
+ np.save(labels_path, labels.values)
434
+
435
+ print(f"\nSaved processed data to {output_dir}")
436
+ print(f" - {features_path.name}: {features.shape}")
437
+ print(f" - {labels_path.name}: {labels.shape}")
438
+
439
+ # Save vectorizer and label names to models/ directory for inference
440
+ MODELS_DIR.mkdir(parents=True, exist_ok=True)
441
+
442
+ if feature_type == "tfidf" and vectorizer is not None:
443
+ vectorizer_path = MODELS_DIR / "tfidf_vectorizer.pkl"
444
+ joblib.dump(vectorizer, vectorizer_path)
445
+ print(f" - Saved TF-IDF vectorizer to: {vectorizer_path}")
446
+
447
+ # Always save label names (needed for both tfidf and embedding inference)
448
+ label_names_path = MODELS_DIR / "label_names.pkl"
449
+ joblib.dump(label_columns, label_names_path)
450
+ print(f" - Saved {len(label_columns)} label names to: {label_names_path}")
451
+
452
+ return features, labels, feature_names, label_columns
453
+
454
+
455
+ def load_processed_data(
456
+ feature_name: str = "tfidf", data_dir: Optional[Path] = None
457
+ ) -> Tuple[np.ndarray, np.ndarray]:
458
+ """
459
+ Load processed features and labels from disk.
460
+
461
+ Args:
462
+ feature_name: Name prefix of the features to load (e.g., 'tfidf', 'bow', 'embeddings')
463
+ data_dir: Path to processed data directory. If None, uses default.
464
+
465
+ Returns:
466
+ Tuple of (features, labels)
467
+ """
468
+ if data_dir is None:
469
+ data_dir = PROCESSED_DATA_DIR
470
+
471
+ features_path = data_dir / f"features_{feature_name}.npy"
472
+ labels_path = data_dir / f"labels_{feature_name}.npy"
473
+
474
+ features = np.load(features_path)
475
+ labels = np.load(labels_path)
476
+
477
+ print(f"Loaded processed data from {data_dir}")
478
+ print(f" - Feature type: {feature_name}")
479
+ print(f" - Features shape: {features.shape}")
480
+ print(f" - Labels shape: {labels.shape}")
481
+
482
+ return features, labels
483
+
484
+
485
+ if __name__ == "__main__":
486
+ features, labels, feature_names, label_names = create_feature_dataset(feature_type="embedding")
487
+
488
+ print("\n=== Feature Extraction Summary ===")
489
+ print(f"Features shape: {features.shape}")
490
+ print(f"Labels shape: {labels.shape}")
491
+ print(f"Number of feature names: {len(feature_names)}")
492
+ print(f"Number of labels: {len(label_names)}")
hopcroft_skill_classification_tool_competition/main.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI application for skill classification service.
3
+
4
+ Provides REST API endpoints for classifying GitHub issues and pull requests
5
+ into skill categories using machine learning models.
6
+
7
+ Usage:
8
+ Development: fastapi dev hopcroft_skill_classification_tool_competition/main.py
9
+ Production: fastapi run hopcroft_skill_classification_tool_competition/main.py
10
+
11
+ Endpoints:
12
+ GET / - API information
13
+ GET /health - Health check
14
+ POST /predict - Single issue classification
15
+ POST /predict/batch - Batch classification
16
+ """
17
+
18
+ from contextlib import asynccontextmanager
19
+ from datetime import datetime
20
+ import json
21
+ import os
22
+ import time
23
+ from typing import List
24
+
25
+ from fastapi import FastAPI, HTTPException, status
26
+ from fastapi.responses import JSONResponse, RedirectResponse
27
+ import mlflow
28
+ from pydantic import ValidationError
29
+
30
+ from hopcroft_skill_classification_tool_competition.api_models import (
31
+ BatchIssueInput,
32
+ BatchPredictionResponse,
33
+ ErrorResponse,
34
+ HealthCheckResponse,
35
+ IssueInput,
36
+ PredictionRecord,
37
+ PredictionResponse,
38
+ SkillPrediction,
39
+ )
40
+ from hopcroft_skill_classification_tool_competition.config import MLFLOW_CONFIG
41
+ from hopcroft_skill_classification_tool_competition.modeling.predict import SkillPredictor
42
+
43
+ predictor = None
44
+ model_version = "1.0.0"
45
+
46
+
47
+ @asynccontextmanager
48
+ async def lifespan(app: FastAPI):
49
+ """Manage application startup and shutdown."""
50
+ global predictor, model_version
51
+
52
+ print("=" * 80)
53
+ print("Starting Skill Classification API")
54
+ print("=" * 80)
55
+
56
+ # Configure MLflow
57
+ mlflow.set_tracking_uri(MLFLOW_CONFIG["uri"])
58
+ print(f"MLflow tracking URI set to: {MLFLOW_CONFIG['uri']}")
59
+
60
+ try:
61
+ model_name = os.getenv("MODEL_NAME", "random_forest_tfidf_gridsearch.pkl")
62
+ print(f"Loading model: {model_name}")
63
+ predictor = SkillPredictor(model_name=model_name)
64
+ print("Model and artifacts loaded successfully")
65
+ except Exception as e:
66
+ print(f"Failed to load model: {e}")
67
+ print("WARNING: API starting in degraded mode (prediction will fail)")
68
+
69
+ print(f"Model version {model_version} initialized")
70
+ print("API ready")
71
+ print("=" * 80)
72
+
73
+ yield
74
+
75
+ print("Shutting down API")
76
+
77
+
78
+ app = FastAPI(
79
+ title="Skill Classification API",
80
+ description="API for classifying GitHub issues and pull requests into skill categories",
81
+ version="1.0.0",
82
+ docs_url="/docs",
83
+ redoc_url="/redoc",
84
+ lifespan=lifespan,
85
+ )
86
+
87
+
88
+ @app.get("/", tags=["Root"])
89
+ async def root():
90
+ """Return basic API information."""
91
+ return {
92
+ "message": "Skill Classification API",
93
+ "version": "1.0.0",
94
+ "documentation": "/docs",
95
+ "demo": "/demo",
96
+ "health": "/health",
97
+ }
98
+
99
+
100
+ @app.get("/health", response_model=HealthCheckResponse, tags=["Health"])
101
+ async def health_check():
102
+ """Check API and model status."""
103
+ return HealthCheckResponse(
104
+ status="healthy",
105
+ model_loaded=predictor is not None,
106
+ version="1.0.0",
107
+ timestamp=datetime.now(),
108
+ )
109
+
110
+
111
+ @app.get("/demo")
112
+ async def redirect_to_demo():
113
+ """Redirect to Streamlit demo."""
114
+ return RedirectResponse(url="http://localhost:8501")
115
+
116
+
117
+ @app.post(
118
+ "/predict",
119
+ response_model=PredictionRecord,
120
+ status_code=status.HTTP_201_CREATED,
121
+ tags=["Prediction"],
122
+ summary="Classify a single issue",
123
+ response_description="Skill predictions with confidence scores",
124
+ )
125
+ async def predict_skills(issue: IssueInput) -> PredictionRecord:
126
+ """
127
+ Classify a single GitHub issue or pull request into skill categories.
128
+
129
+ Args:
130
+ issue: IssueInput containing issue text and optional metadata
131
+
132
+ Returns:
133
+ PredictionRecord with list of predicted skills, confidence scores, and run_id
134
+
135
+ Raises:
136
+ HTTPException: If prediction fails
137
+ """
138
+ start_time = time.time()
139
+
140
+ try:
141
+ if predictor is None:
142
+ raise HTTPException(status_code=503, detail="Model not loaded")
143
+
144
+ # Combine text fields if needed, or just use issue_text
145
+ # The predictor expects a single string
146
+ full_text = f"{issue.issue_text} {issue.issue_description or ''} {issue.repo_name or ''}"
147
+
148
+ predictions_data = predictor.predict(full_text)
149
+
150
+ # Convert to Pydantic models
151
+ predictions = [
152
+ SkillPrediction(skill_name=p["skill_name"], confidence=p["confidence"])
153
+ for p in predictions_data
154
+ ]
155
+
156
+ processing_time = (time.time() - start_time) * 1000
157
+
158
+ # Log to MLflow
159
+ run_id = "local"
160
+ timestamp = datetime.now()
161
+
162
+ try:
163
+ experiment_name = MLFLOW_CONFIG["experiments"]["baseline"]
164
+ mlflow.set_experiment(experiment_name)
165
+
166
+ with mlflow.start_run() as run:
167
+ run_id = run.info.run_id
168
+ # Log inputs
169
+ mlflow.log_param("issue_text", issue.issue_text)
170
+ if issue.repo_name:
171
+ mlflow.log_param("repo_name", issue.repo_name)
172
+
173
+ # Log outputs (as metrics or params/tags for retrieval)
174
+ # For simple retrieval, we'll store the main prediction as a tag/param
175
+ if predictions:
176
+ mlflow.log_param("top_skill", predictions[0].skill_name)
177
+ mlflow.log_metric("top_confidence", predictions[0].confidence)
178
+
179
+ # Store full predictions as a JSON artifact or tag
180
+ predictions_json = json.dumps([p.model_dump() for p in predictions])
181
+ mlflow.set_tag("predictions_json", predictions_json)
182
+ mlflow.set_tag("model_version", model_version)
183
+ except Exception as e:
184
+ print(f"MLflow logging failed: {e}")
185
+
186
+ return PredictionRecord(
187
+ predictions=predictions,
188
+ num_predictions=len(predictions),
189
+ model_version=model_version,
190
+ processing_time_ms=round(processing_time, 2),
191
+ run_id=run_id,
192
+ timestamp=timestamp,
193
+ input_text=issue.issue_text,
194
+ )
195
+
196
+ except Exception as e:
197
+ raise HTTPException(
198
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
199
+ detail=f"Prediction failed: {str(e)}",
200
+ )
201
+
202
+
203
+ @app.post(
204
+ "/predict/batch",
205
+ response_model=BatchPredictionResponse,
206
+ status_code=status.HTTP_200_OK,
207
+ tags=["Prediction"],
208
+ summary="Classify multiple issues",
209
+ response_description="Batch skill predictions",
210
+ )
211
+ async def predict_skills_batch(batch: BatchIssueInput) -> BatchPredictionResponse:
212
+ """
213
+ Classify multiple GitHub issues or pull requests in batch.
214
+
215
+ Args:
216
+ batch: BatchIssueInput containing list of issues (max 100)
217
+
218
+ Returns:
219
+ BatchPredictionResponse with prediction results for each issue
220
+
221
+ Raises:
222
+ HTTPException: If batch prediction fails
223
+ """
224
+ start_time = time.time()
225
+
226
+ try:
227
+ results = []
228
+
229
+ if predictor is None:
230
+ raise HTTPException(status_code=503, detail="Model not loaded")
231
+
232
+ for issue in batch.issues:
233
+ full_text = (
234
+ f"{issue.issue_text} {issue.issue_description or ''} {issue.repo_name or ''}"
235
+ )
236
+ predictions_data = predictor.predict(full_text)
237
+
238
+ predictions = [
239
+ SkillPrediction(skill_name=p["skill_name"], confidence=p["confidence"])
240
+ for p in predictions_data
241
+ ]
242
+
243
+ results.append(
244
+ PredictionResponse(
245
+ predictions=predictions,
246
+ num_predictions=len(predictions),
247
+ model_version=model_version,
248
+ )
249
+ )
250
+
251
+ total_processing_time = (time.time() - start_time) * 1000
252
+
253
+ return BatchPredictionResponse(
254
+ results=results,
255
+ total_issues=len(batch.issues),
256
+ total_processing_time_ms=round(total_processing_time, 2),
257
+ )
258
+
259
+ except Exception as e:
260
+ raise HTTPException(
261
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
262
+ detail=f"Batch prediction failed: {str(e)}",
263
+ )
264
+
265
+
266
+ @app.get(
267
+ "/predictions/{run_id}",
268
+ response_model=PredictionRecord,
269
+ status_code=status.HTTP_200_OK,
270
+ tags=["Prediction"],
271
+ summary="Get a prediction by ID",
272
+ response_description="Prediction details",
273
+ )
274
+ async def get_prediction(run_id: str) -> PredictionRecord:
275
+ """
276
+ Retrieve a specific prediction by its MLflow Run ID.
277
+
278
+ Args:
279
+ run_id: The MLflow Run ID
280
+
281
+ Returns:
282
+ PredictionRecord containing the prediction details
283
+
284
+ Raises:
285
+ HTTPException: If run not found or error occurs
286
+ """
287
+ try:
288
+ run = mlflow.get_run(run_id)
289
+ data = run.data
290
+
291
+ # Reconstruct predictions from tags
292
+ predictions_json = data.tags.get("predictions_json", "[]")
293
+ predictions_data = json.loads(predictions_json)
294
+ predictions = [SkillPrediction(**p) for p in predictions_data]
295
+
296
+ # Get timestamp (start_time is in ms)
297
+ timestamp = datetime.fromtimestamp(run.info.start_time / 1000.0)
298
+
299
+ return PredictionRecord(
300
+ predictions=predictions,
301
+ num_predictions=len(predictions),
302
+ model_version=data.tags.get("model_version", "unknown"),
303
+ processing_time_ms=None, # Not stored in standard tags, could be added
304
+ run_id=run.info.run_id,
305
+ timestamp=timestamp,
306
+ input_text=data.params.get("issue_text", ""),
307
+ )
308
+
309
+ except mlflow.exceptions.MlflowException:
310
+ raise HTTPException(
311
+ status_code=status.HTTP_404_NOT_FOUND, detail=f"Prediction with ID {run_id} not found"
312
+ )
313
+ except Exception as e:
314
+ raise HTTPException(
315
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
316
+ detail=f"Failed to retrieve prediction: {str(e)}",
317
+ )
318
+
319
+
320
+ @app.get(
321
+ "/predictions",
322
+ response_model=List[PredictionRecord],
323
+ status_code=status.HTTP_200_OK,
324
+ tags=["Prediction"],
325
+ summary="List predictions",
326
+ response_description="List of recent predictions",
327
+ )
328
+ async def list_predictions(skip: int = 0, limit: int = 10) -> List[PredictionRecord]:
329
+ """
330
+ Retrieve a list of recent predictions.
331
+
332
+ Args:
333
+ skip: Number of records to skip (not fully supported by MLflow search, handled client-side)
334
+ limit: Maximum number of records to return
335
+
336
+ Returns:
337
+ List of PredictionRecord
338
+ """
339
+ try:
340
+ experiment_name = MLFLOW_CONFIG["experiments"]["baseline"]
341
+ experiment = mlflow.get_experiment_by_name(experiment_name)
342
+
343
+ if not experiment:
344
+ return []
345
+
346
+ # Search runs
347
+ runs = mlflow.search_runs(
348
+ experiment_ids=[experiment.experiment_id],
349
+ max_results=limit + skip,
350
+ order_by=["start_time DESC"],
351
+ )
352
+
353
+ results = []
354
+ # Convert pandas DataFrame to list of dicts if needed, or iterate
355
+ # mlflow.search_runs returns a pandas DataFrame
356
+
357
+ # We need to iterate through the DataFrame
358
+ if runs.empty:
359
+ return []
360
+
361
+ # Apply skip
362
+ runs = runs.iloc[skip:]
363
+
364
+ for _, row in runs.iterrows():
365
+ run_id = row.run_id
366
+
367
+ # Extract data from columns (flattened)
368
+ # Tags are prefixed with 'tags.', Params with 'params.'
369
+
370
+ # Helper to safely get value
371
+ def get_val(row, prefix, key, default=None):
372
+ col = f"{prefix}.{key}"
373
+ return row[col] if col in row else default
374
+
375
+ predictions_json = get_val(row, "tags", "predictions_json", "[]")
376
+ try:
377
+ predictions_data = json.loads(predictions_json)
378
+ predictions = [SkillPrediction(**p) for p in predictions_data]
379
+ except Exception:
380
+ predictions = []
381
+
382
+ timestamp = row.start_time # This is usually a datetime object in the DF
383
+
384
+ # Get model_version with fallback to "unknown" or inherited default
385
+ model_version = get_val(row, "tags", "model_version")
386
+ if model_version is None or model_version == "":
387
+ model_version = "unknown"
388
+
389
+ # Get input_text with fallback to empty string
390
+ input_text = get_val(row, "params", "issue_text")
391
+ if input_text is None:
392
+ input_text = ""
393
+
394
+ results.append(
395
+ PredictionRecord(
396
+ predictions=predictions,
397
+ num_predictions=len(predictions),
398
+ model_version=model_version,
399
+ processing_time_ms=None,
400
+ run_id=run_id,
401
+ timestamp=timestamp,
402
+ input_text=input_text,
403
+ )
404
+ )
405
+
406
+ return results
407
+
408
+ except Exception as e:
409
+ raise HTTPException(
410
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
411
+ detail=f"Failed to list predictions: {str(e)}",
412
+ )
413
+
414
+
415
+ @app.exception_handler(ValidationError)
416
+ async def validation_exception_handler(request, exc: ValidationError):
417
+ """Handle Pydantic validation errors."""
418
+ return JSONResponse(
419
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
420
+ content=ErrorResponse(
421
+ error="Validation Error", detail=str(exc), timestamp=datetime.now()
422
+ ).model_dump(),
423
+ )
424
+
425
+
426
+ @app.exception_handler(HTTPException)
427
+ async def http_exception_handler(request, exc: HTTPException):
428
+ """Handle HTTP exceptions."""
429
+ return JSONResponse(
430
+ status_code=exc.status_code,
431
+ content=ErrorResponse(
432
+ error=exc.detail, detail=None, timestamp=datetime.now()
433
+ ).model_dump(),
434
+ )
hopcroft_skill_classification_tool_competition/mlsmote.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # github: https://github.com/niteshsukhwani/MLSMOTE.git
2
+ # -*- coding: utf-8 -*-
3
+ # Importing required Library
4
+ import random
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from sklearn.datasets import make_classification
9
+ from sklearn.neighbors import NearestNeighbors
10
+
11
+
12
+ def create_dataset(n_sample=1000):
13
+ """
14
+ Create a unevenly distributed sample data set multilabel
15
+ classification using make_classification function
16
+
17
+ args
18
+ nsample: int, Number of sample to be created
19
+
20
+ return
21
+ X: pandas.DataFrame, feature vector dataframe with 10 features
22
+ y: pandas.DataFrame, target vector dataframe with 5 labels
23
+ """
24
+ X, y = make_classification(
25
+ n_classes=5,
26
+ class_sep=2,
27
+ weights=[0.1, 0.025, 0.205, 0.008, 0.9],
28
+ n_informative=3,
29
+ n_redundant=1,
30
+ flip_y=0,
31
+ n_features=10,
32
+ n_clusters_per_class=1,
33
+ n_samples=1000,
34
+ random_state=10,
35
+ )
36
+ y = pd.get_dummies(y, prefix="class")
37
+ return pd.DataFrame(X), y
38
+
39
+
40
+ def get_tail_label(df):
41
+ """
42
+ Give tail label colums of the given target dataframe
43
+
44
+ args
45
+ df: pandas.DataFrame, target label df whose tail label has to identified
46
+
47
+ return
48
+ tail_label: list, a list containing column name of all the tail label
49
+ """
50
+ columns = df.columns
51
+ n = len(columns)
52
+ irpl = np.zeros(n)
53
+ for column in range(n):
54
+ irpl[column] = df[columns[column]].value_counts()[1]
55
+ irpl = max(irpl) / irpl
56
+ mir = np.average(irpl)
57
+ tail_label = []
58
+ for i in range(n):
59
+ if irpl[i] > mir:
60
+ tail_label.append(columns[i])
61
+ return tail_label
62
+
63
+
64
+ def get_index(df):
65
+ """
66
+ give the index of all tail_label rows
67
+ args
68
+ df: pandas.DataFrame, target label df from which index for tail label has to identified
69
+
70
+ return
71
+ index: list, a list containing index number of all the tail label
72
+ """
73
+ tail_labels = get_tail_label(df)
74
+ index = set()
75
+ for tail_label in tail_labels:
76
+ sub_index = set(df[df[tail_label] == 1].index)
77
+ index = index.union(sub_index)
78
+ return list(index)
79
+
80
+
81
+ def get_minority_instace(X, y):
82
+ """
83
+ Give minority dataframe containing all the tail labels
84
+
85
+ args
86
+ X: pandas.DataFrame, the feature vector dataframe
87
+ y: pandas.DataFrame, the target vector dataframe
88
+
89
+ return
90
+ X_sub: pandas.DataFrame, the feature vector minority dataframe
91
+ y_sub: pandas.DataFrame, the target vector minority dataframe
92
+ """
93
+ index = get_index(y)
94
+ X_sub = X[X.index.isin(index)].reset_index(drop=True)
95
+ y_sub = y[y.index.isin(index)].reset_index(drop=True)
96
+ return X_sub, y_sub
97
+
98
+
99
+ def nearest_neighbour(X):
100
+ """
101
+ Give index of 5 nearest neighbor of all the instance
102
+
103
+ args
104
+ X: np.array, array whose nearest neighbor has to find
105
+
106
+ return
107
+ indices: list of list, index of 5 NN of each element in X
108
+ """
109
+ nbs = NearestNeighbors(n_neighbors=5, metric="euclidean", algorithm="kd_tree").fit(X)
110
+ euclidean, indices = nbs.kneighbors(X)
111
+ return indices
112
+
113
+
114
+ def MLSMOTE(X, y, n_sample):
115
+ """
116
+ Give the augmented data using MLSMOTE algorithm
117
+
118
+ args
119
+ X: pandas.DataFrame, input vector DataFrame
120
+ y: pandas.DataFrame, feature vector dataframe
121
+ n_sample: int, number of newly generated sample
122
+
123
+ return
124
+ new_X: pandas.DataFrame, augmented feature vector data
125
+ target: pandas.DataFrame, augmented target vector data
126
+ """
127
+ indices2 = nearest_neighbour(X)
128
+ n = len(indices2)
129
+ new_X = np.zeros((n_sample, X.shape[1]))
130
+ target = np.zeros((n_sample, y.shape[1]))
131
+ for i in range(n_sample):
132
+ reference = random.randint(0, n - 1)
133
+ neighbour = random.choice(indices2[reference, 1:])
134
+ all_point = indices2[reference]
135
+ nn_df = y[y.index.isin(all_point)]
136
+ ser = nn_df.sum(axis=0, skipna=True)
137
+ target[i] = np.array([1 if val > 2 else 0 for val in ser])
138
+ ratio = random.random()
139
+ gap = X.loc[reference, :] - X.loc[neighbour, :]
140
+ new_X[i] = np.array(X.loc[reference, :] + ratio * gap)
141
+ new_X = pd.DataFrame(new_X, columns=X.columns)
142
+ target = pd.DataFrame(target, columns=y.columns)
143
+ new_X = pd.concat([X, new_X], axis=0)
144
+ target = pd.concat([y, target], axis=0)
145
+ return new_X, target
146
+
147
+
148
+ # Keep original MLSMOTE function name for direct use
149
+
150
+
151
+ if __name__ == "__main__":
152
+ """
153
+ main function to use the MLSMOTE
154
+ """
155
+ X, y = create_dataset() # Creating a Dataframe
156
+ X_sub, y_sub = get_minority_instace(X, y) # Getting minority instance of that datframe
157
+ X_res, y_res = MLSMOTE(X_sub, y_sub, 100) # Applying MLSMOTE to augment the dataframe
hopcroft_skill_classification_tool_competition/modeling/predict.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ import joblib
5
+ import numpy as np
6
+
7
+ from hopcroft_skill_classification_tool_competition.config import (
8
+ API_CONFIG,
9
+ DATA_PATHS,
10
+ EMBEDDING_MODEL_NAME,
11
+ MODELS_DIR,
12
+ )
13
+ from hopcroft_skill_classification_tool_competition.features import clean_github_text
14
+
15
+
16
+ class SkillPredictor:
17
+ """
18
+ Skill prediction class that supports both TF-IDF and Embedding-based models.
19
+
20
+ The feature_type determines how text is transformed:
21
+ - "tfidf": Uses saved TfidfVectorizer
22
+ - "embedding": Uses SentenceTransformer to generate embeddings
23
+ """
24
+
25
+ def __init__(self, model_name: Optional[str] = None, feature_type: Optional[str] = None):
26
+ """
27
+ Initialize the SkillPredictor.
28
+
29
+ Args:
30
+ model_name: Name of the model file. If None, uses API_CONFIG["model_name"]
31
+ feature_type: "tfidf" or "embedding". If None, uses API_CONFIG["feature_type"]
32
+ """
33
+ # Use config defaults if not specified
34
+ self.model_name = model_name or API_CONFIG["model_name"]
35
+ self.feature_type = feature_type or API_CONFIG["feature_type"]
36
+
37
+ self.model_path = MODELS_DIR / self.model_name
38
+ self.labels_path = MODELS_DIR / "label_names.pkl"
39
+
40
+ # Paths for kept indices (may be in different locations)
41
+ self.kept_indices_path_models = MODELS_DIR / "kept_label_indices.npy"
42
+ self.kept_indices_path_tfidf = (
43
+ Path(DATA_PATHS["features"]).parent.parent / "tfidf" / "kept_label_indices.npy"
44
+ )
45
+ self.kept_indices_path_emb = (
46
+ Path(DATA_PATHS["features"]).parent.parent / "embedding" / "kept_label_indices.npy"
47
+ )
48
+
49
+ self.model = None
50
+ self.vectorizer = None # TF-IDF vectorizer or SentenceTransformer
51
+ self.label_names = None
52
+ self.kept_indices = None
53
+
54
+ self._load_artifacts()
55
+
56
+ def _load_artifacts(self):
57
+ """Load model and required artifacts based on feature_type."""
58
+ print(f"Loading model from {self.model_path}...")
59
+ if not self.model_path.exists():
60
+ raise FileNotFoundError(f"Model not found at {self.model_path}")
61
+ self.model = joblib.load(self.model_path)
62
+
63
+ # Load vectorizer/encoder based on feature type
64
+ if self.feature_type == "tfidf":
65
+ self._load_tfidf_vectorizer()
66
+ elif self.feature_type == "embedding":
67
+ self._load_embedding_model()
68
+ else:
69
+ raise ValueError(
70
+ f"Unknown feature_type: {self.feature_type}. Must be 'tfidf' or 'embedding'"
71
+ )
72
+
73
+ # Load label names
74
+ print(f"Loading label names from {self.labels_path}...")
75
+ if not self.labels_path.exists():
76
+ raise FileNotFoundError(f"Label names not found at {self.labels_path}")
77
+ self.label_names = joblib.load(self.labels_path)
78
+
79
+ # Load kept indices if available
80
+ if self.kept_indices_path_models.exists():
81
+ print(f"Loading kept indices from {self.kept_indices_path_models}")
82
+ self.kept_indices = np.load(self.kept_indices_path_models)
83
+ elif self.kept_indices_path_emb.exists():
84
+ print(f"Loading kept indices from {self.kept_indices_path_emb}")
85
+ self.kept_indices = np.load(self.kept_indices_path_emb)
86
+ elif self.kept_indices_path_tfidf.exists():
87
+ print(f"Loading kept indices from {self.kept_indices_path_tfidf}")
88
+ self.kept_indices = np.load(self.kept_indices_path_tfidf)
89
+ else:
90
+ print("No kept_label_indices.npy found. Assuming all labels are used.")
91
+ self.kept_indices = None
92
+
93
+ def _load_tfidf_vectorizer(self):
94
+ """Load the TF-IDF vectorizer."""
95
+ vectorizer_path = MODELS_DIR / "tfidf_vectorizer.pkl"
96
+ print(f"Loading TF-IDF vectorizer from {vectorizer_path}...")
97
+ if not vectorizer_path.exists():
98
+ raise FileNotFoundError(
99
+ f"TF-IDF vectorizer not found at {vectorizer_path}. "
100
+ "Run feature extraction first: python -m hopcroft_skill_classification_tool_competition.features"
101
+ )
102
+ self.vectorizer = joblib.load(vectorizer_path)
103
+
104
+ def _load_embedding_model(self):
105
+ """Load the SentenceTransformer model for embeddings."""
106
+ try:
107
+ from sentence_transformers import SentenceTransformer
108
+ except ImportError as e:
109
+ raise ImportError(
110
+ f"sentence-transformers is required for embedding-based models. "
111
+ f"Install with: pip install sentence-transformers. Error: {e}"
112
+ ) from e
113
+
114
+ print(f"Loading SentenceTransformer model: {EMBEDDING_MODEL_NAME}...")
115
+ self.vectorizer = SentenceTransformer(EMBEDDING_MODEL_NAME)
116
+
117
+ def _transform_text(self, text: str) -> np.ndarray:
118
+ """
119
+ Transform text to features based on feature_type.
120
+
121
+ Args:
122
+ text: Cleaned input text
123
+
124
+ Returns:
125
+ Feature array ready for model prediction
126
+ """
127
+ if self.feature_type == "tfidf":
128
+ # TF-IDF: use stemming, return sparse matrix converted to array
129
+ cleaned = clean_github_text(text, use_stemming=True)
130
+ features = self.vectorizer.transform([cleaned])
131
+ return features
132
+ else:
133
+ # Embedding: no stemming (LLMs need full words)
134
+ cleaned = clean_github_text(text, use_stemming=False)
135
+ features = self.vectorizer.encode([cleaned], convert_to_numpy=True)
136
+ return features
137
+
138
+ def predict(self, text: str, threshold: float = 0.5) -> List[Dict[str, Any]]:
139
+ """
140
+ Predict skills for a given text.
141
+
142
+ Args:
143
+ text: Input text (issue title + body)
144
+ threshold: Confidence threshold for binary classification
145
+
146
+ Returns:
147
+ List of dicts with 'skill_name' and 'confidence'
148
+ """
149
+ # Transform text to features
150
+ features = self._transform_text(text)
151
+
152
+ # Predict
153
+ # MultiOutputClassifier predict_proba returns a list of arrays (one per class)
154
+ # Each array is (n_samples, 2) -> [prob_0, prob_1]
155
+ probas_list = self.model.predict_proba(features)
156
+
157
+ # Extract positive class probabilities
158
+ confidence_scores = []
159
+ for i, prob in enumerate(probas_list):
160
+ if prob.shape[1] >= 2:
161
+ confidence_scores.append(prob[0][1])
162
+ else:
163
+ # Only one class present
164
+ try:
165
+ estimator = self.model.estimators_[i]
166
+ classes = estimator.classes_
167
+ if len(classes) == 1 and classes[0] == 1:
168
+ confidence_scores.append(1.0)
169
+ else:
170
+ confidence_scores.append(0.0)
171
+ except Exception:
172
+ confidence_scores.append(0.0)
173
+
174
+ confidence_scores = np.array(confidence_scores)
175
+
176
+ # Filter by threshold and map to label names
177
+ predictions = []
178
+
179
+ for i, score in enumerate(confidence_scores):
180
+ if score >= threshold:
181
+ if self.kept_indices is not None:
182
+ if i < len(self.kept_indices):
183
+ original_idx = self.kept_indices[i]
184
+ skill_name = self.label_names[original_idx]
185
+ else:
186
+ continue
187
+ else:
188
+ if i < len(self.label_names):
189
+ skill_name = self.label_names[i]
190
+ else:
191
+ skill_name = f"Unknown_Skill_{i}"
192
+
193
+ predictions.append({"skill_name": skill_name, "confidence": float(score)})
194
+
195
+ # Sort by confidence descending
196
+ predictions.sort(key=lambda x: x["confidence"], reverse=True)
197
+
198
+ return predictions
hopcroft_skill_classification_tool_competition/modeling/train.py ADDED
@@ -0,0 +1,858 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from imblearn.over_sampling import ADASYN, RandomOverSampler
6
+ import joblib
7
+ import lightgbm as lgb
8
+ import mlflow
9
+ import mlflow.sklearn
10
+ import numpy as np
11
+ from sklearn.decomposition import PCA
12
+ from sklearn.ensemble import RandomForestClassifier
13
+ from sklearn.metrics import f1_score, precision_score, recall_score
14
+ from sklearn.model_selection import GridSearchCV, KFold, train_test_split
15
+ from sklearn.multioutput import MultiOutputClassifier
16
+
17
+ from hopcroft_skill_classification_tool_competition.config import (
18
+ ADASYN_CONFIG,
19
+ DATA_PATHS,
20
+ MLFLOW_CONFIG,
21
+ MODEL_CONFIG,
22
+ PCA_CONFIG,
23
+ TRAINING_CONFIG,
24
+ get_feature_paths,
25
+ )
26
+
27
+ # Local MLSMOTE implementation (lightweight multi-label oversampling)
28
+ try:
29
+ import pandas as pd
30
+
31
+ from hopcroft_skill_classification_tool_competition.mlsmote import MLSMOTE as mlsmote_function
32
+ from hopcroft_skill_classification_tool_competition.mlsmote import get_minority_instace
33
+
34
+ _HAS_LOCAL_MLSMOTE = True
35
+ except Exception:
36
+ mlsmote_function = None
37
+ get_minority_instace = None
38
+ _HAS_LOCAL_MLSMOTE = False
39
+ print("[warning] Local MLSMOTE not available. Check mlsmote.py exists.")
40
+
41
+
42
+ # Prefer multilabel stratified splits for imbalanced multi-label data.
43
+ # Use `iterative-stratification` package when available.
44
+ try:
45
+ from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
46
+
47
+ _HAS_MLSTRAT = True
48
+ except Exception:
49
+ MultilabelStratifiedShuffleSplit = None
50
+ _HAS_MLSTRAT = False
51
+
52
+
53
+ # -------------------------------
54
+ # MLflow authentication and setup
55
+ # Load environment variables from .env file (for local dev)
56
+ # In Docker, env vars are set via docker-compose env_file
57
+ # -------------------------------
58
+ from dotenv import load_dotenv
59
+
60
+ load_dotenv()
61
+
62
+ _mlflow_env_uri = os.getenv("MLFLOW_TRACKING_URI")
63
+ _configured_uri = MLFLOW_CONFIG.get("uri", "https://dagshub.com/se4ai2526-uniba/Hopcroft.mlflow")
64
+
65
+ if _mlflow_env_uri:
66
+ mlflow_uri = _mlflow_env_uri
67
+ else:
68
+ mlflow_uri = _configured_uri
69
+
70
+ # If targeting DagsHub, require username/password; otherwise proceed.
71
+ if "dagshub.com" in mlflow_uri:
72
+ _username = os.getenv("MLFLOW_TRACKING_USERNAME")
73
+ _password = os.getenv("MLFLOW_TRACKING_PASSWORD")
74
+ if not _username or not _password:
75
+ raise ValueError(
76
+ "Set the environment variables MLFLOW_TRACKING_USERNAME and MLFLOW_TRACKING_PASSWORD for remote tracking"
77
+ )
78
+
79
+ mlflow.set_tracking_uri(mlflow_uri)
80
+
81
+
82
+ # =====================================================
83
+ # Common utilities (merged from train_experiments.py)
84
+ # =====================================================
85
+ def load_data(feature_type="tfidf", use_cleaned=True):
86
+ """Load features and labels using get_feature_paths.
87
+
88
+ Args:
89
+ feature_type: 'tfidf' or 'embedding'
90
+ use_cleaned: whether to use cleaned data
91
+
92
+ Returns:
93
+ X, Y: feature matrix and label matrix
94
+ """
95
+ paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned)
96
+ X = np.load(paths["features"])
97
+ Y = np.load(paths["labels"])
98
+
99
+ print(f"Dataset loaded successfully: {X.shape} samples, {Y.shape} labels")
100
+ print(f"Using feature type: {feature_type}{'_clean' if use_cleaned else ''}")
101
+ return X, Y
102
+
103
+
104
+ def stratified_train_test_split(X, Y, test_size=None, random_state=None, fallback=True):
105
+ """Split X, Y using multilabel stratified shuffle split when possible.
106
+
107
+ Args:
108
+ X: np.ndarray features
109
+ Y: np.ndarray multi-label binary matrix (n_samples, n_labels)
110
+ test_size: float or int, forwarded to splitter
111
+ random_state: int
112
+ fallback: if True and multilabel splitter unavailable, use sklearn.train_test_split
113
+
114
+ Returns:
115
+ X_train, X_test, Y_train, Y_test
116
+ """
117
+ if _HAS_MLSTRAT:
118
+ if isinstance(test_size, float):
119
+ tst = test_size
120
+ else:
121
+ # default to TRAINING_CONFIG if not provided
122
+ tst = TRAINING_CONFIG.get("test_size", 0.2)
123
+
124
+ msss = MultilabelStratifiedShuffleSplit(
125
+ n_splits=1, test_size=tst, random_state=random_state
126
+ )
127
+ train_idx, test_idx = next(msss.split(X, Y))
128
+ return X[train_idx], X[test_idx], Y[train_idx], Y[test_idx]
129
+
130
+ if fallback:
131
+ print(
132
+ "[warning] iterative-stratification not available; using standard train_test_split (no multilabel stratification). To enable stratified multilabel splitting install 'iterative-stratification'."
133
+ )
134
+ return train_test_split(X, Y, test_size=test_size, random_state=random_state, shuffle=True)
135
+
136
+ raise RuntimeError(
137
+ "iterative-stratification is required for multilabel stratified splitting but not installed."
138
+ )
139
+
140
+
141
+ def stratified_train_val_test_split(
142
+ X, Y, test_size=0.2, val_size=0.1, random_state=None, fallback=True
143
+ ):
144
+ """Split X, Y into train, val, test with multilabel stratification when possible.
145
+
146
+ Args:
147
+ X, Y: arrays
148
+ test_size: proportion for final test set
149
+ val_size: proportion for validation set (relative to whole dataset)
150
+ random_state: seed
151
+ fallback: if True, falls back to sklearn splits
152
+
153
+ Returns:
154
+ X_train, X_val, X_test, Y_train, Y_val, Y_test
155
+ """
156
+ if not (0.0 < test_size < 1.0 and 0.0 <= val_size < 1.0 and val_size + test_size < 1.0):
157
+ raise ValueError("test_size and val_size must be fractions in (0,1) and sum < 1")
158
+
159
+ # First split off the final test set
160
+ X_rem, X_test, Y_rem, Y_test = stratified_train_test_split(
161
+ X, Y, test_size=test_size, random_state=random_state, fallback=fallback
162
+ )
163
+
164
+ # Compute validation size relative to the remaining data
165
+ rel_val = 0.0
166
+ if (1.0 - test_size) > 0:
167
+ rel_val = val_size / (1.0 - test_size)
168
+ else:
169
+ rel_val = 0.0
170
+
171
+ if rel_val <= 0:
172
+ # No validation requested
173
+ return X_rem, np.empty((0, X.shape[1])), X_test, Y_rem, np.empty((0, Y.shape[1])), Y_test
174
+
175
+ X_train, X_val, Y_train, Y_val = stratified_train_test_split(
176
+ X_rem, Y_rem, test_size=rel_val, random_state=random_state, fallback=fallback
177
+ )
178
+
179
+ return X_train, X_val, X_test, Y_train, Y_val, Y_test
180
+
181
+
182
+ def _check_label_coverage(Y_train: np.ndarray, Y_val: np.ndarray, min_train: int = 1):
183
+ """Check that each label appears at least `min_train` times in train and
184
+ at least once in train+val. Prints a warning if some labels are scarce in
185
+ train, and raises an error if some labels are missing entirely from
186
+ train+val (which would make learning impossible for those labels).
187
+
188
+ Args:
189
+ Y_train: (n_train, n_labels) binary matrix
190
+ Y_val: (n_val, n_labels) binary matrix (may be empty)
191
+ min_train: minimum occurrences in train to be considered "covered"
192
+ """
193
+ # Defensive: handle empty val
194
+ if Y_val is None:
195
+ Y_val = np.empty((0, Y_train.shape[1]))
196
+
197
+ counts_train = np.sum(Y_train, axis=0)
198
+ counts_train_val = counts_train + np.sum(Y_val, axis=0)
199
+
200
+ missing_in_train = np.where(counts_train < min_train)[0]
201
+ missing_in_train_val = np.where(counts_train_val == 0)[0]
202
+
203
+ if missing_in_train.size > 0:
204
+ # Small, actionable warning for debugging
205
+ preview = missing_in_train[:10].tolist()
206
+ print(
207
+ f"[warning] {missing_in_train.size} label(s) have <{min_train} occurrences in TRAIN. Example label indices: {preview}."
208
+ )
209
+
210
+ if missing_in_train_val.size > 0:
211
+ preview = missing_in_train_val[:10].tolist()
212
+ raise ValueError(
213
+ f"{missing_in_train_val.size} label(s) have 0 occurrences in TRAIN+VAL (indices example: {preview}). "
214
+ "Reduce test/val size, aggregate labels, or ensure these labels exist in the source DB."
215
+ )
216
+
217
+
218
+ def evaluate_and_log(model, X_test, Y_test, best_params, cv_score, exp_name, extra_params=None):
219
+ Y_pred = model.predict(X_test)
220
+ precision = precision_score(Y_test, Y_pred, average="micro", zero_division=0)
221
+ recall = recall_score(Y_test, Y_pred, average="micro", zero_division=0)
222
+ f1 = f1_score(Y_test, Y_pred, average="micro", zero_division=0)
223
+
224
+ mlflow.log_metrics(
225
+ {
226
+ "cv_best_f1_micro": cv_score,
227
+ "test_precision_micro": precision,
228
+ "test_recall_micro": recall,
229
+ "test_f1_micro": f1,
230
+ }
231
+ )
232
+
233
+ for k, v in best_params.items():
234
+ mlflow.log_param(k, v)
235
+ if extra_params:
236
+ for k, v in extra_params.items():
237
+ mlflow.log_param(k, v)
238
+
239
+ os.makedirs(DATA_PATHS["models_dir"], exist_ok=True)
240
+ model_path = Path(DATA_PATHS["models_dir"]) / f"{exp_name}.pkl"
241
+ joblib.dump(model, model_path)
242
+ mlflow.log_artifact(str(model_path), artifact_path=f"model_{exp_name}")
243
+ print(f"Model saved to {model_path}")
244
+ print(f"{exp_name} completed and logged successfully.\n")
245
+
246
+
247
+ def run_grid_search(X, Y):
248
+ base_rf = RandomForestClassifier(random_state=TRAINING_CONFIG["random_state"], n_jobs=-1)
249
+ multi = MultiOutputClassifier(base_rf)
250
+ cv = KFold(
251
+ n_splits=TRAINING_CONFIG["cv_folds"],
252
+ shuffle=True,
253
+ random_state=TRAINING_CONFIG["random_state"],
254
+ )
255
+ grid = GridSearchCV(
256
+ estimator=multi,
257
+ param_grid=MODEL_CONFIG["param_grid"],
258
+ scoring="f1_micro",
259
+ cv=cv,
260
+ n_jobs=-1,
261
+ verbose=2,
262
+ refit=True,
263
+ )
264
+ return grid
265
+
266
+
267
+ def run_grid_search_lgb(X, Y):
268
+ base_lgb = lgb.LGBMClassifier(
269
+ random_state=TRAINING_CONFIG["random_state"], n_jobs=1, force_row_wise=True, verbose=-1
270
+ )
271
+ multi = MultiOutputClassifier(base_lgb, n_jobs=-1)
272
+ cv = KFold(
273
+ n_splits=TRAINING_CONFIG["cv_folds"],
274
+ shuffle=True,
275
+ random_state=TRAINING_CONFIG["random_state"],
276
+ )
277
+ lgb_param_grid = {
278
+ "estimator__n_estimators": [50, 100, 200],
279
+ "estimator__max_depth": [3, 5, 7],
280
+ "estimator__learning_rate": [0.1],
281
+ "estimator__num_leaves": [15],
282
+ }
283
+ grid = GridSearchCV(
284
+ estimator=multi,
285
+ param_grid=lgb_param_grid,
286
+ scoring="f1_micro",
287
+ cv=cv,
288
+ n_jobs=-1,
289
+ verbose=2,
290
+ refit=True,
291
+ )
292
+ return grid
293
+
294
+
295
+ # =====================================================
296
+ # Experiments (merged)
297
+ # =====================================================
298
+ def run_smote_experiment(X, Y, feature_type="tfidf"):
299
+ mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["smote"])
300
+
301
+ # Split into train / val / test
302
+ X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
303
+ X,
304
+ Y,
305
+ test_size=TRAINING_CONFIG.get("test_size", 0.2),
306
+ val_size=TRAINING_CONFIG.get("val_size", 0.1),
307
+ random_state=TRAINING_CONFIG["random_state"],
308
+ )
309
+ # Check label coverage and fail early if labels are missing from train+val
310
+ _check_label_coverage(Y_train, Y_val)
311
+
312
+ # Apply MLSMOTE (Multi-Label SMOTE) as per paper
313
+ # MLSMOTE handles multi-label classification natively by considering label correlations
314
+ print("Applying MLSMOTE (Multi-Label SMOTE) as per SkillScope paper...")
315
+ print(f" Original training set: {X_train.shape[0]} samples, {Y_train.shape[1]} labels")
316
+
317
+ # Use local MLSMOTE implementation directly (function-based)
318
+ if _HAS_LOCAL_MLSMOTE:
319
+ try:
320
+ # Set random seed
321
+ if TRAINING_CONFIG["random_state"] is not None:
322
+ np.random.seed(TRAINING_CONFIG["random_state"])
323
+ import random
324
+
325
+ random.seed(TRAINING_CONFIG["random_state"])
326
+
327
+ # Convert to DataFrame (MLSMOTE function expects DataFrames)
328
+ X_train_df = pd.DataFrame(X_train)
329
+ Y_train_df = pd.DataFrame(Y_train)
330
+
331
+ # Get minority instances
332
+ X_min, Y_min = get_minority_instace(X_train_df, Y_train_df)
333
+
334
+ if len(X_min) == 0:
335
+ print("No minority instances found, using original dataset")
336
+ X_res, Y_res = X_train, Y_train
337
+ oversampling_method = "None (no minority instances)"
338
+ n_new = 0
339
+ else:
340
+ # Calculate number of synthetic samples
341
+ label_counts = Y_train_df.sum(axis=0)
342
+ mean_count = int(label_counts.mean())
343
+ min_count = int(label_counts.min())
344
+ n_synthetic = max(100, int(mean_count - min_count))
345
+ n_synthetic = min(n_synthetic, len(X_min) * 3)
346
+
347
+ print(
348
+ f"Generating {n_synthetic} synthetic samples from {len(X_min)} minority instances"
349
+ )
350
+
351
+ # Apply MLSMOTE function directly
352
+ X_res_df, Y_res_df = mlsmote_function(X_min, Y_min, n_synthetic)
353
+
354
+ # Convert back to numpy
355
+ X_res = X_res_df.values
356
+ Y_res = Y_res_df.values.astype(int)
357
+
358
+ oversampling_method = "MLSMOTE (local implementation)"
359
+ n_new = len(X_res) - len(X_train)
360
+ print(
361
+ f"MLSMOTE completed: {n_new} synthetic samples generated. Total: {len(X_res)} samples"
362
+ )
363
+ except Exception as e:
364
+ print(f"MLSMOTE failed ({e}); falling back to RandomOverSampler")
365
+ Y_train_str = ["".join(map(str, y)) for y in Y_train]
366
+ ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
367
+ X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
368
+ Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
369
+ oversampling_method = "RandomOverSampler (MLSMOTE fallback)"
370
+ n_new = len(X_res) - len(X_train)
371
+ else:
372
+ print("Local MLSMOTE not available; falling back to RandomOverSampler")
373
+ Y_train_str = ["".join(map(str, y)) for y in Y_train]
374
+ ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
375
+ X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
376
+ Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
377
+ oversampling_method = "RandomOverSampler (no MLSMOTE)"
378
+ n_new = len(X_res) - len(X_train)
379
+
380
+ grid = run_grid_search(X_res, Y_res)
381
+ with mlflow.start_run(run_name="random_forest_with_smote"):
382
+ grid.fit(X_res, Y_res)
383
+
384
+ # Refit final model on train + val (use original non-oversampled data for final fit)
385
+ best_params = grid.best_params_
386
+ best_cv = grid.best_score_
387
+ final_model = grid.best_estimator_
388
+ X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
389
+ Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
390
+ final_model.fit(X_comb, Y_comb)
391
+
392
+ evaluate_and_log(
393
+ final_model,
394
+ X_test,
395
+ Y_test,
396
+ best_params,
397
+ best_cv,
398
+ f"random_forest_{feature_type}_gridsearch_smote",
399
+ {
400
+ "oversampling": oversampling_method,
401
+ "synthetic_samples": n_new,
402
+ "n_labels": Y_train.shape[1],
403
+ },
404
+ )
405
+
406
+
407
+ def run_ros_experiment(X, Y):
408
+ mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["ros"])
409
+
410
+ # Split into train / val / test
411
+ X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
412
+ X,
413
+ Y,
414
+ test_size=TRAINING_CONFIG.get("test_size", 0.2),
415
+ val_size=TRAINING_CONFIG.get("val_size", 0.1),
416
+ random_state=TRAINING_CONFIG["random_state"],
417
+ )
418
+
419
+ Y_train_str = ["".join(map(str, y)) for y in Y_train]
420
+ ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
421
+ X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
422
+
423
+ Y.shape[1]
424
+ Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
425
+
426
+ grid = run_grid_search(X_res, Y_res)
427
+ with mlflow.start_run(run_name="random_forest_with_ros"):
428
+ grid.fit(X_res, Y_res)
429
+
430
+ best_params = grid.best_params_
431
+ best_cv = grid.best_score_
432
+ final_model = grid.best_estimator_
433
+ X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
434
+ Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
435
+ final_model.fit(X_comb, Y_comb)
436
+
437
+ evaluate_and_log(
438
+ final_model,
439
+ X_test,
440
+ Y_test,
441
+ best_params,
442
+ best_cv,
443
+ "random_forest_tfidf_gridsearch_ros",
444
+ {"oversampling": "RandomOverSampler"},
445
+ )
446
+
447
+
448
+ def run_adasyn_pca_experiment(X, Y):
449
+ mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["adasyn_pca"])
450
+
451
+ # Split into train / val / test
452
+ X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
453
+ X,
454
+ Y,
455
+ test_size=TRAINING_CONFIG.get("test_size", 0.2),
456
+ val_size=TRAINING_CONFIG.get("val_size", 0.1),
457
+ random_state=TRAINING_CONFIG["random_state"],
458
+ )
459
+
460
+ print("Applying PCA before ADASYN...")
461
+ pca = PCA(
462
+ n_components=PCA_CONFIG["variance_retained"], random_state=TRAINING_CONFIG["random_state"]
463
+ )
464
+ X_train_pca = pca.fit_transform(X_train)
465
+
466
+ adasyn = ADASYN(
467
+ random_state=TRAINING_CONFIG["random_state"],
468
+ n_neighbors=ADASYN_CONFIG["n_neighbors"],
469
+ sampling_strategy=ADASYN_CONFIG["sampling_strategy"],
470
+ )
471
+
472
+ valid_label_idx = next(
473
+ (i for i in range(Y_train.shape[1]) if len(np.unique(Y_train[:, i])) > 1), None
474
+ )
475
+
476
+ if valid_label_idx is None:
477
+ X_res, Y_res = X_train, Y_train
478
+ n_new = 0
479
+ else:
480
+ X_res_pca, _ = adasyn.fit_resample(X_train_pca, Y_train[:, valid_label_idx])
481
+ X_res = pca.inverse_transform(X_res_pca)
482
+ n_new = len(X_res) - len(X_train)
483
+ Y_res = np.vstack([Y_train, Y_train[np.random.randint(0, len(Y_train), n_new)]])
484
+
485
+ grid = run_grid_search(X_res, Y_res)
486
+ with mlflow.start_run(run_name="random_forest_with_adasyn_pca"):
487
+ grid.fit(X_res, Y_res)
488
+
489
+ best_params = grid.best_params_
490
+ best_cv = grid.best_score_
491
+ final_model = grid.best_estimator_
492
+ X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
493
+ Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
494
+ final_model.fit(X_comb, Y_comb)
495
+
496
+ evaluate_and_log(
497
+ final_model,
498
+ X_test,
499
+ Y_test,
500
+ best_params,
501
+ best_cv,
502
+ "random_forest_tfidf_gridsearch_adasyn_pca",
503
+ {
504
+ "oversampling": "ADASYN + PCA",
505
+ "pca_variance": PCA_CONFIG["variance_retained"],
506
+ "synthetic_samples": n_new,
507
+ },
508
+ )
509
+ pca_path = Path(DATA_PATHS["models_dir"]) / "pca_tfidf_adasyn.pkl"
510
+ joblib.dump(pca, pca_path)
511
+ mlflow.log_artifact(str(pca_path), artifact_path="model_adasyn_pca")
512
+
513
+
514
+ def run_lightgbm(X, Y):
515
+ mlflow.set_experiment(MLFLOW_CONFIG["experiments"].get("lightgbm", "LightGBM"))
516
+
517
+ # Split into train / val / test
518
+ X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
519
+ X,
520
+ Y,
521
+ test_size=TRAINING_CONFIG.get("test_size", 0.2),
522
+ val_size=TRAINING_CONFIG.get("val_size", 0.1),
523
+ random_state=TRAINING_CONFIG["random_state"],
524
+ )
525
+
526
+ print("\nTraining LightGBM with GridSearchCV...")
527
+ grid = run_grid_search_lgb(X_train, Y_train)
528
+
529
+ with mlflow.start_run(run_name="lightgbm"):
530
+ grid.fit(X_train, Y_train)
531
+
532
+ best_params = grid.best_params_
533
+ best_cv = grid.best_score_
534
+ final_model = grid.best_estimator_
535
+ X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
536
+ Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
537
+ final_model.fit(X_comb, Y_comb)
538
+
539
+ evaluate_and_log(
540
+ final_model,
541
+ X_test,
542
+ Y_test,
543
+ best_params,
544
+ best_cv,
545
+ "lightgbm_tfidf_gridsearch",
546
+ {"oversampling": "None", "model": "LightGBM"},
547
+ )
548
+
549
+
550
+ def run_lightgbm_smote_experiment(X, Y):
551
+ mlflow.set_experiment(MLFLOW_CONFIG["experiments"].get("lightgbm_smote", "LightGBM_SMOTE"))
552
+
553
+ # Split into train / val / test
554
+ X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
555
+ X,
556
+ Y,
557
+ test_size=TRAINING_CONFIG.get("test_size", 0.2),
558
+ val_size=TRAINING_CONFIG.get("val_size", 0.1),
559
+ random_state=TRAINING_CONFIG["random_state"],
560
+ )
561
+
562
+ # Apply MLSMOTE (Multi-Label SMOTE) as per paper
563
+ print(" Applying MLSMOTE for LightGBM...")
564
+ print(f" Original training set: {X_train.shape[0]} samples, {Y_train.shape[1]} labels")
565
+
566
+ # Use local MLSMOTE implementation directly (function-based)
567
+ if _HAS_LOCAL_MLSMOTE:
568
+ try:
569
+ # Set random seed
570
+ if TRAINING_CONFIG["random_state"] is not None:
571
+ np.random.seed(TRAINING_CONFIG["random_state"])
572
+ import random
573
+
574
+ random.seed(TRAINING_CONFIG["random_state"])
575
+
576
+ # Convert to DataFrame (MLSMOTE function expects DataFrames)
577
+ X_train_df = pd.DataFrame(X_train)
578
+ Y_train_df = pd.DataFrame(Y_train)
579
+
580
+ # Get minority instances
581
+ X_min, Y_min = get_minority_instace(X_train_df, Y_train_df)
582
+
583
+ if len(X_min) == 0:
584
+ print("No minority instances found, using original dataset")
585
+ X_res, Y_res = X_train, Y_train
586
+ oversampling_method = "None (no minority instances)"
587
+ n_new = 0
588
+ else:
589
+ # Calculate number of synthetic samples
590
+ label_counts = Y_train_df.sum(axis=0)
591
+ mean_count = int(label_counts.mean())
592
+ min_count = int(label_counts.min())
593
+ n_synthetic = max(100, int(mean_count - min_count))
594
+ n_synthetic = min(n_synthetic, len(X_min) * 3)
595
+
596
+ print(
597
+ f"Generating {n_synthetic} synthetic samples from {len(X_min)} minority instances"
598
+ )
599
+
600
+ # Apply MLSMOTE function directly
601
+ X_res_df, Y_res_df = mlsmote_function(X_min, Y_min, n_synthetic)
602
+
603
+ # Convert back to numpy
604
+ X_res = X_res_df.values
605
+ Y_res = Y_res_df.values.astype(int)
606
+
607
+ oversampling_method = "MLSMOTE (local implementation)"
608
+ n_new = len(X_res) - len(X_train)
609
+ print(
610
+ f"MLSMOTE completed: {n_new} synthetic samples generated. Total: {len(X_res)} samples"
611
+ )
612
+ except Exception as e:
613
+ print(f"MLSMOTE failed ({e}); falling back to RandomOverSampler")
614
+ Y_train_str = ["".join(map(str, y)) for y in Y_train]
615
+ ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
616
+ X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
617
+ Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
618
+ oversampling_method = "RandomOverSampler (MLSMOTE fallback)"
619
+ n_new = len(X_res) - len(X_train)
620
+ else:
621
+ print(" Local MLSMOTE not available; falling back to RandomOverSampler")
622
+ Y_train_str = ["".join(map(str, y)) for y in Y_train]
623
+ ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
624
+ X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
625
+ Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
626
+ oversampling_method = "RandomOverSampler (no MLSMOTE)"
627
+ n_new = len(X_res) - len(X_train)
628
+
629
+ print(f"\n Training LightGBM with {oversampling_method} ({n_new} synthetic samples)...")
630
+ grid = run_grid_search_lgb(X_res, Y_res)
631
+
632
+ with mlflow.start_run(run_name="lightgbm_with_smote"):
633
+ grid.fit(X_res, Y_res)
634
+
635
+ best_params = grid.best_params_
636
+ best_cv = grid.best_score_
637
+ final_model = grid.best_estimator_
638
+ X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
639
+ Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
640
+ final_model.fit(X_comb, Y_comb)
641
+
642
+ evaluate_and_log(
643
+ final_model,
644
+ X_test,
645
+ Y_test,
646
+ best_params,
647
+ best_cv,
648
+ "lightgbm_tfidf_gridsearch_smote",
649
+ {
650
+ "oversampling": oversampling_method,
651
+ "synthetic_samples": n_new,
652
+ "n_labels": Y_train.shape[1],
653
+ "model": "LightGBM",
654
+ },
655
+ )
656
+
657
+
658
+ # =====================================================
659
+ # Baseline training (original train.py behavior)
660
+ # =====================================================
661
+ def run_baseline_train(feature_type="tfidf", use_cleaned=True):
662
+ """Run baseline training with configurable feature type.
663
+
664
+ Args:
665
+ feature_type: 'tfidf' or 'embedding'
666
+ use_cleaned: whether to use cleaned data
667
+ """
668
+ mlflow.set_experiment(
669
+ MLFLOW_CONFIG.get("experiments", {}).get("baseline", "hopcroft_random_forest_baseline")
670
+ )
671
+
672
+ X, Y = load_data(feature_type=feature_type, use_cleaned=use_cleaned)
673
+
674
+ # Use 80/20 split as per SkillScope paper (no validation set for baseline)
675
+ print(" Using 80/20 train/test split as per paper...")
676
+ X_train, X_test, Y_train, Y_test = stratified_train_test_split(
677
+ X,
678
+ Y,
679
+ test_size=TRAINING_CONFIG.get("test_size", 0.2),
680
+ random_state=TRAINING_CONFIG.get("random_state", 42),
681
+ )
682
+
683
+ # Remove labels that have 0 occurrences in training set (after split)
684
+ train_counts = np.sum(Y_train, axis=0).astype(int)
685
+ zero_in_train = np.where(train_counts == 0)[0]
686
+
687
+ if zero_in_train.size > 0:
688
+ kept_idx = np.where(train_counts > 0)[0]
689
+ print(
690
+ f"[warning] Removing {zero_in_train.size} label(s) with 0 occurrences in TRAIN set. Example removed indices: {zero_in_train[:10].tolist()}"
691
+ )
692
+ Y_train = Y_train[:, kept_idx]
693
+ Y_test = Y_test[:, kept_idx]
694
+
695
+ # Save kept indices for inference
696
+ paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned)
697
+ kept_indices_path = Path(paths["features"]).parent / "kept_label_indices.npy"
698
+ np.save(kept_indices_path, kept_idx)
699
+ print(f"Saved kept label indices to {kept_indices_path}")
700
+
701
+ # Now check label coverage (should pass since we removed zero-occurrence labels)
702
+ _check_label_coverage(Y_train, np.empty((0, Y_train.shape[1])))
703
+
704
+ base_rf = RandomForestClassifier(
705
+ random_state=TRAINING_CONFIG.get("random_state", 42), n_jobs=-1
706
+ )
707
+ multi = MultiOutputClassifier(base_rf)
708
+
709
+ # Use full param_grid from MODEL_CONFIG for optimal results as per paper
710
+ param_grid = MODEL_CONFIG.get(
711
+ "param_grid",
712
+ {
713
+ "estimator__n_estimators": [50, 100, 200],
714
+ "estimator__max_depth": [10, 20, 30],
715
+ "estimator__min_samples_split": [2, 5],
716
+ },
717
+ )
718
+
719
+ cv = KFold(
720
+ n_splits=TRAINING_CONFIG.get("cv_folds", 5),
721
+ shuffle=True,
722
+ random_state=TRAINING_CONFIG.get("random_state", 42),
723
+ )
724
+
725
+ print(
726
+ f" GridSearch with {cv.n_splits} folds and {len(param_grid['estimator__n_estimators']) * len(param_grid['estimator__max_depth']) * len(param_grid['estimator__min_samples_split'])} combinations..."
727
+ )
728
+
729
+ grid = GridSearchCV(
730
+ estimator=multi,
731
+ param_grid=param_grid,
732
+ scoring="f1_micro",
733
+ cv=cv,
734
+ n_jobs=-1,
735
+ verbose=2,
736
+ refit=True,
737
+ )
738
+
739
+ with mlflow.start_run(run_name="random_forest_tfidf_gridsearch"):
740
+ grid.fit(X_train, Y_train)
741
+
742
+ best = grid.best_estimator_
743
+ best_params = grid.best_params_
744
+ best_cv_score = grid.best_score_
745
+
746
+ # No need to refit on combined train+val since we don't have a val set
747
+ # Model is already fitted on full training data
748
+
749
+ Y_pred_test = best.predict(X_test)
750
+
751
+ precision = precision_score(Y_test, Y_pred_test, average="micro", zero_division=0)
752
+ recall = recall_score(Y_test, Y_pred_test, average="micro", zero_division=0)
753
+ f1 = f1_score(Y_test, Y_pred_test, average="micro", zero_division=0)
754
+
755
+ mlflow.log_param("model_type", "RandomForest + MultiOutput")
756
+ for k, v in best_params.items():
757
+ mlflow.log_param(k, v)
758
+ mlflow.log_metric("cv_best_f1_micro", best_cv_score)
759
+
760
+ mlflow.log_metric("test_precision_micro", precision)
761
+ mlflow.log_metric("test_recall_micro", recall)
762
+ mlflow.log_metric("test_f1_micro", f1)
763
+ mlflow.log_param("feature_type", feature_type)
764
+ mlflow.log_param("use_cleaned", use_cleaned)
765
+
766
+ print("\n=== Training Results ===")
767
+ print(f"Test Precision (Micro): {precision:.4f}")
768
+ print(f"Test Recall (Micro): {recall:.4f}")
769
+ print(f"Test F1 Score (Micro): {f1:.4f}")
770
+ print("========================\n")
771
+
772
+ paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned)
773
+ os.makedirs(paths["models_dir"], exist_ok=True)
774
+
775
+ model_path = Path(paths["models_dir"]) / f"random_forest_{feature_type}_gridsearch.pkl"
776
+ joblib.dump(best, model_path)
777
+
778
+ np.save(Path(paths["features"]).parent / "X_test.npy", X_test)
779
+ np.save(Path(paths["labels"]).parent / "Y_test.npy", Y_test)
780
+
781
+ mlflow.sklearn.log_model(best, "model")
782
+
783
+ print("Grid search training completed and logged successfully.")
784
+
785
+
786
+ # =====================================================
787
+ # Inference utility (merged from predict.py)
788
+ # =====================================================
789
+ def run_inference(model_path: str = None):
790
+ mlflow.set_experiment(
791
+ MLFLOW_CONFIG.get("experiments", {}).get("inference", "hopcroft_random_forest_inference")
792
+ )
793
+
794
+ if model_path is None:
795
+ model_path = Path(DATA_PATHS["models_dir"]) / "random_forest_tfidf_gridsearch.pkl"
796
+ else:
797
+ model_path = Path(model_path)
798
+
799
+ model = joblib.load(str(model_path))
800
+
801
+ X_test = np.load(Path(DATA_PATHS["features"]).parent / "X_test.npy")
802
+ Y_test = np.load(Path(DATA_PATHS["labels"]).parent / "Y_test.npy")
803
+
804
+ with mlflow.start_run(run_name="random_forest_tfidf_inference"):
805
+ Y_pred = model.predict(X_test)
806
+
807
+ precision = precision_score(Y_test, Y_pred, average="micro", zero_division=0)
808
+ recall = recall_score(Y_test, Y_pred, average="micro", zero_division=0)
809
+ f1 = f1_score(Y_test, Y_pred, average="micro", zero_division=0)
810
+
811
+ mlflow.log_metric("test_precision_micro", precision)
812
+ mlflow.log_metric("test_recall_micro", recall)
813
+ mlflow.log_metric("test_f1_micro", f1)
814
+
815
+ print(f"Inference completed — Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
816
+
817
+
818
+ def _parse_args():
819
+ p = argparse.ArgumentParser(description="Unified training & experiments script")
820
+ p.add_argument(
821
+ "action",
822
+ choices=[
823
+ "baseline",
824
+ "smote",
825
+ "ros",
826
+ "adasyn_pca",
827
+ "lightgbm",
828
+ "lightgbm_smote",
829
+ "predict",
830
+ ],
831
+ help="Action to run",
832
+ )
833
+ p.add_argument("--model-path", help="Custom model path for inference")
834
+ return p.parse_args()
835
+
836
+
837
+ if __name__ == "__main__":
838
+ args = _parse_args()
839
+
840
+ # Baseline has its own load_data logic (removes rare labels after split)
841
+ if args.action == "baseline":
842
+ run_baseline_train(feature_type="tfidf", use_cleaned=True)
843
+ else:
844
+ # Other experiments use the original load_data() logic
845
+ X, Y = load_data(feature_type="tfidf", use_cleaned=True)
846
+
847
+ if args.action == "smote":
848
+ run_smote_experiment(X, Y)
849
+ elif args.action == "ros":
850
+ run_ros_experiment(X, Y)
851
+ elif args.action == "adasyn_pca":
852
+ run_adasyn_pca_experiment(X, Y)
853
+ elif args.action == "lightgbm":
854
+ run_lightgbm(X, Y)
855
+ elif args.action == "lightgbm_smote":
856
+ run_lightgbm_smote_experiment(X, Y)
857
+ elif args.action == "predict":
858
+ run_inference(args.model_path)
hopcroft_skill_classification_tool_competition/streamlit_app.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, List
3
+
4
+ import pandas as pd
5
+ import requests
6
+ import streamlit as st
7
+
8
+ API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000")
9
+
10
+ # Page config
11
+ st.set_page_config(
12
+ page_title="GitHub Skill Classifier", layout="wide", initial_sidebar_state="expanded"
13
+ )
14
+
15
+ st.markdown(
16
+ """
17
+ <style>
18
+ .main-header {
19
+ font-size: 2.5rem;
20
+ color: #1f77b4;
21
+ text-align: center;
22
+ margin-bottom: 2rem;
23
+ }
24
+ .skill-card {
25
+ padding: 1rem;
26
+ border-radius: 0.5rem;
27
+ border-left: 4px solid #1f77b4;
28
+ background-color: #f0f2f6;
29
+ margin-bottom: 0.5rem;
30
+ }
31
+ .confidence-high {
32
+ color: #28a745;
33
+ font-weight: bold;
34
+ }
35
+ .confidence-medium {
36
+ color: #ffc107;
37
+ font-weight: bold;
38
+ }
39
+ .confidence-low {
40
+ color: #dc3545;
41
+ font-weight: bold;
42
+ }
43
+ </style>
44
+ """,
45
+ unsafe_allow_html=True,
46
+ )
47
+
48
+
49
+ def check_api_health() -> bool:
50
+ """Check if the API is running and healthy."""
51
+ try:
52
+ response = requests.get(f"{API_BASE_URL}/health", timeout=2)
53
+ return response.status_code == 200
54
+ except Exception:
55
+ return False
56
+
57
+
58
+ def predict_skills(
59
+ issue_text: str, issue_description: str = None, repo_name: str = None, pr_number: int = None
60
+ ) -> Dict:
61
+ """Call the prediction API."""
62
+ payload = {"issue_text": issue_text}
63
+
64
+ if issue_description:
65
+ payload["issue_description"] = issue_description
66
+ if repo_name:
67
+ payload["repo_name"] = repo_name
68
+ if pr_number:
69
+ payload["pr_number"] = pr_number
70
+
71
+ try:
72
+ response = requests.post(f"{API_BASE_URL}/predict", json=payload, timeout=30)
73
+ response.raise_for_status()
74
+ return response.json()
75
+ except requests.exceptions.RequestException as e:
76
+ st.error(f"API Error: {str(e)}")
77
+ return None
78
+
79
+
80
+ def display_predictions(predictions: List[Dict], threshold: float = 0.5):
81
+ """Display predictions with visual formatting."""
82
+
83
+ # Filter by threshold
84
+ filtered = [p for p in predictions if p["confidence"] >= threshold]
85
+
86
+ if not filtered:
87
+ st.warning(f"No predictions above confidence threshold {threshold:.2f}")
88
+ return
89
+
90
+ st.success(f"Found {len(filtered)} skills above threshold {threshold:.2f}")
91
+
92
+ # Create DataFrame for table view
93
+ df = pd.DataFrame(filtered)
94
+ df["confidence"] = df["confidence"].apply(lambda x: f"{x:.2%}")
95
+
96
+ col1, col2 = st.columns([2, 1])
97
+
98
+ with col1:
99
+ st.subheader("Predictions Table")
100
+ st.dataframe(
101
+ df,
102
+ use_container_width=True,
103
+ hide_index=True,
104
+ column_config={
105
+ "skill_name": st.column_config.TextColumn("Skill", width="large"),
106
+ "confidence": st.column_config.TextColumn("Confidence", width="medium"),
107
+ },
108
+ )
109
+
110
+ with col2:
111
+ st.subheader("Top 5 Skills")
112
+ for i, pred in enumerate(filtered[:5], 1):
113
+ confidence = pred["confidence"]
114
+
115
+ if confidence >= 0.8:
116
+ conf_class = "confidence-high"
117
+ elif confidence >= 0.5:
118
+ conf_class = "confidence-medium"
119
+ else:
120
+ conf_class = "confidence-low"
121
+
122
+ st.markdown(
123
+ f"""
124
+ <div class="skill-card">
125
+ <strong>#{i} {pred["skill_name"]}</strong><br>
126
+ <span class="{conf_class}">{confidence:.2%}</span>
127
+ </div>
128
+ """,
129
+ unsafe_allow_html=True,
130
+ )
131
+
132
+
133
+ def main():
134
+ """Main Streamlit app."""
135
+
136
+ if "example_text" not in st.session_state:
137
+ st.session_state.example_text = ""
138
+
139
+ # Header
140
+ st.markdown('<h1 class="main-header"> GitHub Skill Classifier</h1>', unsafe_allow_html=True)
141
+
142
+ st.markdown("""
143
+ This tool uses machine learning to predict the skills required for GitHub issues and pull requests.
144
+ Enter the issue text below to get started!
145
+ """)
146
+
147
+ # Sidebar
148
+ with st.sidebar:
149
+ st.header("Settings")
150
+
151
+ # API Status
152
+ st.subheader("API Status")
153
+ if check_api_health():
154
+ st.success(" API is running")
155
+ else:
156
+ st.error(" API is not available")
157
+ st.info(f"Make sure FastAPI is running at {API_BASE_URL}")
158
+ st.code("fastapi dev hopcroft_skill_classification_tool_competition/main.py")
159
+
160
+ # Confidence threshold
161
+ threshold = st.slider(
162
+ "Confidence Threshold",
163
+ min_value=0.0,
164
+ max_value=1.0,
165
+ value=0.5,
166
+ step=0.05,
167
+ help="Only show predictions above this confidence level",
168
+ )
169
+
170
+ # Model info
171
+ st.subheader("Model Info")
172
+ try:
173
+ health = requests.get(f"{API_BASE_URL}/health", timeout=2).json()
174
+ st.metric("Version", health.get("version", "N/A"))
175
+ st.metric("Model Loaded", "" if health.get("model_loaded") else "")
176
+ except Exception:
177
+ st.info("API not available")
178
+
179
+ # Main
180
+ st.header("Input")
181
+
182
+ # Tabs for different input modes
183
+ tab1, tab2, tab3 = st.tabs(["Quick Input", "Detailed Input", "Examples"])
184
+
185
+ with tab1:
186
+ issue_text = st.text_area(
187
+ "Issue/PR Text",
188
+ height=150,
189
+ placeholder="Enter the issue or pull request text here...",
190
+ help="Required: The main text of the GitHub issue or PR",
191
+ value=st.session_state.example_text,
192
+ )
193
+
194
+ if st.button("Predict Skills", type="primary", use_container_width=True):
195
+ if not issue_text.strip():
196
+ st.error("Please enter some text!")
197
+ else:
198
+ st.session_state.example_text = ""
199
+ with st.spinner("Analyzing issue..."):
200
+ result = predict_skills(issue_text)
201
+
202
+ if result:
203
+ st.header("Results")
204
+
205
+ # Metadata
206
+ col1, col2, col3 = st.columns(3)
207
+ with col1:
208
+ st.metric("Total Predictions", result.get("num_predictions", 0))
209
+ with col2:
210
+ st.metric(
211
+ "Processing Time", f"{result.get('processing_time_ms', 0):.2f} ms"
212
+ )
213
+ with col3:
214
+ st.metric("Model Version", result.get("model_version", "N/A"))
215
+
216
+ # Predictions
217
+ st.divider()
218
+ display_predictions(result.get("predictions", []), threshold)
219
+
220
+ # Raw JSON
221
+ with st.expander("🔍 View Raw Response"):
222
+ st.json(result)
223
+
224
+ with tab2:
225
+ col1, col2 = st.columns(2)
226
+
227
+ with col1:
228
+ issue_text_detailed = st.text_area(
229
+ "Issue Title/Text*",
230
+ height=100,
231
+ placeholder="e.g., Fix authentication bug in login module",
232
+ key="issue_text_detailed",
233
+ )
234
+
235
+ issue_description = st.text_area(
236
+ "Issue Description",
237
+ height=100,
238
+ placeholder="Optional: Detailed description of the issue",
239
+ key="issue_description",
240
+ )
241
+
242
+ with col2:
243
+ repo_name = st.text_input(
244
+ "Repository Name",
245
+ placeholder="e.g., owner/repository",
246
+ help="Optional: GitHub repository name",
247
+ )
248
+
249
+ pr_number = st.number_input(
250
+ "PR Number",
251
+ min_value=0,
252
+ value=0,
253
+ help="Optional: Pull request number (0 = not a PR)",
254
+ )
255
+
256
+ if st.button("Predict Skills (Detailed)", type="primary", use_container_width=True):
257
+ if not issue_text_detailed.strip():
258
+ st.error("Issue text is required!")
259
+ else:
260
+ with st.spinner("Analyzing issue..."):
261
+ result = predict_skills(
262
+ issue_text_detailed,
263
+ issue_description if issue_description else None,
264
+ repo_name if repo_name else None,
265
+ pr_number if pr_number > 0 else None,
266
+ )
267
+
268
+ if result:
269
+ st.header("Results")
270
+
271
+ # Metadata
272
+ col1, col2, col3 = st.columns(3)
273
+ with col1:
274
+ st.metric("Total Predictions", result.get("num_predictions", 0))
275
+ with col2:
276
+ st.metric(
277
+ "Processing Time", f"{result.get('processing_time_ms', 0):.2f} ms"
278
+ )
279
+ with col3:
280
+ st.metric("Model Version", result.get("model_version", "N/A"))
281
+
282
+ st.divider()
283
+ display_predictions(result.get("predictions", []), threshold)
284
+
285
+ with st.expander("🔍 View Raw Response"):
286
+ st.json(result)
287
+
288
+ with tab3:
289
+ st.markdown("### Example Issues")
290
+
291
+ examples = [
292
+ {
293
+ "title": "Authentication Bug",
294
+ "text": "Fix authentication bug in login module. Users cannot login with OAuth providers.",
295
+ },
296
+ {
297
+ "title": "Machine Learning Feature",
298
+ "text": "Implement transfer learning with transformers for text classification using PyTorch and TensorFlow.",
299
+ },
300
+ {
301
+ "title": "Database Issue",
302
+ "text": "Fix database connection pooling issue causing memory leaks in production environment.",
303
+ },
304
+ {
305
+ "title": "UI Enhancement",
306
+ "text": "Add responsive design support for mobile devices with CSS media queries and flexbox layout.",
307
+ },
308
+ ]
309
+
310
+ for i, example in enumerate(examples):
311
+ if st.button(example["title"], use_container_width=True, key=f"example_btn_{i}"):
312
+ st.session_state.example_text = example["text"]
313
+ st.rerun()
314
+
315
+ if st.session_state.example_text:
316
+ st.success(" Example loaded! Switch to 'Quick Input' tab to use it.")
317
+ with st.expander("Preview"):
318
+ st.code(st.session_state.example_text)
319
+
320
+
321
+ if __name__ == "__main__":
322
+ main()
hopcroft_skill_classification_tool_competition/threshold_optimization.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Threshold Optimization for Multi-Label Classification
3
+
4
+ This module provides functions to optimize decision thresholds for multi-label
5
+ classification tasks to maximize F1-score (or other metrics).
6
+
7
+ In multi-label classification, the default threshold of 0.5 for converting
8
+ probabilities to binary predictions is often suboptimal, especially for
9
+ imbalanced classes. This module finds optimal thresholds per-class or globally.
10
+
11
+ Designed to work with Random Forest (baseline and improved models).
12
+
13
+ Usage:
14
+ from threshold_optimization import optimize_thresholds, apply_thresholds
15
+ from sklearn.ensemble import RandomForestClassifier
16
+
17
+ # Train Random Forest
18
+ model = RandomForestClassifier(n_estimators=100)
19
+ model.fit(X_train, y_train)
20
+
21
+ # Get probability predictions
22
+ y_proba = model.predict_proba(X_val)
23
+
24
+ # Find optimal thresholds on validation set
25
+ thresholds = optimize_thresholds(y_val, y_proba, method='per_class')
26
+
27
+ # Apply thresholds to test set
28
+ y_pred = apply_thresholds(model.predict_proba(X_test), thresholds)
29
+ """
30
+
31
+ from typing import Dict, Tuple, Union
32
+ import warnings
33
+
34
+ import numpy as np
35
+ from sklearn.metrics import f1_score
36
+
37
+
38
+ def optimize_thresholds(
39
+ y_true: np.ndarray,
40
+ y_proba: np.ndarray,
41
+ method: str = "per_class",
42
+ metric: str = "f1_weighted",
43
+ search_range: Tuple[float, float] = (0.1, 0.9),
44
+ n_steps: int = 50,
45
+ ) -> Union[float, np.ndarray]:
46
+ """
47
+ Optimize decision thresholds to maximize a given metric.
48
+
49
+ This function searches for optimal thresholds that convert probability
50
+ predictions to binary predictions (0/1) in a way that maximizes the
51
+ specified metric (default: weighted F1-score).
52
+
53
+ Args:
54
+ y_true: True binary labels, shape (n_samples, n_labels)
55
+ y_proba: Predicted probabilities, shape (n_samples, n_labels)
56
+ method: Threshold optimization method:
57
+ - 'global': Single threshold for all classes
58
+ - 'per_class': One threshold per class (default, recommended)
59
+ metric: Metric to optimize ('f1_weighted', 'f1_macro', 'f1_micro')
60
+ search_range: Range of thresholds to search (min, max)
61
+ n_steps: Number of threshold values to try
62
+
63
+ Returns:
64
+ - If method='global': Single float threshold
65
+ - If method='per_class': Array of thresholds, one per class
66
+
67
+ Example:
68
+ >>> y_true = np.array([[1, 0, 1], [0, 1, 0], [1, 1, 0]])
69
+ >>> y_proba = np.array([[0.9, 0.3, 0.7], [0.2, 0.8, 0.4], [0.85, 0.6, 0.3]])
70
+ >>> thresholds = optimize_thresholds(y_true, y_proba, method='per_class')
71
+ >>> print(thresholds) # Array of 3 thresholds, one per class
72
+ """
73
+ if y_true.shape != y_proba.shape:
74
+ raise ValueError(f"Shape mismatch: y_true {y_true.shape} vs y_proba {y_proba.shape}")
75
+
76
+ if method == "global":
77
+ return _optimize_global_threshold(y_true, y_proba, metric, search_range, n_steps)
78
+ elif method == "per_class":
79
+ return _optimize_per_class_thresholds(y_true, y_proba, metric, search_range, n_steps)
80
+ else:
81
+ raise ValueError(f"Invalid method: {method}. Must be 'global' or 'per_class'")
82
+
83
+
84
+ def _optimize_global_threshold(
85
+ y_true: np.ndarray,
86
+ y_proba: np.ndarray,
87
+ metric: str,
88
+ search_range: Tuple[float, float],
89
+ n_steps: int,
90
+ ) -> float:
91
+ """
92
+ Find single optimal threshold for all classes.
93
+
94
+ This approach is faster but less flexible than per-class optimization.
95
+ Useful when classes have similar distributions.
96
+ """
97
+ thresholds_to_try = np.linspace(search_range[0], search_range[1], n_steps)
98
+ best_threshold = 0.5
99
+ best_score = -np.inf
100
+
101
+ for threshold in thresholds_to_try:
102
+ y_pred = (y_proba >= threshold).astype(int)
103
+ score = _compute_score(y_true, y_pred, metric)
104
+
105
+ if score > best_score:
106
+ best_score = score
107
+ best_threshold = threshold
108
+
109
+ print(f"Optimal global threshold: {best_threshold:.3f} (score: {best_score:.4f})")
110
+ return best_threshold
111
+
112
+
113
+ def _optimize_per_class_thresholds(
114
+ y_true: np.ndarray,
115
+ y_proba: np.ndarray,
116
+ metric: str,
117
+ search_range: Tuple[float, float],
118
+ n_steps: int,
119
+ ) -> np.ndarray:
120
+ """
121
+ Find optimal threshold for each class independently.
122
+
123
+ This approach is more flexible and typically yields better results
124
+ for imbalanced multi-label problems, but is slower.
125
+ """
126
+ n_classes = y_true.shape[1]
127
+ optimal_thresholds = np.zeros(n_classes)
128
+ thresholds_to_try = np.linspace(search_range[0], search_range[1], n_steps)
129
+
130
+ print(f"Optimizing thresholds for {n_classes} classes...")
131
+
132
+ for class_idx in range(n_classes):
133
+ y_true_class = y_true[:, class_idx]
134
+ y_proba_class = y_proba[:, class_idx]
135
+
136
+ # Skip classes with no positive samples
137
+ if y_true_class.sum() == 0:
138
+ optimal_thresholds[class_idx] = 0.5
139
+ warnings.warn(
140
+ f"Class {class_idx} has no positive samples, using default threshold 0.5"
141
+ )
142
+ continue
143
+
144
+ best_threshold = 0.5
145
+ best_score = -np.inf
146
+
147
+ for threshold in thresholds_to_try:
148
+ y_pred_class = (y_proba_class >= threshold).astype(int)
149
+
150
+ # Compute binary F1 for this class
151
+ try:
152
+ score = f1_score(y_true_class, y_pred_class, average="binary", zero_division=0)
153
+ except Exception:
154
+ continue
155
+
156
+ if score > best_score:
157
+ best_score = score
158
+ best_threshold = threshold
159
+
160
+ optimal_thresholds[class_idx] = best_threshold
161
+
162
+ print(
163
+ f"Threshold statistics: min={optimal_thresholds.min():.3f}, "
164
+ f"max={optimal_thresholds.max():.3f}, mean={optimal_thresholds.mean():.3f}"
165
+ )
166
+
167
+ return optimal_thresholds
168
+
169
+
170
+ def _compute_score(y_true: np.ndarray, y_pred: np.ndarray, metric: str) -> float:
171
+ """Compute the specified metric."""
172
+ if metric == "f1_weighted":
173
+ return f1_score(y_true, y_pred, average="weighted", zero_division=0)
174
+ elif metric == "f1_macro":
175
+ return f1_score(y_true, y_pred, average="macro", zero_division=0)
176
+ elif metric == "f1_micro":
177
+ return f1_score(y_true, y_pred, average="micro", zero_division=0)
178
+ else:
179
+ raise ValueError(f"Unsupported metric: {metric}")
180
+
181
+
182
+ def apply_thresholds(y_proba: np.ndarray, thresholds: Union[float, np.ndarray]) -> np.ndarray:
183
+ """
184
+ Apply thresholds to probability predictions to get binary predictions.
185
+
186
+ Args:
187
+ y_proba: Predicted probabilities, shape (n_samples, n_labels)
188
+ thresholds: Threshold(s) to apply:
189
+ - Single float: same threshold for all classes
190
+ - Array: one threshold per class
191
+
192
+ Returns:
193
+ Binary predictions, shape (n_samples, n_labels)
194
+
195
+ Example:
196
+ >>> y_proba = np.array([[0.9, 0.3, 0.7], [0.2, 0.8, 0.4]])
197
+ >>> thresholds = np.array([0.5, 0.4, 0.6])
198
+ >>> y_pred = apply_thresholds(y_proba, thresholds)
199
+ >>> print(y_pred)
200
+ [[1 0 1]
201
+ [0 1 0]]
202
+ """
203
+ if isinstance(thresholds, float):
204
+ # Global threshold
205
+ return (y_proba >= thresholds).astype(int)
206
+ else:
207
+ # Per-class thresholds
208
+ if len(thresholds) != y_proba.shape[1]:
209
+ raise ValueError(
210
+ f"Number of thresholds ({len(thresholds)}) must match "
211
+ f"number of classes ({y_proba.shape[1]})"
212
+ )
213
+
214
+ # Broadcasting: compare each column with its threshold
215
+ return (y_proba >= thresholds[np.newaxis, :]).astype(int)
216
+
217
+
218
+ def evaluate_with_thresholds(
219
+ model,
220
+ X_val: np.ndarray,
221
+ y_val: np.ndarray,
222
+ X_test: np.ndarray,
223
+ y_test: np.ndarray,
224
+ method: str = "per_class",
225
+ ) -> Dict:
226
+ """
227
+ Complete workflow: optimize thresholds on validation set and evaluate on test set.
228
+
229
+ This function encapsulates the entire threshold optimization pipeline:
230
+ 1. Get probability predictions on validation set
231
+ 2. Optimize thresholds using validation data
232
+ 3. Apply optimized thresholds to test set
233
+ 4. Compare with default threshold (0.5)
234
+
235
+ Args:
236
+ model: Trained model with predict_proba method
237
+ X_val: Validation features
238
+ y_val: Validation labels (binary)
239
+ X_test: Test features
240
+ y_test: Test labels (binary)
241
+ method: 'global' or 'per_class'
242
+
243
+ Returns:
244
+ Dictionary with results:
245
+ - 'thresholds': Optimized thresholds
246
+ - 'f1_default': F1-score with default threshold (0.5)
247
+ - 'f1_optimized': F1-score with optimized thresholds
248
+ - 'improvement': Absolute improvement in F1-score
249
+
250
+ Example:
251
+ >>> results = evaluate_with_thresholds(model, X_val, y_val, X_test, y_test)
252
+ >>> print(f"F1 improvement: {results['improvement']:.4f}")
253
+ """
254
+ # Get probability predictions
255
+ print("Getting probability predictions on validation set...")
256
+ y_val_proba = model.predict_proba(X_val)
257
+
258
+ # Handle MultiOutputClassifier (returns list of arrays)
259
+ if isinstance(y_val_proba, list):
260
+ y_val_proba = np.column_stack([proba[:, 1] for proba in y_val_proba])
261
+
262
+ # Optimize thresholds
263
+ print(f"Optimizing thresholds ({method})...")
264
+ thresholds = optimize_thresholds(y_val, y_val_proba, method=method)
265
+
266
+ # Evaluate on test set
267
+ print("Evaluating on test set...")
268
+ y_test_proba = model.predict_proba(X_test)
269
+
270
+ # Handle MultiOutputClassifier
271
+ if isinstance(y_test_proba, list):
272
+ y_test_proba = np.column_stack([proba[:, 1] for proba in y_test_proba])
273
+
274
+ # Default predictions (threshold=0.5)
275
+ y_test_pred_default = (y_test_proba >= 0.5).astype(int)
276
+ f1_default = f1_score(y_test, y_test_pred_default, average="weighted", zero_division=0)
277
+
278
+ # Optimized predictions
279
+ y_test_pred_optimized = apply_thresholds(y_test_proba, thresholds)
280
+ f1_optimized = f1_score(y_test, y_test_pred_optimized, average="weighted", zero_division=0)
281
+
282
+ improvement = f1_optimized - f1_default
283
+
284
+ print("\nResults:")
285
+ print(f" F1-score (default threshold=0.5): {f1_default:.4f}")
286
+ print(f" F1-score (optimized thresholds): {f1_optimized:.4f}")
287
+ print(f" Improvement: {improvement:+.4f} ({improvement / f1_default * 100:+.2f}%)")
288
+
289
+ return {
290
+ "thresholds": thresholds,
291
+ "f1_default": f1_default,
292
+ "f1_optimized": f1_optimized,
293
+ "improvement": improvement,
294
+ "y_pred_optimized": y_test_pred_optimized,
295
+ }
models/.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /random_forest_tfidf_gridsearch.pkl
2
+ /random_forest_tfidf_gridsearch_adasyn_pca.pkl
3
+ /random_forest_tfidf_gridsearch_ros.pkl
4
+ /random_forest_tfidf_gridsearch_smote.pkl
5
+ /lightgbm_tfidf_gridsearch.pkl
6
+ /lightgbm_tfidf_gridsearch_smote.pkl
7
+ /pca_tfidf_adasyn.pkl
8
+ /label_names.pkl
9
+ /tfidf_vectorizer.pkl
10
+ /random_forest_embedding_gridsearch.pkl
11
+ /random_forest_embedding_gridsearch_smote.pkl
models/.gitkeep ADDED
File without changes
models/README.md ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: mit
4
+ tags:
5
+ - multi-label-classification
6
+ - tfidf
7
+ - embeddings
8
+ - random-forest
9
+ - oversampling
10
+ - mlsmote
11
+ - software-engineering
12
+ datasets:
13
+ - NLBSE/SkillCompetition
14
+ model-index:
15
+ - name: random_forest_tfidf_gridsearch
16
+ results:
17
+ - status: success
18
+ metrics:
19
+ cv_best_f1_micro: 0.595038375202279
20
+ test_precision_micro: 0.690371373744215
21
+ test_recall_micro: 0.5287455692919513
22
+ test_f1_micro: 0.5988446098110252
23
+ params:
24
+ estimator__max_depth: '10'
25
+ estimator__min_samples_split: '2'
26
+ estimator__n_estimators: '200'
27
+ feature_type: embedding
28
+ model_type: RandomForest + MultiOutput
29
+ use_cleaned: 'True'
30
+ oversampling: 'False'
31
+ dvc:
32
+ path: random_forest_tfidf_gridsearch.pkl
33
+ - name: random_forest_tfidf_gridsearch_smote
34
+ results:
35
+ - status: success
36
+ metrics:
37
+ cv_best_f1_micro: 0.59092598557871
38
+ test_precision_micro: 0.6923300238053766
39
+ test_recall_micro: 0.5154318319356791
40
+ test_f1_micro: 0.59092598557871
41
+ params:
42
+ feature_type: tfidf
43
+ oversampling: 'MLSMOTE (RandomOverSampler fallback)'
44
+ dvc:
45
+ path: random_forest_tfidf_gridsearch_smote.pkl
46
+ - name: random_forest_embedding_gridsearch
47
+ results:
48
+ - status: success
49
+ metrics:
50
+ cv_best_f1_micro: 0.6012826418169578
51
+ test_precision_micro: 0.703060266254212
52
+ test_recall_micro: 0.5252460640075934
53
+ test_f1_micro: 0.6012826418169578
54
+ params:
55
+ feature_type: embedding
56
+ oversampling: 'False'
57
+ dvc:
58
+ path: random_forest_embedding_gridsearch.pkl
59
+ - name: random_forest_embedding_gridsearch_smote
60
+ results:
61
+ - status: success
62
+ metrics:
63
+ cv_best_f1_micro: 0.5962084744755453
64
+ test_precision_micro: 0.7031004709576139
65
+ test_recall_micro: 0.5175288364319172
66
+ test_f1_micro: 0.5962084744755453
67
+ params:
68
+ feature_type: embedding
69
+ oversampling: 'MLSMOTE (RandomOverSampler fallback)'
70
+ dvc:
71
+ path: random_forest_embedding_gridsearch_smote.pkl
72
+ ---
73
+
74
+
75
+ Model cards for committed models
76
+
77
+ Overview
78
+ - This file documents four trained model artifacts available in the repository: two TF‑IDF based Random Forest models (baseline and with oversampling) and two embedding‑based Random Forest models (baseline and with oversampling).
79
+ - For dataset provenance and preprocessing details see `data/README.md`.
80
+
81
+ 1) random_forest_tfidf_gridsearch
82
+
83
+ Model details
84
+ - Name: `random_forest_tfidf_gridsearch`
85
+ - Organization: Hopcroft (se4ai2526-uniba)
86
+ - Model type: `RandomForestClassifier` wrapped in `MultiOutputClassifier` for multi-label outputs
87
+ - Branch: `Milestone-4`
88
+
89
+ Intended use
90
+ - Suitable for research and benchmarking on multi-label skill prediction for GitHub PRs/issues. Not intended for automated high‑stakes decisions or profiling individuals without further validation.
91
+
92
+ Training data and preprocessing
93
+ - Dataset: Processed SkillScope Dataset (NLBSE/SkillCompetition) as prepared for this project.
94
+ - Features: TF‑IDF (unigrams and bigrams), up to `MAX_TFIDF_FEATURES=5000`.
95
+ - Feature and label files are referenced via `get_feature_paths(feature_type='tfidf', use_cleaned=True)` in `config.py`.
96
+
97
+ Evaluation
98
+ - Reported metrics include micro‑precision, micro‑recall and micro‑F1 on a held‑out test split.
99
+ - Protocol: 80/20 multilabel‑stratified split; hyperparameters selected via 5‑fold cross‑validation optimizing `f1_micro`.
100
+ - MLflow run: `random_forest_tfidf_gridsearch` (see `hopcroft_skill_classification_tool_competition/config.py`).
101
+
102
+ Limitations and recommendations
103
+ - Trained on Java repositories; generalization to other languages is not ensured.
104
+ - Label imbalance affects rare labels; apply per‑label thresholds or further sampling strategies if required.
105
+
106
+ Usage
107
+ - Artifact path: `models/random_forest_tfidf_gridsearch.pkl`.
108
+ - Example:
109
+ ```python
110
+ import joblib
111
+ model = joblib.load('models/random_forest_tfidf_gridsearch.pkl')
112
+ y = model.predict(X_tfidf)
113
+ ```
114
+
115
+ 2) random_forest_tfidf_gridsearch_smote
116
+
117
+ Model details
118
+ - Name: `random_forest_tfidf_gridsearch_smote`
119
+ - Model type: `RandomForestClassifier` inside `MultiOutputClassifier` trained with multi‑label oversampling
120
+
121
+ Intended use
122
+ - Intended to improve recall for under‑represented labels by applying MLSMOTE (or RandomOverSampler fallback) during training.
123
+
124
+ Training and preprocessing
125
+ - Features: TF‑IDF (same configuration as the baseline).
126
+ - Oversampling: local MLSMOTE implementation when available; otherwise `RandomOverSampler`. Oversampling metadata (method and synthetic sample counts) are logged to MLflow.
127
+ - Training script: `hopcroft_skill_classification_tool_competition/modeling/train.py` (action `smote`).
128
+
129
+ Evaluation
130
+ - MLflow run: `random_forest_tfidf_gridsearch_smote`.
131
+
132
+ Limitations and recommendations
133
+ - Synthetic samples may introduce distributional artifacts; validate synthetic examples and per‑label metrics before deployment.
134
+
135
+ Usage
136
+ - Artifact path: `models/random_forest_tfidf_gridsearch_smote.pkl`.
137
+
138
+ 3) random_forest_embedding_gridsearch
139
+
140
+ Model details
141
+ - Name: `random_forest_embedding_gridsearch`
142
+ - Features: sentence embeddings produced by `all-MiniLM-L6-v2` (see `config.EMBEDDING_MODEL_NAME`).
143
+
144
+ Intended use
145
+ - Uses semantic embeddings to capture contextual information from PR text; suitable for research and prototyping.
146
+
147
+ Training and preprocessing
148
+ - Embeddings generated and stored via `get_feature_paths(feature_type='embedding', use_cleaned=True)`.
149
+ - Training script: see `hopcroft_skill_classification_tool_competition/modeling/train.py`.
150
+
151
+ Evaluation
152
+ - MLflow run: `random_forest_embedding_gridsearch`.
153
+
154
+ Limitations and recommendations
155
+ - Embeddings encode dataset biases; verify performance when transferring to other repositories or languages.
156
+
157
+ Usage
158
+ - Artifact path: `models/random_forest_embedding_gridsearch.pkl`.
159
+ - Example:
160
+ ```python
161
+ model.predict(X_embeddings)
162
+ ```
163
+
164
+ 4) random_forest_embedding_gridsearch_smote
165
+
166
+ Model details
167
+ - Name: `random_forest_embedding_gridsearch_smote`
168
+ - Combines embedding features with multi‑label oversampling to address rare labels.
169
+
170
+ Training and evaluation
171
+ - Oversampling: MLSMOTE preferred; `RandomOverSampler` fallback if MLSMOTE is unavailable.
172
+ - MLflow run: `random_forest_embedding_gridsearch_smote`.
173
+
174
+ Limitations and recommendations
175
+ - Review synthetic examples and re‑evaluate on target data prior to deployment.
176
+
177
+ Usage
178
+ - Artifact path: `models/random_forest_embedding_gridsearch_smote.pkl`.
179
+
180
+ Publishing guidance for Hugging Face Hub
181
+ - The YAML front‑matter enables rendering on the Hugging Face Hub. Recommended repository contents for publishing:
182
+ - `README.md` (this file)
183
+ - model artifact(s) (`*.pkl`)
184
+ - vectorizer(s) and label map (e.g. `tfidf_vectorizer.pkl`, `label_names.pkl`)
185
+ - a minimal inference example or notebook
186
+
187
+ Evaluation Data and Protocol
188
+ - Evaluation split: an 80/20 multilabel‑stratified train/test split was used for final evaluation.
189
+ - Cross-validation: hyperparameters were selected via 5‑fold cross‑validation optimizing `f1_micro`.
190
+ - Test metrics reported: micro precision, micro recall, micro F1 (reported in the YAML `model-index` for each model).
191
+
192
+ Quantitative Analyses
193
+ - Reported unitary results: micro‑precision, micro‑recall and micro‑F1 on the held‑out test split for each model.
194
+ - Where available, `cv_best_f1_micro` is the best cross‑validation f1_micro recorded during training; when a CV value was not present in tracking, the test F1 is used as a proxy and noted in the README.
195
+ - Notes on comparability: TF‑IDF and embedding models are evaluated on the same held‑out splits (features differ); reported metrics are comparable for broad benchmarking but not for per‑label fairness analyses.
196
+
197
+ How Metrics Were Computed
198
+ - Metrics were computed using scikit‑learn's `precision_score`, `recall_score`, and `f1_score` with `average='micro'` and `zero_division=0` on the held‑out test labels and model predictions.
199
+ - Test feature and label files used are available under `data/processed/tfidf/` and `data/processed/embedding/` (paths referenced from `hopcroft_skill_classification_tool_competition.config.get_feature_paths`).
200
+
201
+ Ethical Considerations and Caveats
202
+ - The dataset contains examples from Java repositories; model generalization to other languages or domains is not guaranteed.
203
+ - Label imbalance is present; oversampling (MLSMOTE or RandomOverSampler fallback) was used in two variants to improve recall for rare labels — inspect per‑label metrics before deploying.
204
+ - The models and README are intended for research and benchmarking. They are not validated for safety‑critical or high‑stakes automated decisioning.
205
+
206
+
models/kept_label_indices.npy ADDED
Binary file (1.26 kB). View file
 
models/label_names.pkl.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: bd94d38e415f8dc2aaee3f60b6776483
3
+ size: 6708
4
+ hash: md5
5
+ path: label_names.pkl
models/random_forest_embedding_gridsearch.pkl.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: e1c1c0290e0c6036ee798275fdbad61c
3
+ size: 346568353
4
+ hash: md5
5
+ path: random_forest_embedding_gridsearch.pkl
models/random_forest_embedding_gridsearch_smote.pkl.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 4d5379a3847341f8de423778b94537b0
3
+ size: 1035016993
4
+ hash: md5
5
+ path: random_forest_embedding_gridsearch_smote.pkl
models/random_forest_tfidf_gridsearch.pkl.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 39165e064d60e4bd0688e6c7aa94258c
3
+ size: 137359137
4
+ hash: md5
5
+ path: random_forest_tfidf_gridsearch.pkl
models/random_forest_tfidf_gridsearch_adasyn_pca.pkl.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 5e0024de62e0c693cb41677084bb0fd5
3
+ size: 382639449
4
+ hash: md5
5
+ path: random_forest_tfidf_gridsearch_adasyn_pca.pkl
models/random_forest_tfidf_gridsearch_ros.pkl.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 3fda04bfcc26d1fb4350b8a31b80ebaf
3
+ size: 3011992009
4
+ hash: md5
5
+ path: random_forest_tfidf_gridsearch_ros.pkl
models/random_forest_tfidf_gridsearch_smote.pkl.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 7e39607fd740c69373376133b8c4f87b
3
+ size: 371297857
4
+ hash: md5
5
+ path: random_forest_tfidf_gridsearch_smote.pkl
models/tfidf_vectorizer.pkl.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 6f8eab4e3e9dbb44a65d6387e061d7ac
3
+ size: 76439
4
+ hash: md5
5
+ path: tfidf_vectorizer.pkl
notebooks/.gitkeep ADDED
File without changes