DaCrow13
Deploy to HF Spaces (Clean)
225af6a
import argparse
import os
from pathlib import Path
from imblearn.over_sampling import ADASYN, RandomOverSampler
import joblib
import lightgbm as lgb
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.multioutput import MultiOutputClassifier
from hopcroft_skill_classification_tool_competition.config import (
ADASYN_CONFIG,
DATA_PATHS,
MLFLOW_CONFIG,
MODEL_CONFIG,
PCA_CONFIG,
TRAINING_CONFIG,
get_feature_paths,
)
# Local MLSMOTE implementation (lightweight multi-label oversampling)
try:
import pandas as pd
from hopcroft_skill_classification_tool_competition.mlsmote import MLSMOTE as mlsmote_function
from hopcroft_skill_classification_tool_competition.mlsmote import get_minority_instace
_HAS_LOCAL_MLSMOTE = True
except Exception:
mlsmote_function = None
get_minority_instace = None
_HAS_LOCAL_MLSMOTE = False
print("[warning] Local MLSMOTE not available. Check mlsmote.py exists.")
# Prefer multilabel stratified splits for imbalanced multi-label data.
# Use `iterative-stratification` package when available.
try:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
_HAS_MLSTRAT = True
except Exception:
MultilabelStratifiedShuffleSplit = None
_HAS_MLSTRAT = False
# -------------------------------
# MLflow authentication and setup
# Load environment variables from .env file (for local dev)
# In Docker, env vars are set via docker-compose env_file
# -------------------------------
from dotenv import load_dotenv
load_dotenv()
_mlflow_env_uri = os.getenv("MLFLOW_TRACKING_URI")
_configured_uri = MLFLOW_CONFIG.get("uri", "https://dagshub.com/se4ai2526-uniba/Hopcroft.mlflow")
if _mlflow_env_uri:
mlflow_uri = _mlflow_env_uri
else:
mlflow_uri = _configured_uri
# If targeting DagsHub, require username/password; otherwise proceed.
if "dagshub.com" in mlflow_uri:
_username = os.getenv("MLFLOW_TRACKING_USERNAME")
_password = os.getenv("MLFLOW_TRACKING_PASSWORD")
if not _username or not _password:
raise ValueError(
"Set the environment variables MLFLOW_TRACKING_USERNAME and MLFLOW_TRACKING_PASSWORD for remote tracking"
)
mlflow.set_tracking_uri(mlflow_uri)
# =====================================================
# Common utilities (merged from train_experiments.py)
# =====================================================
def load_data(feature_type="tfidf", use_cleaned=True):
"""Load features and labels using get_feature_paths.
Args:
feature_type: 'tfidf' or 'embedding'
use_cleaned: whether to use cleaned data
Returns:
X, Y: feature matrix and label matrix
"""
paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned)
X = np.load(paths["features"])
Y = np.load(paths["labels"])
print(f"Dataset loaded successfully: {X.shape} samples, {Y.shape} labels")
print(f"Using feature type: {feature_type}{'_clean' if use_cleaned else ''}")
return X, Y
def stratified_train_test_split(X, Y, test_size=None, random_state=None, fallback=True):
"""Split X, Y using multilabel stratified shuffle split when possible.
Args:
X: np.ndarray features
Y: np.ndarray multi-label binary matrix (n_samples, n_labels)
test_size: float or int, forwarded to splitter
random_state: int
fallback: if True and multilabel splitter unavailable, use sklearn.train_test_split
Returns:
X_train, X_test, Y_train, Y_test
"""
if _HAS_MLSTRAT:
if isinstance(test_size, float):
tst = test_size
else:
# default to TRAINING_CONFIG if not provided
tst = TRAINING_CONFIG.get("test_size", 0.2)
msss = MultilabelStratifiedShuffleSplit(
n_splits=1, test_size=tst, random_state=random_state
)
train_idx, test_idx = next(msss.split(X, Y))
return X[train_idx], X[test_idx], Y[train_idx], Y[test_idx]
if fallback:
print(
"[warning] iterative-stratification not available; using standard train_test_split (no multilabel stratification). To enable stratified multilabel splitting install 'iterative-stratification'."
)
return train_test_split(X, Y, test_size=test_size, random_state=random_state, shuffle=True)
raise RuntimeError(
"iterative-stratification is required for multilabel stratified splitting but not installed."
)
def stratified_train_val_test_split(
X, Y, test_size=0.2, val_size=0.1, random_state=None, fallback=True
):
"""Split X, Y into train, val, test with multilabel stratification when possible.
Args:
X, Y: arrays
test_size: proportion for final test set
val_size: proportion for validation set (relative to whole dataset)
random_state: seed
fallback: if True, falls back to sklearn splits
Returns:
X_train, X_val, X_test, Y_train, Y_val, Y_test
"""
if not (0.0 < test_size < 1.0 and 0.0 <= val_size < 1.0 and val_size + test_size < 1.0):
raise ValueError("test_size and val_size must be fractions in (0,1) and sum < 1")
# First split off the final test set
X_rem, X_test, Y_rem, Y_test = stratified_train_test_split(
X, Y, test_size=test_size, random_state=random_state, fallback=fallback
)
# Compute validation size relative to the remaining data
rel_val = 0.0
if (1.0 - test_size) > 0:
rel_val = val_size / (1.0 - test_size)
else:
rel_val = 0.0
if rel_val <= 0:
# No validation requested
return X_rem, np.empty((0, X.shape[1])), X_test, Y_rem, np.empty((0, Y.shape[1])), Y_test
X_train, X_val, Y_train, Y_val = stratified_train_test_split(
X_rem, Y_rem, test_size=rel_val, random_state=random_state, fallback=fallback
)
return X_train, X_val, X_test, Y_train, Y_val, Y_test
def _check_label_coverage(Y_train: np.ndarray, Y_val: np.ndarray, min_train: int = 1):
"""Check that each label appears at least `min_train` times in train and
at least once in train+val. Prints a warning if some labels are scarce in
train, and raises an error if some labels are missing entirely from
train+val (which would make learning impossible for those labels).
Args:
Y_train: (n_train, n_labels) binary matrix
Y_val: (n_val, n_labels) binary matrix (may be empty)
min_train: minimum occurrences in train to be considered "covered"
"""
# Defensive: handle empty val
if Y_val is None:
Y_val = np.empty((0, Y_train.shape[1]))
counts_train = np.sum(Y_train, axis=0)
counts_train_val = counts_train + np.sum(Y_val, axis=0)
missing_in_train = np.where(counts_train < min_train)[0]
missing_in_train_val = np.where(counts_train_val == 0)[0]
if missing_in_train.size > 0:
# Small, actionable warning for debugging
preview = missing_in_train[:10].tolist()
print(
f"[warning] {missing_in_train.size} label(s) have <{min_train} occurrences in TRAIN. Example label indices: {preview}."
)
if missing_in_train_val.size > 0:
preview = missing_in_train_val[:10].tolist()
raise ValueError(
f"{missing_in_train_val.size} label(s) have 0 occurrences in TRAIN+VAL (indices example: {preview}). "
"Reduce test/val size, aggregate labels, or ensure these labels exist in the source DB."
)
def evaluate_and_log(model, X_test, Y_test, best_params, cv_score, exp_name, extra_params=None):
Y_pred = model.predict(X_test)
precision = precision_score(Y_test, Y_pred, average="micro", zero_division=0)
recall = recall_score(Y_test, Y_pred, average="micro", zero_division=0)
f1 = f1_score(Y_test, Y_pred, average="micro", zero_division=0)
mlflow.log_metrics(
{
"cv_best_f1_micro": cv_score,
"test_precision_micro": precision,
"test_recall_micro": recall,
"test_f1_micro": f1,
}
)
for k, v in best_params.items():
mlflow.log_param(k, v)
if extra_params:
for k, v in extra_params.items():
mlflow.log_param(k, v)
os.makedirs(DATA_PATHS["models_dir"], exist_ok=True)
model_path = Path(DATA_PATHS["models_dir"]) / f"{exp_name}.pkl"
joblib.dump(model, model_path)
mlflow.log_artifact(str(model_path), artifact_path=f"model_{exp_name}")
print(f"Model saved to {model_path}")
print(f"{exp_name} completed and logged successfully.\n")
def run_grid_search(X, Y):
base_rf = RandomForestClassifier(random_state=TRAINING_CONFIG["random_state"], n_jobs=-1)
multi = MultiOutputClassifier(base_rf)
cv = KFold(
n_splits=TRAINING_CONFIG["cv_folds"],
shuffle=True,
random_state=TRAINING_CONFIG["random_state"],
)
grid = GridSearchCV(
estimator=multi,
param_grid=MODEL_CONFIG["param_grid"],
scoring="f1_micro",
cv=cv,
n_jobs=-1,
verbose=2,
refit=True,
)
return grid
def run_grid_search_lgb(X, Y):
base_lgb = lgb.LGBMClassifier(
random_state=TRAINING_CONFIG["random_state"], n_jobs=1, force_row_wise=True, verbose=-1
)
multi = MultiOutputClassifier(base_lgb, n_jobs=-1)
cv = KFold(
n_splits=TRAINING_CONFIG["cv_folds"],
shuffle=True,
random_state=TRAINING_CONFIG["random_state"],
)
lgb_param_grid = {
"estimator__n_estimators": [50, 100, 200],
"estimator__max_depth": [3, 5, 7],
"estimator__learning_rate": [0.1],
"estimator__num_leaves": [15],
}
grid = GridSearchCV(
estimator=multi,
param_grid=lgb_param_grid,
scoring="f1_micro",
cv=cv,
n_jobs=-1,
verbose=2,
refit=True,
)
return grid
# =====================================================
# Experiments (merged)
# =====================================================
def run_smote_experiment(X, Y, feature_type="tfidf"):
mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["smote"])
# Split into train / val / test
X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
X,
Y,
test_size=TRAINING_CONFIG.get("test_size", 0.2),
val_size=TRAINING_CONFIG.get("val_size", 0.1),
random_state=TRAINING_CONFIG["random_state"],
)
# Check label coverage and fail early if labels are missing from train+val
_check_label_coverage(Y_train, Y_val)
# Apply MLSMOTE (Multi-Label SMOTE) as per paper
# MLSMOTE handles multi-label classification natively by considering label correlations
print("Applying MLSMOTE (Multi-Label SMOTE) as per SkillScope paper...")
print(f" Original training set: {X_train.shape[0]} samples, {Y_train.shape[1]} labels")
# Use local MLSMOTE implementation directly (function-based)
if _HAS_LOCAL_MLSMOTE:
try:
# Set random seed
if TRAINING_CONFIG["random_state"] is not None:
np.random.seed(TRAINING_CONFIG["random_state"])
import random
random.seed(TRAINING_CONFIG["random_state"])
# Convert to DataFrame (MLSMOTE function expects DataFrames)
X_train_df = pd.DataFrame(X_train)
Y_train_df = pd.DataFrame(Y_train)
# Get minority instances
X_min, Y_min = get_minority_instace(X_train_df, Y_train_df)
if len(X_min) == 0:
print("No minority instances found, using original dataset")
X_res, Y_res = X_train, Y_train
oversampling_method = "None (no minority instances)"
n_new = 0
else:
# Calculate number of synthetic samples
label_counts = Y_train_df.sum(axis=0)
mean_count = int(label_counts.mean())
min_count = int(label_counts.min())
n_synthetic = max(100, int(mean_count - min_count))
n_synthetic = min(n_synthetic, len(X_min) * 3)
print(
f"Generating {n_synthetic} synthetic samples from {len(X_min)} minority instances"
)
# Apply MLSMOTE function directly
X_res_df, Y_res_df = mlsmote_function(X_min, Y_min, n_synthetic)
# Convert back to numpy
X_res = X_res_df.values
Y_res = Y_res_df.values.astype(int)
oversampling_method = "MLSMOTE (local implementation)"
n_new = len(X_res) - len(X_train)
print(
f"MLSMOTE completed: {n_new} synthetic samples generated. Total: {len(X_res)} samples"
)
except Exception as e:
print(f"MLSMOTE failed ({e}); falling back to RandomOverSampler")
Y_train_str = ["".join(map(str, y)) for y in Y_train]
ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
oversampling_method = "RandomOverSampler (MLSMOTE fallback)"
n_new = len(X_res) - len(X_train)
else:
print("Local MLSMOTE not available; falling back to RandomOverSampler")
Y_train_str = ["".join(map(str, y)) for y in Y_train]
ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
oversampling_method = "RandomOverSampler (no MLSMOTE)"
n_new = len(X_res) - len(X_train)
grid = run_grid_search(X_res, Y_res)
with mlflow.start_run(run_name="random_forest_with_smote"):
grid.fit(X_res, Y_res)
# Refit final model on train + val (use original non-oversampled data for final fit)
best_params = grid.best_params_
best_cv = grid.best_score_
final_model = grid.best_estimator_
X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
final_model.fit(X_comb, Y_comb)
evaluate_and_log(
final_model,
X_test,
Y_test,
best_params,
best_cv,
f"random_forest_{feature_type}_gridsearch_smote",
{
"oversampling": oversampling_method,
"synthetic_samples": n_new,
"n_labels": Y_train.shape[1],
},
)
def run_ros_experiment(X, Y):
mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["ros"])
# Split into train / val / test
X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
X,
Y,
test_size=TRAINING_CONFIG.get("test_size", 0.2),
val_size=TRAINING_CONFIG.get("val_size", 0.1),
random_state=TRAINING_CONFIG["random_state"],
)
Y_train_str = ["".join(map(str, y)) for y in Y_train]
ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
Y.shape[1]
Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
grid = run_grid_search(X_res, Y_res)
with mlflow.start_run(run_name="random_forest_with_ros"):
grid.fit(X_res, Y_res)
best_params = grid.best_params_
best_cv = grid.best_score_
final_model = grid.best_estimator_
X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
final_model.fit(X_comb, Y_comb)
evaluate_and_log(
final_model,
X_test,
Y_test,
best_params,
best_cv,
"random_forest_tfidf_gridsearch_ros",
{"oversampling": "RandomOverSampler"},
)
def run_adasyn_pca_experiment(X, Y):
mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["adasyn_pca"])
# Split into train / val / test
X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
X,
Y,
test_size=TRAINING_CONFIG.get("test_size", 0.2),
val_size=TRAINING_CONFIG.get("val_size", 0.1),
random_state=TRAINING_CONFIG["random_state"],
)
print("Applying PCA before ADASYN...")
pca = PCA(
n_components=PCA_CONFIG["variance_retained"], random_state=TRAINING_CONFIG["random_state"]
)
X_train_pca = pca.fit_transform(X_train)
adasyn = ADASYN(
random_state=TRAINING_CONFIG["random_state"],
n_neighbors=ADASYN_CONFIG["n_neighbors"],
sampling_strategy=ADASYN_CONFIG["sampling_strategy"],
)
valid_label_idx = next(
(i for i in range(Y_train.shape[1]) if len(np.unique(Y_train[:, i])) > 1), None
)
if valid_label_idx is None:
X_res, Y_res = X_train, Y_train
n_new = 0
else:
X_res_pca, _ = adasyn.fit_resample(X_train_pca, Y_train[:, valid_label_idx])
X_res = pca.inverse_transform(X_res_pca)
n_new = len(X_res) - len(X_train)
Y_res = np.vstack([Y_train, Y_train[np.random.randint(0, len(Y_train), n_new)]])
grid = run_grid_search(X_res, Y_res)
with mlflow.start_run(run_name="random_forest_with_adasyn_pca"):
grid.fit(X_res, Y_res)
best_params = grid.best_params_
best_cv = grid.best_score_
final_model = grid.best_estimator_
X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
final_model.fit(X_comb, Y_comb)
evaluate_and_log(
final_model,
X_test,
Y_test,
best_params,
best_cv,
"random_forest_tfidf_gridsearch_adasyn_pca",
{
"oversampling": "ADASYN + PCA",
"pca_variance": PCA_CONFIG["variance_retained"],
"synthetic_samples": n_new,
},
)
pca_path = Path(DATA_PATHS["models_dir"]) / "pca_tfidf_adasyn.pkl"
joblib.dump(pca, pca_path)
mlflow.log_artifact(str(pca_path), artifact_path="model_adasyn_pca")
def run_lightgbm(X, Y):
mlflow.set_experiment(MLFLOW_CONFIG["experiments"].get("lightgbm", "LightGBM"))
# Split into train / val / test
X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
X,
Y,
test_size=TRAINING_CONFIG.get("test_size", 0.2),
val_size=TRAINING_CONFIG.get("val_size", 0.1),
random_state=TRAINING_CONFIG["random_state"],
)
print("\nTraining LightGBM with GridSearchCV...")
grid = run_grid_search_lgb(X_train, Y_train)
with mlflow.start_run(run_name="lightgbm"):
grid.fit(X_train, Y_train)
best_params = grid.best_params_
best_cv = grid.best_score_
final_model = grid.best_estimator_
X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
final_model.fit(X_comb, Y_comb)
evaluate_and_log(
final_model,
X_test,
Y_test,
best_params,
best_cv,
"lightgbm_tfidf_gridsearch",
{"oversampling": "None", "model": "LightGBM"},
)
def run_lightgbm_smote_experiment(X, Y):
mlflow.set_experiment(MLFLOW_CONFIG["experiments"].get("lightgbm_smote", "LightGBM_SMOTE"))
# Split into train / val / test
X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
X,
Y,
test_size=TRAINING_CONFIG.get("test_size", 0.2),
val_size=TRAINING_CONFIG.get("val_size", 0.1),
random_state=TRAINING_CONFIG["random_state"],
)
# Apply MLSMOTE (Multi-Label SMOTE) as per paper
print(" Applying MLSMOTE for LightGBM...")
print(f" Original training set: {X_train.shape[0]} samples, {Y_train.shape[1]} labels")
# Use local MLSMOTE implementation directly (function-based)
if _HAS_LOCAL_MLSMOTE:
try:
# Set random seed
if TRAINING_CONFIG["random_state"] is not None:
np.random.seed(TRAINING_CONFIG["random_state"])
import random
random.seed(TRAINING_CONFIG["random_state"])
# Convert to DataFrame (MLSMOTE function expects DataFrames)
X_train_df = pd.DataFrame(X_train)
Y_train_df = pd.DataFrame(Y_train)
# Get minority instances
X_min, Y_min = get_minority_instace(X_train_df, Y_train_df)
if len(X_min) == 0:
print("No minority instances found, using original dataset")
X_res, Y_res = X_train, Y_train
oversampling_method = "None (no minority instances)"
n_new = 0
else:
# Calculate number of synthetic samples
label_counts = Y_train_df.sum(axis=0)
mean_count = int(label_counts.mean())
min_count = int(label_counts.min())
n_synthetic = max(100, int(mean_count - min_count))
n_synthetic = min(n_synthetic, len(X_min) * 3)
print(
f"Generating {n_synthetic} synthetic samples from {len(X_min)} minority instances"
)
# Apply MLSMOTE function directly
X_res_df, Y_res_df = mlsmote_function(X_min, Y_min, n_synthetic)
# Convert back to numpy
X_res = X_res_df.values
Y_res = Y_res_df.values.astype(int)
oversampling_method = "MLSMOTE (local implementation)"
n_new = len(X_res) - len(X_train)
print(
f"MLSMOTE completed: {n_new} synthetic samples generated. Total: {len(X_res)} samples"
)
except Exception as e:
print(f"MLSMOTE failed ({e}); falling back to RandomOverSampler")
Y_train_str = ["".join(map(str, y)) for y in Y_train]
ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
oversampling_method = "RandomOverSampler (MLSMOTE fallback)"
n_new = len(X_res) - len(X_train)
else:
print(" Local MLSMOTE not available; falling back to RandomOverSampler")
Y_train_str = ["".join(map(str, y)) for y in Y_train]
ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
oversampling_method = "RandomOverSampler (no MLSMOTE)"
n_new = len(X_res) - len(X_train)
print(f"\n Training LightGBM with {oversampling_method} ({n_new} synthetic samples)...")
grid = run_grid_search_lgb(X_res, Y_res)
with mlflow.start_run(run_name="lightgbm_with_smote"):
grid.fit(X_res, Y_res)
best_params = grid.best_params_
best_cv = grid.best_score_
final_model = grid.best_estimator_
X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
final_model.fit(X_comb, Y_comb)
evaluate_and_log(
final_model,
X_test,
Y_test,
best_params,
best_cv,
"lightgbm_tfidf_gridsearch_smote",
{
"oversampling": oversampling_method,
"synthetic_samples": n_new,
"n_labels": Y_train.shape[1],
"model": "LightGBM",
},
)
# =====================================================
# Baseline training (original train.py behavior)
# =====================================================
def run_baseline_train(feature_type="tfidf", use_cleaned=True):
"""Run baseline training with configurable feature type.
Args:
feature_type: 'tfidf' or 'embedding'
use_cleaned: whether to use cleaned data
"""
mlflow.set_experiment(
MLFLOW_CONFIG.get("experiments", {}).get("baseline", "hopcroft_random_forest_baseline")
)
X, Y = load_data(feature_type=feature_type, use_cleaned=use_cleaned)
# Use 80/20 split as per SkillScope paper (no validation set for baseline)
print(" Using 80/20 train/test split as per paper...")
X_train, X_test, Y_train, Y_test = stratified_train_test_split(
X,
Y,
test_size=TRAINING_CONFIG.get("test_size", 0.2),
random_state=TRAINING_CONFIG.get("random_state", 42),
)
# Remove labels that have 0 occurrences in training set (after split)
train_counts = np.sum(Y_train, axis=0).astype(int)
zero_in_train = np.where(train_counts == 0)[0]
if zero_in_train.size > 0:
kept_idx = np.where(train_counts > 0)[0]
print(
f"[warning] Removing {zero_in_train.size} label(s) with 0 occurrences in TRAIN set. Example removed indices: {zero_in_train[:10].tolist()}"
)
Y_train = Y_train[:, kept_idx]
Y_test = Y_test[:, kept_idx]
# Save kept indices for inference
paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned)
kept_indices_path = Path(paths["features"]).parent / "kept_label_indices.npy"
np.save(kept_indices_path, kept_idx)
print(f"Saved kept label indices to {kept_indices_path}")
# Now check label coverage (should pass since we removed zero-occurrence labels)
_check_label_coverage(Y_train, np.empty((0, Y_train.shape[1])))
base_rf = RandomForestClassifier(
random_state=TRAINING_CONFIG.get("random_state", 42), n_jobs=-1
)
multi = MultiOutputClassifier(base_rf)
# Use full param_grid from MODEL_CONFIG for optimal results as per paper
param_grid = MODEL_CONFIG.get(
"param_grid",
{
"estimator__n_estimators": [50, 100, 200],
"estimator__max_depth": [10, 20, 30],
"estimator__min_samples_split": [2, 5],
},
)
cv = KFold(
n_splits=TRAINING_CONFIG.get("cv_folds", 5),
shuffle=True,
random_state=TRAINING_CONFIG.get("random_state", 42),
)
print(
f" GridSearch with {cv.n_splits} folds and {len(param_grid['estimator__n_estimators']) * len(param_grid['estimator__max_depth']) * len(param_grid['estimator__min_samples_split'])} combinations..."
)
grid = GridSearchCV(
estimator=multi,
param_grid=param_grid,
scoring="f1_micro",
cv=cv,
n_jobs=-1,
verbose=2,
refit=True,
)
with mlflow.start_run(run_name="random_forest_tfidf_gridsearch"):
grid.fit(X_train, Y_train)
best = grid.best_estimator_
best_params = grid.best_params_
best_cv_score = grid.best_score_
# No need to refit on combined train+val since we don't have a val set
# Model is already fitted on full training data
Y_pred_test = best.predict(X_test)
precision = precision_score(Y_test, Y_pred_test, average="micro", zero_division=0)
recall = recall_score(Y_test, Y_pred_test, average="micro", zero_division=0)
f1 = f1_score(Y_test, Y_pred_test, average="micro", zero_division=0)
mlflow.log_param("model_type", "RandomForest + MultiOutput")
for k, v in best_params.items():
mlflow.log_param(k, v)
mlflow.log_metric("cv_best_f1_micro", best_cv_score)
mlflow.log_metric("test_precision_micro", precision)
mlflow.log_metric("test_recall_micro", recall)
mlflow.log_metric("test_f1_micro", f1)
mlflow.log_param("feature_type", feature_type)
mlflow.log_param("use_cleaned", use_cleaned)
print("\n=== Training Results ===")
print(f"Test Precision (Micro): {precision:.4f}")
print(f"Test Recall (Micro): {recall:.4f}")
print(f"Test F1 Score (Micro): {f1:.4f}")
print("========================\n")
paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned)
os.makedirs(paths["models_dir"], exist_ok=True)
model_path = Path(paths["models_dir"]) / f"random_forest_{feature_type}_gridsearch.pkl"
joblib.dump(best, model_path)
np.save(Path(paths["features"]).parent / "X_test.npy", X_test)
np.save(Path(paths["labels"]).parent / "Y_test.npy", Y_test)
mlflow.sklearn.log_model(best, "model")
print("Grid search training completed and logged successfully.")
# =====================================================
# Inference utility (merged from predict.py)
# =====================================================
def run_inference(model_path: str = None):
mlflow.set_experiment(
MLFLOW_CONFIG.get("experiments", {}).get("inference", "hopcroft_random_forest_inference")
)
if model_path is None:
model_path = Path(DATA_PATHS["models_dir"]) / "random_forest_tfidf_gridsearch.pkl"
else:
model_path = Path(model_path)
model = joblib.load(str(model_path))
X_test = np.load(Path(DATA_PATHS["features"]).parent / "X_test.npy")
Y_test = np.load(Path(DATA_PATHS["labels"]).parent / "Y_test.npy")
with mlflow.start_run(run_name="random_forest_tfidf_inference"):
Y_pred = model.predict(X_test)
precision = precision_score(Y_test, Y_pred, average="micro", zero_division=0)
recall = recall_score(Y_test, Y_pred, average="micro", zero_division=0)
f1 = f1_score(Y_test, Y_pred, average="micro", zero_division=0)
mlflow.log_metric("test_precision_micro", precision)
mlflow.log_metric("test_recall_micro", recall)
mlflow.log_metric("test_f1_micro", f1)
print(f"Inference completed — Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
def _parse_args():
p = argparse.ArgumentParser(description="Unified training & experiments script")
p.add_argument(
"action",
choices=[
"baseline",
"smote",
"ros",
"adasyn_pca",
"lightgbm",
"lightgbm_smote",
"predict",
],
help="Action to run",
)
p.add_argument("--model-path", help="Custom model path for inference")
return p.parse_args()
if __name__ == "__main__":
args = _parse_args()
# Baseline has its own load_data logic (removes rare labels after split)
if args.action == "baseline":
run_baseline_train(feature_type="tfidf", use_cleaned=True)
else:
# Other experiments use the original load_data() logic
X, Y = load_data(feature_type="tfidf", use_cleaned=True)
if args.action == "smote":
run_smote_experiment(X, Y)
elif args.action == "ros":
run_ros_experiment(X, Y)
elif args.action == "adasyn_pca":
run_adasyn_pca_experiment(X, Y)
elif args.action == "lightgbm":
run_lightgbm(X, Y)
elif args.action == "lightgbm_smote":
run_lightgbm_smote_experiment(X, Y)
elif args.action == "predict":
run_inference(args.model_path)