Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

Hopcroft-Skill-Classification / hopcroft_skill_classification_tool_competition /modeling /train.py

DaCrow13

Deploy to HF Spaces (Clean)

225af6a about 2 months ago

31.9 kB

	import argparse
	import os
	from pathlib import Path

	from imblearn.over_sampling import ADASYN, RandomOverSampler
	import joblib
	import lightgbm as lgb
	import mlflow
	import mlflow.sklearn
	import numpy as np
	from sklearn.decomposition import PCA
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.metrics import f1_score, precision_score, recall_score
	from sklearn.model_selection import GridSearchCV, KFold, train_test_split
	from sklearn.multioutput import MultiOutputClassifier

	from hopcroft_skill_classification_tool_competition.config import (
	ADASYN_CONFIG,
	DATA_PATHS,
	MLFLOW_CONFIG,
	MODEL_CONFIG,
	PCA_CONFIG,
	TRAINING_CONFIG,
	get_feature_paths,
	)

	# Local MLSMOTE implementation (lightweight multi-label oversampling)
	try:
	import pandas as pd

	from hopcroft_skill_classification_tool_competition.mlsmote import MLSMOTE as mlsmote_function
	from hopcroft_skill_classification_tool_competition.mlsmote import get_minority_instace

	_HAS_LOCAL_MLSMOTE = True
	except Exception:
	mlsmote_function = None
	get_minority_instace = None
	_HAS_LOCAL_MLSMOTE = False
	print("[warning] Local MLSMOTE not available. Check mlsmote.py exists.")


	# Prefer multilabel stratified splits for imbalanced multi-label data.
	# Use `iterative-stratification` package when available.
	try:
	from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

	_HAS_MLSTRAT = True
	except Exception:
	MultilabelStratifiedShuffleSplit = None
	_HAS_MLSTRAT = False


	# -------------------------------
	# MLflow authentication and setup
	# Load environment variables from .env file (for local dev)
	# In Docker, env vars are set via docker-compose env_file
	# -------------------------------
	from dotenv import load_dotenv

	load_dotenv()

	_mlflow_env_uri = os.getenv("MLFLOW_TRACKING_URI")
	_configured_uri = MLFLOW_CONFIG.get("uri", "https://dagshub.com/se4ai2526-uniba/Hopcroft.mlflow")

	if _mlflow_env_uri:
	mlflow_uri = _mlflow_env_uri
	else:
	mlflow_uri = _configured_uri

	# If targeting DagsHub, require username/password; otherwise proceed.
	if "dagshub.com" in mlflow_uri:
	_username = os.getenv("MLFLOW_TRACKING_USERNAME")
	_password = os.getenv("MLFLOW_TRACKING_PASSWORD")
	if not _username or not _password:
	raise ValueError(
	"Set the environment variables MLFLOW_TRACKING_USERNAME and MLFLOW_TRACKING_PASSWORD for remote tracking"
	)

	mlflow.set_tracking_uri(mlflow_uri)


	# =====================================================
	# Common utilities (merged from train_experiments.py)
	# =====================================================
	def load_data(feature_type="tfidf", use_cleaned=True):
	"""Load features and labels using get_feature_paths.

	Args:
	feature_type: 'tfidf' or 'embedding'
	use_cleaned: whether to use cleaned data

	Returns:
	X, Y: feature matrix and label matrix
	"""
	paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned)
	X = np.load(paths["features"])
	Y = np.load(paths["labels"])

	print(f"Dataset loaded successfully: {X.shape} samples, {Y.shape} labels")
	print(f"Using feature type: {feature_type}{'_clean' if use_cleaned else ''}")
	return X, Y


	def stratified_train_test_split(X, Y, test_size=None, random_state=None, fallback=True):
	"""Split X, Y using multilabel stratified shuffle split when possible.

	Args:
	X: np.ndarray features
	Y: np.ndarray multi-label binary matrix (n_samples, n_labels)
	test_size: float or int, forwarded to splitter
	random_state: int
	fallback: if True and multilabel splitter unavailable, use sklearn.train_test_split

	Returns:
	X_train, X_test, Y_train, Y_test
	"""
	if _HAS_MLSTRAT:
	if isinstance(test_size, float):
	tst = test_size
	else:
	# default to TRAINING_CONFIG if not provided
	tst = TRAINING_CONFIG.get("test_size", 0.2)

	msss = MultilabelStratifiedShuffleSplit(
	n_splits=1, test_size=tst, random_state=random_state
	)
	train_idx, test_idx = next(msss.split(X, Y))
	return X[train_idx], X[test_idx], Y[train_idx], Y[test_idx]

	if fallback:
	print(
	"[warning] iterative-stratification not available; using standard train_test_split (no multilabel stratification). To enable stratified multilabel splitting install 'iterative-stratification'."
	)
	return train_test_split(X, Y, test_size=test_size, random_state=random_state, shuffle=True)

	raise RuntimeError(
	"iterative-stratification is required for multilabel stratified splitting but not installed."
	)


	def stratified_train_val_test_split(
	X, Y, test_size=0.2, val_size=0.1, random_state=None, fallback=True
	):
	"""Split X, Y into train, val, test with multilabel stratification when possible.

	Args:
	X, Y: arrays
	test_size: proportion for final test set
	val_size: proportion for validation set (relative to whole dataset)
	random_state: seed
	fallback: if True, falls back to sklearn splits

	Returns:
	X_train, X_val, X_test, Y_train, Y_val, Y_test
	"""
	if not (0.0 < test_size < 1.0 and 0.0 <= val_size < 1.0 and val_size + test_size < 1.0):
	raise ValueError("test_size and val_size must be fractions in (0,1) and sum < 1")

	# First split off the final test set
	X_rem, X_test, Y_rem, Y_test = stratified_train_test_split(
	X, Y, test_size=test_size, random_state=random_state, fallback=fallback
	)

	# Compute validation size relative to the remaining data
	rel_val = 0.0
	if (1.0 - test_size) > 0:
	rel_val = val_size / (1.0 - test_size)
	else:
	rel_val = 0.0

	if rel_val <= 0:
	# No validation requested
	return X_rem, np.empty((0, X.shape[1])), X_test, Y_rem, np.empty((0, Y.shape[1])), Y_test

	X_train, X_val, Y_train, Y_val = stratified_train_test_split(
	X_rem, Y_rem, test_size=rel_val, random_state=random_state, fallback=fallback
	)

	return X_train, X_val, X_test, Y_train, Y_val, Y_test


	def _check_label_coverage(Y_train: np.ndarray, Y_val: np.ndarray, min_train: int = 1):
	"""Check that each label appears at least `min_train` times in train and
	at least once in train+val. Prints a warning if some labels are scarce in
	train, and raises an error if some labels are missing entirely from
	train+val (which would make learning impossible for those labels).

	Args:
	Y_train: (n_train, n_labels) binary matrix
	Y_val: (n_val, n_labels) binary matrix (may be empty)
	min_train: minimum occurrences in train to be considered "covered"
	"""
	# Defensive: handle empty val
	if Y_val is None:
	Y_val = np.empty((0, Y_train.shape[1]))

	counts_train = np.sum(Y_train, axis=0)
	counts_train_val = counts_train + np.sum(Y_val, axis=0)

	missing_in_train = np.where(counts_train < min_train)[0]
	missing_in_train_val = np.where(counts_train_val == 0)[0]

	if missing_in_train.size > 0:
	# Small, actionable warning for debugging
	preview = missing_in_train[:10].tolist()
	print(
	f"[warning] {missing_in_train.size} label(s) have <{min_train} occurrences in TRAIN. Example label indices: {preview}."
	)

	if missing_in_train_val.size > 0:
	preview = missing_in_train_val[:10].tolist()
	raise ValueError(
	f"{missing_in_train_val.size} label(s) have 0 occurrences in TRAIN+VAL (indices example: {preview}). "
	"Reduce test/val size, aggregate labels, or ensure these labels exist in the source DB."
	)


	def evaluate_and_log(model, X_test, Y_test, best_params, cv_score, exp_name, extra_params=None):
	Y_pred = model.predict(X_test)
	precision = precision_score(Y_test, Y_pred, average="micro", zero_division=0)
	recall = recall_score(Y_test, Y_pred, average="micro", zero_division=0)
	f1 = f1_score(Y_test, Y_pred, average="micro", zero_division=0)

	mlflow.log_metrics(
	{
	"cv_best_f1_micro": cv_score,
	"test_precision_micro": precision,
	"test_recall_micro": recall,
	"test_f1_micro": f1,
	}
	)

	for k, v in best_params.items():
	mlflow.log_param(k, v)
	if extra_params:
	for k, v in extra_params.items():
	mlflow.log_param(k, v)

	os.makedirs(DATA_PATHS["models_dir"], exist_ok=True)
	model_path = Path(DATA_PATHS["models_dir"]) / f"{exp_name}.pkl"
	joblib.dump(model, model_path)
	mlflow.log_artifact(str(model_path), artifact_path=f"model_{exp_name}")
	print(f"Model saved to {model_path}")
	print(f"{exp_name} completed and logged successfully.\n")


	def run_grid_search(X, Y):
	base_rf = RandomForestClassifier(random_state=TRAINING_CONFIG["random_state"], n_jobs=-1)
	multi = MultiOutputClassifier(base_rf)
	cv = KFold(
	n_splits=TRAINING_CONFIG["cv_folds"],
	shuffle=True,
	random_state=TRAINING_CONFIG["random_state"],
	)
	grid = GridSearchCV(
	estimator=multi,
	param_grid=MODEL_CONFIG["param_grid"],
	scoring="f1_micro",
	cv=cv,
	n_jobs=-1,
	verbose=2,
	refit=True,
	)
	return grid


	def run_grid_search_lgb(X, Y):
	base_lgb = lgb.LGBMClassifier(
	random_state=TRAINING_CONFIG["random_state"], n_jobs=1, force_row_wise=True, verbose=-1
	)
	multi = MultiOutputClassifier(base_lgb, n_jobs=-1)
	cv = KFold(
	n_splits=TRAINING_CONFIG["cv_folds"],
	shuffle=True,
	random_state=TRAINING_CONFIG["random_state"],
	)
	lgb_param_grid = {
	"estimator__n_estimators": [50, 100, 200],
	"estimator__max_depth": [3, 5, 7],
	"estimator__learning_rate": [0.1],
	"estimator__num_leaves": [15],
	}
	grid = GridSearchCV(
	estimator=multi,
	param_grid=lgb_param_grid,
	scoring="f1_micro",
	cv=cv,
	n_jobs=-1,
	verbose=2,
	refit=True,
	)
	return grid


	# =====================================================
	# Experiments (merged)
	# =====================================================
	def run_smote_experiment(X, Y, feature_type="tfidf"):
	mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["smote"])

	# Split into train / val / test
	X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
	X,
	Y,
	test_size=TRAINING_CONFIG.get("test_size", 0.2),
	val_size=TRAINING_CONFIG.get("val_size", 0.1),
	random_state=TRAINING_CONFIG["random_state"],
	)
	# Check label coverage and fail early if labels are missing from train+val
	_check_label_coverage(Y_train, Y_val)

	# Apply MLSMOTE (Multi-Label SMOTE) as per paper
	# MLSMOTE handles multi-label classification natively by considering label correlations
	print("Applying MLSMOTE (Multi-Label SMOTE) as per SkillScope paper...")
	print(f" Original training set: {X_train.shape[0]} samples, {Y_train.shape[1]} labels")

	# Use local MLSMOTE implementation directly (function-based)
	if _HAS_LOCAL_MLSMOTE:
	try:
	# Set random seed
	if TRAINING_CONFIG["random_state"] is not None:
	np.random.seed(TRAINING_CONFIG["random_state"])
	import random

	random.seed(TRAINING_CONFIG["random_state"])

	# Convert to DataFrame (MLSMOTE function expects DataFrames)
	X_train_df = pd.DataFrame(X_train)
	Y_train_df = pd.DataFrame(Y_train)

	# Get minority instances
	X_min, Y_min = get_minority_instace(X_train_df, Y_train_df)

	if len(X_min) == 0:
	print("No minority instances found, using original dataset")
	X_res, Y_res = X_train, Y_train
	oversampling_method = "None (no minority instances)"
	n_new = 0
	else:
	# Calculate number of synthetic samples
	label_counts = Y_train_df.sum(axis=0)
	mean_count = int(label_counts.mean())
	min_count = int(label_counts.min())
	n_synthetic = max(100, int(mean_count - min_count))
	n_synthetic = min(n_synthetic, len(X_min) * 3)

	print(
	f"Generating {n_synthetic} synthetic samples from {len(X_min)} minority instances"
	)

	# Apply MLSMOTE function directly
	X_res_df, Y_res_df = mlsmote_function(X_min, Y_min, n_synthetic)

	# Convert back to numpy
	X_res = X_res_df.values
	Y_res = Y_res_df.values.astype(int)

	oversampling_method = "MLSMOTE (local implementation)"
	n_new = len(X_res) - len(X_train)
	print(
	f"MLSMOTE completed: {n_new} synthetic samples generated. Total: {len(X_res)} samples"
	)
	except Exception as e:
	print(f"MLSMOTE failed ({e}); falling back to RandomOverSampler")
	Y_train_str = ["".join(map(str, y)) for y in Y_train]
	ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
	X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
	Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
	oversampling_method = "RandomOverSampler (MLSMOTE fallback)"
	n_new = len(X_res) - len(X_train)
	else:
	print("Local MLSMOTE not available; falling back to RandomOverSampler")
	Y_train_str = ["".join(map(str, y)) for y in Y_train]
	ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
	X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
	Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
	oversampling_method = "RandomOverSampler (no MLSMOTE)"
	n_new = len(X_res) - len(X_train)

	grid = run_grid_search(X_res, Y_res)
	with mlflow.start_run(run_name="random_forest_with_smote"):
	grid.fit(X_res, Y_res)

	# Refit final model on train + val (use original non-oversampled data for final fit)
	best_params = grid.best_params_
	best_cv = grid.best_score_
	final_model = grid.best_estimator_
	X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
	Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
	final_model.fit(X_comb, Y_comb)

	evaluate_and_log(
	final_model,
	X_test,
	Y_test,
	best_params,
	best_cv,
	f"random_forest_{feature_type}_gridsearch_smote",
	{
	"oversampling": oversampling_method,
	"synthetic_samples": n_new,
	"n_labels": Y_train.shape[1],
	},
	)


	def run_ros_experiment(X, Y):
	mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["ros"])

	# Split into train / val / test
	X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
	X,
	Y,
	test_size=TRAINING_CONFIG.get("test_size", 0.2),
	val_size=TRAINING_CONFIG.get("val_size", 0.1),
	random_state=TRAINING_CONFIG["random_state"],
	)

	Y_train_str = ["".join(map(str, y)) for y in Y_train]
	ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
	X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)

	Y.shape[1]
	Y_res = np.array([[int(c) for c in s] for s in Y_res_str])

	grid = run_grid_search(X_res, Y_res)
	with mlflow.start_run(run_name="random_forest_with_ros"):
	grid.fit(X_res, Y_res)

	best_params = grid.best_params_
	best_cv = grid.best_score_
	final_model = grid.best_estimator_
	X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
	Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
	final_model.fit(X_comb, Y_comb)

	evaluate_and_log(
	final_model,
	X_test,
	Y_test,
	best_params,
	best_cv,
	"random_forest_tfidf_gridsearch_ros",
	{"oversampling": "RandomOverSampler"},
	)


	def run_adasyn_pca_experiment(X, Y):
	mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["adasyn_pca"])

	# Split into train / val / test
	X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
	X,
	Y,
	test_size=TRAINING_CONFIG.get("test_size", 0.2),
	val_size=TRAINING_CONFIG.get("val_size", 0.1),
	random_state=TRAINING_CONFIG["random_state"],
	)

	print("Applying PCA before ADASYN...")
	pca = PCA(
	n_components=PCA_CONFIG["variance_retained"], random_state=TRAINING_CONFIG["random_state"]
	)
	X_train_pca = pca.fit_transform(X_train)

	adasyn = ADASYN(
	random_state=TRAINING_CONFIG["random_state"],
	n_neighbors=ADASYN_CONFIG["n_neighbors"],
	sampling_strategy=ADASYN_CONFIG["sampling_strategy"],
	)

	valid_label_idx = next(
	(i for i in range(Y_train.shape[1]) if len(np.unique(Y_train[:, i])) > 1), None
	)

	if valid_label_idx is None:
	X_res, Y_res = X_train, Y_train
	n_new = 0
	else:
	X_res_pca, _ = adasyn.fit_resample(X_train_pca, Y_train[:, valid_label_idx])
	X_res = pca.inverse_transform(X_res_pca)
	n_new = len(X_res) - len(X_train)
	Y_res = np.vstack([Y_train, Y_train[np.random.randint(0, len(Y_train), n_new)]])

	grid = run_grid_search(X_res, Y_res)
	with mlflow.start_run(run_name="random_forest_with_adasyn_pca"):
	grid.fit(X_res, Y_res)

	best_params = grid.best_params_
	best_cv = grid.best_score_
	final_model = grid.best_estimator_
	X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
	Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
	final_model.fit(X_comb, Y_comb)

	evaluate_and_log(
	final_model,
	X_test,
	Y_test,
	best_params,
	best_cv,
	"random_forest_tfidf_gridsearch_adasyn_pca",
	{
	"oversampling": "ADASYN + PCA",
	"pca_variance": PCA_CONFIG["variance_retained"],
	"synthetic_samples": n_new,
	},
	)
	pca_path = Path(DATA_PATHS["models_dir"]) / "pca_tfidf_adasyn.pkl"
	joblib.dump(pca, pca_path)
	mlflow.log_artifact(str(pca_path), artifact_path="model_adasyn_pca")


	def run_lightgbm(X, Y):
	mlflow.set_experiment(MLFLOW_CONFIG["experiments"].get("lightgbm", "LightGBM"))

	# Split into train / val / test
	X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
	X,
	Y,
	test_size=TRAINING_CONFIG.get("test_size", 0.2),
	val_size=TRAINING_CONFIG.get("val_size", 0.1),
	random_state=TRAINING_CONFIG["random_state"],
	)

	print("\nTraining LightGBM with GridSearchCV...")
	grid = run_grid_search_lgb(X_train, Y_train)

	with mlflow.start_run(run_name="lightgbm"):
	grid.fit(X_train, Y_train)

	best_params = grid.best_params_
	best_cv = grid.best_score_
	final_model = grid.best_estimator_
	X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
	Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
	final_model.fit(X_comb, Y_comb)

	evaluate_and_log(
	final_model,
	X_test,
	Y_test,
	best_params,
	best_cv,
	"lightgbm_tfidf_gridsearch",
	{"oversampling": "None", "model": "LightGBM"},
	)


	def run_lightgbm_smote_experiment(X, Y):
	mlflow.set_experiment(MLFLOW_CONFIG["experiments"].get("lightgbm_smote", "LightGBM_SMOTE"))

	# Split into train / val / test
	X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
	X,
	Y,
	test_size=TRAINING_CONFIG.get("test_size", 0.2),
	val_size=TRAINING_CONFIG.get("val_size", 0.1),
	random_state=TRAINING_CONFIG["random_state"],
	)

	# Apply MLSMOTE (Multi-Label SMOTE) as per paper
	print(" Applying MLSMOTE for LightGBM...")
	print(f" Original training set: {X_train.shape[0]} samples, {Y_train.shape[1]} labels")

	# Use local MLSMOTE implementation directly (function-based)
	if _HAS_LOCAL_MLSMOTE:
	try:
	# Set random seed
	if TRAINING_CONFIG["random_state"] is not None:
	np.random.seed(TRAINING_CONFIG["random_state"])
	import random

	random.seed(TRAINING_CONFIG["random_state"])

	# Convert to DataFrame (MLSMOTE function expects DataFrames)
	X_train_df = pd.DataFrame(X_train)
	Y_train_df = pd.DataFrame(Y_train)

	# Get minority instances
	X_min, Y_min = get_minority_instace(X_train_df, Y_train_df)

	if len(X_min) == 0:
	print("No minority instances found, using original dataset")
	X_res, Y_res = X_train, Y_train
	oversampling_method = "None (no minority instances)"
	n_new = 0
	else:
	# Calculate number of synthetic samples
	label_counts = Y_train_df.sum(axis=0)
	mean_count = int(label_counts.mean())
	min_count = int(label_counts.min())
	n_synthetic = max(100, int(mean_count - min_count))
	n_synthetic = min(n_synthetic, len(X_min) * 3)

	print(
	f"Generating {n_synthetic} synthetic samples from {len(X_min)} minority instances"
	)

	# Apply MLSMOTE function directly
	X_res_df, Y_res_df = mlsmote_function(X_min, Y_min, n_synthetic)

	# Convert back to numpy
	X_res = X_res_df.values
	Y_res = Y_res_df.values.astype(int)

	oversampling_method = "MLSMOTE (local implementation)"
	n_new = len(X_res) - len(X_train)
	print(
	f"MLSMOTE completed: {n_new} synthetic samples generated. Total: {len(X_res)} samples"
	)
	except Exception as e:
	print(f"MLSMOTE failed ({e}); falling back to RandomOverSampler")
	Y_train_str = ["".join(map(str, y)) for y in Y_train]
	ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
	X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
	Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
	oversampling_method = "RandomOverSampler (MLSMOTE fallback)"
	n_new = len(X_res) - len(X_train)
	else:
	print(" Local MLSMOTE not available; falling back to RandomOverSampler")
	Y_train_str = ["".join(map(str, y)) for y in Y_train]
	ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
	X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
	Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
	oversampling_method = "RandomOverSampler (no MLSMOTE)"
	n_new = len(X_res) - len(X_train)

	print(f"\n Training LightGBM with {oversampling_method} ({n_new} synthetic samples)...")
	grid = run_grid_search_lgb(X_res, Y_res)

	with mlflow.start_run(run_name="lightgbm_with_smote"):
	grid.fit(X_res, Y_res)

	best_params = grid.best_params_
	best_cv = grid.best_score_
	final_model = grid.best_estimator_
	X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
	Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
	final_model.fit(X_comb, Y_comb)

	evaluate_and_log(
	final_model,
	X_test,
	Y_test,
	best_params,
	best_cv,
	"lightgbm_tfidf_gridsearch_smote",
	{
	"oversampling": oversampling_method,
	"synthetic_samples": n_new,
	"n_labels": Y_train.shape[1],
	"model": "LightGBM",
	},
	)


	# =====================================================
	# Baseline training (original train.py behavior)
	# =====================================================
	def run_baseline_train(feature_type="tfidf", use_cleaned=True):
	"""Run baseline training with configurable feature type.

	Args:
	feature_type: 'tfidf' or 'embedding'
	use_cleaned: whether to use cleaned data
	"""
	mlflow.set_experiment(
	MLFLOW_CONFIG.get("experiments", {}).get("baseline", "hopcroft_random_forest_baseline")
	)

	X, Y = load_data(feature_type=feature_type, use_cleaned=use_cleaned)

	# Use 80/20 split as per SkillScope paper (no validation set for baseline)
	print(" Using 80/20 train/test split as per paper...")
	X_train, X_test, Y_train, Y_test = stratified_train_test_split(
	X,
	Y,
	test_size=TRAINING_CONFIG.get("test_size", 0.2),
	random_state=TRAINING_CONFIG.get("random_state", 42),
	)

	# Remove labels that have 0 occurrences in training set (after split)
	train_counts = np.sum(Y_train, axis=0).astype(int)
	zero_in_train = np.where(train_counts == 0)[0]

	if zero_in_train.size > 0:
	kept_idx = np.where(train_counts > 0)[0]
	print(
	f"[warning] Removing {zero_in_train.size} label(s) with 0 occurrences in TRAIN set. Example removed indices: {zero_in_train[:10].tolist()}"
	)
	Y_train = Y_train[:, kept_idx]
	Y_test = Y_test[:, kept_idx]

	# Save kept indices for inference
	paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned)
	kept_indices_path = Path(paths["features"]).parent / "kept_label_indices.npy"
	np.save(kept_indices_path, kept_idx)
	print(f"Saved kept label indices to {kept_indices_path}")

	# Now check label coverage (should pass since we removed zero-occurrence labels)
	_check_label_coverage(Y_train, np.empty((0, Y_train.shape[1])))

	base_rf = RandomForestClassifier(
	random_state=TRAINING_CONFIG.get("random_state", 42), n_jobs=-1
	)
	multi = MultiOutputClassifier(base_rf)

	# Use full param_grid from MODEL_CONFIG for optimal results as per paper
	param_grid = MODEL_CONFIG.get(
	"param_grid",
	{
	"estimator__n_estimators": [50, 100, 200],
	"estimator__max_depth": [10, 20, 30],
	"estimator__min_samples_split": [2, 5],
	},
	)

	cv = KFold(
	n_splits=TRAINING_CONFIG.get("cv_folds", 5),
	shuffle=True,
	random_state=TRAINING_CONFIG.get("random_state", 42),
	)

	print(
	f" GridSearch with {cv.n_splits} folds and {len(param_grid['estimator__n_estimators']) * len(param_grid['estimator__max_depth']) * len(param_grid['estimator__min_samples_split'])} combinations..."
	)

	grid = GridSearchCV(
	estimator=multi,
	param_grid=param_grid,
	scoring="f1_micro",
	cv=cv,
	n_jobs=-1,
	verbose=2,
	refit=True,
	)

	with mlflow.start_run(run_name="random_forest_tfidf_gridsearch"):
	grid.fit(X_train, Y_train)

	best = grid.best_estimator_
	best_params = grid.best_params_
	best_cv_score = grid.best_score_

	# No need to refit on combined train+val since we don't have a val set
	# Model is already fitted on full training data

	Y_pred_test = best.predict(X_test)

	precision = precision_score(Y_test, Y_pred_test, average="micro", zero_division=0)
	recall = recall_score(Y_test, Y_pred_test, average="micro", zero_division=0)
	f1 = f1_score(Y_test, Y_pred_test, average="micro", zero_division=0)

	mlflow.log_param("model_type", "RandomForest + MultiOutput")
	for k, v in best_params.items():
	mlflow.log_param(k, v)
	mlflow.log_metric("cv_best_f1_micro", best_cv_score)

	mlflow.log_metric("test_precision_micro", precision)
	mlflow.log_metric("test_recall_micro", recall)
	mlflow.log_metric("test_f1_micro", f1)
	mlflow.log_param("feature_type", feature_type)
	mlflow.log_param("use_cleaned", use_cleaned)

	print("\n=== Training Results ===")
	print(f"Test Precision (Micro): {precision:.4f}")
	print(f"Test Recall (Micro): {recall:.4f}")
	print(f"Test F1 Score (Micro): {f1:.4f}")
	print("========================\n")

	paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned)
	os.makedirs(paths["models_dir"], exist_ok=True)

	model_path = Path(paths["models_dir"]) / f"random_forest_{feature_type}_gridsearch.pkl"
	joblib.dump(best, model_path)

	np.save(Path(paths["features"]).parent / "X_test.npy", X_test)
	np.save(Path(paths["labels"]).parent / "Y_test.npy", Y_test)

	mlflow.sklearn.log_model(best, "model")

	print("Grid search training completed and logged successfully.")


	# =====================================================
	# Inference utility (merged from predict.py)
	# =====================================================
	def run_inference(model_path: str = None):
	mlflow.set_experiment(
	MLFLOW_CONFIG.get("experiments", {}).get("inference", "hopcroft_random_forest_inference")
	)

	if model_path is None:
	model_path = Path(DATA_PATHS["models_dir"]) / "random_forest_tfidf_gridsearch.pkl"
	else:
	model_path = Path(model_path)

	model = joblib.load(str(model_path))

	X_test = np.load(Path(DATA_PATHS["features"]).parent / "X_test.npy")
	Y_test = np.load(Path(DATA_PATHS["labels"]).parent / "Y_test.npy")

	with mlflow.start_run(run_name="random_forest_tfidf_inference"):
	Y_pred = model.predict(X_test)

	precision = precision_score(Y_test, Y_pred, average="micro", zero_division=0)
	recall = recall_score(Y_test, Y_pred, average="micro", zero_division=0)
	f1 = f1_score(Y_test, Y_pred, average="micro", zero_division=0)

	mlflow.log_metric("test_precision_micro", precision)
	mlflow.log_metric("test_recall_micro", recall)
	mlflow.log_metric("test_f1_micro", f1)

	print(f"Inference completed — Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")


	def _parse_args():
	p = argparse.ArgumentParser(description="Unified training & experiments script")
	p.add_argument(
	"action",
	choices=[
	"baseline",
	"smote",
	"ros",
	"adasyn_pca",
	"lightgbm",
	"lightgbm_smote",
	"predict",
	],
	help="Action to run",
	)
	p.add_argument("--model-path", help="Custom model path for inference")
	return p.parse_args()


	if __name__ == "__main__":
	args = _parse_args()

	# Baseline has its own load_data logic (removes rare labels after split)
	if args.action == "baseline":
	run_baseline_train(feature_type="tfidf", use_cleaned=True)
	else:
	# Other experiments use the original load_data() logic
	X, Y = load_data(feature_type="tfidf", use_cleaned=True)

	if args.action == "smote":
	run_smote_experiment(X, Y)
	elif args.action == "ros":
	run_ros_experiment(X, Y)
	elif args.action == "adasyn_pca":
	run_adasyn_pca_experiment(X, Y)
	elif args.action == "lightgbm":
	run_lightgbm(X, Y)
	elif args.action == "lightgbm_smote":
	run_lightgbm_smote_experiment(X, Y)
	elif args.action == "predict":
	run_inference(args.model_path)