Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

File size: 31,907 Bytes

225af6a

import argparse
import os
from pathlib import Path

from imblearn.over_sampling import ADASYN, RandomOverSampler
import joblib
import lightgbm as lgb
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.multioutput import MultiOutputClassifier

from hopcroft_skill_classification_tool_competition.config import (
    ADASYN_CONFIG,
    DATA_PATHS,
    MLFLOW_CONFIG,
    MODEL_CONFIG,
    PCA_CONFIG,
    TRAINING_CONFIG,
    get_feature_paths,
)

# Local MLSMOTE implementation (lightweight multi-label oversampling)
try:
    import pandas as pd

    from hopcroft_skill_classification_tool_competition.mlsmote import MLSMOTE as mlsmote_function
    from hopcroft_skill_classification_tool_competition.mlsmote import get_minority_instace

    _HAS_LOCAL_MLSMOTE = True
except Exception:
    mlsmote_function = None
    get_minority_instace = None
    _HAS_LOCAL_MLSMOTE = False
    print("[warning] Local MLSMOTE not available. Check mlsmote.py exists.")


# Prefer multilabel stratified splits for imbalanced multi-label data.
# Use `iterative-stratification` package when available.
try:
    from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

    _HAS_MLSTRAT = True
except Exception:
    MultilabelStratifiedShuffleSplit = None
    _HAS_MLSTRAT = False


# -------------------------------
# MLflow authentication and setup
# Load environment variables from .env file (for local dev)
# In Docker, env vars are set via docker-compose env_file
# -------------------------------
from dotenv import load_dotenv

load_dotenv()

_mlflow_env_uri = os.getenv("MLFLOW_TRACKING_URI")
_configured_uri = MLFLOW_CONFIG.get("uri", "https://dagshub.com/se4ai2526-uniba/Hopcroft.mlflow")

if _mlflow_env_uri:
    mlflow_uri = _mlflow_env_uri
else:
    mlflow_uri = _configured_uri

# If targeting DagsHub, require username/password; otherwise proceed.
if "dagshub.com" in mlflow_uri:
    _username = os.getenv("MLFLOW_TRACKING_USERNAME")
    _password = os.getenv("MLFLOW_TRACKING_PASSWORD")
    if not _username or not _password:
        raise ValueError(
            "Set the environment variables MLFLOW_TRACKING_USERNAME and MLFLOW_TRACKING_PASSWORD for remote tracking"
        )

mlflow.set_tracking_uri(mlflow_uri)


# =====================================================
# Common utilities (merged from train_experiments.py)
# =====================================================
def load_data(feature_type="tfidf", use_cleaned=True):
    """Load features and labels using get_feature_paths.

    Args:
        feature_type: 'tfidf' or 'embedding'
        use_cleaned: whether to use cleaned data

    Returns:
        X, Y: feature matrix and label matrix
    """
    paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned)
    X = np.load(paths["features"])
    Y = np.load(paths["labels"])

    print(f"Dataset loaded successfully: {X.shape} samples, {Y.shape} labels")
    print(f"Using feature type: {feature_type}{'_clean' if use_cleaned else ''}")
    return X, Y


def stratified_train_test_split(X, Y, test_size=None, random_state=None, fallback=True):
    """Split X, Y using multilabel stratified shuffle split when possible.

    Args:
        X: np.ndarray features
        Y: np.ndarray multi-label binary matrix (n_samples, n_labels)
        test_size: float or int, forwarded to splitter
        random_state: int
        fallback: if True and multilabel splitter unavailable, use sklearn.train_test_split

    Returns:
        X_train, X_test, Y_train, Y_test
    """
    if _HAS_MLSTRAT:
        if isinstance(test_size, float):
            tst = test_size
        else:
            # default to TRAINING_CONFIG if not provided
            tst = TRAINING_CONFIG.get("test_size", 0.2)

        msss = MultilabelStratifiedShuffleSplit(
            n_splits=1, test_size=tst, random_state=random_state
        )
        train_idx, test_idx = next(msss.split(X, Y))
        return X[train_idx], X[test_idx], Y[train_idx], Y[test_idx]

    if fallback:
        print(
            "[warning] iterative-stratification not available; using standard train_test_split (no multilabel stratification). To enable stratified multilabel splitting install 'iterative-stratification'."
        )
        return train_test_split(X, Y, test_size=test_size, random_state=random_state, shuffle=True)

    raise RuntimeError(
        "iterative-stratification is required for multilabel stratified splitting but not installed."
    )


def stratified_train_val_test_split(
    X, Y, test_size=0.2, val_size=0.1, random_state=None, fallback=True
):
    """Split X, Y into train, val, test with multilabel stratification when possible.

    Args:
        X, Y: arrays
        test_size: proportion for final test set
        val_size: proportion for validation set (relative to whole dataset)
        random_state: seed
        fallback: if True, falls back to sklearn splits

    Returns:
        X_train, X_val, X_test, Y_train, Y_val, Y_test
    """
    if not (0.0 < test_size < 1.0 and 0.0 <= val_size < 1.0 and val_size + test_size < 1.0):
        raise ValueError("test_size and val_size must be fractions in (0,1) and sum < 1")

    # First split off the final test set
    X_rem, X_test, Y_rem, Y_test = stratified_train_test_split(
        X, Y, test_size=test_size, random_state=random_state, fallback=fallback
    )

    # Compute validation size relative to the remaining data
    rel_val = 0.0
    if (1.0 - test_size) > 0:
        rel_val = val_size / (1.0 - test_size)
    else:
        rel_val = 0.0

    if rel_val <= 0:
        # No validation requested
        return X_rem, np.empty((0, X.shape[1])), X_test, Y_rem, np.empty((0, Y.shape[1])), Y_test

    X_train, X_val, Y_train, Y_val = stratified_train_test_split(
        X_rem, Y_rem, test_size=rel_val, random_state=random_state, fallback=fallback
    )

    return X_train, X_val, X_test, Y_train, Y_val, Y_test


def _check_label_coverage(Y_train: np.ndarray, Y_val: np.ndarray, min_train: int = 1):
    """Check that each label appears at least `min_train` times in train and
    at least once in train+val. Prints a warning if some labels are scarce in
    train, and raises an error if some labels are missing entirely from
    train+val (which would make learning impossible for those labels).

    Args:
        Y_train: (n_train, n_labels) binary matrix
        Y_val: (n_val, n_labels) binary matrix (may be empty)
        min_train: minimum occurrences in train to be considered "covered"
    """
    # Defensive: handle empty val
    if Y_val is None:
        Y_val = np.empty((0, Y_train.shape[1]))

    counts_train = np.sum(Y_train, axis=0)
    counts_train_val = counts_train + np.sum(Y_val, axis=0)

    missing_in_train = np.where(counts_train < min_train)[0]
    missing_in_train_val = np.where(counts_train_val == 0)[0]

    if missing_in_train.size > 0:
        # Small, actionable warning for debugging
        preview = missing_in_train[:10].tolist()
        print(
            f"[warning] {missing_in_train.size} label(s) have <{min_train} occurrences in TRAIN. Example label indices: {preview}."
        )

    if missing_in_train_val.size > 0:
        preview = missing_in_train_val[:10].tolist()
        raise ValueError(
            f"{missing_in_train_val.size} label(s) have 0 occurrences in TRAIN+VAL (indices example: {preview}). "
            "Reduce test/val size, aggregate labels, or ensure these labels exist in the source DB."
        )


def evaluate_and_log(model, X_test, Y_test, best_params, cv_score, exp_name, extra_params=None):
    Y_pred = model.predict(X_test)
    precision = precision_score(Y_test, Y_pred, average="micro", zero_division=0)
    recall = recall_score(Y_test, Y_pred, average="micro", zero_division=0)
    f1 = f1_score(Y_test, Y_pred, average="micro", zero_division=0)

    mlflow.log_metrics(
        {
            "cv_best_f1_micro": cv_score,
            "test_precision_micro": precision,
            "test_recall_micro": recall,
            "test_f1_micro": f1,
        }
    )

    for k, v in best_params.items():
        mlflow.log_param(k, v)
    if extra_params:
        for k, v in extra_params.items():
            mlflow.log_param(k, v)

    os.makedirs(DATA_PATHS["models_dir"], exist_ok=True)
    model_path = Path(DATA_PATHS["models_dir"]) / f"{exp_name}.pkl"
    joblib.dump(model, model_path)
    mlflow.log_artifact(str(model_path), artifact_path=f"model_{exp_name}")
    print(f"Model saved to {model_path}")
    print(f"{exp_name} completed and logged successfully.\n")


def run_grid_search(X, Y):
    base_rf = RandomForestClassifier(random_state=TRAINING_CONFIG["random_state"], n_jobs=-1)
    multi = MultiOutputClassifier(base_rf)
    cv = KFold(
        n_splits=TRAINING_CONFIG["cv_folds"],
        shuffle=True,
        random_state=TRAINING_CONFIG["random_state"],
    )
    grid = GridSearchCV(
        estimator=multi,
        param_grid=MODEL_CONFIG["param_grid"],
        scoring="f1_micro",
        cv=cv,
        n_jobs=-1,
        verbose=2,
        refit=True,
    )
    return grid


def run_grid_search_lgb(X, Y):
    base_lgb = lgb.LGBMClassifier(
        random_state=TRAINING_CONFIG["random_state"], n_jobs=1, force_row_wise=True, verbose=-1
    )
    multi = MultiOutputClassifier(base_lgb, n_jobs=-1)
    cv = KFold(
        n_splits=TRAINING_CONFIG["cv_folds"],
        shuffle=True,
        random_state=TRAINING_CONFIG["random_state"],
    )
    lgb_param_grid = {
        "estimator__n_estimators": [50, 100, 200],
        "estimator__max_depth": [3, 5, 7],
        "estimator__learning_rate": [0.1],
        "estimator__num_leaves": [15],
    }
    grid = GridSearchCV(
        estimator=multi,
        param_grid=lgb_param_grid,
        scoring="f1_micro",
        cv=cv,
        n_jobs=-1,
        verbose=2,
        refit=True,
    )
    return grid


# =====================================================
# Experiments (merged)
# =====================================================
def run_smote_experiment(X, Y, feature_type="tfidf"):
    mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["smote"])

    # Split into train / val / test
    X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
        X,
        Y,
        test_size=TRAINING_CONFIG.get("test_size", 0.2),
        val_size=TRAINING_CONFIG.get("val_size", 0.1),
        random_state=TRAINING_CONFIG["random_state"],
    )
    # Check label coverage and fail early if labels are missing from train+val
    _check_label_coverage(Y_train, Y_val)

    # Apply MLSMOTE (Multi-Label SMOTE) as per paper
    # MLSMOTE handles multi-label classification natively by considering label correlations
    print("Applying MLSMOTE (Multi-Label SMOTE) as per SkillScope paper...")
    print(f"   Original training set: {X_train.shape[0]} samples, {Y_train.shape[1]} labels")

    # Use local MLSMOTE implementation directly (function-based)
    if _HAS_LOCAL_MLSMOTE:
        try:
            # Set random seed
            if TRAINING_CONFIG["random_state"] is not None:
                np.random.seed(TRAINING_CONFIG["random_state"])
                import random

                random.seed(TRAINING_CONFIG["random_state"])

            # Convert to DataFrame (MLSMOTE function expects DataFrames)
            X_train_df = pd.DataFrame(X_train)
            Y_train_df = pd.DataFrame(Y_train)

            # Get minority instances
            X_min, Y_min = get_minority_instace(X_train_df, Y_train_df)

            if len(X_min) == 0:
                print("No minority instances found, using original dataset")
                X_res, Y_res = X_train, Y_train
                oversampling_method = "None (no minority instances)"
                n_new = 0
            else:
                # Calculate number of synthetic samples
                label_counts = Y_train_df.sum(axis=0)
                mean_count = int(label_counts.mean())
                min_count = int(label_counts.min())
                n_synthetic = max(100, int(mean_count - min_count))
                n_synthetic = min(n_synthetic, len(X_min) * 3)

                print(
                    f"Generating {n_synthetic} synthetic samples from {len(X_min)} minority instances"
                )

                # Apply MLSMOTE function directly
                X_res_df, Y_res_df = mlsmote_function(X_min, Y_min, n_synthetic)

                # Convert back to numpy
                X_res = X_res_df.values
                Y_res = Y_res_df.values.astype(int)

                oversampling_method = "MLSMOTE (local implementation)"
                n_new = len(X_res) - len(X_train)
                print(
                    f"MLSMOTE completed: {n_new} synthetic samples generated. Total: {len(X_res)} samples"
                )
        except Exception as e:
            print(f"MLSMOTE failed ({e}); falling back to RandomOverSampler")
            Y_train_str = ["".join(map(str, y)) for y in Y_train]
            ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
            X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
            Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
            oversampling_method = "RandomOverSampler (MLSMOTE fallback)"
            n_new = len(X_res) - len(X_train)
    else:
        print("Local MLSMOTE not available; falling back to RandomOverSampler")
        Y_train_str = ["".join(map(str, y)) for y in Y_train]
        ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
        X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
        Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
        oversampling_method = "RandomOverSampler (no MLSMOTE)"
        n_new = len(X_res) - len(X_train)

    grid = run_grid_search(X_res, Y_res)
    with mlflow.start_run(run_name="random_forest_with_smote"):
        grid.fit(X_res, Y_res)

        # Refit final model on train + val (use original non-oversampled data for final fit)
        best_params = grid.best_params_
        best_cv = grid.best_score_
        final_model = grid.best_estimator_
        X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
        Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
        final_model.fit(X_comb, Y_comb)

        evaluate_and_log(
            final_model,
            X_test,
            Y_test,
            best_params,
            best_cv,
            f"random_forest_{feature_type}_gridsearch_smote",
            {
                "oversampling": oversampling_method,
                "synthetic_samples": n_new,
                "n_labels": Y_train.shape[1],
            },
        )


def run_ros_experiment(X, Y):
    mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["ros"])

    # Split into train / val / test
    X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
        X,
        Y,
        test_size=TRAINING_CONFIG.get("test_size", 0.2),
        val_size=TRAINING_CONFIG.get("val_size", 0.1),
        random_state=TRAINING_CONFIG["random_state"],
    )

    Y_train_str = ["".join(map(str, y)) for y in Y_train]
    ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
    X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)

    Y.shape[1]
    Y_res = np.array([[int(c) for c in s] for s in Y_res_str])

    grid = run_grid_search(X_res, Y_res)
    with mlflow.start_run(run_name="random_forest_with_ros"):
        grid.fit(X_res, Y_res)

        best_params = grid.best_params_
        best_cv = grid.best_score_
        final_model = grid.best_estimator_
        X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
        Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
        final_model.fit(X_comb, Y_comb)

        evaluate_and_log(
            final_model,
            X_test,
            Y_test,
            best_params,
            best_cv,
            "random_forest_tfidf_gridsearch_ros",
            {"oversampling": "RandomOverSampler"},
        )


def run_adasyn_pca_experiment(X, Y):
    mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["adasyn_pca"])

    # Split into train / val / test
    X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
        X,
        Y,
        test_size=TRAINING_CONFIG.get("test_size", 0.2),
        val_size=TRAINING_CONFIG.get("val_size", 0.1),
        random_state=TRAINING_CONFIG["random_state"],
    )

    print("Applying PCA before ADASYN...")
    pca = PCA(
        n_components=PCA_CONFIG["variance_retained"], random_state=TRAINING_CONFIG["random_state"]
    )
    X_train_pca = pca.fit_transform(X_train)

    adasyn = ADASYN(
        random_state=TRAINING_CONFIG["random_state"],
        n_neighbors=ADASYN_CONFIG["n_neighbors"],
        sampling_strategy=ADASYN_CONFIG["sampling_strategy"],
    )

    valid_label_idx = next(
        (i for i in range(Y_train.shape[1]) if len(np.unique(Y_train[:, i])) > 1), None
    )

    if valid_label_idx is None:
        X_res, Y_res = X_train, Y_train
        n_new = 0
    else:
        X_res_pca, _ = adasyn.fit_resample(X_train_pca, Y_train[:, valid_label_idx])
        X_res = pca.inverse_transform(X_res_pca)
        n_new = len(X_res) - len(X_train)
        Y_res = np.vstack([Y_train, Y_train[np.random.randint(0, len(Y_train), n_new)]])

    grid = run_grid_search(X_res, Y_res)
    with mlflow.start_run(run_name="random_forest_with_adasyn_pca"):
        grid.fit(X_res, Y_res)

        best_params = grid.best_params_
        best_cv = grid.best_score_
        final_model = grid.best_estimator_
        X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
        Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
        final_model.fit(X_comb, Y_comb)

        evaluate_and_log(
            final_model,
            X_test,
            Y_test,
            best_params,
            best_cv,
            "random_forest_tfidf_gridsearch_adasyn_pca",
            {
                "oversampling": "ADASYN + PCA",
                "pca_variance": PCA_CONFIG["variance_retained"],
                "synthetic_samples": n_new,
            },
        )
        pca_path = Path(DATA_PATHS["models_dir"]) / "pca_tfidf_adasyn.pkl"
        joblib.dump(pca, pca_path)
        mlflow.log_artifact(str(pca_path), artifact_path="model_adasyn_pca")


def run_lightgbm(X, Y):
    mlflow.set_experiment(MLFLOW_CONFIG["experiments"].get("lightgbm", "LightGBM"))

    # Split into train / val / test
    X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
        X,
        Y,
        test_size=TRAINING_CONFIG.get("test_size", 0.2),
        val_size=TRAINING_CONFIG.get("val_size", 0.1),
        random_state=TRAINING_CONFIG["random_state"],
    )

    print("\nTraining LightGBM with GridSearchCV...")
    grid = run_grid_search_lgb(X_train, Y_train)

    with mlflow.start_run(run_name="lightgbm"):
        grid.fit(X_train, Y_train)

        best_params = grid.best_params_
        best_cv = grid.best_score_
        final_model = grid.best_estimator_
        X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
        Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
        final_model.fit(X_comb, Y_comb)

        evaluate_and_log(
            final_model,
            X_test,
            Y_test,
            best_params,
            best_cv,
            "lightgbm_tfidf_gridsearch",
            {"oversampling": "None", "model": "LightGBM"},
        )


def run_lightgbm_smote_experiment(X, Y):
    mlflow.set_experiment(MLFLOW_CONFIG["experiments"].get("lightgbm_smote", "LightGBM_SMOTE"))

    # Split into train / val / test
    X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split(
        X,
        Y,
        test_size=TRAINING_CONFIG.get("test_size", 0.2),
        val_size=TRAINING_CONFIG.get("val_size", 0.1),
        random_state=TRAINING_CONFIG["random_state"],
    )

    # Apply MLSMOTE (Multi-Label SMOTE) as per paper
    print(" Applying MLSMOTE for LightGBM...")
    print(f"   Original training set: {X_train.shape[0]} samples, {Y_train.shape[1]} labels")

    # Use local MLSMOTE implementation directly (function-based)
    if _HAS_LOCAL_MLSMOTE:
        try:
            # Set random seed
            if TRAINING_CONFIG["random_state"] is not None:
                np.random.seed(TRAINING_CONFIG["random_state"])
                import random

                random.seed(TRAINING_CONFIG["random_state"])

            # Convert to DataFrame (MLSMOTE function expects DataFrames)
            X_train_df = pd.DataFrame(X_train)
            Y_train_df = pd.DataFrame(Y_train)

            # Get minority instances
            X_min, Y_min = get_minority_instace(X_train_df, Y_train_df)

            if len(X_min) == 0:
                print("No minority instances found, using original dataset")
                X_res, Y_res = X_train, Y_train
                oversampling_method = "None (no minority instances)"
                n_new = 0
            else:
                # Calculate number of synthetic samples
                label_counts = Y_train_df.sum(axis=0)
                mean_count = int(label_counts.mean())
                min_count = int(label_counts.min())
                n_synthetic = max(100, int(mean_count - min_count))
                n_synthetic = min(n_synthetic, len(X_min) * 3)

                print(
                    f"Generating {n_synthetic} synthetic samples from {len(X_min)} minority instances"
                )

                # Apply MLSMOTE function directly
                X_res_df, Y_res_df = mlsmote_function(X_min, Y_min, n_synthetic)

                # Convert back to numpy
                X_res = X_res_df.values
                Y_res = Y_res_df.values.astype(int)

                oversampling_method = "MLSMOTE (local implementation)"
                n_new = len(X_res) - len(X_train)
                print(
                    f"MLSMOTE completed: {n_new} synthetic samples generated. Total: {len(X_res)} samples"
                )
        except Exception as e:
            print(f"MLSMOTE failed ({e}); falling back to RandomOverSampler")
            Y_train_str = ["".join(map(str, y)) for y in Y_train]
            ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
            X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
            Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
            oversampling_method = "RandomOverSampler (MLSMOTE fallback)"
            n_new = len(X_res) - len(X_train)
    else:
        print("  Local MLSMOTE not available; falling back to RandomOverSampler")
        Y_train_str = ["".join(map(str, y)) for y in Y_train]
        ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"])
        X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str)
        Y_res = np.array([[int(c) for c in s] for s in Y_res_str])
        oversampling_method = "RandomOverSampler (no MLSMOTE)"
        n_new = len(X_res) - len(X_train)

    print(f"\n Training LightGBM with {oversampling_method} ({n_new} synthetic samples)...")
    grid = run_grid_search_lgb(X_res, Y_res)

    with mlflow.start_run(run_name="lightgbm_with_smote"):
        grid.fit(X_res, Y_res)

        best_params = grid.best_params_
        best_cv = grid.best_score_
        final_model = grid.best_estimator_
        X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train
        Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train
        final_model.fit(X_comb, Y_comb)

        evaluate_and_log(
            final_model,
            X_test,
            Y_test,
            best_params,
            best_cv,
            "lightgbm_tfidf_gridsearch_smote",
            {
                "oversampling": oversampling_method,
                "synthetic_samples": n_new,
                "n_labels": Y_train.shape[1],
                "model": "LightGBM",
            },
        )


# =====================================================
# Baseline training (original train.py behavior)
# =====================================================
def run_baseline_train(feature_type="tfidf", use_cleaned=True):
    """Run baseline training with configurable feature type.

    Args:
        feature_type: 'tfidf' or 'embedding'
        use_cleaned: whether to use cleaned data
    """
    mlflow.set_experiment(
        MLFLOW_CONFIG.get("experiments", {}).get("baseline", "hopcroft_random_forest_baseline")
    )

    X, Y = load_data(feature_type=feature_type, use_cleaned=use_cleaned)

    # Use 80/20 split as per SkillScope paper (no validation set for baseline)
    print(" Using 80/20 train/test split as per paper...")
    X_train, X_test, Y_train, Y_test = stratified_train_test_split(
        X,
        Y,
        test_size=TRAINING_CONFIG.get("test_size", 0.2),
        random_state=TRAINING_CONFIG.get("random_state", 42),
    )

    # Remove labels that have 0 occurrences in training set (after split)
    train_counts = np.sum(Y_train, axis=0).astype(int)
    zero_in_train = np.where(train_counts == 0)[0]

    if zero_in_train.size > 0:
        kept_idx = np.where(train_counts > 0)[0]
        print(
            f"[warning] Removing {zero_in_train.size} label(s) with 0 occurrences in TRAIN set. Example removed indices: {zero_in_train[:10].tolist()}"
        )
        Y_train = Y_train[:, kept_idx]
        Y_test = Y_test[:, kept_idx]

        # Save kept indices for inference
        paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned)
        kept_indices_path = Path(paths["features"]).parent / "kept_label_indices.npy"
        np.save(kept_indices_path, kept_idx)
        print(f"Saved kept label indices to {kept_indices_path}")

    # Now check label coverage (should pass since we removed zero-occurrence labels)
    _check_label_coverage(Y_train, np.empty((0, Y_train.shape[1])))

    base_rf = RandomForestClassifier(
        random_state=TRAINING_CONFIG.get("random_state", 42), n_jobs=-1
    )
    multi = MultiOutputClassifier(base_rf)

    # Use full param_grid from MODEL_CONFIG for optimal results as per paper
    param_grid = MODEL_CONFIG.get(
        "param_grid",
        {
            "estimator__n_estimators": [50, 100, 200],
            "estimator__max_depth": [10, 20, 30],
            "estimator__min_samples_split": [2, 5],
        },
    )

    cv = KFold(
        n_splits=TRAINING_CONFIG.get("cv_folds", 5),
        shuffle=True,
        random_state=TRAINING_CONFIG.get("random_state", 42),
    )

    print(
        f" GridSearch with {cv.n_splits} folds and {len(param_grid['estimator__n_estimators']) * len(param_grid['estimator__max_depth']) * len(param_grid['estimator__min_samples_split'])} combinations..."
    )

    grid = GridSearchCV(
        estimator=multi,
        param_grid=param_grid,
        scoring="f1_micro",
        cv=cv,
        n_jobs=-1,
        verbose=2,
        refit=True,
    )

    with mlflow.start_run(run_name="random_forest_tfidf_gridsearch"):
        grid.fit(X_train, Y_train)

        best = grid.best_estimator_
        best_params = grid.best_params_
        best_cv_score = grid.best_score_

        # No need to refit on combined train+val since we don't have a val set
        # Model is already fitted on full training data

        Y_pred_test = best.predict(X_test)

        precision = precision_score(Y_test, Y_pred_test, average="micro", zero_division=0)
        recall = recall_score(Y_test, Y_pred_test, average="micro", zero_division=0)
        f1 = f1_score(Y_test, Y_pred_test, average="micro", zero_division=0)

        mlflow.log_param("model_type", "RandomForest + MultiOutput")
        for k, v in best_params.items():
            mlflow.log_param(k, v)
        mlflow.log_metric("cv_best_f1_micro", best_cv_score)

        mlflow.log_metric("test_precision_micro", precision)
        mlflow.log_metric("test_recall_micro", recall)
        mlflow.log_metric("test_f1_micro", f1)
        mlflow.log_param("feature_type", feature_type)
        mlflow.log_param("use_cleaned", use_cleaned)

        print("\n=== Training Results ===")
        print(f"Test Precision (Micro): {precision:.4f}")
        print(f"Test Recall (Micro):    {recall:.4f}")
        print(f"Test F1 Score (Micro):  {f1:.4f}")
        print("========================\n")

        paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned)
        os.makedirs(paths["models_dir"], exist_ok=True)

        model_path = Path(paths["models_dir"]) / f"random_forest_{feature_type}_gridsearch.pkl"
        joblib.dump(best, model_path)

        np.save(Path(paths["features"]).parent / "X_test.npy", X_test)
        np.save(Path(paths["labels"]).parent / "Y_test.npy", Y_test)

        mlflow.sklearn.log_model(best, "model")

    print("Grid search training completed and logged successfully.")


# =====================================================
# Inference utility (merged from predict.py)
# =====================================================
def run_inference(model_path: str = None):
    mlflow.set_experiment(
        MLFLOW_CONFIG.get("experiments", {}).get("inference", "hopcroft_random_forest_inference")
    )

    if model_path is None:
        model_path = Path(DATA_PATHS["models_dir"]) / "random_forest_tfidf_gridsearch.pkl"
    else:
        model_path = Path(model_path)

    model = joblib.load(str(model_path))

    X_test = np.load(Path(DATA_PATHS["features"]).parent / "X_test.npy")
    Y_test = np.load(Path(DATA_PATHS["labels"]).parent / "Y_test.npy")

    with mlflow.start_run(run_name="random_forest_tfidf_inference"):
        Y_pred = model.predict(X_test)

        precision = precision_score(Y_test, Y_pred, average="micro", zero_division=0)
        recall = recall_score(Y_test, Y_pred, average="micro", zero_division=0)
        f1 = f1_score(Y_test, Y_pred, average="micro", zero_division=0)

        mlflow.log_metric("test_precision_micro", precision)
        mlflow.log_metric("test_recall_micro", recall)
        mlflow.log_metric("test_f1_micro", f1)

    print(f"Inference completed — Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")


def _parse_args():
    p = argparse.ArgumentParser(description="Unified training & experiments script")
    p.add_argument(
        "action",
        choices=[
            "baseline",
            "smote",
            "ros",
            "adasyn_pca",
            "lightgbm",
            "lightgbm_smote",
            "predict",
        ],
        help="Action to run",
    )
    p.add_argument("--model-path", help="Custom model path for inference")
    return p.parse_args()


if __name__ == "__main__":
    args = _parse_args()

    # Baseline has its own load_data logic (removes rare labels after split)
    if args.action == "baseline":
        run_baseline_train(feature_type="tfidf", use_cleaned=True)
    else:
        # Other experiments use the original load_data() logic
        X, Y = load_data(feature_type="tfidf", use_cleaned=True)

        if args.action == "smote":
            run_smote_experiment(X, Y)
        elif args.action == "ros":
            run_ros_experiment(X, Y)
        elif args.action == "adasyn_pca":
            run_adasyn_pca_experiment(X, Y)
        elif args.action == "lightgbm":
            run_lightgbm(X, Y)
        elif args.action == "lightgbm_smote":
            run_lightgbm_smote_experiment(X, Y)
        elif args.action == "predict":
            run_inference(args.model_path)