import argparse import os from pathlib import Path from imblearn.over_sampling import ADASYN, RandomOverSampler import joblib import lightgbm as lgb import mlflow import mlflow.sklearn import numpy as np from sklearn.decomposition import PCA from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import f1_score, precision_score, recall_score from sklearn.model_selection import GridSearchCV, KFold, train_test_split from sklearn.multioutput import MultiOutputClassifier from hopcroft_skill_classification_tool_competition.config import ( ADASYN_CONFIG, DATA_PATHS, MLFLOW_CONFIG, MODEL_CONFIG, PCA_CONFIG, TRAINING_CONFIG, get_feature_paths, ) # Local MLSMOTE implementation (lightweight multi-label oversampling) try: import pandas as pd from hopcroft_skill_classification_tool_competition.mlsmote import MLSMOTE as mlsmote_function from hopcroft_skill_classification_tool_competition.mlsmote import get_minority_instace _HAS_LOCAL_MLSMOTE = True except Exception: mlsmote_function = None get_minority_instace = None _HAS_LOCAL_MLSMOTE = False print("[warning] Local MLSMOTE not available. Check mlsmote.py exists.") # Prefer multilabel stratified splits for imbalanced multi-label data. # Use `iterative-stratification` package when available. try: from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit _HAS_MLSTRAT = True except Exception: MultilabelStratifiedShuffleSplit = None _HAS_MLSTRAT = False # ------------------------------- # MLflow authentication and setup # Load environment variables from .env file (for local dev) # In Docker, env vars are set via docker-compose env_file # ------------------------------- from dotenv import load_dotenv load_dotenv() _mlflow_env_uri = os.getenv("MLFLOW_TRACKING_URI") _configured_uri = MLFLOW_CONFIG.get("uri", "https://dagshub.com/se4ai2526-uniba/Hopcroft.mlflow") if _mlflow_env_uri: mlflow_uri = _mlflow_env_uri else: mlflow_uri = _configured_uri # If targeting DagsHub, require username/password; otherwise proceed. if "dagshub.com" in mlflow_uri: _username = os.getenv("MLFLOW_TRACKING_USERNAME") _password = os.getenv("MLFLOW_TRACKING_PASSWORD") if not _username or not _password: raise ValueError( "Set the environment variables MLFLOW_TRACKING_USERNAME and MLFLOW_TRACKING_PASSWORD for remote tracking" ) mlflow.set_tracking_uri(mlflow_uri) # ===================================================== # Common utilities (merged from train_experiments.py) # ===================================================== def load_data(feature_type="tfidf", use_cleaned=True): """Load features and labels using get_feature_paths. Args: feature_type: 'tfidf' or 'embedding' use_cleaned: whether to use cleaned data Returns: X, Y: feature matrix and label matrix """ paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned) X = np.load(paths["features"]) Y = np.load(paths["labels"]) print(f"Dataset loaded successfully: {X.shape} samples, {Y.shape} labels") print(f"Using feature type: {feature_type}{'_clean' if use_cleaned else ''}") return X, Y def stratified_train_test_split(X, Y, test_size=None, random_state=None, fallback=True): """Split X, Y using multilabel stratified shuffle split when possible. Args: X: np.ndarray features Y: np.ndarray multi-label binary matrix (n_samples, n_labels) test_size: float or int, forwarded to splitter random_state: int fallback: if True and multilabel splitter unavailable, use sklearn.train_test_split Returns: X_train, X_test, Y_train, Y_test """ if _HAS_MLSTRAT: if isinstance(test_size, float): tst = test_size else: # default to TRAINING_CONFIG if not provided tst = TRAINING_CONFIG.get("test_size", 0.2) msss = MultilabelStratifiedShuffleSplit( n_splits=1, test_size=tst, random_state=random_state ) train_idx, test_idx = next(msss.split(X, Y)) return X[train_idx], X[test_idx], Y[train_idx], Y[test_idx] if fallback: print( "[warning] iterative-stratification not available; using standard train_test_split (no multilabel stratification). To enable stratified multilabel splitting install 'iterative-stratification'." ) return train_test_split(X, Y, test_size=test_size, random_state=random_state, shuffle=True) raise RuntimeError( "iterative-stratification is required for multilabel stratified splitting but not installed." ) def stratified_train_val_test_split( X, Y, test_size=0.2, val_size=0.1, random_state=None, fallback=True ): """Split X, Y into train, val, test with multilabel stratification when possible. Args: X, Y: arrays test_size: proportion for final test set val_size: proportion for validation set (relative to whole dataset) random_state: seed fallback: if True, falls back to sklearn splits Returns: X_train, X_val, X_test, Y_train, Y_val, Y_test """ if not (0.0 < test_size < 1.0 and 0.0 <= val_size < 1.0 and val_size + test_size < 1.0): raise ValueError("test_size and val_size must be fractions in (0,1) and sum < 1") # First split off the final test set X_rem, X_test, Y_rem, Y_test = stratified_train_test_split( X, Y, test_size=test_size, random_state=random_state, fallback=fallback ) # Compute validation size relative to the remaining data rel_val = 0.0 if (1.0 - test_size) > 0: rel_val = val_size / (1.0 - test_size) else: rel_val = 0.0 if rel_val <= 0: # No validation requested return X_rem, np.empty((0, X.shape[1])), X_test, Y_rem, np.empty((0, Y.shape[1])), Y_test X_train, X_val, Y_train, Y_val = stratified_train_test_split( X_rem, Y_rem, test_size=rel_val, random_state=random_state, fallback=fallback ) return X_train, X_val, X_test, Y_train, Y_val, Y_test def _check_label_coverage(Y_train: np.ndarray, Y_val: np.ndarray, min_train: int = 1): """Check that each label appears at least `min_train` times in train and at least once in train+val. Prints a warning if some labels are scarce in train, and raises an error if some labels are missing entirely from train+val (which would make learning impossible for those labels). Args: Y_train: (n_train, n_labels) binary matrix Y_val: (n_val, n_labels) binary matrix (may be empty) min_train: minimum occurrences in train to be considered "covered" """ # Defensive: handle empty val if Y_val is None: Y_val = np.empty((0, Y_train.shape[1])) counts_train = np.sum(Y_train, axis=0) counts_train_val = counts_train + np.sum(Y_val, axis=0) missing_in_train = np.where(counts_train < min_train)[0] missing_in_train_val = np.where(counts_train_val == 0)[0] if missing_in_train.size > 0: # Small, actionable warning for debugging preview = missing_in_train[:10].tolist() print( f"[warning] {missing_in_train.size} label(s) have <{min_train} occurrences in TRAIN. Example label indices: {preview}." ) if missing_in_train_val.size > 0: preview = missing_in_train_val[:10].tolist() raise ValueError( f"{missing_in_train_val.size} label(s) have 0 occurrences in TRAIN+VAL (indices example: {preview}). " "Reduce test/val size, aggregate labels, or ensure these labels exist in the source DB." ) def evaluate_and_log(model, X_test, Y_test, best_params, cv_score, exp_name, extra_params=None): Y_pred = model.predict(X_test) precision = precision_score(Y_test, Y_pred, average="micro", zero_division=0) recall = recall_score(Y_test, Y_pred, average="micro", zero_division=0) f1 = f1_score(Y_test, Y_pred, average="micro", zero_division=0) mlflow.log_metrics( { "cv_best_f1_micro": cv_score, "test_precision_micro": precision, "test_recall_micro": recall, "test_f1_micro": f1, } ) for k, v in best_params.items(): mlflow.log_param(k, v) if extra_params: for k, v in extra_params.items(): mlflow.log_param(k, v) os.makedirs(DATA_PATHS["models_dir"], exist_ok=True) model_path = Path(DATA_PATHS["models_dir"]) / f"{exp_name}.pkl" joblib.dump(model, model_path) mlflow.log_artifact(str(model_path), artifact_path=f"model_{exp_name}") print(f"Model saved to {model_path}") print(f"{exp_name} completed and logged successfully.\n") def run_grid_search(X, Y): base_rf = RandomForestClassifier(random_state=TRAINING_CONFIG["random_state"], n_jobs=-1) multi = MultiOutputClassifier(base_rf) cv = KFold( n_splits=TRAINING_CONFIG["cv_folds"], shuffle=True, random_state=TRAINING_CONFIG["random_state"], ) grid = GridSearchCV( estimator=multi, param_grid=MODEL_CONFIG["param_grid"], scoring="f1_micro", cv=cv, n_jobs=-1, verbose=2, refit=True, ) return grid def run_grid_search_lgb(X, Y): base_lgb = lgb.LGBMClassifier( random_state=TRAINING_CONFIG["random_state"], n_jobs=1, force_row_wise=True, verbose=-1 ) multi = MultiOutputClassifier(base_lgb, n_jobs=-1) cv = KFold( n_splits=TRAINING_CONFIG["cv_folds"], shuffle=True, random_state=TRAINING_CONFIG["random_state"], ) lgb_param_grid = { "estimator__n_estimators": [50, 100, 200], "estimator__max_depth": [3, 5, 7], "estimator__learning_rate": [0.1], "estimator__num_leaves": [15], } grid = GridSearchCV( estimator=multi, param_grid=lgb_param_grid, scoring="f1_micro", cv=cv, n_jobs=-1, verbose=2, refit=True, ) return grid # ===================================================== # Experiments (merged) # ===================================================== def run_smote_experiment(X, Y, feature_type="tfidf"): mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["smote"]) # Split into train / val / test X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split( X, Y, test_size=TRAINING_CONFIG.get("test_size", 0.2), val_size=TRAINING_CONFIG.get("val_size", 0.1), random_state=TRAINING_CONFIG["random_state"], ) # Check label coverage and fail early if labels are missing from train+val _check_label_coverage(Y_train, Y_val) # Apply MLSMOTE (Multi-Label SMOTE) as per paper # MLSMOTE handles multi-label classification natively by considering label correlations print("Applying MLSMOTE (Multi-Label SMOTE) as per SkillScope paper...") print(f" Original training set: {X_train.shape[0]} samples, {Y_train.shape[1]} labels") # Use local MLSMOTE implementation directly (function-based) if _HAS_LOCAL_MLSMOTE: try: # Set random seed if TRAINING_CONFIG["random_state"] is not None: np.random.seed(TRAINING_CONFIG["random_state"]) import random random.seed(TRAINING_CONFIG["random_state"]) # Convert to DataFrame (MLSMOTE function expects DataFrames) X_train_df = pd.DataFrame(X_train) Y_train_df = pd.DataFrame(Y_train) # Get minority instances X_min, Y_min = get_minority_instace(X_train_df, Y_train_df) if len(X_min) == 0: print("No minority instances found, using original dataset") X_res, Y_res = X_train, Y_train oversampling_method = "None (no minority instances)" n_new = 0 else: # Calculate number of synthetic samples label_counts = Y_train_df.sum(axis=0) mean_count = int(label_counts.mean()) min_count = int(label_counts.min()) n_synthetic = max(100, int(mean_count - min_count)) n_synthetic = min(n_synthetic, len(X_min) * 3) print( f"Generating {n_synthetic} synthetic samples from {len(X_min)} minority instances" ) # Apply MLSMOTE function directly X_res_df, Y_res_df = mlsmote_function(X_min, Y_min, n_synthetic) # Convert back to numpy X_res = X_res_df.values Y_res = Y_res_df.values.astype(int) oversampling_method = "MLSMOTE (local implementation)" n_new = len(X_res) - len(X_train) print( f"MLSMOTE completed: {n_new} synthetic samples generated. Total: {len(X_res)} samples" ) except Exception as e: print(f"MLSMOTE failed ({e}); falling back to RandomOverSampler") Y_train_str = ["".join(map(str, y)) for y in Y_train] ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"]) X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str) Y_res = np.array([[int(c) for c in s] for s in Y_res_str]) oversampling_method = "RandomOverSampler (MLSMOTE fallback)" n_new = len(X_res) - len(X_train) else: print("Local MLSMOTE not available; falling back to RandomOverSampler") Y_train_str = ["".join(map(str, y)) for y in Y_train] ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"]) X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str) Y_res = np.array([[int(c) for c in s] for s in Y_res_str]) oversampling_method = "RandomOverSampler (no MLSMOTE)" n_new = len(X_res) - len(X_train) grid = run_grid_search(X_res, Y_res) with mlflow.start_run(run_name="random_forest_with_smote"): grid.fit(X_res, Y_res) # Refit final model on train + val (use original non-oversampled data for final fit) best_params = grid.best_params_ best_cv = grid.best_score_ final_model = grid.best_estimator_ X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train final_model.fit(X_comb, Y_comb) evaluate_and_log( final_model, X_test, Y_test, best_params, best_cv, f"random_forest_{feature_type}_gridsearch_smote", { "oversampling": oversampling_method, "synthetic_samples": n_new, "n_labels": Y_train.shape[1], }, ) def run_ros_experiment(X, Y): mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["ros"]) # Split into train / val / test X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split( X, Y, test_size=TRAINING_CONFIG.get("test_size", 0.2), val_size=TRAINING_CONFIG.get("val_size", 0.1), random_state=TRAINING_CONFIG["random_state"], ) Y_train_str = ["".join(map(str, y)) for y in Y_train] ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"]) X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str) Y.shape[1] Y_res = np.array([[int(c) for c in s] for s in Y_res_str]) grid = run_grid_search(X_res, Y_res) with mlflow.start_run(run_name="random_forest_with_ros"): grid.fit(X_res, Y_res) best_params = grid.best_params_ best_cv = grid.best_score_ final_model = grid.best_estimator_ X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train final_model.fit(X_comb, Y_comb) evaluate_and_log( final_model, X_test, Y_test, best_params, best_cv, "random_forest_tfidf_gridsearch_ros", {"oversampling": "RandomOverSampler"}, ) def run_adasyn_pca_experiment(X, Y): mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["adasyn_pca"]) # Split into train / val / test X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split( X, Y, test_size=TRAINING_CONFIG.get("test_size", 0.2), val_size=TRAINING_CONFIG.get("val_size", 0.1), random_state=TRAINING_CONFIG["random_state"], ) print("Applying PCA before ADASYN...") pca = PCA( n_components=PCA_CONFIG["variance_retained"], random_state=TRAINING_CONFIG["random_state"] ) X_train_pca = pca.fit_transform(X_train) adasyn = ADASYN( random_state=TRAINING_CONFIG["random_state"], n_neighbors=ADASYN_CONFIG["n_neighbors"], sampling_strategy=ADASYN_CONFIG["sampling_strategy"], ) valid_label_idx = next( (i for i in range(Y_train.shape[1]) if len(np.unique(Y_train[:, i])) > 1), None ) if valid_label_idx is None: X_res, Y_res = X_train, Y_train n_new = 0 else: X_res_pca, _ = adasyn.fit_resample(X_train_pca, Y_train[:, valid_label_idx]) X_res = pca.inverse_transform(X_res_pca) n_new = len(X_res) - len(X_train) Y_res = np.vstack([Y_train, Y_train[np.random.randint(0, len(Y_train), n_new)]]) grid = run_grid_search(X_res, Y_res) with mlflow.start_run(run_name="random_forest_with_adasyn_pca"): grid.fit(X_res, Y_res) best_params = grid.best_params_ best_cv = grid.best_score_ final_model = grid.best_estimator_ X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train final_model.fit(X_comb, Y_comb) evaluate_and_log( final_model, X_test, Y_test, best_params, best_cv, "random_forest_tfidf_gridsearch_adasyn_pca", { "oversampling": "ADASYN + PCA", "pca_variance": PCA_CONFIG["variance_retained"], "synthetic_samples": n_new, }, ) pca_path = Path(DATA_PATHS["models_dir"]) / "pca_tfidf_adasyn.pkl" joblib.dump(pca, pca_path) mlflow.log_artifact(str(pca_path), artifact_path="model_adasyn_pca") def run_lightgbm(X, Y): mlflow.set_experiment(MLFLOW_CONFIG["experiments"].get("lightgbm", "LightGBM")) # Split into train / val / test X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split( X, Y, test_size=TRAINING_CONFIG.get("test_size", 0.2), val_size=TRAINING_CONFIG.get("val_size", 0.1), random_state=TRAINING_CONFIG["random_state"], ) print("\nTraining LightGBM with GridSearchCV...") grid = run_grid_search_lgb(X_train, Y_train) with mlflow.start_run(run_name="lightgbm"): grid.fit(X_train, Y_train) best_params = grid.best_params_ best_cv = grid.best_score_ final_model = grid.best_estimator_ X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train final_model.fit(X_comb, Y_comb) evaluate_and_log( final_model, X_test, Y_test, best_params, best_cv, "lightgbm_tfidf_gridsearch", {"oversampling": "None", "model": "LightGBM"}, ) def run_lightgbm_smote_experiment(X, Y): mlflow.set_experiment(MLFLOW_CONFIG["experiments"].get("lightgbm_smote", "LightGBM_SMOTE")) # Split into train / val / test X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split( X, Y, test_size=TRAINING_CONFIG.get("test_size", 0.2), val_size=TRAINING_CONFIG.get("val_size", 0.1), random_state=TRAINING_CONFIG["random_state"], ) # Apply MLSMOTE (Multi-Label SMOTE) as per paper print(" Applying MLSMOTE for LightGBM...") print(f" Original training set: {X_train.shape[0]} samples, {Y_train.shape[1]} labels") # Use local MLSMOTE implementation directly (function-based) if _HAS_LOCAL_MLSMOTE: try: # Set random seed if TRAINING_CONFIG["random_state"] is not None: np.random.seed(TRAINING_CONFIG["random_state"]) import random random.seed(TRAINING_CONFIG["random_state"]) # Convert to DataFrame (MLSMOTE function expects DataFrames) X_train_df = pd.DataFrame(X_train) Y_train_df = pd.DataFrame(Y_train) # Get minority instances X_min, Y_min = get_minority_instace(X_train_df, Y_train_df) if len(X_min) == 0: print("No minority instances found, using original dataset") X_res, Y_res = X_train, Y_train oversampling_method = "None (no minority instances)" n_new = 0 else: # Calculate number of synthetic samples label_counts = Y_train_df.sum(axis=0) mean_count = int(label_counts.mean()) min_count = int(label_counts.min()) n_synthetic = max(100, int(mean_count - min_count)) n_synthetic = min(n_synthetic, len(X_min) * 3) print( f"Generating {n_synthetic} synthetic samples from {len(X_min)} minority instances" ) # Apply MLSMOTE function directly X_res_df, Y_res_df = mlsmote_function(X_min, Y_min, n_synthetic) # Convert back to numpy X_res = X_res_df.values Y_res = Y_res_df.values.astype(int) oversampling_method = "MLSMOTE (local implementation)" n_new = len(X_res) - len(X_train) print( f"MLSMOTE completed: {n_new} synthetic samples generated. Total: {len(X_res)} samples" ) except Exception as e: print(f"MLSMOTE failed ({e}); falling back to RandomOverSampler") Y_train_str = ["".join(map(str, y)) for y in Y_train] ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"]) X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str) Y_res = np.array([[int(c) for c in s] for s in Y_res_str]) oversampling_method = "RandomOverSampler (MLSMOTE fallback)" n_new = len(X_res) - len(X_train) else: print(" Local MLSMOTE not available; falling back to RandomOverSampler") Y_train_str = ["".join(map(str, y)) for y in Y_train] ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"]) X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str) Y_res = np.array([[int(c) for c in s] for s in Y_res_str]) oversampling_method = "RandomOverSampler (no MLSMOTE)" n_new = len(X_res) - len(X_train) print(f"\n Training LightGBM with {oversampling_method} ({n_new} synthetic samples)...") grid = run_grid_search_lgb(X_res, Y_res) with mlflow.start_run(run_name="lightgbm_with_smote"): grid.fit(X_res, Y_res) best_params = grid.best_params_ best_cv = grid.best_score_ final_model = grid.best_estimator_ X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train final_model.fit(X_comb, Y_comb) evaluate_and_log( final_model, X_test, Y_test, best_params, best_cv, "lightgbm_tfidf_gridsearch_smote", { "oversampling": oversampling_method, "synthetic_samples": n_new, "n_labels": Y_train.shape[1], "model": "LightGBM", }, ) # ===================================================== # Baseline training (original train.py behavior) # ===================================================== def run_baseline_train(feature_type="tfidf", use_cleaned=True): """Run baseline training with configurable feature type. Args: feature_type: 'tfidf' or 'embedding' use_cleaned: whether to use cleaned data """ mlflow.set_experiment( MLFLOW_CONFIG.get("experiments", {}).get("baseline", "hopcroft_random_forest_baseline") ) X, Y = load_data(feature_type=feature_type, use_cleaned=use_cleaned) # Use 80/20 split as per SkillScope paper (no validation set for baseline) print(" Using 80/20 train/test split as per paper...") X_train, X_test, Y_train, Y_test = stratified_train_test_split( X, Y, test_size=TRAINING_CONFIG.get("test_size", 0.2), random_state=TRAINING_CONFIG.get("random_state", 42), ) # Remove labels that have 0 occurrences in training set (after split) train_counts = np.sum(Y_train, axis=0).astype(int) zero_in_train = np.where(train_counts == 0)[0] if zero_in_train.size > 0: kept_idx = np.where(train_counts > 0)[0] print( f"[warning] Removing {zero_in_train.size} label(s) with 0 occurrences in TRAIN set. Example removed indices: {zero_in_train[:10].tolist()}" ) Y_train = Y_train[:, kept_idx] Y_test = Y_test[:, kept_idx] # Save kept indices for inference paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned) kept_indices_path = Path(paths["features"]).parent / "kept_label_indices.npy" np.save(kept_indices_path, kept_idx) print(f"Saved kept label indices to {kept_indices_path}") # Now check label coverage (should pass since we removed zero-occurrence labels) _check_label_coverage(Y_train, np.empty((0, Y_train.shape[1]))) base_rf = RandomForestClassifier( random_state=TRAINING_CONFIG.get("random_state", 42), n_jobs=-1 ) multi = MultiOutputClassifier(base_rf) # Use full param_grid from MODEL_CONFIG for optimal results as per paper param_grid = MODEL_CONFIG.get( "param_grid", { "estimator__n_estimators": [50, 100, 200], "estimator__max_depth": [10, 20, 30], "estimator__min_samples_split": [2, 5], }, ) cv = KFold( n_splits=TRAINING_CONFIG.get("cv_folds", 5), shuffle=True, random_state=TRAINING_CONFIG.get("random_state", 42), ) print( f" GridSearch with {cv.n_splits} folds and {len(param_grid['estimator__n_estimators']) * len(param_grid['estimator__max_depth']) * len(param_grid['estimator__min_samples_split'])} combinations..." ) grid = GridSearchCV( estimator=multi, param_grid=param_grid, scoring="f1_micro", cv=cv, n_jobs=-1, verbose=2, refit=True, ) with mlflow.start_run(run_name="random_forest_tfidf_gridsearch"): grid.fit(X_train, Y_train) best = grid.best_estimator_ best_params = grid.best_params_ best_cv_score = grid.best_score_ # No need to refit on combined train+val since we don't have a val set # Model is already fitted on full training data Y_pred_test = best.predict(X_test) precision = precision_score(Y_test, Y_pred_test, average="micro", zero_division=0) recall = recall_score(Y_test, Y_pred_test, average="micro", zero_division=0) f1 = f1_score(Y_test, Y_pred_test, average="micro", zero_division=0) mlflow.log_param("model_type", "RandomForest + MultiOutput") for k, v in best_params.items(): mlflow.log_param(k, v) mlflow.log_metric("cv_best_f1_micro", best_cv_score) mlflow.log_metric("test_precision_micro", precision) mlflow.log_metric("test_recall_micro", recall) mlflow.log_metric("test_f1_micro", f1) mlflow.log_param("feature_type", feature_type) mlflow.log_param("use_cleaned", use_cleaned) print("\n=== Training Results ===") print(f"Test Precision (Micro): {precision:.4f}") print(f"Test Recall (Micro): {recall:.4f}") print(f"Test F1 Score (Micro): {f1:.4f}") print("========================\n") paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned) os.makedirs(paths["models_dir"], exist_ok=True) model_path = Path(paths["models_dir"]) / f"random_forest_{feature_type}_gridsearch.pkl" joblib.dump(best, model_path) np.save(Path(paths["features"]).parent / "X_test.npy", X_test) np.save(Path(paths["labels"]).parent / "Y_test.npy", Y_test) mlflow.sklearn.log_model(best, "model") print("Grid search training completed and logged successfully.") # ===================================================== # Inference utility (merged from predict.py) # ===================================================== def run_inference(model_path: str = None): mlflow.set_experiment( MLFLOW_CONFIG.get("experiments", {}).get("inference", "hopcroft_random_forest_inference") ) if model_path is None: model_path = Path(DATA_PATHS["models_dir"]) / "random_forest_tfidf_gridsearch.pkl" else: model_path = Path(model_path) model = joblib.load(str(model_path)) X_test = np.load(Path(DATA_PATHS["features"]).parent / "X_test.npy") Y_test = np.load(Path(DATA_PATHS["labels"]).parent / "Y_test.npy") with mlflow.start_run(run_name="random_forest_tfidf_inference"): Y_pred = model.predict(X_test) precision = precision_score(Y_test, Y_pred, average="micro", zero_division=0) recall = recall_score(Y_test, Y_pred, average="micro", zero_division=0) f1 = f1_score(Y_test, Y_pred, average="micro", zero_division=0) mlflow.log_metric("test_precision_micro", precision) mlflow.log_metric("test_recall_micro", recall) mlflow.log_metric("test_f1_micro", f1) print(f"Inference completed — Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}") def _parse_args(): p = argparse.ArgumentParser(description="Unified training & experiments script") p.add_argument( "action", choices=[ "baseline", "smote", "ros", "adasyn_pca", "lightgbm", "lightgbm_smote", "predict", ], help="Action to run", ) p.add_argument("--model-path", help="Custom model path for inference") return p.parse_args() if __name__ == "__main__": args = _parse_args() # Baseline has its own load_data logic (removes rare labels after split) if args.action == "baseline": run_baseline_train(feature_type="tfidf", use_cleaned=True) else: # Other experiments use the original load_data() logic X, Y = load_data(feature_type="tfidf", use_cleaned=True) if args.action == "smote": run_smote_experiment(X, Y) elif args.action == "ros": run_ros_experiment(X, Y) elif args.action == "adasyn_pca": run_adasyn_pca_experiment(X, Y) elif args.action == "lightgbm": run_lightgbm(X, Y) elif args.action == "lightgbm_smote": run_lightgbm_smote_experiment(X, Y) elif args.action == "predict": run_inference(args.model_path)