Spaces:
Sleeping
Sleeping
| import argparse | |
| import os | |
| from pathlib import Path | |
| from imblearn.over_sampling import ADASYN, RandomOverSampler | |
| import joblib | |
| import lightgbm as lgb | |
| import mlflow | |
| import mlflow.sklearn | |
| import numpy as np | |
| from sklearn.decomposition import PCA | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.metrics import f1_score, precision_score, recall_score | |
| from sklearn.model_selection import GridSearchCV, KFold, train_test_split | |
| from sklearn.multioutput import MultiOutputClassifier | |
| from hopcroft_skill_classification_tool_competition.config import ( | |
| ADASYN_CONFIG, | |
| DATA_PATHS, | |
| MLFLOW_CONFIG, | |
| MODEL_CONFIG, | |
| PCA_CONFIG, | |
| TRAINING_CONFIG, | |
| get_feature_paths, | |
| ) | |
| # Local MLSMOTE implementation (lightweight multi-label oversampling) | |
| try: | |
| import pandas as pd | |
| from hopcroft_skill_classification_tool_competition.mlsmote import MLSMOTE as mlsmote_function | |
| from hopcroft_skill_classification_tool_competition.mlsmote import get_minority_instace | |
| _HAS_LOCAL_MLSMOTE = True | |
| except Exception: | |
| mlsmote_function = None | |
| get_minority_instace = None | |
| _HAS_LOCAL_MLSMOTE = False | |
| print("[warning] Local MLSMOTE not available. Check mlsmote.py exists.") | |
| # Prefer multilabel stratified splits for imbalanced multi-label data. | |
| # Use `iterative-stratification` package when available. | |
| try: | |
| from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit | |
| _HAS_MLSTRAT = True | |
| except Exception: | |
| MultilabelStratifiedShuffleSplit = None | |
| _HAS_MLSTRAT = False | |
| # ------------------------------- | |
| # MLflow authentication and setup | |
| # Load environment variables from .env file (for local dev) | |
| # In Docker, env vars are set via docker-compose env_file | |
| # ------------------------------- | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| _mlflow_env_uri = os.getenv("MLFLOW_TRACKING_URI") | |
| _configured_uri = MLFLOW_CONFIG.get("uri", "https://dagshub.com/se4ai2526-uniba/Hopcroft.mlflow") | |
| if _mlflow_env_uri: | |
| mlflow_uri = _mlflow_env_uri | |
| else: | |
| mlflow_uri = _configured_uri | |
| # If targeting DagsHub, require username/password; otherwise proceed. | |
| if "dagshub.com" in mlflow_uri: | |
| _username = os.getenv("MLFLOW_TRACKING_USERNAME") | |
| _password = os.getenv("MLFLOW_TRACKING_PASSWORD") | |
| if not _username or not _password: | |
| raise ValueError( | |
| "Set the environment variables MLFLOW_TRACKING_USERNAME and MLFLOW_TRACKING_PASSWORD for remote tracking" | |
| ) | |
| mlflow.set_tracking_uri(mlflow_uri) | |
| # ===================================================== | |
| # Common utilities (merged from train_experiments.py) | |
| # ===================================================== | |
| def load_data(feature_type="tfidf", use_cleaned=True): | |
| """Load features and labels using get_feature_paths. | |
| Args: | |
| feature_type: 'tfidf' or 'embedding' | |
| use_cleaned: whether to use cleaned data | |
| Returns: | |
| X, Y: feature matrix and label matrix | |
| """ | |
| paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned) | |
| X = np.load(paths["features"]) | |
| Y = np.load(paths["labels"]) | |
| print(f"Dataset loaded successfully: {X.shape} samples, {Y.shape} labels") | |
| print(f"Using feature type: {feature_type}{'_clean' if use_cleaned else ''}") | |
| return X, Y | |
| def stratified_train_test_split(X, Y, test_size=None, random_state=None, fallback=True): | |
| """Split X, Y using multilabel stratified shuffle split when possible. | |
| Args: | |
| X: np.ndarray features | |
| Y: np.ndarray multi-label binary matrix (n_samples, n_labels) | |
| test_size: float or int, forwarded to splitter | |
| random_state: int | |
| fallback: if True and multilabel splitter unavailable, use sklearn.train_test_split | |
| Returns: | |
| X_train, X_test, Y_train, Y_test | |
| """ | |
| if _HAS_MLSTRAT: | |
| if isinstance(test_size, float): | |
| tst = test_size | |
| else: | |
| # default to TRAINING_CONFIG if not provided | |
| tst = TRAINING_CONFIG.get("test_size", 0.2) | |
| msss = MultilabelStratifiedShuffleSplit( | |
| n_splits=1, test_size=tst, random_state=random_state | |
| ) | |
| train_idx, test_idx = next(msss.split(X, Y)) | |
| return X[train_idx], X[test_idx], Y[train_idx], Y[test_idx] | |
| if fallback: | |
| print( | |
| "[warning] iterative-stratification not available; using standard train_test_split (no multilabel stratification). To enable stratified multilabel splitting install 'iterative-stratification'." | |
| ) | |
| return train_test_split(X, Y, test_size=test_size, random_state=random_state, shuffle=True) | |
| raise RuntimeError( | |
| "iterative-stratification is required for multilabel stratified splitting but not installed." | |
| ) | |
| def stratified_train_val_test_split( | |
| X, Y, test_size=0.2, val_size=0.1, random_state=None, fallback=True | |
| ): | |
| """Split X, Y into train, val, test with multilabel stratification when possible. | |
| Args: | |
| X, Y: arrays | |
| test_size: proportion for final test set | |
| val_size: proportion for validation set (relative to whole dataset) | |
| random_state: seed | |
| fallback: if True, falls back to sklearn splits | |
| Returns: | |
| X_train, X_val, X_test, Y_train, Y_val, Y_test | |
| """ | |
| if not (0.0 < test_size < 1.0 and 0.0 <= val_size < 1.0 and val_size + test_size < 1.0): | |
| raise ValueError("test_size and val_size must be fractions in (0,1) and sum < 1") | |
| # First split off the final test set | |
| X_rem, X_test, Y_rem, Y_test = stratified_train_test_split( | |
| X, Y, test_size=test_size, random_state=random_state, fallback=fallback | |
| ) | |
| # Compute validation size relative to the remaining data | |
| rel_val = 0.0 | |
| if (1.0 - test_size) > 0: | |
| rel_val = val_size / (1.0 - test_size) | |
| else: | |
| rel_val = 0.0 | |
| if rel_val <= 0: | |
| # No validation requested | |
| return X_rem, np.empty((0, X.shape[1])), X_test, Y_rem, np.empty((0, Y.shape[1])), Y_test | |
| X_train, X_val, Y_train, Y_val = stratified_train_test_split( | |
| X_rem, Y_rem, test_size=rel_val, random_state=random_state, fallback=fallback | |
| ) | |
| return X_train, X_val, X_test, Y_train, Y_val, Y_test | |
| def _check_label_coverage(Y_train: np.ndarray, Y_val: np.ndarray, min_train: int = 1): | |
| """Check that each label appears at least `min_train` times in train and | |
| at least once in train+val. Prints a warning if some labels are scarce in | |
| train, and raises an error if some labels are missing entirely from | |
| train+val (which would make learning impossible for those labels). | |
| Args: | |
| Y_train: (n_train, n_labels) binary matrix | |
| Y_val: (n_val, n_labels) binary matrix (may be empty) | |
| min_train: minimum occurrences in train to be considered "covered" | |
| """ | |
| # Defensive: handle empty val | |
| if Y_val is None: | |
| Y_val = np.empty((0, Y_train.shape[1])) | |
| counts_train = np.sum(Y_train, axis=0) | |
| counts_train_val = counts_train + np.sum(Y_val, axis=0) | |
| missing_in_train = np.where(counts_train < min_train)[0] | |
| missing_in_train_val = np.where(counts_train_val == 0)[0] | |
| if missing_in_train.size > 0: | |
| # Small, actionable warning for debugging | |
| preview = missing_in_train[:10].tolist() | |
| print( | |
| f"[warning] {missing_in_train.size} label(s) have <{min_train} occurrences in TRAIN. Example label indices: {preview}." | |
| ) | |
| if missing_in_train_val.size > 0: | |
| preview = missing_in_train_val[:10].tolist() | |
| raise ValueError( | |
| f"{missing_in_train_val.size} label(s) have 0 occurrences in TRAIN+VAL (indices example: {preview}). " | |
| "Reduce test/val size, aggregate labels, or ensure these labels exist in the source DB." | |
| ) | |
| def evaluate_and_log(model, X_test, Y_test, best_params, cv_score, exp_name, extra_params=None): | |
| Y_pred = model.predict(X_test) | |
| precision = precision_score(Y_test, Y_pred, average="micro", zero_division=0) | |
| recall = recall_score(Y_test, Y_pred, average="micro", zero_division=0) | |
| f1 = f1_score(Y_test, Y_pred, average="micro", zero_division=0) | |
| mlflow.log_metrics( | |
| { | |
| "cv_best_f1_micro": cv_score, | |
| "test_precision_micro": precision, | |
| "test_recall_micro": recall, | |
| "test_f1_micro": f1, | |
| } | |
| ) | |
| for k, v in best_params.items(): | |
| mlflow.log_param(k, v) | |
| if extra_params: | |
| for k, v in extra_params.items(): | |
| mlflow.log_param(k, v) | |
| os.makedirs(DATA_PATHS["models_dir"], exist_ok=True) | |
| model_path = Path(DATA_PATHS["models_dir"]) / f"{exp_name}.pkl" | |
| joblib.dump(model, model_path) | |
| mlflow.log_artifact(str(model_path), artifact_path=f"model_{exp_name}") | |
| print(f"Model saved to {model_path}") | |
| print(f"{exp_name} completed and logged successfully.\n") | |
| def run_grid_search(X, Y): | |
| base_rf = RandomForestClassifier(random_state=TRAINING_CONFIG["random_state"], n_jobs=-1) | |
| multi = MultiOutputClassifier(base_rf) | |
| cv = KFold( | |
| n_splits=TRAINING_CONFIG["cv_folds"], | |
| shuffle=True, | |
| random_state=TRAINING_CONFIG["random_state"], | |
| ) | |
| grid = GridSearchCV( | |
| estimator=multi, | |
| param_grid=MODEL_CONFIG["param_grid"], | |
| scoring="f1_micro", | |
| cv=cv, | |
| n_jobs=-1, | |
| verbose=2, | |
| refit=True, | |
| ) | |
| return grid | |
| def run_grid_search_lgb(X, Y): | |
| base_lgb = lgb.LGBMClassifier( | |
| random_state=TRAINING_CONFIG["random_state"], n_jobs=1, force_row_wise=True, verbose=-1 | |
| ) | |
| multi = MultiOutputClassifier(base_lgb, n_jobs=-1) | |
| cv = KFold( | |
| n_splits=TRAINING_CONFIG["cv_folds"], | |
| shuffle=True, | |
| random_state=TRAINING_CONFIG["random_state"], | |
| ) | |
| lgb_param_grid = { | |
| "estimator__n_estimators": [50, 100, 200], | |
| "estimator__max_depth": [3, 5, 7], | |
| "estimator__learning_rate": [0.1], | |
| "estimator__num_leaves": [15], | |
| } | |
| grid = GridSearchCV( | |
| estimator=multi, | |
| param_grid=lgb_param_grid, | |
| scoring="f1_micro", | |
| cv=cv, | |
| n_jobs=-1, | |
| verbose=2, | |
| refit=True, | |
| ) | |
| return grid | |
| # ===================================================== | |
| # Experiments (merged) | |
| # ===================================================== | |
| def run_smote_experiment(X, Y, feature_type="tfidf"): | |
| mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["smote"]) | |
| # Split into train / val / test | |
| X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split( | |
| X, | |
| Y, | |
| test_size=TRAINING_CONFIG.get("test_size", 0.2), | |
| val_size=TRAINING_CONFIG.get("val_size", 0.1), | |
| random_state=TRAINING_CONFIG["random_state"], | |
| ) | |
| # Check label coverage and fail early if labels are missing from train+val | |
| _check_label_coverage(Y_train, Y_val) | |
| # Apply MLSMOTE (Multi-Label SMOTE) as per paper | |
| # MLSMOTE handles multi-label classification natively by considering label correlations | |
| print("Applying MLSMOTE (Multi-Label SMOTE) as per SkillScope paper...") | |
| print(f" Original training set: {X_train.shape[0]} samples, {Y_train.shape[1]} labels") | |
| # Use local MLSMOTE implementation directly (function-based) | |
| if _HAS_LOCAL_MLSMOTE: | |
| try: | |
| # Set random seed | |
| if TRAINING_CONFIG["random_state"] is not None: | |
| np.random.seed(TRAINING_CONFIG["random_state"]) | |
| import random | |
| random.seed(TRAINING_CONFIG["random_state"]) | |
| # Convert to DataFrame (MLSMOTE function expects DataFrames) | |
| X_train_df = pd.DataFrame(X_train) | |
| Y_train_df = pd.DataFrame(Y_train) | |
| # Get minority instances | |
| X_min, Y_min = get_minority_instace(X_train_df, Y_train_df) | |
| if len(X_min) == 0: | |
| print("No minority instances found, using original dataset") | |
| X_res, Y_res = X_train, Y_train | |
| oversampling_method = "None (no minority instances)" | |
| n_new = 0 | |
| else: | |
| # Calculate number of synthetic samples | |
| label_counts = Y_train_df.sum(axis=0) | |
| mean_count = int(label_counts.mean()) | |
| min_count = int(label_counts.min()) | |
| n_synthetic = max(100, int(mean_count - min_count)) | |
| n_synthetic = min(n_synthetic, len(X_min) * 3) | |
| print( | |
| f"Generating {n_synthetic} synthetic samples from {len(X_min)} minority instances" | |
| ) | |
| # Apply MLSMOTE function directly | |
| X_res_df, Y_res_df = mlsmote_function(X_min, Y_min, n_synthetic) | |
| # Convert back to numpy | |
| X_res = X_res_df.values | |
| Y_res = Y_res_df.values.astype(int) | |
| oversampling_method = "MLSMOTE (local implementation)" | |
| n_new = len(X_res) - len(X_train) | |
| print( | |
| f"MLSMOTE completed: {n_new} synthetic samples generated. Total: {len(X_res)} samples" | |
| ) | |
| except Exception as e: | |
| print(f"MLSMOTE failed ({e}); falling back to RandomOverSampler") | |
| Y_train_str = ["".join(map(str, y)) for y in Y_train] | |
| ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"]) | |
| X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str) | |
| Y_res = np.array([[int(c) for c in s] for s in Y_res_str]) | |
| oversampling_method = "RandomOverSampler (MLSMOTE fallback)" | |
| n_new = len(X_res) - len(X_train) | |
| else: | |
| print("Local MLSMOTE not available; falling back to RandomOverSampler") | |
| Y_train_str = ["".join(map(str, y)) for y in Y_train] | |
| ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"]) | |
| X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str) | |
| Y_res = np.array([[int(c) for c in s] for s in Y_res_str]) | |
| oversampling_method = "RandomOverSampler (no MLSMOTE)" | |
| n_new = len(X_res) - len(X_train) | |
| grid = run_grid_search(X_res, Y_res) | |
| with mlflow.start_run(run_name="random_forest_with_smote"): | |
| grid.fit(X_res, Y_res) | |
| # Refit final model on train + val (use original non-oversampled data for final fit) | |
| best_params = grid.best_params_ | |
| best_cv = grid.best_score_ | |
| final_model = grid.best_estimator_ | |
| X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train | |
| Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train | |
| final_model.fit(X_comb, Y_comb) | |
| evaluate_and_log( | |
| final_model, | |
| X_test, | |
| Y_test, | |
| best_params, | |
| best_cv, | |
| f"random_forest_{feature_type}_gridsearch_smote", | |
| { | |
| "oversampling": oversampling_method, | |
| "synthetic_samples": n_new, | |
| "n_labels": Y_train.shape[1], | |
| }, | |
| ) | |
| def run_ros_experiment(X, Y): | |
| mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["ros"]) | |
| # Split into train / val / test | |
| X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split( | |
| X, | |
| Y, | |
| test_size=TRAINING_CONFIG.get("test_size", 0.2), | |
| val_size=TRAINING_CONFIG.get("val_size", 0.1), | |
| random_state=TRAINING_CONFIG["random_state"], | |
| ) | |
| Y_train_str = ["".join(map(str, y)) for y in Y_train] | |
| ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"]) | |
| X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str) | |
| Y.shape[1] | |
| Y_res = np.array([[int(c) for c in s] for s in Y_res_str]) | |
| grid = run_grid_search(X_res, Y_res) | |
| with mlflow.start_run(run_name="random_forest_with_ros"): | |
| grid.fit(X_res, Y_res) | |
| best_params = grid.best_params_ | |
| best_cv = grid.best_score_ | |
| final_model = grid.best_estimator_ | |
| X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train | |
| Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train | |
| final_model.fit(X_comb, Y_comb) | |
| evaluate_and_log( | |
| final_model, | |
| X_test, | |
| Y_test, | |
| best_params, | |
| best_cv, | |
| "random_forest_tfidf_gridsearch_ros", | |
| {"oversampling": "RandomOverSampler"}, | |
| ) | |
| def run_adasyn_pca_experiment(X, Y): | |
| mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["adasyn_pca"]) | |
| # Split into train / val / test | |
| X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split( | |
| X, | |
| Y, | |
| test_size=TRAINING_CONFIG.get("test_size", 0.2), | |
| val_size=TRAINING_CONFIG.get("val_size", 0.1), | |
| random_state=TRAINING_CONFIG["random_state"], | |
| ) | |
| print("Applying PCA before ADASYN...") | |
| pca = PCA( | |
| n_components=PCA_CONFIG["variance_retained"], random_state=TRAINING_CONFIG["random_state"] | |
| ) | |
| X_train_pca = pca.fit_transform(X_train) | |
| adasyn = ADASYN( | |
| random_state=TRAINING_CONFIG["random_state"], | |
| n_neighbors=ADASYN_CONFIG["n_neighbors"], | |
| sampling_strategy=ADASYN_CONFIG["sampling_strategy"], | |
| ) | |
| valid_label_idx = next( | |
| (i for i in range(Y_train.shape[1]) if len(np.unique(Y_train[:, i])) > 1), None | |
| ) | |
| if valid_label_idx is None: | |
| X_res, Y_res = X_train, Y_train | |
| n_new = 0 | |
| else: | |
| X_res_pca, _ = adasyn.fit_resample(X_train_pca, Y_train[:, valid_label_idx]) | |
| X_res = pca.inverse_transform(X_res_pca) | |
| n_new = len(X_res) - len(X_train) | |
| Y_res = np.vstack([Y_train, Y_train[np.random.randint(0, len(Y_train), n_new)]]) | |
| grid = run_grid_search(X_res, Y_res) | |
| with mlflow.start_run(run_name="random_forest_with_adasyn_pca"): | |
| grid.fit(X_res, Y_res) | |
| best_params = grid.best_params_ | |
| best_cv = grid.best_score_ | |
| final_model = grid.best_estimator_ | |
| X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train | |
| Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train | |
| final_model.fit(X_comb, Y_comb) | |
| evaluate_and_log( | |
| final_model, | |
| X_test, | |
| Y_test, | |
| best_params, | |
| best_cv, | |
| "random_forest_tfidf_gridsearch_adasyn_pca", | |
| { | |
| "oversampling": "ADASYN + PCA", | |
| "pca_variance": PCA_CONFIG["variance_retained"], | |
| "synthetic_samples": n_new, | |
| }, | |
| ) | |
| pca_path = Path(DATA_PATHS["models_dir"]) / "pca_tfidf_adasyn.pkl" | |
| joblib.dump(pca, pca_path) | |
| mlflow.log_artifact(str(pca_path), artifact_path="model_adasyn_pca") | |
| def run_lightgbm(X, Y): | |
| mlflow.set_experiment(MLFLOW_CONFIG["experiments"].get("lightgbm", "LightGBM")) | |
| # Split into train / val / test | |
| X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split( | |
| X, | |
| Y, | |
| test_size=TRAINING_CONFIG.get("test_size", 0.2), | |
| val_size=TRAINING_CONFIG.get("val_size", 0.1), | |
| random_state=TRAINING_CONFIG["random_state"], | |
| ) | |
| print("\nTraining LightGBM with GridSearchCV...") | |
| grid = run_grid_search_lgb(X_train, Y_train) | |
| with mlflow.start_run(run_name="lightgbm"): | |
| grid.fit(X_train, Y_train) | |
| best_params = grid.best_params_ | |
| best_cv = grid.best_score_ | |
| final_model = grid.best_estimator_ | |
| X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train | |
| Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train | |
| final_model.fit(X_comb, Y_comb) | |
| evaluate_and_log( | |
| final_model, | |
| X_test, | |
| Y_test, | |
| best_params, | |
| best_cv, | |
| "lightgbm_tfidf_gridsearch", | |
| {"oversampling": "None", "model": "LightGBM"}, | |
| ) | |
| def run_lightgbm_smote_experiment(X, Y): | |
| mlflow.set_experiment(MLFLOW_CONFIG["experiments"].get("lightgbm_smote", "LightGBM_SMOTE")) | |
| # Split into train / val / test | |
| X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split( | |
| X, | |
| Y, | |
| test_size=TRAINING_CONFIG.get("test_size", 0.2), | |
| val_size=TRAINING_CONFIG.get("val_size", 0.1), | |
| random_state=TRAINING_CONFIG["random_state"], | |
| ) | |
| # Apply MLSMOTE (Multi-Label SMOTE) as per paper | |
| print(" Applying MLSMOTE for LightGBM...") | |
| print(f" Original training set: {X_train.shape[0]} samples, {Y_train.shape[1]} labels") | |
| # Use local MLSMOTE implementation directly (function-based) | |
| if _HAS_LOCAL_MLSMOTE: | |
| try: | |
| # Set random seed | |
| if TRAINING_CONFIG["random_state"] is not None: | |
| np.random.seed(TRAINING_CONFIG["random_state"]) | |
| import random | |
| random.seed(TRAINING_CONFIG["random_state"]) | |
| # Convert to DataFrame (MLSMOTE function expects DataFrames) | |
| X_train_df = pd.DataFrame(X_train) | |
| Y_train_df = pd.DataFrame(Y_train) | |
| # Get minority instances | |
| X_min, Y_min = get_minority_instace(X_train_df, Y_train_df) | |
| if len(X_min) == 0: | |
| print("No minority instances found, using original dataset") | |
| X_res, Y_res = X_train, Y_train | |
| oversampling_method = "None (no minority instances)" | |
| n_new = 0 | |
| else: | |
| # Calculate number of synthetic samples | |
| label_counts = Y_train_df.sum(axis=0) | |
| mean_count = int(label_counts.mean()) | |
| min_count = int(label_counts.min()) | |
| n_synthetic = max(100, int(mean_count - min_count)) | |
| n_synthetic = min(n_synthetic, len(X_min) * 3) | |
| print( | |
| f"Generating {n_synthetic} synthetic samples from {len(X_min)} minority instances" | |
| ) | |
| # Apply MLSMOTE function directly | |
| X_res_df, Y_res_df = mlsmote_function(X_min, Y_min, n_synthetic) | |
| # Convert back to numpy | |
| X_res = X_res_df.values | |
| Y_res = Y_res_df.values.astype(int) | |
| oversampling_method = "MLSMOTE (local implementation)" | |
| n_new = len(X_res) - len(X_train) | |
| print( | |
| f"MLSMOTE completed: {n_new} synthetic samples generated. Total: {len(X_res)} samples" | |
| ) | |
| except Exception as e: | |
| print(f"MLSMOTE failed ({e}); falling back to RandomOverSampler") | |
| Y_train_str = ["".join(map(str, y)) for y in Y_train] | |
| ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"]) | |
| X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str) | |
| Y_res = np.array([[int(c) for c in s] for s in Y_res_str]) | |
| oversampling_method = "RandomOverSampler (MLSMOTE fallback)" | |
| n_new = len(X_res) - len(X_train) | |
| else: | |
| print(" Local MLSMOTE not available; falling back to RandomOverSampler") | |
| Y_train_str = ["".join(map(str, y)) for y in Y_train] | |
| ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"]) | |
| X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str) | |
| Y_res = np.array([[int(c) for c in s] for s in Y_res_str]) | |
| oversampling_method = "RandomOverSampler (no MLSMOTE)" | |
| n_new = len(X_res) - len(X_train) | |
| print(f"\n Training LightGBM with {oversampling_method} ({n_new} synthetic samples)...") | |
| grid = run_grid_search_lgb(X_res, Y_res) | |
| with mlflow.start_run(run_name="lightgbm_with_smote"): | |
| grid.fit(X_res, Y_res) | |
| best_params = grid.best_params_ | |
| best_cv = grid.best_score_ | |
| final_model = grid.best_estimator_ | |
| X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train | |
| Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train | |
| final_model.fit(X_comb, Y_comb) | |
| evaluate_and_log( | |
| final_model, | |
| X_test, | |
| Y_test, | |
| best_params, | |
| best_cv, | |
| "lightgbm_tfidf_gridsearch_smote", | |
| { | |
| "oversampling": oversampling_method, | |
| "synthetic_samples": n_new, | |
| "n_labels": Y_train.shape[1], | |
| "model": "LightGBM", | |
| }, | |
| ) | |
| # ===================================================== | |
| # Baseline training (original train.py behavior) | |
| # ===================================================== | |
| def run_baseline_train(feature_type="tfidf", use_cleaned=True): | |
| """Run baseline training with configurable feature type. | |
| Args: | |
| feature_type: 'tfidf' or 'embedding' | |
| use_cleaned: whether to use cleaned data | |
| """ | |
| mlflow.set_experiment( | |
| MLFLOW_CONFIG.get("experiments", {}).get("baseline", "hopcroft_random_forest_baseline") | |
| ) | |
| X, Y = load_data(feature_type=feature_type, use_cleaned=use_cleaned) | |
| # Use 80/20 split as per SkillScope paper (no validation set for baseline) | |
| print(" Using 80/20 train/test split as per paper...") | |
| X_train, X_test, Y_train, Y_test = stratified_train_test_split( | |
| X, | |
| Y, | |
| test_size=TRAINING_CONFIG.get("test_size", 0.2), | |
| random_state=TRAINING_CONFIG.get("random_state", 42), | |
| ) | |
| # Remove labels that have 0 occurrences in training set (after split) | |
| train_counts = np.sum(Y_train, axis=0).astype(int) | |
| zero_in_train = np.where(train_counts == 0)[0] | |
| if zero_in_train.size > 0: | |
| kept_idx = np.where(train_counts > 0)[0] | |
| print( | |
| f"[warning] Removing {zero_in_train.size} label(s) with 0 occurrences in TRAIN set. Example removed indices: {zero_in_train[:10].tolist()}" | |
| ) | |
| Y_train = Y_train[:, kept_idx] | |
| Y_test = Y_test[:, kept_idx] | |
| # Save kept indices for inference | |
| paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned) | |
| kept_indices_path = Path(paths["features"]).parent / "kept_label_indices.npy" | |
| np.save(kept_indices_path, kept_idx) | |
| print(f"Saved kept label indices to {kept_indices_path}") | |
| # Now check label coverage (should pass since we removed zero-occurrence labels) | |
| _check_label_coverage(Y_train, np.empty((0, Y_train.shape[1]))) | |
| base_rf = RandomForestClassifier( | |
| random_state=TRAINING_CONFIG.get("random_state", 42), n_jobs=-1 | |
| ) | |
| multi = MultiOutputClassifier(base_rf) | |
| # Use full param_grid from MODEL_CONFIG for optimal results as per paper | |
| param_grid = MODEL_CONFIG.get( | |
| "param_grid", | |
| { | |
| "estimator__n_estimators": [50, 100, 200], | |
| "estimator__max_depth": [10, 20, 30], | |
| "estimator__min_samples_split": [2, 5], | |
| }, | |
| ) | |
| cv = KFold( | |
| n_splits=TRAINING_CONFIG.get("cv_folds", 5), | |
| shuffle=True, | |
| random_state=TRAINING_CONFIG.get("random_state", 42), | |
| ) | |
| print( | |
| f" GridSearch with {cv.n_splits} folds and {len(param_grid['estimator__n_estimators']) * len(param_grid['estimator__max_depth']) * len(param_grid['estimator__min_samples_split'])} combinations..." | |
| ) | |
| grid = GridSearchCV( | |
| estimator=multi, | |
| param_grid=param_grid, | |
| scoring="f1_micro", | |
| cv=cv, | |
| n_jobs=-1, | |
| verbose=2, | |
| refit=True, | |
| ) | |
| with mlflow.start_run(run_name="random_forest_tfidf_gridsearch"): | |
| grid.fit(X_train, Y_train) | |
| best = grid.best_estimator_ | |
| best_params = grid.best_params_ | |
| best_cv_score = grid.best_score_ | |
| # No need to refit on combined train+val since we don't have a val set | |
| # Model is already fitted on full training data | |
| Y_pred_test = best.predict(X_test) | |
| precision = precision_score(Y_test, Y_pred_test, average="micro", zero_division=0) | |
| recall = recall_score(Y_test, Y_pred_test, average="micro", zero_division=0) | |
| f1 = f1_score(Y_test, Y_pred_test, average="micro", zero_division=0) | |
| mlflow.log_param("model_type", "RandomForest + MultiOutput") | |
| for k, v in best_params.items(): | |
| mlflow.log_param(k, v) | |
| mlflow.log_metric("cv_best_f1_micro", best_cv_score) | |
| mlflow.log_metric("test_precision_micro", precision) | |
| mlflow.log_metric("test_recall_micro", recall) | |
| mlflow.log_metric("test_f1_micro", f1) | |
| mlflow.log_param("feature_type", feature_type) | |
| mlflow.log_param("use_cleaned", use_cleaned) | |
| print("\n=== Training Results ===") | |
| print(f"Test Precision (Micro): {precision:.4f}") | |
| print(f"Test Recall (Micro): {recall:.4f}") | |
| print(f"Test F1 Score (Micro): {f1:.4f}") | |
| print("========================\n") | |
| paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned) | |
| os.makedirs(paths["models_dir"], exist_ok=True) | |
| model_path = Path(paths["models_dir"]) / f"random_forest_{feature_type}_gridsearch.pkl" | |
| joblib.dump(best, model_path) | |
| np.save(Path(paths["features"]).parent / "X_test.npy", X_test) | |
| np.save(Path(paths["labels"]).parent / "Y_test.npy", Y_test) | |
| mlflow.sklearn.log_model(best, "model") | |
| print("Grid search training completed and logged successfully.") | |
| # ===================================================== | |
| # Inference utility (merged from predict.py) | |
| # ===================================================== | |
| def run_inference(model_path: str = None): | |
| mlflow.set_experiment( | |
| MLFLOW_CONFIG.get("experiments", {}).get("inference", "hopcroft_random_forest_inference") | |
| ) | |
| if model_path is None: | |
| model_path = Path(DATA_PATHS["models_dir"]) / "random_forest_tfidf_gridsearch.pkl" | |
| else: | |
| model_path = Path(model_path) | |
| model = joblib.load(str(model_path)) | |
| X_test = np.load(Path(DATA_PATHS["features"]).parent / "X_test.npy") | |
| Y_test = np.load(Path(DATA_PATHS["labels"]).parent / "Y_test.npy") | |
| with mlflow.start_run(run_name="random_forest_tfidf_inference"): | |
| Y_pred = model.predict(X_test) | |
| precision = precision_score(Y_test, Y_pred, average="micro", zero_division=0) | |
| recall = recall_score(Y_test, Y_pred, average="micro", zero_division=0) | |
| f1 = f1_score(Y_test, Y_pred, average="micro", zero_division=0) | |
| mlflow.log_metric("test_precision_micro", precision) | |
| mlflow.log_metric("test_recall_micro", recall) | |
| mlflow.log_metric("test_f1_micro", f1) | |
| print(f"Inference completed — Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}") | |
| def _parse_args(): | |
| p = argparse.ArgumentParser(description="Unified training & experiments script") | |
| p.add_argument( | |
| "action", | |
| choices=[ | |
| "baseline", | |
| "smote", | |
| "ros", | |
| "adasyn_pca", | |
| "lightgbm", | |
| "lightgbm_smote", | |
| "predict", | |
| ], | |
| help="Action to run", | |
| ) | |
| p.add_argument("--model-path", help="Custom model path for inference") | |
| return p.parse_args() | |
| if __name__ == "__main__": | |
| args = _parse_args() | |
| # Baseline has its own load_data logic (removes rare labels after split) | |
| if args.action == "baseline": | |
| run_baseline_train(feature_type="tfidf", use_cleaned=True) | |
| else: | |
| # Other experiments use the original load_data() logic | |
| X, Y = load_data(feature_type="tfidf", use_cleaned=True) | |
| if args.action == "smote": | |
| run_smote_experiment(X, Y) | |
| elif args.action == "ros": | |
| run_ros_experiment(X, Y) | |
| elif args.action == "adasyn_pca": | |
| run_adasyn_pca_experiment(X, Y) | |
| elif args.action == "lightgbm": | |
| run_lightgbm(X, Y) | |
| elif args.action == "lightgbm_smote": | |
| run_lightgbm_smote_experiment(X, Y) | |
| elif args.action == "predict": | |
| run_inference(args.model_path) | |