|
|
import argparse |
|
|
import os |
|
|
from pathlib import Path |
|
|
|
|
|
from imblearn.over_sampling import ADASYN, RandomOverSampler |
|
|
import joblib |
|
|
import lightgbm as lgb |
|
|
import mlflow |
|
|
import mlflow.sklearn |
|
|
import numpy as np |
|
|
from sklearn.decomposition import PCA |
|
|
from sklearn.ensemble import RandomForestClassifier |
|
|
from sklearn.metrics import f1_score, precision_score, recall_score |
|
|
from sklearn.model_selection import GridSearchCV, KFold, train_test_split |
|
|
from sklearn.multioutput import MultiOutputClassifier |
|
|
|
|
|
from hopcroft_skill_classification_tool_competition.config import ( |
|
|
ADASYN_CONFIG, |
|
|
DATA_PATHS, |
|
|
MLFLOW_CONFIG, |
|
|
MODEL_CONFIG, |
|
|
PCA_CONFIG, |
|
|
TRAINING_CONFIG, |
|
|
get_feature_paths, |
|
|
) |
|
|
|
|
|
|
|
|
try: |
|
|
import pandas as pd |
|
|
|
|
|
from hopcroft_skill_classification_tool_competition.mlsmote import MLSMOTE as mlsmote_function |
|
|
from hopcroft_skill_classification_tool_competition.mlsmote import get_minority_instace |
|
|
|
|
|
_HAS_LOCAL_MLSMOTE = True |
|
|
except Exception: |
|
|
mlsmote_function = None |
|
|
get_minority_instace = None |
|
|
_HAS_LOCAL_MLSMOTE = False |
|
|
print("[warning] Local MLSMOTE not available. Check mlsmote.py exists.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit |
|
|
|
|
|
_HAS_MLSTRAT = True |
|
|
except Exception: |
|
|
MultilabelStratifiedShuffleSplit = None |
|
|
_HAS_MLSTRAT = False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from dotenv import load_dotenv |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
_mlflow_env_uri = os.getenv("MLFLOW_TRACKING_URI") |
|
|
_configured_uri = MLFLOW_CONFIG.get("uri", "https://dagshub.com/se4ai2526-uniba/Hopcroft.mlflow") |
|
|
|
|
|
if _mlflow_env_uri: |
|
|
mlflow_uri = _mlflow_env_uri |
|
|
else: |
|
|
mlflow_uri = _configured_uri |
|
|
|
|
|
|
|
|
if "dagshub.com" in mlflow_uri: |
|
|
_username = os.getenv("MLFLOW_TRACKING_USERNAME") |
|
|
_password = os.getenv("MLFLOW_TRACKING_PASSWORD") |
|
|
if not _username or not _password: |
|
|
raise ValueError( |
|
|
"Set the environment variables MLFLOW_TRACKING_USERNAME and MLFLOW_TRACKING_PASSWORD for remote tracking" |
|
|
) |
|
|
|
|
|
mlflow.set_tracking_uri(mlflow_uri) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_data(feature_type="tfidf", use_cleaned=True): |
|
|
"""Load features and labels using get_feature_paths. |
|
|
|
|
|
Args: |
|
|
feature_type: 'tfidf' or 'embedding' |
|
|
use_cleaned: whether to use cleaned data |
|
|
|
|
|
Returns: |
|
|
X, Y: feature matrix and label matrix |
|
|
""" |
|
|
paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned) |
|
|
X = np.load(paths["features"]) |
|
|
Y = np.load(paths["labels"]) |
|
|
|
|
|
print(f"Dataset loaded successfully: {X.shape} samples, {Y.shape} labels") |
|
|
print(f"Using feature type: {feature_type}{'_clean' if use_cleaned else ''}") |
|
|
return X, Y |
|
|
|
|
|
|
|
|
def stratified_train_test_split(X, Y, test_size=None, random_state=None, fallback=True): |
|
|
"""Split X, Y using multilabel stratified shuffle split when possible. |
|
|
|
|
|
Args: |
|
|
X: np.ndarray features |
|
|
Y: np.ndarray multi-label binary matrix (n_samples, n_labels) |
|
|
test_size: float or int, forwarded to splitter |
|
|
random_state: int |
|
|
fallback: if True and multilabel splitter unavailable, use sklearn.train_test_split |
|
|
|
|
|
Returns: |
|
|
X_train, X_test, Y_train, Y_test |
|
|
""" |
|
|
if _HAS_MLSTRAT: |
|
|
if isinstance(test_size, float): |
|
|
tst = test_size |
|
|
else: |
|
|
|
|
|
tst = TRAINING_CONFIG.get("test_size", 0.2) |
|
|
|
|
|
msss = MultilabelStratifiedShuffleSplit( |
|
|
n_splits=1, test_size=tst, random_state=random_state |
|
|
) |
|
|
train_idx, test_idx = next(msss.split(X, Y)) |
|
|
return X[train_idx], X[test_idx], Y[train_idx], Y[test_idx] |
|
|
|
|
|
if fallback: |
|
|
print( |
|
|
"[warning] iterative-stratification not available; using standard train_test_split (no multilabel stratification). To enable stratified multilabel splitting install 'iterative-stratification'." |
|
|
) |
|
|
return train_test_split(X, Y, test_size=test_size, random_state=random_state, shuffle=True) |
|
|
|
|
|
raise RuntimeError( |
|
|
"iterative-stratification is required for multilabel stratified splitting but not installed." |
|
|
) |
|
|
|
|
|
|
|
|
def stratified_train_val_test_split( |
|
|
X, Y, test_size=0.2, val_size=0.1, random_state=None, fallback=True |
|
|
): |
|
|
"""Split X, Y into train, val, test with multilabel stratification when possible. |
|
|
|
|
|
Args: |
|
|
X, Y: arrays |
|
|
test_size: proportion for final test set |
|
|
val_size: proportion for validation set (relative to whole dataset) |
|
|
random_state: seed |
|
|
fallback: if True, falls back to sklearn splits |
|
|
|
|
|
Returns: |
|
|
X_train, X_val, X_test, Y_train, Y_val, Y_test |
|
|
""" |
|
|
if not (0.0 < test_size < 1.0 and 0.0 <= val_size < 1.0 and val_size + test_size < 1.0): |
|
|
raise ValueError("test_size and val_size must be fractions in (0,1) and sum < 1") |
|
|
|
|
|
|
|
|
X_rem, X_test, Y_rem, Y_test = stratified_train_test_split( |
|
|
X, Y, test_size=test_size, random_state=random_state, fallback=fallback |
|
|
) |
|
|
|
|
|
|
|
|
rel_val = 0.0 |
|
|
if (1.0 - test_size) > 0: |
|
|
rel_val = val_size / (1.0 - test_size) |
|
|
else: |
|
|
rel_val = 0.0 |
|
|
|
|
|
if rel_val <= 0: |
|
|
|
|
|
return X_rem, np.empty((0, X.shape[1])), X_test, Y_rem, np.empty((0, Y.shape[1])), Y_test |
|
|
|
|
|
X_train, X_val, Y_train, Y_val = stratified_train_test_split( |
|
|
X_rem, Y_rem, test_size=rel_val, random_state=random_state, fallback=fallback |
|
|
) |
|
|
|
|
|
return X_train, X_val, X_test, Y_train, Y_val, Y_test |
|
|
|
|
|
|
|
|
def _check_label_coverage(Y_train: np.ndarray, Y_val: np.ndarray, min_train: int = 1): |
|
|
"""Check that each label appears at least `min_train` times in train and |
|
|
at least once in train+val. Prints a warning if some labels are scarce in |
|
|
train, and raises an error if some labels are missing entirely from |
|
|
train+val (which would make learning impossible for those labels). |
|
|
|
|
|
Args: |
|
|
Y_train: (n_train, n_labels) binary matrix |
|
|
Y_val: (n_val, n_labels) binary matrix (may be empty) |
|
|
min_train: minimum occurrences in train to be considered "covered" |
|
|
""" |
|
|
|
|
|
if Y_val is None: |
|
|
Y_val = np.empty((0, Y_train.shape[1])) |
|
|
|
|
|
counts_train = np.sum(Y_train, axis=0) |
|
|
counts_train_val = counts_train + np.sum(Y_val, axis=0) |
|
|
|
|
|
missing_in_train = np.where(counts_train < min_train)[0] |
|
|
missing_in_train_val = np.where(counts_train_val == 0)[0] |
|
|
|
|
|
if missing_in_train.size > 0: |
|
|
|
|
|
preview = missing_in_train[:10].tolist() |
|
|
print( |
|
|
f"[warning] {missing_in_train.size} label(s) have <{min_train} occurrences in TRAIN. Example label indices: {preview}." |
|
|
) |
|
|
|
|
|
if missing_in_train_val.size > 0: |
|
|
preview = missing_in_train_val[:10].tolist() |
|
|
raise ValueError( |
|
|
f"{missing_in_train_val.size} label(s) have 0 occurrences in TRAIN+VAL (indices example: {preview}). " |
|
|
"Reduce test/val size, aggregate labels, or ensure these labels exist in the source DB." |
|
|
) |
|
|
|
|
|
|
|
|
def evaluate_and_log(model, X_test, Y_test, best_params, cv_score, exp_name, extra_params=None): |
|
|
Y_pred = model.predict(X_test) |
|
|
precision = precision_score(Y_test, Y_pred, average="micro", zero_division=0) |
|
|
recall = recall_score(Y_test, Y_pred, average="micro", zero_division=0) |
|
|
f1 = f1_score(Y_test, Y_pred, average="micro", zero_division=0) |
|
|
|
|
|
mlflow.log_metrics( |
|
|
{ |
|
|
"cv_best_f1_micro": cv_score, |
|
|
"test_precision_micro": precision, |
|
|
"test_recall_micro": recall, |
|
|
"test_f1_micro": f1, |
|
|
} |
|
|
) |
|
|
|
|
|
for k, v in best_params.items(): |
|
|
mlflow.log_param(k, v) |
|
|
if extra_params: |
|
|
for k, v in extra_params.items(): |
|
|
mlflow.log_param(k, v) |
|
|
|
|
|
os.makedirs(DATA_PATHS["models_dir"], exist_ok=True) |
|
|
model_path = Path(DATA_PATHS["models_dir"]) / f"{exp_name}.pkl" |
|
|
joblib.dump(model, model_path) |
|
|
mlflow.log_artifact(str(model_path), artifact_path=f"model_{exp_name}") |
|
|
print(f"Model saved to {model_path}") |
|
|
print(f"{exp_name} completed and logged successfully.\n") |
|
|
|
|
|
|
|
|
def run_grid_search(X, Y): |
|
|
base_rf = RandomForestClassifier(random_state=TRAINING_CONFIG["random_state"], n_jobs=-1) |
|
|
multi = MultiOutputClassifier(base_rf) |
|
|
cv = KFold( |
|
|
n_splits=TRAINING_CONFIG["cv_folds"], |
|
|
shuffle=True, |
|
|
random_state=TRAINING_CONFIG["random_state"], |
|
|
) |
|
|
grid = GridSearchCV( |
|
|
estimator=multi, |
|
|
param_grid=MODEL_CONFIG["param_grid"], |
|
|
scoring="f1_micro", |
|
|
cv=cv, |
|
|
n_jobs=-1, |
|
|
verbose=2, |
|
|
refit=True, |
|
|
) |
|
|
return grid |
|
|
|
|
|
|
|
|
def run_grid_search_lgb(X, Y): |
|
|
base_lgb = lgb.LGBMClassifier( |
|
|
random_state=TRAINING_CONFIG["random_state"], n_jobs=1, force_row_wise=True, verbose=-1 |
|
|
) |
|
|
multi = MultiOutputClassifier(base_lgb, n_jobs=-1) |
|
|
cv = KFold( |
|
|
n_splits=TRAINING_CONFIG["cv_folds"], |
|
|
shuffle=True, |
|
|
random_state=TRAINING_CONFIG["random_state"], |
|
|
) |
|
|
lgb_param_grid = { |
|
|
"estimator__n_estimators": [50, 100, 200], |
|
|
"estimator__max_depth": [3, 5, 7], |
|
|
"estimator__learning_rate": [0.1], |
|
|
"estimator__num_leaves": [15], |
|
|
} |
|
|
grid = GridSearchCV( |
|
|
estimator=multi, |
|
|
param_grid=lgb_param_grid, |
|
|
scoring="f1_micro", |
|
|
cv=cv, |
|
|
n_jobs=-1, |
|
|
verbose=2, |
|
|
refit=True, |
|
|
) |
|
|
return grid |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_smote_experiment(X, Y, feature_type="tfidf"): |
|
|
mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["smote"]) |
|
|
|
|
|
|
|
|
X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split( |
|
|
X, |
|
|
Y, |
|
|
test_size=TRAINING_CONFIG.get("test_size", 0.2), |
|
|
val_size=TRAINING_CONFIG.get("val_size", 0.1), |
|
|
random_state=TRAINING_CONFIG["random_state"], |
|
|
) |
|
|
|
|
|
_check_label_coverage(Y_train, Y_val) |
|
|
|
|
|
|
|
|
|
|
|
print("Applying MLSMOTE (Multi-Label SMOTE) as per SkillScope paper...") |
|
|
print(f" Original training set: {X_train.shape[0]} samples, {Y_train.shape[1]} labels") |
|
|
|
|
|
|
|
|
if _HAS_LOCAL_MLSMOTE: |
|
|
try: |
|
|
|
|
|
if TRAINING_CONFIG["random_state"] is not None: |
|
|
np.random.seed(TRAINING_CONFIG["random_state"]) |
|
|
import random |
|
|
|
|
|
random.seed(TRAINING_CONFIG["random_state"]) |
|
|
|
|
|
|
|
|
X_train_df = pd.DataFrame(X_train) |
|
|
Y_train_df = pd.DataFrame(Y_train) |
|
|
|
|
|
|
|
|
X_min, Y_min = get_minority_instace(X_train_df, Y_train_df) |
|
|
|
|
|
if len(X_min) == 0: |
|
|
print("No minority instances found, using original dataset") |
|
|
X_res, Y_res = X_train, Y_train |
|
|
oversampling_method = "None (no minority instances)" |
|
|
n_new = 0 |
|
|
else: |
|
|
|
|
|
label_counts = Y_train_df.sum(axis=0) |
|
|
mean_count = int(label_counts.mean()) |
|
|
min_count = int(label_counts.min()) |
|
|
n_synthetic = max(100, int(mean_count - min_count)) |
|
|
n_synthetic = min(n_synthetic, len(X_min) * 3) |
|
|
|
|
|
print( |
|
|
f"Generating {n_synthetic} synthetic samples from {len(X_min)} minority instances" |
|
|
) |
|
|
|
|
|
|
|
|
X_res_df, Y_res_df = mlsmote_function(X_min, Y_min, n_synthetic) |
|
|
|
|
|
|
|
|
X_res = X_res_df.values |
|
|
Y_res = Y_res_df.values.astype(int) |
|
|
|
|
|
oversampling_method = "MLSMOTE (local implementation)" |
|
|
n_new = len(X_res) - len(X_train) |
|
|
print( |
|
|
f"MLSMOTE completed: {n_new} synthetic samples generated. Total: {len(X_res)} samples" |
|
|
) |
|
|
except Exception as e: |
|
|
print(f"MLSMOTE failed ({e}); falling back to RandomOverSampler") |
|
|
Y_train_str = ["".join(map(str, y)) for y in Y_train] |
|
|
ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"]) |
|
|
X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str) |
|
|
Y_res = np.array([[int(c) for c in s] for s in Y_res_str]) |
|
|
oversampling_method = "RandomOverSampler (MLSMOTE fallback)" |
|
|
n_new = len(X_res) - len(X_train) |
|
|
else: |
|
|
print("Local MLSMOTE not available; falling back to RandomOverSampler") |
|
|
Y_train_str = ["".join(map(str, y)) for y in Y_train] |
|
|
ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"]) |
|
|
X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str) |
|
|
Y_res = np.array([[int(c) for c in s] for s in Y_res_str]) |
|
|
oversampling_method = "RandomOverSampler (no MLSMOTE)" |
|
|
n_new = len(X_res) - len(X_train) |
|
|
|
|
|
grid = run_grid_search(X_res, Y_res) |
|
|
with mlflow.start_run(run_name="random_forest_with_smote"): |
|
|
grid.fit(X_res, Y_res) |
|
|
|
|
|
|
|
|
best_params = grid.best_params_ |
|
|
best_cv = grid.best_score_ |
|
|
final_model = grid.best_estimator_ |
|
|
X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train |
|
|
Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train |
|
|
final_model.fit(X_comb, Y_comb) |
|
|
|
|
|
evaluate_and_log( |
|
|
final_model, |
|
|
X_test, |
|
|
Y_test, |
|
|
best_params, |
|
|
best_cv, |
|
|
f"random_forest_{feature_type}_gridsearch_smote", |
|
|
{ |
|
|
"oversampling": oversampling_method, |
|
|
"synthetic_samples": n_new, |
|
|
"n_labels": Y_train.shape[1], |
|
|
}, |
|
|
) |
|
|
|
|
|
|
|
|
def run_ros_experiment(X, Y): |
|
|
mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["ros"]) |
|
|
|
|
|
|
|
|
X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split( |
|
|
X, |
|
|
Y, |
|
|
test_size=TRAINING_CONFIG.get("test_size", 0.2), |
|
|
val_size=TRAINING_CONFIG.get("val_size", 0.1), |
|
|
random_state=TRAINING_CONFIG["random_state"], |
|
|
) |
|
|
|
|
|
Y_train_str = ["".join(map(str, y)) for y in Y_train] |
|
|
ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"]) |
|
|
X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str) |
|
|
|
|
|
Y.shape[1] |
|
|
Y_res = np.array([[int(c) for c in s] for s in Y_res_str]) |
|
|
|
|
|
grid = run_grid_search(X_res, Y_res) |
|
|
with mlflow.start_run(run_name="random_forest_with_ros"): |
|
|
grid.fit(X_res, Y_res) |
|
|
|
|
|
best_params = grid.best_params_ |
|
|
best_cv = grid.best_score_ |
|
|
final_model = grid.best_estimator_ |
|
|
X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train |
|
|
Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train |
|
|
final_model.fit(X_comb, Y_comb) |
|
|
|
|
|
evaluate_and_log( |
|
|
final_model, |
|
|
X_test, |
|
|
Y_test, |
|
|
best_params, |
|
|
best_cv, |
|
|
"random_forest_tfidf_gridsearch_ros", |
|
|
{"oversampling": "RandomOverSampler"}, |
|
|
) |
|
|
|
|
|
|
|
|
def run_adasyn_pca_experiment(X, Y): |
|
|
mlflow.set_experiment(MLFLOW_CONFIG["experiments"]["adasyn_pca"]) |
|
|
|
|
|
|
|
|
X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split( |
|
|
X, |
|
|
Y, |
|
|
test_size=TRAINING_CONFIG.get("test_size", 0.2), |
|
|
val_size=TRAINING_CONFIG.get("val_size", 0.1), |
|
|
random_state=TRAINING_CONFIG["random_state"], |
|
|
) |
|
|
|
|
|
print("Applying PCA before ADASYN...") |
|
|
pca = PCA( |
|
|
n_components=PCA_CONFIG["variance_retained"], random_state=TRAINING_CONFIG["random_state"] |
|
|
) |
|
|
X_train_pca = pca.fit_transform(X_train) |
|
|
|
|
|
adasyn = ADASYN( |
|
|
random_state=TRAINING_CONFIG["random_state"], |
|
|
n_neighbors=ADASYN_CONFIG["n_neighbors"], |
|
|
sampling_strategy=ADASYN_CONFIG["sampling_strategy"], |
|
|
) |
|
|
|
|
|
valid_label_idx = next( |
|
|
(i for i in range(Y_train.shape[1]) if len(np.unique(Y_train[:, i])) > 1), None |
|
|
) |
|
|
|
|
|
if valid_label_idx is None: |
|
|
X_res, Y_res = X_train, Y_train |
|
|
n_new = 0 |
|
|
else: |
|
|
X_res_pca, _ = adasyn.fit_resample(X_train_pca, Y_train[:, valid_label_idx]) |
|
|
X_res = pca.inverse_transform(X_res_pca) |
|
|
n_new = len(X_res) - len(X_train) |
|
|
Y_res = np.vstack([Y_train, Y_train[np.random.randint(0, len(Y_train), n_new)]]) |
|
|
|
|
|
grid = run_grid_search(X_res, Y_res) |
|
|
with mlflow.start_run(run_name="random_forest_with_adasyn_pca"): |
|
|
grid.fit(X_res, Y_res) |
|
|
|
|
|
best_params = grid.best_params_ |
|
|
best_cv = grid.best_score_ |
|
|
final_model = grid.best_estimator_ |
|
|
X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train |
|
|
Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train |
|
|
final_model.fit(X_comb, Y_comb) |
|
|
|
|
|
evaluate_and_log( |
|
|
final_model, |
|
|
X_test, |
|
|
Y_test, |
|
|
best_params, |
|
|
best_cv, |
|
|
"random_forest_tfidf_gridsearch_adasyn_pca", |
|
|
{ |
|
|
"oversampling": "ADASYN + PCA", |
|
|
"pca_variance": PCA_CONFIG["variance_retained"], |
|
|
"synthetic_samples": n_new, |
|
|
}, |
|
|
) |
|
|
pca_path = Path(DATA_PATHS["models_dir"]) / "pca_tfidf_adasyn.pkl" |
|
|
joblib.dump(pca, pca_path) |
|
|
mlflow.log_artifact(str(pca_path), artifact_path="model_adasyn_pca") |
|
|
|
|
|
|
|
|
def run_lightgbm(X, Y): |
|
|
mlflow.set_experiment(MLFLOW_CONFIG["experiments"].get("lightgbm", "LightGBM")) |
|
|
|
|
|
|
|
|
X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split( |
|
|
X, |
|
|
Y, |
|
|
test_size=TRAINING_CONFIG.get("test_size", 0.2), |
|
|
val_size=TRAINING_CONFIG.get("val_size", 0.1), |
|
|
random_state=TRAINING_CONFIG["random_state"], |
|
|
) |
|
|
|
|
|
print("\nTraining LightGBM with GridSearchCV...") |
|
|
grid = run_grid_search_lgb(X_train, Y_train) |
|
|
|
|
|
with mlflow.start_run(run_name="lightgbm"): |
|
|
grid.fit(X_train, Y_train) |
|
|
|
|
|
best_params = grid.best_params_ |
|
|
best_cv = grid.best_score_ |
|
|
final_model = grid.best_estimator_ |
|
|
X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train |
|
|
Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train |
|
|
final_model.fit(X_comb, Y_comb) |
|
|
|
|
|
evaluate_and_log( |
|
|
final_model, |
|
|
X_test, |
|
|
Y_test, |
|
|
best_params, |
|
|
best_cv, |
|
|
"lightgbm_tfidf_gridsearch", |
|
|
{"oversampling": "None", "model": "LightGBM"}, |
|
|
) |
|
|
|
|
|
|
|
|
def run_lightgbm_smote_experiment(X, Y): |
|
|
mlflow.set_experiment(MLFLOW_CONFIG["experiments"].get("lightgbm_smote", "LightGBM_SMOTE")) |
|
|
|
|
|
|
|
|
X_train, X_val, X_test, Y_train, Y_val, Y_test = stratified_train_val_test_split( |
|
|
X, |
|
|
Y, |
|
|
test_size=TRAINING_CONFIG.get("test_size", 0.2), |
|
|
val_size=TRAINING_CONFIG.get("val_size", 0.1), |
|
|
random_state=TRAINING_CONFIG["random_state"], |
|
|
) |
|
|
|
|
|
|
|
|
print(" Applying MLSMOTE for LightGBM...") |
|
|
print(f" Original training set: {X_train.shape[0]} samples, {Y_train.shape[1]} labels") |
|
|
|
|
|
|
|
|
if _HAS_LOCAL_MLSMOTE: |
|
|
try: |
|
|
|
|
|
if TRAINING_CONFIG["random_state"] is not None: |
|
|
np.random.seed(TRAINING_CONFIG["random_state"]) |
|
|
import random |
|
|
|
|
|
random.seed(TRAINING_CONFIG["random_state"]) |
|
|
|
|
|
|
|
|
X_train_df = pd.DataFrame(X_train) |
|
|
Y_train_df = pd.DataFrame(Y_train) |
|
|
|
|
|
|
|
|
X_min, Y_min = get_minority_instace(X_train_df, Y_train_df) |
|
|
|
|
|
if len(X_min) == 0: |
|
|
print("No minority instances found, using original dataset") |
|
|
X_res, Y_res = X_train, Y_train |
|
|
oversampling_method = "None (no minority instances)" |
|
|
n_new = 0 |
|
|
else: |
|
|
|
|
|
label_counts = Y_train_df.sum(axis=0) |
|
|
mean_count = int(label_counts.mean()) |
|
|
min_count = int(label_counts.min()) |
|
|
n_synthetic = max(100, int(mean_count - min_count)) |
|
|
n_synthetic = min(n_synthetic, len(X_min) * 3) |
|
|
|
|
|
print( |
|
|
f"Generating {n_synthetic} synthetic samples from {len(X_min)} minority instances" |
|
|
) |
|
|
|
|
|
|
|
|
X_res_df, Y_res_df = mlsmote_function(X_min, Y_min, n_synthetic) |
|
|
|
|
|
|
|
|
X_res = X_res_df.values |
|
|
Y_res = Y_res_df.values.astype(int) |
|
|
|
|
|
oversampling_method = "MLSMOTE (local implementation)" |
|
|
n_new = len(X_res) - len(X_train) |
|
|
print( |
|
|
f"MLSMOTE completed: {n_new} synthetic samples generated. Total: {len(X_res)} samples" |
|
|
) |
|
|
except Exception as e: |
|
|
print(f"MLSMOTE failed ({e}); falling back to RandomOverSampler") |
|
|
Y_train_str = ["".join(map(str, y)) for y in Y_train] |
|
|
ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"]) |
|
|
X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str) |
|
|
Y_res = np.array([[int(c) for c in s] for s in Y_res_str]) |
|
|
oversampling_method = "RandomOverSampler (MLSMOTE fallback)" |
|
|
n_new = len(X_res) - len(X_train) |
|
|
else: |
|
|
print(" Local MLSMOTE not available; falling back to RandomOverSampler") |
|
|
Y_train_str = ["".join(map(str, y)) for y in Y_train] |
|
|
ros = RandomOverSampler(random_state=TRAINING_CONFIG["random_state"]) |
|
|
X_res, Y_res_str = ros.fit_resample(X_train, Y_train_str) |
|
|
Y_res = np.array([[int(c) for c in s] for s in Y_res_str]) |
|
|
oversampling_method = "RandomOverSampler (no MLSMOTE)" |
|
|
n_new = len(X_res) - len(X_train) |
|
|
|
|
|
print(f"\n Training LightGBM with {oversampling_method} ({n_new} synthetic samples)...") |
|
|
grid = run_grid_search_lgb(X_res, Y_res) |
|
|
|
|
|
with mlflow.start_run(run_name="lightgbm_with_smote"): |
|
|
grid.fit(X_res, Y_res) |
|
|
|
|
|
best_params = grid.best_params_ |
|
|
best_cv = grid.best_score_ |
|
|
final_model = grid.best_estimator_ |
|
|
X_comb = np.vstack([X_train, X_val]) if X_val.size else X_train |
|
|
Y_comb = np.vstack([Y_train, Y_val]) if Y_val.size else Y_train |
|
|
final_model.fit(X_comb, Y_comb) |
|
|
|
|
|
evaluate_and_log( |
|
|
final_model, |
|
|
X_test, |
|
|
Y_test, |
|
|
best_params, |
|
|
best_cv, |
|
|
"lightgbm_tfidf_gridsearch_smote", |
|
|
{ |
|
|
"oversampling": oversampling_method, |
|
|
"synthetic_samples": n_new, |
|
|
"n_labels": Y_train.shape[1], |
|
|
"model": "LightGBM", |
|
|
}, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_baseline_train(feature_type="tfidf", use_cleaned=True): |
|
|
"""Run baseline training with configurable feature type. |
|
|
|
|
|
Args: |
|
|
feature_type: 'tfidf' or 'embedding' |
|
|
use_cleaned: whether to use cleaned data |
|
|
""" |
|
|
mlflow.set_experiment( |
|
|
MLFLOW_CONFIG.get("experiments", {}).get("baseline", "hopcroft_random_forest_baseline") |
|
|
) |
|
|
|
|
|
X, Y = load_data(feature_type=feature_type, use_cleaned=use_cleaned) |
|
|
|
|
|
|
|
|
print(" Using 80/20 train/test split as per paper...") |
|
|
X_train, X_test, Y_train, Y_test = stratified_train_test_split( |
|
|
X, |
|
|
Y, |
|
|
test_size=TRAINING_CONFIG.get("test_size", 0.2), |
|
|
random_state=TRAINING_CONFIG.get("random_state", 42), |
|
|
) |
|
|
|
|
|
|
|
|
train_counts = np.sum(Y_train, axis=0).astype(int) |
|
|
zero_in_train = np.where(train_counts == 0)[0] |
|
|
|
|
|
if zero_in_train.size > 0: |
|
|
kept_idx = np.where(train_counts > 0)[0] |
|
|
print( |
|
|
f"[warning] Removing {zero_in_train.size} label(s) with 0 occurrences in TRAIN set. Example removed indices: {zero_in_train[:10].tolist()}" |
|
|
) |
|
|
Y_train = Y_train[:, kept_idx] |
|
|
Y_test = Y_test[:, kept_idx] |
|
|
|
|
|
|
|
|
paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned) |
|
|
kept_indices_path = Path(paths["features"]).parent / "kept_label_indices.npy" |
|
|
np.save(kept_indices_path, kept_idx) |
|
|
print(f"Saved kept label indices to {kept_indices_path}") |
|
|
|
|
|
|
|
|
_check_label_coverage(Y_train, np.empty((0, Y_train.shape[1]))) |
|
|
|
|
|
base_rf = RandomForestClassifier( |
|
|
random_state=TRAINING_CONFIG.get("random_state", 42), n_jobs=-1 |
|
|
) |
|
|
multi = MultiOutputClassifier(base_rf) |
|
|
|
|
|
|
|
|
param_grid = MODEL_CONFIG.get( |
|
|
"param_grid", |
|
|
{ |
|
|
"estimator__n_estimators": [50, 100, 200], |
|
|
"estimator__max_depth": [10, 20, 30], |
|
|
"estimator__min_samples_split": [2, 5], |
|
|
}, |
|
|
) |
|
|
|
|
|
cv = KFold( |
|
|
n_splits=TRAINING_CONFIG.get("cv_folds", 5), |
|
|
shuffle=True, |
|
|
random_state=TRAINING_CONFIG.get("random_state", 42), |
|
|
) |
|
|
|
|
|
print( |
|
|
f" GridSearch with {cv.n_splits} folds and {len(param_grid['estimator__n_estimators']) * len(param_grid['estimator__max_depth']) * len(param_grid['estimator__min_samples_split'])} combinations..." |
|
|
) |
|
|
|
|
|
grid = GridSearchCV( |
|
|
estimator=multi, |
|
|
param_grid=param_grid, |
|
|
scoring="f1_micro", |
|
|
cv=cv, |
|
|
n_jobs=-1, |
|
|
verbose=2, |
|
|
refit=True, |
|
|
) |
|
|
|
|
|
with mlflow.start_run(run_name="random_forest_tfidf_gridsearch"): |
|
|
grid.fit(X_train, Y_train) |
|
|
|
|
|
best = grid.best_estimator_ |
|
|
best_params = grid.best_params_ |
|
|
best_cv_score = grid.best_score_ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Y_pred_test = best.predict(X_test) |
|
|
|
|
|
precision = precision_score(Y_test, Y_pred_test, average="micro", zero_division=0) |
|
|
recall = recall_score(Y_test, Y_pred_test, average="micro", zero_division=0) |
|
|
f1 = f1_score(Y_test, Y_pred_test, average="micro", zero_division=0) |
|
|
|
|
|
mlflow.log_param("model_type", "RandomForest + MultiOutput") |
|
|
for k, v in best_params.items(): |
|
|
mlflow.log_param(k, v) |
|
|
mlflow.log_metric("cv_best_f1_micro", best_cv_score) |
|
|
|
|
|
mlflow.log_metric("test_precision_micro", precision) |
|
|
mlflow.log_metric("test_recall_micro", recall) |
|
|
mlflow.log_metric("test_f1_micro", f1) |
|
|
mlflow.log_param("feature_type", feature_type) |
|
|
mlflow.log_param("use_cleaned", use_cleaned) |
|
|
|
|
|
print("\n=== Training Results ===") |
|
|
print(f"Test Precision (Micro): {precision:.4f}") |
|
|
print(f"Test Recall (Micro): {recall:.4f}") |
|
|
print(f"Test F1 Score (Micro): {f1:.4f}") |
|
|
print("========================\n") |
|
|
|
|
|
paths = get_feature_paths(feature_type=feature_type, use_cleaned=use_cleaned) |
|
|
os.makedirs(paths["models_dir"], exist_ok=True) |
|
|
|
|
|
model_path = Path(paths["models_dir"]) / f"random_forest_{feature_type}_gridsearch.pkl" |
|
|
joblib.dump(best, model_path) |
|
|
|
|
|
np.save(Path(paths["features"]).parent / "X_test.npy", X_test) |
|
|
np.save(Path(paths["labels"]).parent / "Y_test.npy", Y_test) |
|
|
|
|
|
mlflow.sklearn.log_model(best, "model") |
|
|
|
|
|
print("Grid search training completed and logged successfully.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_inference(model_path: str = None): |
|
|
mlflow.set_experiment( |
|
|
MLFLOW_CONFIG.get("experiments", {}).get("inference", "hopcroft_random_forest_inference") |
|
|
) |
|
|
|
|
|
if model_path is None: |
|
|
model_path = Path(DATA_PATHS["models_dir"]) / "random_forest_tfidf_gridsearch.pkl" |
|
|
else: |
|
|
model_path = Path(model_path) |
|
|
|
|
|
model = joblib.load(str(model_path)) |
|
|
|
|
|
X_test = np.load(Path(DATA_PATHS["features"]).parent / "X_test.npy") |
|
|
Y_test = np.load(Path(DATA_PATHS["labels"]).parent / "Y_test.npy") |
|
|
|
|
|
with mlflow.start_run(run_name="random_forest_tfidf_inference"): |
|
|
Y_pred = model.predict(X_test) |
|
|
|
|
|
precision = precision_score(Y_test, Y_pred, average="micro", zero_division=0) |
|
|
recall = recall_score(Y_test, Y_pred, average="micro", zero_division=0) |
|
|
f1 = f1_score(Y_test, Y_pred, average="micro", zero_division=0) |
|
|
|
|
|
mlflow.log_metric("test_precision_micro", precision) |
|
|
mlflow.log_metric("test_recall_micro", recall) |
|
|
mlflow.log_metric("test_f1_micro", f1) |
|
|
|
|
|
print(f"Inference completed — Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}") |
|
|
|
|
|
|
|
|
def _parse_args(): |
|
|
p = argparse.ArgumentParser(description="Unified training & experiments script") |
|
|
p.add_argument( |
|
|
"action", |
|
|
choices=[ |
|
|
"baseline", |
|
|
"smote", |
|
|
"ros", |
|
|
"adasyn_pca", |
|
|
"lightgbm", |
|
|
"lightgbm_smote", |
|
|
"predict", |
|
|
], |
|
|
help="Action to run", |
|
|
) |
|
|
p.add_argument("--model-path", help="Custom model path for inference") |
|
|
return p.parse_args() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
args = _parse_args() |
|
|
|
|
|
|
|
|
if args.action == "baseline": |
|
|
run_baseline_train(feature_type="tfidf", use_cleaned=True) |
|
|
else: |
|
|
|
|
|
X, Y = load_data(feature_type="tfidf", use_cleaned=True) |
|
|
|
|
|
if args.action == "smote": |
|
|
run_smote_experiment(X, Y) |
|
|
elif args.action == "ros": |
|
|
run_ros_experiment(X, Y) |
|
|
elif args.action == "adasyn_pca": |
|
|
run_adasyn_pca_experiment(X, Y) |
|
|
elif args.action == "lightgbm": |
|
|
run_lightgbm(X, Y) |
|
|
elif args.action == "lightgbm_smote": |
|
|
run_lightgbm_smote_experiment(X, Y) |
|
|
elif args.action == "predict": |
|
|
run_inference(args.model_path) |
|
|
|