Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| import json | |
| import logging | |
| import time | |
| import numpy as np | |
| import pandas as pd | |
| import joblib | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.preprocessing import OneHotEncoder | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.model_selection import StratifiedKFold, cross_val_predict, GridSearchCV | |
| from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score | |
| from matplotlib import pyplot as plt | |
| _PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| if str(_PROJECT_ROOT) not in sys.path: | |
| sys.path.insert(0, str(_PROJECT_ROOT)) | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(name)s | %(levelname)s | %(message)s") | |
| logger = logging.getLogger("logistic_model") | |
| def load_data(splits_dir): | |
| """Load train and val pandas dataframes, maintaining clean_text and text_length_bucket.""" | |
| train_df = pd.read_csv(os.path.join(splits_dir, "df_train.csv")) | |
| val_df = pd.read_csv(os.path.join(splits_dir, "df_val.csv")) | |
| # Fill NaN just in case | |
| train_df["clean_text"] = train_df["clean_text"].fillna("") | |
| val_df["clean_text"] = val_df["clean_text"].fillna("") | |
| return train_df, val_df | |
| def plot_and_save_cm(y_true, y_pred, path, title="Logistic Regression Confusion Matrix"): | |
| """Save confusion matrix as a PNG.""" | |
| cm = confusion_matrix(y_true, y_pred) | |
| fig, ax = plt.subplots(figsize=(5, 5)) | |
| ax.matshow(cm, cmap=plt.cm.Blues, alpha=0.3) | |
| for i in range(cm.shape[0]): | |
| for j in range(cm.shape[1]): | |
| ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center', size='xx-large') | |
| plt.xlabel('Predicted Label') | |
| plt.ylabel('True Label') | |
| plt.title(title) | |
| plt.tight_layout() | |
| plt.savefig(path) | |
| plt.close() | |
| def train_logistic_model(cfg, splits_dir, save_dir): | |
| logger.info("Initializing Logistic Regression Training...") | |
| os.makedirs(save_dir, exist_ok=True) | |
| train_df, val_df = load_data(splits_dir) | |
| y_train = train_df["binary_label"].values | |
| y_val = val_df["binary_label"].values | |
| max_features = cfg.get("preprocessing", {}).get("max_tfidf_features", 50000) | |
| # Define ColumnTransformer for generic pipeline feature stack | |
| preprocessor = ColumnTransformer( | |
| transformers=[ | |
| ("tfidf", TfidfVectorizer(max_features=max_features, ngram_range=(1, 2)), "clean_text"), | |
| ("cat", OneHotEncoder(handle_unknown="ignore"), ["text_length_bucket"]) | |
| ], | |
| remainder="drop" | |
| ) | |
| # Define Model | |
| log_reg = LogisticRegression(class_weight="balanced", random_state=42, max_iter=1000) | |
| pipeline = Pipeline(steps=[ | |
| ("preprocessor", preprocessor), | |
| ("classifier", log_reg) | |
| ]) | |
| # K-Fold OOF Predictions | |
| logger.info("Generating 5-Fold OOF predictions on Train set...") | |
| cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) | |
| # Using method='predict_proba' returns a 2D array [n_samples, 2] | |
| oof_probas = cross_val_predict(pipeline, train_df, y_train, cv=cv, method='predict_proba', n_jobs=-1) | |
| np.save(os.path.join(save_dir, "lr_oof.npy"), oof_probas[:, 1]) | |
| logger.info("Saved OOF predictions (lr_oof.npy)") | |
| # Hyperparameter Tuning on full Train via GridSearch | |
| logger.info("Hyperparameter tuning C over 5-folds...") | |
| param_grid = {'classifier__C': [0.1, 1.0, 10.0]} | |
| grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1) | |
| grid_search.fit(train_df, y_train) | |
| best_pipeline = grid_search.best_estimator_ | |
| logger.info(f"Best parameter C: {grid_search.best_params_['classifier__C']}") | |
| # Validation Evaluation | |
| val_probas = best_pipeline.predict_proba(val_df)[:, 1] | |
| val_preds = (val_probas >= 0.5).astype(int) | |
| logger.info("Validation Classification Report:\n" + classification_report(y_val, val_preds)) | |
| roc_auc = roc_auc_score(y_val, val_probas) | |
| logger.info(f"ROC-AUC: {roc_auc:.4f}") | |
| # Generate Evaluation Artifacts | |
| plot_and_save_cm(y_val, val_preds, os.path.join(save_dir, "cm.png")) | |
| # Compute accuracy per text length bucket on val | |
| bucket_acc = {} | |
| for b in ["short", "medium", "long"]: | |
| b_mask = (val_df["text_length_bucket"] == b) | |
| if b_mask.sum() > 0: | |
| acc = (val_preds[b_mask] == y_val[b_mask]).mean() | |
| bucket_acc[b] = acc | |
| metrics = { | |
| "roc_auc": float(roc_auc), | |
| "bucket_accuracy": {k: float(v) for k, v in bucket_acc.items()} | |
| } | |
| with open(os.path.join(save_dir, "metrics.json"), "w") as f: | |
| json.dump(metrics, f, indent=2) | |
| # Save Pipeline | |
| joblib.dump(best_pipeline, os.path.join(save_dir, "logistic_model.pkl")) | |
| logger.info("Saved Logistic Regression Pipeline to format `logistic_model.pkl`.") | |
| if __name__ == "__main__": | |
| import yaml | |
| cfg_path = os.path.join(_PROJECT_ROOT, "config", "config.yaml") | |
| with open(cfg_path, "r", encoding="utf-8") as file: | |
| config = yaml.safe_load(file) | |
| s_dir = os.path.join(_PROJECT_ROOT, config["paths"]["splits_dir"]) | |
| m_dir = os.path.join(_PROJECT_ROOT, config["paths"]["models_dir"], "logistic_model") | |
| t0 = time.time() | |
| train_logistic_model(config, s_dir, m_dir) | |
| print(f"Total time: {time.time() - t0:.2f}s") | |