Spaces:
Running
Running
| """TF-IDF + Logistic Regression baseline model for intent classification.""" | |
| import json | |
| import pickle | |
| from pathlib import Path | |
| from typing import Dict, List, Tuple | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import pandas as pd | |
| import seaborn as sns | |
| from loguru import logger | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.metrics import ( | |
| classification_report, | |
| confusion_matrix, | |
| f1_score, | |
| ) | |
| from sklearn.pipeline import Pipeline | |
| from src.data.dataset import INTENT_CATEGORIES | |
| def build_pipeline( | |
| max_features: int = 10000, | |
| ngram_range: Tuple[int, int] = (1, 2), | |
| min_df: int = 2, | |
| sublinear_tf: bool = True, | |
| C: float = 1.0, | |
| max_iter: int = 1000, | |
| seed: int = 42, | |
| ) -> Pipeline: | |
| """Build and return an unfitted TF-IDF + LogisticRegression sklearn Pipeline.""" | |
| tfidf = TfidfVectorizer( | |
| max_features=max_features, | |
| ngram_range=tuple(ngram_range), | |
| min_df=min_df, | |
| sublinear_tf=sublinear_tf, | |
| ) | |
| lr = LogisticRegression( | |
| C=C, | |
| max_iter=max_iter, | |
| class_weight="balanced", | |
| solver="lbfgs", | |
| random_state=seed, | |
| ) | |
| return Pipeline([("tfidf", tfidf), ("clf", lr)]) | |
| def train( | |
| train_df: pd.DataFrame, | |
| val_df: pd.DataFrame, | |
| cfg: dict, | |
| save_dir: str, | |
| ) -> Pipeline: | |
| """Train the baseline pipeline and evaluate on validation set. | |
| Args: | |
| train_df: Training DataFrame with 'text' and 'label' columns. | |
| val_df: Validation DataFrame with 'text' and 'label' columns. | |
| cfg: Config dict (from config.yaml). | |
| save_dir: Directory to save the fitted pipeline. | |
| Returns: | |
| Fitted sklearn Pipeline. | |
| """ | |
| bc = cfg["baseline"] | |
| pipeline = build_pipeline( | |
| max_features=bc["tfidf"]["max_features"], | |
| ngram_range=bc["tfidf"]["ngram_range"], | |
| min_df=bc["tfidf"]["min_df"], | |
| sublinear_tf=bc["tfidf"]["sublinear_tf"], | |
| C=bc["logistic_regression"]["C"], | |
| max_iter=bc["logistic_regression"]["max_iter"], | |
| seed=bc["logistic_regression"]["seed"], | |
| ) | |
| logger.info(f"Training baseline on {len(train_df):,} examples…") | |
| pipeline.fit(train_df["text"], train_df["label"]) | |
| val_preds = pipeline.predict(val_df["text"]) | |
| val_f1 = f1_score(val_df["label"], val_preds, average="weighted") | |
| logger.info(f"Validation weighted F1: {val_f1:.4f}") | |
| Path(save_dir).mkdir(parents=True, exist_ok=True) | |
| model_path = Path(save_dir) / "baseline_pipeline.pkl" | |
| with open(model_path, "wb") as f: | |
| pickle.dump(pipeline, f) | |
| logger.info(f"Saved baseline pipeline → {model_path}") | |
| return pipeline | |
| def evaluate( | |
| pipeline: Pipeline, | |
| test_df: pd.DataFrame, | |
| results_dir: str, | |
| ) -> Dict: | |
| """Evaluate the baseline on the test set and save artifacts. | |
| Args: | |
| pipeline: Fitted sklearn Pipeline. | |
| test_df: Test DataFrame with 'text' and 'label' columns. | |
| results_dir: Directory to save evaluation artifacts. | |
| Returns: | |
| Dictionary with classification report metrics. | |
| """ | |
| Path(results_dir).mkdir(parents=True, exist_ok=True) | |
| preds = pipeline.predict(test_df["text"]) | |
| labels = sorted(INTENT_CATEGORIES) | |
| report = classification_report( | |
| test_df["label"], preds, labels=labels, output_dict=True | |
| ) | |
| report_text = classification_report(test_df["label"], preds, labels=labels) | |
| logger.info(f"Baseline classification report:\n{report_text}") | |
| # Save JSON report | |
| report_path = Path(results_dir) / "baseline_classification_report.json" | |
| with open(report_path, "w") as f: | |
| json.dump(report, f, indent=2) | |
| logger.info(f"Saved classification report → {report_path}") | |
| # Confusion matrix | |
| cm = confusion_matrix(test_df["label"], preds, labels=labels) | |
| fig, ax = plt.subplots(figsize=(10, 8)) | |
| sns.heatmap( | |
| cm, | |
| annot=True, | |
| fmt="d", | |
| cmap="Blues", | |
| xticklabels=labels, | |
| yticklabels=labels, | |
| ax=ax, | |
| ) | |
| ax.set_title("Baseline Confusion Matrix (TF-IDF + LR)") | |
| ax.set_xlabel("Predicted") | |
| ax.set_ylabel("True") | |
| plt.tight_layout() | |
| cm_path = Path(results_dir) / "baseline_confusion_matrix.png" | |
| fig.savefig(cm_path, dpi=150) | |
| plt.close(fig) | |
| logger.info(f"Saved confusion matrix → {cm_path}") | |
| weighted_f1 = report["weighted avg"]["f1-score"] | |
| logger.info(f"Baseline test weighted F1: {weighted_f1:.4f}") | |
| return report | |
| def load_pipeline(save_dir: str) -> Pipeline: | |
| """Load and return the saved baseline pipeline from disk.""" | |
| path = Path(save_dir) / "baseline_pipeline.pkl" | |
| with open(path, "rb") as f: | |
| pipeline = pickle.load(f) | |
| logger.info(f"Loaded baseline pipeline from {path}") | |
| return pipeline | |