pro580's picture
Fix rate limiter to use X-Forwarded-For header behind HF proxy
e323466
Raw
History Blame Contribute Delete
4.85 kB
"""TF-IDF + Logistic Regression baseline model for intent classification."""
import json
import pickle
from pathlib import Path
from typing import Dict, List, Tuple
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from loguru import logger
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
classification_report,
confusion_matrix,
f1_score,
)
from sklearn.pipeline import Pipeline
from src.data.dataset import INTENT_CATEGORIES
def build_pipeline(
max_features: int = 10000,
ngram_range: Tuple[int, int] = (1, 2),
min_df: int = 2,
sublinear_tf: bool = True,
C: float = 1.0,
max_iter: int = 1000,
seed: int = 42,
) -> Pipeline:
"""Build and return an unfitted TF-IDF + LogisticRegression sklearn Pipeline."""
tfidf = TfidfVectorizer(
max_features=max_features,
ngram_range=tuple(ngram_range),
min_df=min_df,
sublinear_tf=sublinear_tf,
)
lr = LogisticRegression(
C=C,
max_iter=max_iter,
class_weight="balanced",
solver="lbfgs",
random_state=seed,
)
return Pipeline([("tfidf", tfidf), ("clf", lr)])
def train(
train_df: pd.DataFrame,
val_df: pd.DataFrame,
cfg: dict,
save_dir: str,
) -> Pipeline:
"""Train the baseline pipeline and evaluate on validation set.
Args:
train_df: Training DataFrame with 'text' and 'label' columns.
val_df: Validation DataFrame with 'text' and 'label' columns.
cfg: Config dict (from config.yaml).
save_dir: Directory to save the fitted pipeline.
Returns:
Fitted sklearn Pipeline.
"""
bc = cfg["baseline"]
pipeline = build_pipeline(
max_features=bc["tfidf"]["max_features"],
ngram_range=bc["tfidf"]["ngram_range"],
min_df=bc["tfidf"]["min_df"],
sublinear_tf=bc["tfidf"]["sublinear_tf"],
C=bc["logistic_regression"]["C"],
max_iter=bc["logistic_regression"]["max_iter"],
seed=bc["logistic_regression"]["seed"],
)
logger.info(f"Training baseline on {len(train_df):,} examples…")
pipeline.fit(train_df["text"], train_df["label"])
val_preds = pipeline.predict(val_df["text"])
val_f1 = f1_score(val_df["label"], val_preds, average="weighted")
logger.info(f"Validation weighted F1: {val_f1:.4f}")
Path(save_dir).mkdir(parents=True, exist_ok=True)
model_path = Path(save_dir) / "baseline_pipeline.pkl"
with open(model_path, "wb") as f:
pickle.dump(pipeline, f)
logger.info(f"Saved baseline pipeline → {model_path}")
return pipeline
def evaluate(
pipeline: Pipeline,
test_df: pd.DataFrame,
results_dir: str,
) -> Dict:
"""Evaluate the baseline on the test set and save artifacts.
Args:
pipeline: Fitted sklearn Pipeline.
test_df: Test DataFrame with 'text' and 'label' columns.
results_dir: Directory to save evaluation artifacts.
Returns:
Dictionary with classification report metrics.
"""
Path(results_dir).mkdir(parents=True, exist_ok=True)
preds = pipeline.predict(test_df["text"])
labels = sorted(INTENT_CATEGORIES)
report = classification_report(
test_df["label"], preds, labels=labels, output_dict=True
)
report_text = classification_report(test_df["label"], preds, labels=labels)
logger.info(f"Baseline classification report:\n{report_text}")
# Save JSON report
report_path = Path(results_dir) / "baseline_classification_report.json"
with open(report_path, "w") as f:
json.dump(report, f, indent=2)
logger.info(f"Saved classification report → {report_path}")
# Confusion matrix
cm = confusion_matrix(test_df["label"], preds, labels=labels)
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(
cm,
annot=True,
fmt="d",
cmap="Blues",
xticklabels=labels,
yticklabels=labels,
ax=ax,
)
ax.set_title("Baseline Confusion Matrix (TF-IDF + LR)")
ax.set_xlabel("Predicted")
ax.set_ylabel("True")
plt.tight_layout()
cm_path = Path(results_dir) / "baseline_confusion_matrix.png"
fig.savefig(cm_path, dpi=150)
plt.close(fig)
logger.info(f"Saved confusion matrix → {cm_path}")
weighted_f1 = report["weighted avg"]["f1-score"]
logger.info(f"Baseline test weighted F1: {weighted_f1:.4f}")
return report
def load_pipeline(save_dir: str) -> Pipeline:
"""Load and return the saved baseline pipeline from disk."""
path = Path(save_dir) / "baseline_pipeline.pkl"
with open(path, "rb") as f:
pipeline = pickle.load(f)
logger.info(f"Loaded baseline pipeline from {path}")
return pipeline