Spaces:

pro580
/

customer-support-agent

Running

App Files Files Community

customer-support-agent / src /models /baseline.py

pro580

Fix rate limiter to use X-Forwarded-For header behind HF proxy

e323466 3 months ago

Raw

History Blame Contribute Delete

4.85 kB

	"""TF-IDF + Logistic Regression baseline model for intent classification."""

	import json
	import pickle
	from pathlib import Path
	from typing import Dict, List, Tuple

	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	import seaborn as sns
	from loguru import logger
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import (
	classification_report,
	confusion_matrix,
	f1_score,
	)
	from sklearn.pipeline import Pipeline

	from src.data.dataset import INTENT_CATEGORIES


	def build_pipeline(
	max_features: int = 10000,
	ngram_range: Tuple[int, int] = (1, 2),
	min_df: int = 2,
	sublinear_tf: bool = True,
	C: float = 1.0,
	max_iter: int = 1000,
	seed: int = 42,
	) -> Pipeline:
	"""Build and return an unfitted TF-IDF + LogisticRegression sklearn Pipeline."""
	tfidf = TfidfVectorizer(
	max_features=max_features,
	ngram_range=tuple(ngram_range),
	min_df=min_df,
	sublinear_tf=sublinear_tf,
	)
	lr = LogisticRegression(
	C=C,
	max_iter=max_iter,
	class_weight="balanced",
	solver="lbfgs",
	random_state=seed,
	)
	return Pipeline([("tfidf", tfidf), ("clf", lr)])


	def train(
	train_df: pd.DataFrame,
	val_df: pd.DataFrame,
	cfg: dict,
	save_dir: str,
	) -> Pipeline:
	"""Train the baseline pipeline and evaluate on validation set.

	Args:
	train_df: Training DataFrame with 'text' and 'label' columns.
	val_df: Validation DataFrame with 'text' and 'label' columns.
	cfg: Config dict (from config.yaml).
	save_dir: Directory to save the fitted pipeline.

	Returns:
	Fitted sklearn Pipeline.
	"""
	bc = cfg["baseline"]
	pipeline = build_pipeline(
	max_features=bc["tfidf"]["max_features"],
	ngram_range=bc["tfidf"]["ngram_range"],
	min_df=bc["tfidf"]["min_df"],
	sublinear_tf=bc["tfidf"]["sublinear_tf"],
	C=bc["logistic_regression"]["C"],
	max_iter=bc["logistic_regression"]["max_iter"],
	seed=bc["logistic_regression"]["seed"],
	)

	logger.info(f"Training baseline on {len(train_df):,} examples…")
	pipeline.fit(train_df["text"], train_df["label"])

	val_preds = pipeline.predict(val_df["text"])
	val_f1 = f1_score(val_df["label"], val_preds, average="weighted")
	logger.info(f"Validation weighted F1: {val_f1:.4f}")

	Path(save_dir).mkdir(parents=True, exist_ok=True)
	model_path = Path(save_dir) / "baseline_pipeline.pkl"
	with open(model_path, "wb") as f:
	pickle.dump(pipeline, f)
	logger.info(f"Saved baseline pipeline → {model_path}")

	return pipeline


	def evaluate(
	pipeline: Pipeline,
	test_df: pd.DataFrame,
	results_dir: str,
	) -> Dict:
	"""Evaluate the baseline on the test set and save artifacts.

	Args:
	pipeline: Fitted sklearn Pipeline.
	test_df: Test DataFrame with 'text' and 'label' columns.
	results_dir: Directory to save evaluation artifacts.

	Returns:
	Dictionary with classification report metrics.
	"""
	Path(results_dir).mkdir(parents=True, exist_ok=True)

	preds = pipeline.predict(test_df["text"])
	labels = sorted(INTENT_CATEGORIES)

	report = classification_report(
	test_df["label"], preds, labels=labels, output_dict=True
	)
	report_text = classification_report(test_df["label"], preds, labels=labels)
	logger.info(f"Baseline classification report:\n{report_text}")

	# Save JSON report
	report_path = Path(results_dir) / "baseline_classification_report.json"
	with open(report_path, "w") as f:
	json.dump(report, f, indent=2)
	logger.info(f"Saved classification report → {report_path}")

	# Confusion matrix
	cm = confusion_matrix(test_df["label"], preds, labels=labels)
	fig, ax = plt.subplots(figsize=(10, 8))
	sns.heatmap(
	cm,
	annot=True,
	fmt="d",
	cmap="Blues",
	xticklabels=labels,
	yticklabels=labels,
	ax=ax,
	)
	ax.set_title("Baseline Confusion Matrix (TF-IDF + LR)")
	ax.set_xlabel("Predicted")
	ax.set_ylabel("True")
	plt.tight_layout()
	cm_path = Path(results_dir) / "baseline_confusion_matrix.png"
	fig.savefig(cm_path, dpi=150)
	plt.close(fig)
	logger.info(f"Saved confusion matrix → {cm_path}")

	weighted_f1 = report["weighted avg"]["f1-score"]
	logger.info(f"Baseline test weighted F1: {weighted_f1:.4f}")
	return report


	def load_pipeline(save_dir: str) -> Pipeline:
	"""Load and return the saved baseline pipeline from disk."""
	path = Path(save_dir) / "baseline_pipeline.pkl"
	with open(path, "rb") as f:
	pipeline = pickle.load(f)
	logger.info(f"Loaded baseline pipeline from {path}")
	return pipeline