""" Train query intent classifier. Uses Hugging Face transformer for sequence classification. """ import json from pathlib import Path import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, f1_score from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, ) from datasets import Dataset from config import ( DATA_PATH, MODEL_DIR, RANDOM_STATE, HF_MODEL, QUERY_COLUMN, INTENT_COLUMN, INTENT_LABELS, ) def load_data(path: str) -> pd.DataFrame: df = pd.read_csv(path) if QUERY_COLUMN not in df.columns or INTENT_COLUMN not in df.columns: raise ValueError(f"Need columns: {QUERY_COLUMN}, {INTENT_COLUMN}") return df def main(): if not Path(DATA_PATH).exists(): print(f"Data not found at {DATA_PATH}. Create data/query_intent.csv with query, intent.") return df = load_data(DATA_PATH) df[INTENT_COLUMN] = pd.Categorical(df[INTENT_COLUMN], categories=INTENT_LABELS) df["label"] = df[INTENT_COLUMN].cat.codes train_df, val_df = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE, stratify=df["label"]) tokenizer = AutoTokenizer.from_pretrained(HF_MODEL) model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL, num_labels=len(INTENT_LABELS)) def tokenize(examples): return tokenizer(examples[QUERY_COLUMN], truncation=True, max_length=128, padding="max_length") train_ds = Dataset.from_pandas(train_df[["query", "label"]].rename(columns={"query": QUERY_COLUMN})) val_ds = Dataset.from_pandas(val_df[["query", "label"]].rename(columns={"query": QUERY_COLUMN})) train_ds = train_ds.map(tokenize, batched=True, remove_columns=[QUERY_COLUMN]) val_ds = val_ds.map(tokenize, batched=True, remove_columns=[QUERY_COLUMN]) train_ds.set_format("torch") val_ds.set_format("torch") args = TrainingArguments( output_dir=str(MODEL_DIR), num_train_epochs=3, per_device_train_batch_size=16, per_device_eval_batch_size=32, evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, metric_for_best_model="f1", greater_is_better=True, ) def compute_metrics(eval_pred): preds = eval_pred.predictions.argmax(axis=1) return {"f1": float(f1_score(eval_pred.label_ids, preds, average="macro"))} trainer = Trainer( model=model, args=args, train_dataset=train_ds, eval_dataset=val_ds, compute_metrics=compute_metrics, ) trainer.train() trainer.save_model(str(MODEL_DIR)) tokenizer.save_pretrained(str(MODEL_DIR)) pred = trainer.predict(val_ds) report = classification_report( val_df["label"].values, pred.predictions.argmax(axis=1), target_names=INTENT_LABELS, output_dict=True, ) with open(MODEL_DIR / "metrics.json", "w") as f: json.dump({"classification_report": report}, f, indent=2) print(classification_report(val_df["label"].values, pred.predictions.argmax(axis=1), target_names=INTENT_LABELS)) if __name__ == "__main__": main()