| """ | |
| Train query intent classifier. | |
| Uses Hugging Face transformer for sequence classification. | |
| """ | |
| import json | |
| from pathlib import Path | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import classification_report, f1_score | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForSequenceClassification, | |
| TrainingArguments, | |
| Trainer, | |
| ) | |
| from datasets import Dataset | |
| from config import ( | |
| DATA_PATH, | |
| MODEL_DIR, | |
| RANDOM_STATE, | |
| HF_MODEL, | |
| QUERY_COLUMN, | |
| INTENT_COLUMN, | |
| INTENT_LABELS, | |
| ) | |
| def load_data(path: str) -> pd.DataFrame: | |
| df = pd.read_csv(path) | |
| if QUERY_COLUMN not in df.columns or INTENT_COLUMN not in df.columns: | |
| raise ValueError(f"Need columns: {QUERY_COLUMN}, {INTENT_COLUMN}") | |
| return df | |
| def main(): | |
| if not Path(DATA_PATH).exists(): | |
| print(f"Data not found at {DATA_PATH}. Create data/query_intent.csv with query, intent.") | |
| return | |
| df = load_data(DATA_PATH) | |
| df[INTENT_COLUMN] = pd.Categorical(df[INTENT_COLUMN], categories=INTENT_LABELS) | |
| df["label"] = df[INTENT_COLUMN].cat.codes | |
| train_df, val_df = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE, stratify=df["label"]) | |
| tokenizer = AutoTokenizer.from_pretrained(HF_MODEL) | |
| model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL, num_labels=len(INTENT_LABELS)) | |
| def tokenize(examples): | |
| return tokenizer(examples[QUERY_COLUMN], truncation=True, max_length=128, padding="max_length") | |
| train_ds = Dataset.from_pandas(train_df[["query", "label"]].rename(columns={"query": QUERY_COLUMN})) | |
| val_ds = Dataset.from_pandas(val_df[["query", "label"]].rename(columns={"query": QUERY_COLUMN})) | |
| train_ds = train_ds.map(tokenize, batched=True, remove_columns=[QUERY_COLUMN]) | |
| val_ds = val_ds.map(tokenize, batched=True, remove_columns=[QUERY_COLUMN]) | |
| train_ds.set_format("torch") | |
| val_ds.set_format("torch") | |
| args = TrainingArguments( | |
| output_dir=str(MODEL_DIR), | |
| num_train_epochs=3, | |
| per_device_train_batch_size=16, | |
| per_device_eval_batch_size=32, | |
| evaluation_strategy="epoch", | |
| save_strategy="epoch", | |
| load_best_model_at_end=True, | |
| metric_for_best_model="f1", | |
| greater_is_better=True, | |
| ) | |
| def compute_metrics(eval_pred): | |
| preds = eval_pred.predictions.argmax(axis=1) | |
| return {"f1": float(f1_score(eval_pred.label_ids, preds, average="macro"))} | |
| trainer = Trainer( | |
| model=model, | |
| args=args, | |
| train_dataset=train_ds, | |
| eval_dataset=val_ds, | |
| compute_metrics=compute_metrics, | |
| ) | |
| trainer.train() | |
| trainer.save_model(str(MODEL_DIR)) | |
| tokenizer.save_pretrained(str(MODEL_DIR)) | |
| pred = trainer.predict(val_ds) | |
| report = classification_report( | |
| val_df["label"].values, | |
| pred.predictions.argmax(axis=1), | |
| target_names=INTENT_LABELS, | |
| output_dict=True, | |
| ) | |
| with open(MODEL_DIR / "metrics.json", "w") as f: | |
| json.dump({"classification_report": report}, f, indent=2) | |
| print(classification_report(val_df["label"].values, pred.predictions.argmax(axis=1), target_names=INTENT_LABELS)) | |
| if __name__ == "__main__": | |
| main() | |