syeedalireza's picture
Upload folder using huggingface_hub
fe67d4b verified
"""
Train query intent classifier.
Uses Hugging Face transformer for sequence classification.
"""
import json
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
)
from datasets import Dataset
from config import (
DATA_PATH,
MODEL_DIR,
RANDOM_STATE,
HF_MODEL,
QUERY_COLUMN,
INTENT_COLUMN,
INTENT_LABELS,
)
def load_data(path: str) -> pd.DataFrame:
df = pd.read_csv(path)
if QUERY_COLUMN not in df.columns or INTENT_COLUMN not in df.columns:
raise ValueError(f"Need columns: {QUERY_COLUMN}, {INTENT_COLUMN}")
return df
def main():
if not Path(DATA_PATH).exists():
print(f"Data not found at {DATA_PATH}. Create data/query_intent.csv with query, intent.")
return
df = load_data(DATA_PATH)
df[INTENT_COLUMN] = pd.Categorical(df[INTENT_COLUMN], categories=INTENT_LABELS)
df["label"] = df[INTENT_COLUMN].cat.codes
train_df, val_df = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE, stratify=df["label"])
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL, num_labels=len(INTENT_LABELS))
def tokenize(examples):
return tokenizer(examples[QUERY_COLUMN], truncation=True, max_length=128, padding="max_length")
train_ds = Dataset.from_pandas(train_df[["query", "label"]].rename(columns={"query": QUERY_COLUMN}))
val_ds = Dataset.from_pandas(val_df[["query", "label"]].rename(columns={"query": QUERY_COLUMN}))
train_ds = train_ds.map(tokenize, batched=True, remove_columns=[QUERY_COLUMN])
val_ds = val_ds.map(tokenize, batched=True, remove_columns=[QUERY_COLUMN])
train_ds.set_format("torch")
val_ds.set_format("torch")
args = TrainingArguments(
output_dir=str(MODEL_DIR),
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="f1",
greater_is_better=True,
)
def compute_metrics(eval_pred):
preds = eval_pred.predictions.argmax(axis=1)
return {"f1": float(f1_score(eval_pred.label_ids, preds, average="macro"))}
trainer = Trainer(
model=model,
args=args,
train_dataset=train_ds,
eval_dataset=val_ds,
compute_metrics=compute_metrics,
)
trainer.train()
trainer.save_model(str(MODEL_DIR))
tokenizer.save_pretrained(str(MODEL_DIR))
pred = trainer.predict(val_ds)
report = classification_report(
val_df["label"].values,
pred.predictions.argmax(axis=1),
target_names=INTENT_LABELS,
output_dict=True,
)
with open(MODEL_DIR / "metrics.json", "w") as f:
json.dump({"classification_report": report}, f, indent=2)
print(classification_report(val_df["label"].values, pred.predictions.argmax(axis=1), target_names=INTENT_LABELS))
if __name__ == "__main__":
main()