multilingual-chatbot / src /finalize_intent.py
momenalhamza's picture
Deploy chatbot: code + RAG + Qwen (3 BERT classifiers loaded from HF Hub)
469ef7f verified
"""Finalize the intent classifier from the existing best checkpoint.
We stopped training at end of epoch 3/5 with best eval_f1_macro=0.9402.
Rather than resuming training (the checkpoint has no optimizer state because
train_intent.py used save_only_model=True), we accept the epoch-3 model as
final, run TEST evaluation, and save:
models/intent_classifier/ (best model + tokenizer + labels.json)
models/intent_classifier/eval_results.json (test metrics + per-language breakdown)
Usage:
python src/finalize_intent.py
"""
from __future__ import annotations
import os
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
import json
import shutil
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
from datasets import load_from_disk
from sklearn.metrics import (
accuracy_score, classification_report, f1_score,
precision_score, recall_score,
)
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
DataCollatorWithPadding,
Trainer,
TrainingArguments,
)
PROJECT_ROOT = Path(__file__).resolve().parent.parent
DATA_DIR = PROJECT_ROOT / "data" / "processed" / "intent"
LABELS_FILE = DATA_DIR / "labels.json"
OUT_DIR = PROJECT_ROOT / "models" / "intent_classifier"
CHECKPOINT_DIR = OUT_DIR / "runs" / "checkpoint-3336"
MAX_LENGTH = 128
def main() -> int:
if not CHECKPOINT_DIR.exists():
print(f"ERROR: checkpoint not found at {CHECKPOINT_DIR}", file=sys.stderr)
return 2
print("=" * 72)
print("Finalize intent classifier from checkpoint-3336")
print("=" * 72)
print(f" Checkpoint: {CHECKPOINT_DIR}")
print(f" Out dir : {OUT_DIR}")
labels_payload = json.loads(LABELS_FILE.read_text())
label_to_id: dict[str, int] = labels_payload["label_to_id"]
id_to_label: dict[int, str] = {int(k): v for k, v in labels_payload["id_to_label"].items()}
label_names = [id_to_label[i] for i in range(len(id_to_label))]
num_labels = len(label_names)
ds = load_from_disk(str(DATA_DIR))
print(f" Test rows : {len(ds['test'])}")
tokenizer = AutoTokenizer.from_pretrained(str(CHECKPOINT_DIR))
model = AutoModelForSequenceClassification.from_pretrained(str(CHECKPOINT_DIR))
def tokenize(batch: dict[str, list]) -> dict[str, Any]:
return tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH)
drop_cols = [c for c in ds["test"].column_names if c not in ("label",)]
test_tok = ds["test"].map(tokenize, batched=True, remove_columns=drop_cols,
desc="Tokenizing test")
eval_args = TrainingArguments(
output_dir=str(OUT_DIR / "tmp_eval"),
per_device_eval_batch_size=16,
fp16=torch.cuda.is_available(),
report_to="none",
dataloader_num_workers=0,
)
def compute_metrics(eval_pred) -> dict[str, float]:
logits, labels = eval_pred
if isinstance(logits, tuple):
logits = logits[0]
preds = np.argmax(logits, axis=-1)
return {
"accuracy": accuracy_score(labels, preds),
"f1": f1_score(labels, preds, average="weighted", zero_division=0),
"f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
"precision": precision_score(labels, preds, average="weighted", zero_division=0),
"recall": recall_score(labels, preds, average="weighted", zero_division=0),
}
trainer = Trainer(
model=model,
args=eval_args,
data_collator=DataCollatorWithPadding(tokenizer),
compute_metrics=compute_metrics,
)
print("\nEvaluating on TEST split ...")
test_metrics = trainer.evaluate(test_tok, metric_key_prefix="test")
test_pred = trainer.predict(test_tok)
test_logits = test_pred.predictions[0] if isinstance(test_pred.predictions, tuple) else test_pred.predictions
pred_ids = np.argmax(test_logits, axis=-1)
true_ids = test_pred.label_ids
report_dict = classification_report(
true_ids, pred_ids,
labels=list(range(num_labels)),
target_names=label_names,
output_dict=True, zero_division=0,
)
report_text = classification_report(
true_ids, pred_ids,
labels=list(range(num_labels)),
target_names=label_names,
zero_division=0,
)
print("\nClassification report on TEST:")
print(report_text)
test_with_lang = ds["test"]
per_lang: dict[str, dict[str, float]] = {}
if "language" in test_with_lang.column_names:
languages = test_with_lang["language"]
for lang in sorted(set(languages)):
mask = np.array([la == lang for la in languages])
if not mask.any():
continue
lp = pred_ids[mask]
lt = true_ids[mask]
per_lang[lang] = {
"n": int(mask.sum()),
"accuracy": float(accuracy_score(lt, lp)),
"f1_weighted": float(f1_score(lt, lp, average="weighted", zero_division=0)),
"f1_macro": float(f1_score(lt, lp, average="macro", zero_division=0)),
}
print("\nPer-language metrics on TEST:")
for lang, m in per_lang.items():
print(f" {lang}: n={m['n']} acc={m['accuracy']:.4f} "
f"f1_w={m['f1_weighted']:.4f} f1_m={m['f1_macro']:.4f}")
OUT_DIR.mkdir(parents=True, exist_ok=True)
trainer.save_model(str(OUT_DIR))
tokenizer.save_pretrained(str(OUT_DIR))
shutil.copy(LABELS_FILE, OUT_DIR / "labels.json")
payload = {
"model_name": "distilbert-base-multilingual-cased",
"task": "intent",
"num_labels": num_labels,
"labels": label_to_id,
"source_checkpoint": str(CHECKPOINT_DIR.relative_to(PROJECT_ROOT)),
"test_metrics": {k: float(v) for k, v in test_metrics.items()
if isinstance(v, (int, float))},
"classification_report": report_dict,
"per_language": per_lang,
"training": {
"epochs_completed": 3,
"epochs_planned": 5,
"note": "training stopped at end of epoch 3; checkpoint accepted as final "
"(best eval_f1_macro=0.9402 at epoch 3, curve still improving but "
"optimizer state was not saved due to save_only_model=True).",
},
}
(OUT_DIR / "eval_results.json").write_text(
json.dumps(payload, indent=2, ensure_ascii=False)
)
print(f"\n[OK] Saved final model to {OUT_DIR}")
print(f"[OK] Saved eval_results.json to {OUT_DIR / 'eval_results.json'}")
tmp = OUT_DIR / "tmp_eval"
if tmp.exists():
shutil.rmtree(tmp, ignore_errors=True)
return 0
if __name__ == "__main__":
sys.exit(main())