"""Finalize the intent classifier from the existing best checkpoint. We stopped training at end of epoch 3/5 with best eval_f1_macro=0.9402. Rather than resuming training (the checkpoint has no optimizer state because train_intent.py used save_only_model=True), we accept the epoch-3 model as final, run TEST evaluation, and save: models/intent_classifier/ (best model + tokenizer + labels.json) models/intent_classifier/eval_results.json (test metrics + per-language breakdown) Usage: python src/finalize_intent.py """ from __future__ import annotations import os os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") import json import shutil import sys from pathlib import Path from typing import Any import numpy as np import torch from datasets import load_from_disk from sklearn.metrics import ( accuracy_score, classification_report, f1_score, precision_score, recall_score, ) from transformers import ( AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments, ) PROJECT_ROOT = Path(__file__).resolve().parent.parent DATA_DIR = PROJECT_ROOT / "data" / "processed" / "intent" LABELS_FILE = DATA_DIR / "labels.json" OUT_DIR = PROJECT_ROOT / "models" / "intent_classifier" CHECKPOINT_DIR = OUT_DIR / "runs" / "checkpoint-3336" MAX_LENGTH = 128 def main() -> int: if not CHECKPOINT_DIR.exists(): print(f"ERROR: checkpoint not found at {CHECKPOINT_DIR}", file=sys.stderr) return 2 print("=" * 72) print("Finalize intent classifier from checkpoint-3336") print("=" * 72) print(f" Checkpoint: {CHECKPOINT_DIR}") print(f" Out dir : {OUT_DIR}") labels_payload = json.loads(LABELS_FILE.read_text()) label_to_id: dict[str, int] = labels_payload["label_to_id"] id_to_label: dict[int, str] = {int(k): v for k, v in labels_payload["id_to_label"].items()} label_names = [id_to_label[i] for i in range(len(id_to_label))] num_labels = len(label_names) ds = load_from_disk(str(DATA_DIR)) print(f" Test rows : {len(ds['test'])}") tokenizer = AutoTokenizer.from_pretrained(str(CHECKPOINT_DIR)) model = AutoModelForSequenceClassification.from_pretrained(str(CHECKPOINT_DIR)) def tokenize(batch: dict[str, list]) -> dict[str, Any]: return tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH) drop_cols = [c for c in ds["test"].column_names if c not in ("label",)] test_tok = ds["test"].map(tokenize, batched=True, remove_columns=drop_cols, desc="Tokenizing test") eval_args = TrainingArguments( output_dir=str(OUT_DIR / "tmp_eval"), per_device_eval_batch_size=16, fp16=torch.cuda.is_available(), report_to="none", dataloader_num_workers=0, ) def compute_metrics(eval_pred) -> dict[str, float]: logits, labels = eval_pred if isinstance(logits, tuple): logits = logits[0] preds = np.argmax(logits, axis=-1) return { "accuracy": accuracy_score(labels, preds), "f1": f1_score(labels, preds, average="weighted", zero_division=0), "f1_macro": f1_score(labels, preds, average="macro", zero_division=0), "precision": precision_score(labels, preds, average="weighted", zero_division=0), "recall": recall_score(labels, preds, average="weighted", zero_division=0), } trainer = Trainer( model=model, args=eval_args, data_collator=DataCollatorWithPadding(tokenizer), compute_metrics=compute_metrics, ) print("\nEvaluating on TEST split ...") test_metrics = trainer.evaluate(test_tok, metric_key_prefix="test") test_pred = trainer.predict(test_tok) test_logits = test_pred.predictions[0] if isinstance(test_pred.predictions, tuple) else test_pred.predictions pred_ids = np.argmax(test_logits, axis=-1) true_ids = test_pred.label_ids report_dict = classification_report( true_ids, pred_ids, labels=list(range(num_labels)), target_names=label_names, output_dict=True, zero_division=0, ) report_text = classification_report( true_ids, pred_ids, labels=list(range(num_labels)), target_names=label_names, zero_division=0, ) print("\nClassification report on TEST:") print(report_text) test_with_lang = ds["test"] per_lang: dict[str, dict[str, float]] = {} if "language" in test_with_lang.column_names: languages = test_with_lang["language"] for lang in sorted(set(languages)): mask = np.array([la == lang for la in languages]) if not mask.any(): continue lp = pred_ids[mask] lt = true_ids[mask] per_lang[lang] = { "n": int(mask.sum()), "accuracy": float(accuracy_score(lt, lp)), "f1_weighted": float(f1_score(lt, lp, average="weighted", zero_division=0)), "f1_macro": float(f1_score(lt, lp, average="macro", zero_division=0)), } print("\nPer-language metrics on TEST:") for lang, m in per_lang.items(): print(f" {lang}: n={m['n']} acc={m['accuracy']:.4f} " f"f1_w={m['f1_weighted']:.4f} f1_m={m['f1_macro']:.4f}") OUT_DIR.mkdir(parents=True, exist_ok=True) trainer.save_model(str(OUT_DIR)) tokenizer.save_pretrained(str(OUT_DIR)) shutil.copy(LABELS_FILE, OUT_DIR / "labels.json") payload = { "model_name": "distilbert-base-multilingual-cased", "task": "intent", "num_labels": num_labels, "labels": label_to_id, "source_checkpoint": str(CHECKPOINT_DIR.relative_to(PROJECT_ROOT)), "test_metrics": {k: float(v) for k, v in test_metrics.items() if isinstance(v, (int, float))}, "classification_report": report_dict, "per_language": per_lang, "training": { "epochs_completed": 3, "epochs_planned": 5, "note": "training stopped at end of epoch 3; checkpoint accepted as final " "(best eval_f1_macro=0.9402 at epoch 3, curve still improving but " "optimizer state was not saved due to save_only_model=True).", }, } (OUT_DIR / "eval_results.json").write_text( json.dumps(payload, indent=2, ensure_ascii=False) ) print(f"\n[OK] Saved final model to {OUT_DIR}") print(f"[OK] Saved eval_results.json to {OUT_DIR / 'eval_results.json'}") tmp = OUT_DIR / "tmp_eval" if tmp.exists(): shutil.rmtree(tmp, ignore_errors=True) return 0 if __name__ == "__main__": sys.exit(main())