Spaces:

momenalhamza
/

multilingual-chatbot

Sleeping

File size: 6,820 Bytes

469ef7f

"""Finalize the intent classifier from the existing best checkpoint.

We stopped training at end of epoch 3/5 with best eval_f1_macro=0.9402.
Rather than resuming training (the checkpoint has no optimizer state because
train_intent.py used save_only_model=True), we accept the epoch-3 model as
final, run TEST evaluation, and save:

  models/intent_classifier/                   (best model + tokenizer + labels.json)
  models/intent_classifier/eval_results.json  (test metrics + per-language breakdown)

Usage:
  python src/finalize_intent.py
"""

from __future__ import annotations

import os
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

import json
import shutil
import sys
from pathlib import Path
from typing import Any

import numpy as np
import torch
from datasets import load_from_disk
from sklearn.metrics import (
    accuracy_score, classification_report, f1_score,
    precision_score, recall_score,
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

PROJECT_ROOT = Path(__file__).resolve().parent.parent
DATA_DIR = PROJECT_ROOT / "data" / "processed" / "intent"
LABELS_FILE = DATA_DIR / "labels.json"
OUT_DIR = PROJECT_ROOT / "models" / "intent_classifier"
CHECKPOINT_DIR = OUT_DIR / "runs" / "checkpoint-3336"
MAX_LENGTH = 128


def main() -> int:
    if not CHECKPOINT_DIR.exists():
        print(f"ERROR: checkpoint not found at {CHECKPOINT_DIR}", file=sys.stderr)
        return 2

    print("=" * 72)
    print("Finalize intent classifier from checkpoint-3336")
    print("=" * 72)
    print(f"  Checkpoint: {CHECKPOINT_DIR}")
    print(f"  Out dir   : {OUT_DIR}")

    labels_payload = json.loads(LABELS_FILE.read_text())
    label_to_id: dict[str, int] = labels_payload["label_to_id"]
    id_to_label: dict[int, str] = {int(k): v for k, v in labels_payload["id_to_label"].items()}
    label_names = [id_to_label[i] for i in range(len(id_to_label))]
    num_labels = len(label_names)

    ds = load_from_disk(str(DATA_DIR))
    print(f"  Test rows : {len(ds['test'])}")

    tokenizer = AutoTokenizer.from_pretrained(str(CHECKPOINT_DIR))
    model = AutoModelForSequenceClassification.from_pretrained(str(CHECKPOINT_DIR))

    def tokenize(batch: dict[str, list]) -> dict[str, Any]:
        return tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH)

    drop_cols = [c for c in ds["test"].column_names if c not in ("label",)]
    test_tok = ds["test"].map(tokenize, batched=True, remove_columns=drop_cols,
                              desc="Tokenizing test")

    eval_args = TrainingArguments(
        output_dir=str(OUT_DIR / "tmp_eval"),
        per_device_eval_batch_size=16,
        fp16=torch.cuda.is_available(),
        report_to="none",
        dataloader_num_workers=0,
    )

    def compute_metrics(eval_pred) -> dict[str, float]:
        logits, labels = eval_pred
        if isinstance(logits, tuple):
            logits = logits[0]
        preds = np.argmax(logits, axis=-1)
        return {
            "accuracy": accuracy_score(labels, preds),
            "f1": f1_score(labels, preds, average="weighted", zero_division=0),
            "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
            "precision": precision_score(labels, preds, average="weighted", zero_division=0),
            "recall": recall_score(labels, preds, average="weighted", zero_division=0),
        }

    trainer = Trainer(
        model=model,
        args=eval_args,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics,
    )

    print("\nEvaluating on TEST split ...")
    test_metrics = trainer.evaluate(test_tok, metric_key_prefix="test")
    test_pred = trainer.predict(test_tok)
    test_logits = test_pred.predictions[0] if isinstance(test_pred.predictions, tuple) else test_pred.predictions
    pred_ids = np.argmax(test_logits, axis=-1)
    true_ids = test_pred.label_ids

    report_dict = classification_report(
        true_ids, pred_ids,
        labels=list(range(num_labels)),
        target_names=label_names,
        output_dict=True, zero_division=0,
    )
    report_text = classification_report(
        true_ids, pred_ids,
        labels=list(range(num_labels)),
        target_names=label_names,
        zero_division=0,
    )
    print("\nClassification report on TEST:")
    print(report_text)

    test_with_lang = ds["test"]
    per_lang: dict[str, dict[str, float]] = {}
    if "language" in test_with_lang.column_names:
        languages = test_with_lang["language"]
        for lang in sorted(set(languages)):
            mask = np.array([la == lang for la in languages])
            if not mask.any():
                continue
            lp = pred_ids[mask]
            lt = true_ids[mask]
            per_lang[lang] = {
                "n": int(mask.sum()),
                "accuracy": float(accuracy_score(lt, lp)),
                "f1_weighted": float(f1_score(lt, lp, average="weighted", zero_division=0)),
                "f1_macro": float(f1_score(lt, lp, average="macro", zero_division=0)),
            }
        print("\nPer-language metrics on TEST:")
        for lang, m in per_lang.items():
            print(f"  {lang}: n={m['n']}  acc={m['accuracy']:.4f}  "
                  f"f1_w={m['f1_weighted']:.4f}  f1_m={m['f1_macro']:.4f}")

    OUT_DIR.mkdir(parents=True, exist_ok=True)
    trainer.save_model(str(OUT_DIR))
    tokenizer.save_pretrained(str(OUT_DIR))
    shutil.copy(LABELS_FILE, OUT_DIR / "labels.json")

    payload = {
        "model_name": "distilbert-base-multilingual-cased",
        "task": "intent",
        "num_labels": num_labels,
        "labels": label_to_id,
        "source_checkpoint": str(CHECKPOINT_DIR.relative_to(PROJECT_ROOT)),
        "test_metrics": {k: float(v) for k, v in test_metrics.items()
                         if isinstance(v, (int, float))},
        "classification_report": report_dict,
        "per_language": per_lang,
        "training": {
            "epochs_completed": 3,
            "epochs_planned": 5,
            "note": "training stopped at end of epoch 3; checkpoint accepted as final "
                    "(best eval_f1_macro=0.9402 at epoch 3, curve still improving but "
                    "optimizer state was not saved due to save_only_model=True).",
        },
    }
    (OUT_DIR / "eval_results.json").write_text(
        json.dumps(payload, indent=2, ensure_ascii=False)
    )
    print(f"\n[OK] Saved final model to {OUT_DIR}")
    print(f"[OK] Saved eval_results.json to {OUT_DIR / 'eval_results.json'}")

    tmp = OUT_DIR / "tmp_eval"
    if tmp.exists():
        shutil.rmtree(tmp, ignore_errors=True)
    return 0


if __name__ == "__main__":
    sys.exit(main())