Spaces:
Sleeping
Sleeping
| """Finalize the intent classifier from the existing best checkpoint. | |
| We stopped training at end of epoch 3/5 with best eval_f1_macro=0.9402. | |
| Rather than resuming training (the checkpoint has no optimizer state because | |
| train_intent.py used save_only_model=True), we accept the epoch-3 model as | |
| final, run TEST evaluation, and save: | |
| models/intent_classifier/ (best model + tokenizer + labels.json) | |
| models/intent_classifier/eval_results.json (test metrics + per-language breakdown) | |
| Usage: | |
| python src/finalize_intent.py | |
| """ | |
| from __future__ import annotations | |
| import os | |
| os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") | |
| import json | |
| import shutil | |
| import sys | |
| from pathlib import Path | |
| from typing import Any | |
| import numpy as np | |
| import torch | |
| from datasets import load_from_disk | |
| from sklearn.metrics import ( | |
| accuracy_score, classification_report, f1_score, | |
| precision_score, recall_score, | |
| ) | |
| from transformers import ( | |
| AutoModelForSequenceClassification, | |
| AutoTokenizer, | |
| DataCollatorWithPadding, | |
| Trainer, | |
| TrainingArguments, | |
| ) | |
| PROJECT_ROOT = Path(__file__).resolve().parent.parent | |
| DATA_DIR = PROJECT_ROOT / "data" / "processed" / "intent" | |
| LABELS_FILE = DATA_DIR / "labels.json" | |
| OUT_DIR = PROJECT_ROOT / "models" / "intent_classifier" | |
| CHECKPOINT_DIR = OUT_DIR / "runs" / "checkpoint-3336" | |
| MAX_LENGTH = 128 | |
| def main() -> int: | |
| if not CHECKPOINT_DIR.exists(): | |
| print(f"ERROR: checkpoint not found at {CHECKPOINT_DIR}", file=sys.stderr) | |
| return 2 | |
| print("=" * 72) | |
| print("Finalize intent classifier from checkpoint-3336") | |
| print("=" * 72) | |
| print(f" Checkpoint: {CHECKPOINT_DIR}") | |
| print(f" Out dir : {OUT_DIR}") | |
| labels_payload = json.loads(LABELS_FILE.read_text()) | |
| label_to_id: dict[str, int] = labels_payload["label_to_id"] | |
| id_to_label: dict[int, str] = {int(k): v for k, v in labels_payload["id_to_label"].items()} | |
| label_names = [id_to_label[i] for i in range(len(id_to_label))] | |
| num_labels = len(label_names) | |
| ds = load_from_disk(str(DATA_DIR)) | |
| print(f" Test rows : {len(ds['test'])}") | |
| tokenizer = AutoTokenizer.from_pretrained(str(CHECKPOINT_DIR)) | |
| model = AutoModelForSequenceClassification.from_pretrained(str(CHECKPOINT_DIR)) | |
| def tokenize(batch: dict[str, list]) -> dict[str, Any]: | |
| return tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH) | |
| drop_cols = [c for c in ds["test"].column_names if c not in ("label",)] | |
| test_tok = ds["test"].map(tokenize, batched=True, remove_columns=drop_cols, | |
| desc="Tokenizing test") | |
| eval_args = TrainingArguments( | |
| output_dir=str(OUT_DIR / "tmp_eval"), | |
| per_device_eval_batch_size=16, | |
| fp16=torch.cuda.is_available(), | |
| report_to="none", | |
| dataloader_num_workers=0, | |
| ) | |
| def compute_metrics(eval_pred) -> dict[str, float]: | |
| logits, labels = eval_pred | |
| if isinstance(logits, tuple): | |
| logits = logits[0] | |
| preds = np.argmax(logits, axis=-1) | |
| return { | |
| "accuracy": accuracy_score(labels, preds), | |
| "f1": f1_score(labels, preds, average="weighted", zero_division=0), | |
| "f1_macro": f1_score(labels, preds, average="macro", zero_division=0), | |
| "precision": precision_score(labels, preds, average="weighted", zero_division=0), | |
| "recall": recall_score(labels, preds, average="weighted", zero_division=0), | |
| } | |
| trainer = Trainer( | |
| model=model, | |
| args=eval_args, | |
| data_collator=DataCollatorWithPadding(tokenizer), | |
| compute_metrics=compute_metrics, | |
| ) | |
| print("\nEvaluating on TEST split ...") | |
| test_metrics = trainer.evaluate(test_tok, metric_key_prefix="test") | |
| test_pred = trainer.predict(test_tok) | |
| test_logits = test_pred.predictions[0] if isinstance(test_pred.predictions, tuple) else test_pred.predictions | |
| pred_ids = np.argmax(test_logits, axis=-1) | |
| true_ids = test_pred.label_ids | |
| report_dict = classification_report( | |
| true_ids, pred_ids, | |
| labels=list(range(num_labels)), | |
| target_names=label_names, | |
| output_dict=True, zero_division=0, | |
| ) | |
| report_text = classification_report( | |
| true_ids, pred_ids, | |
| labels=list(range(num_labels)), | |
| target_names=label_names, | |
| zero_division=0, | |
| ) | |
| print("\nClassification report on TEST:") | |
| print(report_text) | |
| test_with_lang = ds["test"] | |
| per_lang: dict[str, dict[str, float]] = {} | |
| if "language" in test_with_lang.column_names: | |
| languages = test_with_lang["language"] | |
| for lang in sorted(set(languages)): | |
| mask = np.array([la == lang for la in languages]) | |
| if not mask.any(): | |
| continue | |
| lp = pred_ids[mask] | |
| lt = true_ids[mask] | |
| per_lang[lang] = { | |
| "n": int(mask.sum()), | |
| "accuracy": float(accuracy_score(lt, lp)), | |
| "f1_weighted": float(f1_score(lt, lp, average="weighted", zero_division=0)), | |
| "f1_macro": float(f1_score(lt, lp, average="macro", zero_division=0)), | |
| } | |
| print("\nPer-language metrics on TEST:") | |
| for lang, m in per_lang.items(): | |
| print(f" {lang}: n={m['n']} acc={m['accuracy']:.4f} " | |
| f"f1_w={m['f1_weighted']:.4f} f1_m={m['f1_macro']:.4f}") | |
| OUT_DIR.mkdir(parents=True, exist_ok=True) | |
| trainer.save_model(str(OUT_DIR)) | |
| tokenizer.save_pretrained(str(OUT_DIR)) | |
| shutil.copy(LABELS_FILE, OUT_DIR / "labels.json") | |
| payload = { | |
| "model_name": "distilbert-base-multilingual-cased", | |
| "task": "intent", | |
| "num_labels": num_labels, | |
| "labels": label_to_id, | |
| "source_checkpoint": str(CHECKPOINT_DIR.relative_to(PROJECT_ROOT)), | |
| "test_metrics": {k: float(v) for k, v in test_metrics.items() | |
| if isinstance(v, (int, float))}, | |
| "classification_report": report_dict, | |
| "per_language": per_lang, | |
| "training": { | |
| "epochs_completed": 3, | |
| "epochs_planned": 5, | |
| "note": "training stopped at end of epoch 3; checkpoint accepted as final " | |
| "(best eval_f1_macro=0.9402 at epoch 3, curve still improving but " | |
| "optimizer state was not saved due to save_only_model=True).", | |
| }, | |
| } | |
| (OUT_DIR / "eval_results.json").write_text( | |
| json.dumps(payload, indent=2, ensure_ascii=False) | |
| ) | |
| print(f"\n[OK] Saved final model to {OUT_DIR}") | |
| print(f"[OK] Saved eval_results.json to {OUT_DIR / 'eval_results.json'}") | |
| tmp = OUT_DIR / "tmp_eval" | |
| if tmp.exists(): | |
| shutil.rmtree(tmp, ignore_errors=True) | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |