"""Recover eval_results.json for the NER model from the already-saved checkpoint. train_ner.py finished training and saved the model to models/ner_model/, but crashed on json.dumps with `TypeError: Object of type int64 is not JSON serializable` (seqeval's classification_report returns numpy.int64 for the 'support' fields). The model itself is fine — we just need to regenerate eval_results.json. Usage: python src/finalize_ner.py """ from __future__ import annotations import os os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") import json import shutil import sys from pathlib import Path from typing import Any import numpy as np import torch from datasets import load_from_disk from seqeval.metrics import ( classification_report as seq_classification_report, f1_score as seq_f1, precision_score as seq_p, recall_score as seq_r, ) from transformers import ( AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, Trainer, TrainingArguments, ) PROJECT_ROOT = Path(__file__).resolve().parent.parent DATA_DIR = PROJECT_ROOT / "data" / "processed" / "ner" LABELS_FILE = DATA_DIR / "labels.json" OUT_DIR = PROJECT_ROOT / "models" / "ner_model" MAX_LENGTH = 128 def _to_jsonable(obj: Any) -> Any: """Recursively convert numpy scalars/arrays to plain Python types.""" if isinstance(obj, dict): return {k: _to_jsonable(v) for k, v in obj.items()} if isinstance(obj, (list, tuple)): return [_to_jsonable(v) for v in obj] if isinstance(obj, np.ndarray): return obj.tolist() if isinstance(obj, np.integer): return int(obj) if isinstance(obj, np.floating): return float(obj) return obj def main() -> int: if not (OUT_DIR / "config.json").exists(): print(f"ERROR: saved NER model not found at {OUT_DIR}", file=sys.stderr) return 2 print("=" * 72) print("Finalize NER eval_results.json from saved model") print("=" * 72) print(f" Model dir : {OUT_DIR}") labels_payload = json.loads(LABELS_FILE.read_text()) label_to_id: dict[str, int] = labels_payload["label_to_id"] id_to_label: dict[int, str] = {int(k): v for k, v in labels_payload["id_to_label"].items()} num_labels = len(id_to_label) ds = load_from_disk(str(DATA_DIR)) print(f" Test rows : {len(ds['test'])}") tokenizer = AutoTokenizer.from_pretrained(str(OUT_DIR)) model = AutoModelForTokenClassification.from_pretrained(str(OUT_DIR)) def tokenize_and_align(batch: dict[str, list]) -> dict[str, Any]: tokenized = tokenizer( batch["tokens"], is_split_into_words=True, truncation=True, max_length=MAX_LENGTH, ) all_labels = [] for i, word_tag_ids in enumerate(batch["ner_tag_ids"]): word_ids = tokenized.word_ids(batch_index=i) previous_word: int | None = None label_ids: list[int] = [] for wid in word_ids: if wid is None: label_ids.append(-100) elif wid != previous_word: label_ids.append(int(word_tag_ids[wid])) else: label_ids.append(-100) previous_word = wid all_labels.append(label_ids) tokenized["labels"] = all_labels return tokenized drop_cols = [c for c in ds["test"].column_names if c not in ("language",)] test_tok = ds["test"].map( tokenize_and_align, batched=True, remove_columns=drop_cols, desc="Tokenizing + aligning test", ) eval_args = TrainingArguments( output_dir=str(OUT_DIR / "tmp_eval"), per_device_eval_batch_size=16, fp16=torch.cuda.is_available(), report_to="none", dataloader_num_workers=0, ) def _decode(predictions: np.ndarray, labels: np.ndarray) -> tuple[list[list[str]], list[list[str]]]: true_preds: list[list[str]] = [] true_labels: list[list[str]] = [] for pred_seq, lab_seq in zip(predictions, labels): tp, tl = [], [] for p, l in zip(pred_seq, lab_seq): if l == -100: continue tp.append(id_to_label[int(p)]) tl.append(id_to_label[int(l)]) true_preds.append(tp) true_labels.append(tl) return true_preds, true_labels def compute_metrics(eval_pred) -> dict[str, float]: logits, labels = eval_pred if isinstance(logits, tuple): logits = logits[0] preds = np.argmax(logits, axis=-1) true_preds, true_labels = _decode(preds, labels) return { "f1": seq_f1(true_labels, true_preds), "precision": seq_p(true_labels, true_preds), "recall": seq_r(true_labels, true_preds), } trainer = Trainer( model=model, args=eval_args, data_collator=DataCollatorForTokenClassification(tokenizer), compute_metrics=compute_metrics, ) print("\nEvaluating on TEST split ...") test_metrics = trainer.evaluate(test_tok, metric_key_prefix="test") test_pred = trainer.predict(test_tok) test_logits = test_pred.predictions[0] if isinstance(test_pred.predictions, tuple) else test_pred.predictions pred_ids = np.argmax(test_logits, axis=-1) true_preds, true_labels = _decode(pred_ids, test_pred.label_ids) report_dict = seq_classification_report( true_labels, true_preds, output_dict=True, zero_division=0, ) report_text = seq_classification_report(true_labels, true_preds, zero_division=0) print("\nEntity-level classification report on TEST:") print(report_text) test_with_lang = ds["test"] per_lang: dict[str, dict[str, float]] = {} if "language" in test_with_lang.column_names: languages = test_with_lang["language"] for lang in sorted(set(languages)): mask = [la == lang for la in languages] sub_preds = [tp for tp, m in zip(true_preds, mask) if m] sub_labels = [tl for tl, m in zip(true_labels, mask) if m] if not sub_preds: continue per_lang[lang] = { "n": int(sum(mask)), "f1": float(seq_f1(sub_labels, sub_preds)), "precision": float(seq_p(sub_labels, sub_preds)), "recall": float(seq_r(sub_labels, sub_preds)), } print("\nPer-language entity-level metrics on TEST:") for lang, m in per_lang.items(): print(f" {lang}: n={m['n']} P={m['precision']:.4f} " f"R={m['recall']:.4f} F1={m['f1']:.4f}") payload = { "model_name": "distilbert-base-multilingual-cased", "task": "ner", "num_labels": num_labels, "labels": label_to_id, "test_metrics": {k: float(v) for k, v in test_metrics.items() if isinstance(v, (int, float, np.integer, np.floating))}, "classification_report": _to_jsonable(report_dict), "per_language": per_lang, "training": { "epochs": 5, "per_device_batch": 8, "grad_accum": 2, "effective_batch": 16, "learning_rate": 2e-5, "warmup_steps": 100, "fp16": True, "note": "Recovered via finalize_ner.py after train_ner.py crashed on json.dumps " "(numpy int64 in seqeval report 'support'). Model itself was fully trained " "and saved; this script only regenerates eval_results.json.", }, } (OUT_DIR / "eval_results.json").write_text( json.dumps(payload, indent=2, ensure_ascii=False) ) print(f"\n[OK] Saved eval_results.json to {OUT_DIR / 'eval_results.json'}") tmp = OUT_DIR / "tmp_eval" if tmp.exists(): shutil.rmtree(tmp, ignore_errors=True) return 0 if __name__ == "__main__": sys.exit(main())