Spaces:
Sleeping
Sleeping
| """Recover eval_results.json for the NER model from the already-saved checkpoint. | |
| train_ner.py finished training and saved the model to models/ner_model/, but | |
| crashed on json.dumps with `TypeError: Object of type int64 is not JSON | |
| serializable` (seqeval's classification_report returns numpy.int64 for the | |
| 'support' fields). The model itself is fine — we just need to regenerate | |
| eval_results.json. | |
| Usage: | |
| python src/finalize_ner.py | |
| """ | |
| from __future__ import annotations | |
| import os | |
| os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") | |
| import json | |
| import shutil | |
| import sys | |
| from pathlib import Path | |
| from typing import Any | |
| import numpy as np | |
| import torch | |
| from datasets import load_from_disk | |
| from seqeval.metrics import ( | |
| classification_report as seq_classification_report, | |
| f1_score as seq_f1, precision_score as seq_p, recall_score as seq_r, | |
| ) | |
| from transformers import ( | |
| AutoModelForTokenClassification, | |
| AutoTokenizer, | |
| DataCollatorForTokenClassification, | |
| Trainer, | |
| TrainingArguments, | |
| ) | |
| PROJECT_ROOT = Path(__file__).resolve().parent.parent | |
| DATA_DIR = PROJECT_ROOT / "data" / "processed" / "ner" | |
| LABELS_FILE = DATA_DIR / "labels.json" | |
| OUT_DIR = PROJECT_ROOT / "models" / "ner_model" | |
| MAX_LENGTH = 128 | |
| def _to_jsonable(obj: Any) -> Any: | |
| """Recursively convert numpy scalars/arrays to plain Python types.""" | |
| if isinstance(obj, dict): | |
| return {k: _to_jsonable(v) for k, v in obj.items()} | |
| if isinstance(obj, (list, tuple)): | |
| return [_to_jsonable(v) for v in obj] | |
| if isinstance(obj, np.ndarray): | |
| return obj.tolist() | |
| if isinstance(obj, np.integer): | |
| return int(obj) | |
| if isinstance(obj, np.floating): | |
| return float(obj) | |
| return obj | |
| def main() -> int: | |
| if not (OUT_DIR / "config.json").exists(): | |
| print(f"ERROR: saved NER model not found at {OUT_DIR}", file=sys.stderr) | |
| return 2 | |
| print("=" * 72) | |
| print("Finalize NER eval_results.json from saved model") | |
| print("=" * 72) | |
| print(f" Model dir : {OUT_DIR}") | |
| labels_payload = json.loads(LABELS_FILE.read_text()) | |
| label_to_id: dict[str, int] = labels_payload["label_to_id"] | |
| id_to_label: dict[int, str] = {int(k): v for k, v in labels_payload["id_to_label"].items()} | |
| num_labels = len(id_to_label) | |
| ds = load_from_disk(str(DATA_DIR)) | |
| print(f" Test rows : {len(ds['test'])}") | |
| tokenizer = AutoTokenizer.from_pretrained(str(OUT_DIR)) | |
| model = AutoModelForTokenClassification.from_pretrained(str(OUT_DIR)) | |
| def tokenize_and_align(batch: dict[str, list]) -> dict[str, Any]: | |
| tokenized = tokenizer( | |
| batch["tokens"], | |
| is_split_into_words=True, | |
| truncation=True, | |
| max_length=MAX_LENGTH, | |
| ) | |
| all_labels = [] | |
| for i, word_tag_ids in enumerate(batch["ner_tag_ids"]): | |
| word_ids = tokenized.word_ids(batch_index=i) | |
| previous_word: int | None = None | |
| label_ids: list[int] = [] | |
| for wid in word_ids: | |
| if wid is None: | |
| label_ids.append(-100) | |
| elif wid != previous_word: | |
| label_ids.append(int(word_tag_ids[wid])) | |
| else: | |
| label_ids.append(-100) | |
| previous_word = wid | |
| all_labels.append(label_ids) | |
| tokenized["labels"] = all_labels | |
| return tokenized | |
| drop_cols = [c for c in ds["test"].column_names if c not in ("language",)] | |
| test_tok = ds["test"].map( | |
| tokenize_and_align, batched=True, | |
| remove_columns=drop_cols, desc="Tokenizing + aligning test", | |
| ) | |
| eval_args = TrainingArguments( | |
| output_dir=str(OUT_DIR / "tmp_eval"), | |
| per_device_eval_batch_size=16, | |
| fp16=torch.cuda.is_available(), | |
| report_to="none", | |
| dataloader_num_workers=0, | |
| ) | |
| def _decode(predictions: np.ndarray, labels: np.ndarray) -> tuple[list[list[str]], list[list[str]]]: | |
| true_preds: list[list[str]] = [] | |
| true_labels: list[list[str]] = [] | |
| for pred_seq, lab_seq in zip(predictions, labels): | |
| tp, tl = [], [] | |
| for p, l in zip(pred_seq, lab_seq): | |
| if l == -100: | |
| continue | |
| tp.append(id_to_label[int(p)]) | |
| tl.append(id_to_label[int(l)]) | |
| true_preds.append(tp) | |
| true_labels.append(tl) | |
| return true_preds, true_labels | |
| def compute_metrics(eval_pred) -> dict[str, float]: | |
| logits, labels = eval_pred | |
| if isinstance(logits, tuple): | |
| logits = logits[0] | |
| preds = np.argmax(logits, axis=-1) | |
| true_preds, true_labels = _decode(preds, labels) | |
| return { | |
| "f1": seq_f1(true_labels, true_preds), | |
| "precision": seq_p(true_labels, true_preds), | |
| "recall": seq_r(true_labels, true_preds), | |
| } | |
| trainer = Trainer( | |
| model=model, | |
| args=eval_args, | |
| data_collator=DataCollatorForTokenClassification(tokenizer), | |
| compute_metrics=compute_metrics, | |
| ) | |
| print("\nEvaluating on TEST split ...") | |
| test_metrics = trainer.evaluate(test_tok, metric_key_prefix="test") | |
| test_pred = trainer.predict(test_tok) | |
| test_logits = test_pred.predictions[0] if isinstance(test_pred.predictions, tuple) else test_pred.predictions | |
| pred_ids = np.argmax(test_logits, axis=-1) | |
| true_preds, true_labels = _decode(pred_ids, test_pred.label_ids) | |
| report_dict = seq_classification_report( | |
| true_labels, true_preds, output_dict=True, zero_division=0, | |
| ) | |
| report_text = seq_classification_report(true_labels, true_preds, zero_division=0) | |
| print("\nEntity-level classification report on TEST:") | |
| print(report_text) | |
| test_with_lang = ds["test"] | |
| per_lang: dict[str, dict[str, float]] = {} | |
| if "language" in test_with_lang.column_names: | |
| languages = test_with_lang["language"] | |
| for lang in sorted(set(languages)): | |
| mask = [la == lang for la in languages] | |
| sub_preds = [tp for tp, m in zip(true_preds, mask) if m] | |
| sub_labels = [tl for tl, m in zip(true_labels, mask) if m] | |
| if not sub_preds: | |
| continue | |
| per_lang[lang] = { | |
| "n": int(sum(mask)), | |
| "f1": float(seq_f1(sub_labels, sub_preds)), | |
| "precision": float(seq_p(sub_labels, sub_preds)), | |
| "recall": float(seq_r(sub_labels, sub_preds)), | |
| } | |
| print("\nPer-language entity-level metrics on TEST:") | |
| for lang, m in per_lang.items(): | |
| print(f" {lang}: n={m['n']} P={m['precision']:.4f} " | |
| f"R={m['recall']:.4f} F1={m['f1']:.4f}") | |
| payload = { | |
| "model_name": "distilbert-base-multilingual-cased", | |
| "task": "ner", | |
| "num_labels": num_labels, | |
| "labels": label_to_id, | |
| "test_metrics": {k: float(v) for k, v in test_metrics.items() | |
| if isinstance(v, (int, float, np.integer, np.floating))}, | |
| "classification_report": _to_jsonable(report_dict), | |
| "per_language": per_lang, | |
| "training": { | |
| "epochs": 5, | |
| "per_device_batch": 8, | |
| "grad_accum": 2, | |
| "effective_batch": 16, | |
| "learning_rate": 2e-5, | |
| "warmup_steps": 100, | |
| "fp16": True, | |
| "note": "Recovered via finalize_ner.py after train_ner.py crashed on json.dumps " | |
| "(numpy int64 in seqeval report 'support'). Model itself was fully trained " | |
| "and saved; this script only regenerates eval_results.json.", | |
| }, | |
| } | |
| (OUT_DIR / "eval_results.json").write_text( | |
| json.dumps(payload, indent=2, ensure_ascii=False) | |
| ) | |
| print(f"\n[OK] Saved eval_results.json to {OUT_DIR / 'eval_results.json'}") | |
| tmp = OUT_DIR / "tmp_eval" | |
| if tmp.exists(): | |
| shutil.rmtree(tmp, ignore_errors=True) | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |