multilingual-chatbot / src /finalize_ner.py
momenalhamza's picture
Deploy chatbot: code + RAG + Qwen (3 BERT classifiers loaded from HF Hub)
469ef7f verified
"""Recover eval_results.json for the NER model from the already-saved checkpoint.
train_ner.py finished training and saved the model to models/ner_model/, but
crashed on json.dumps with `TypeError: Object of type int64 is not JSON
serializable` (seqeval's classification_report returns numpy.int64 for the
'support' fields). The model itself is fine — we just need to regenerate
eval_results.json.
Usage:
python src/finalize_ner.py
"""
from __future__ import annotations
import os
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
import json
import shutil
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
from datasets import load_from_disk
from seqeval.metrics import (
classification_report as seq_classification_report,
f1_score as seq_f1, precision_score as seq_p, recall_score as seq_r,
)
from transformers import (
AutoModelForTokenClassification,
AutoTokenizer,
DataCollatorForTokenClassification,
Trainer,
TrainingArguments,
)
PROJECT_ROOT = Path(__file__).resolve().parent.parent
DATA_DIR = PROJECT_ROOT / "data" / "processed" / "ner"
LABELS_FILE = DATA_DIR / "labels.json"
OUT_DIR = PROJECT_ROOT / "models" / "ner_model"
MAX_LENGTH = 128
def _to_jsonable(obj: Any) -> Any:
"""Recursively convert numpy scalars/arrays to plain Python types."""
if isinstance(obj, dict):
return {k: _to_jsonable(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [_to_jsonable(v) for v in obj]
if isinstance(obj, np.ndarray):
return obj.tolist()
if isinstance(obj, np.integer):
return int(obj)
if isinstance(obj, np.floating):
return float(obj)
return obj
def main() -> int:
if not (OUT_DIR / "config.json").exists():
print(f"ERROR: saved NER model not found at {OUT_DIR}", file=sys.stderr)
return 2
print("=" * 72)
print("Finalize NER eval_results.json from saved model")
print("=" * 72)
print(f" Model dir : {OUT_DIR}")
labels_payload = json.loads(LABELS_FILE.read_text())
label_to_id: dict[str, int] = labels_payload["label_to_id"]
id_to_label: dict[int, str] = {int(k): v for k, v in labels_payload["id_to_label"].items()}
num_labels = len(id_to_label)
ds = load_from_disk(str(DATA_DIR))
print(f" Test rows : {len(ds['test'])}")
tokenizer = AutoTokenizer.from_pretrained(str(OUT_DIR))
model = AutoModelForTokenClassification.from_pretrained(str(OUT_DIR))
def tokenize_and_align(batch: dict[str, list]) -> dict[str, Any]:
tokenized = tokenizer(
batch["tokens"],
is_split_into_words=True,
truncation=True,
max_length=MAX_LENGTH,
)
all_labels = []
for i, word_tag_ids in enumerate(batch["ner_tag_ids"]):
word_ids = tokenized.word_ids(batch_index=i)
previous_word: int | None = None
label_ids: list[int] = []
for wid in word_ids:
if wid is None:
label_ids.append(-100)
elif wid != previous_word:
label_ids.append(int(word_tag_ids[wid]))
else:
label_ids.append(-100)
previous_word = wid
all_labels.append(label_ids)
tokenized["labels"] = all_labels
return tokenized
drop_cols = [c for c in ds["test"].column_names if c not in ("language",)]
test_tok = ds["test"].map(
tokenize_and_align, batched=True,
remove_columns=drop_cols, desc="Tokenizing + aligning test",
)
eval_args = TrainingArguments(
output_dir=str(OUT_DIR / "tmp_eval"),
per_device_eval_batch_size=16,
fp16=torch.cuda.is_available(),
report_to="none",
dataloader_num_workers=0,
)
def _decode(predictions: np.ndarray, labels: np.ndarray) -> tuple[list[list[str]], list[list[str]]]:
true_preds: list[list[str]] = []
true_labels: list[list[str]] = []
for pred_seq, lab_seq in zip(predictions, labels):
tp, tl = [], []
for p, l in zip(pred_seq, lab_seq):
if l == -100:
continue
tp.append(id_to_label[int(p)])
tl.append(id_to_label[int(l)])
true_preds.append(tp)
true_labels.append(tl)
return true_preds, true_labels
def compute_metrics(eval_pred) -> dict[str, float]:
logits, labels = eval_pred
if isinstance(logits, tuple):
logits = logits[0]
preds = np.argmax(logits, axis=-1)
true_preds, true_labels = _decode(preds, labels)
return {
"f1": seq_f1(true_labels, true_preds),
"precision": seq_p(true_labels, true_preds),
"recall": seq_r(true_labels, true_preds),
}
trainer = Trainer(
model=model,
args=eval_args,
data_collator=DataCollatorForTokenClassification(tokenizer),
compute_metrics=compute_metrics,
)
print("\nEvaluating on TEST split ...")
test_metrics = trainer.evaluate(test_tok, metric_key_prefix="test")
test_pred = trainer.predict(test_tok)
test_logits = test_pred.predictions[0] if isinstance(test_pred.predictions, tuple) else test_pred.predictions
pred_ids = np.argmax(test_logits, axis=-1)
true_preds, true_labels = _decode(pred_ids, test_pred.label_ids)
report_dict = seq_classification_report(
true_labels, true_preds, output_dict=True, zero_division=0,
)
report_text = seq_classification_report(true_labels, true_preds, zero_division=0)
print("\nEntity-level classification report on TEST:")
print(report_text)
test_with_lang = ds["test"]
per_lang: dict[str, dict[str, float]] = {}
if "language" in test_with_lang.column_names:
languages = test_with_lang["language"]
for lang in sorted(set(languages)):
mask = [la == lang for la in languages]
sub_preds = [tp for tp, m in zip(true_preds, mask) if m]
sub_labels = [tl for tl, m in zip(true_labels, mask) if m]
if not sub_preds:
continue
per_lang[lang] = {
"n": int(sum(mask)),
"f1": float(seq_f1(sub_labels, sub_preds)),
"precision": float(seq_p(sub_labels, sub_preds)),
"recall": float(seq_r(sub_labels, sub_preds)),
}
print("\nPer-language entity-level metrics on TEST:")
for lang, m in per_lang.items():
print(f" {lang}: n={m['n']} P={m['precision']:.4f} "
f"R={m['recall']:.4f} F1={m['f1']:.4f}")
payload = {
"model_name": "distilbert-base-multilingual-cased",
"task": "ner",
"num_labels": num_labels,
"labels": label_to_id,
"test_metrics": {k: float(v) for k, v in test_metrics.items()
if isinstance(v, (int, float, np.integer, np.floating))},
"classification_report": _to_jsonable(report_dict),
"per_language": per_lang,
"training": {
"epochs": 5,
"per_device_batch": 8,
"grad_accum": 2,
"effective_batch": 16,
"learning_rate": 2e-5,
"warmup_steps": 100,
"fp16": True,
"note": "Recovered via finalize_ner.py after train_ner.py crashed on json.dumps "
"(numpy int64 in seqeval report 'support'). Model itself was fully trained "
"and saved; this script only regenerates eval_results.json.",
},
}
(OUT_DIR / "eval_results.json").write_text(
json.dumps(payload, indent=2, ensure_ascii=False)
)
print(f"\n[OK] Saved eval_results.json to {OUT_DIR / 'eval_results.json'}")
tmp = OUT_DIR / "tmp_eval"
if tmp.exists():
shutil.rmtree(tmp, ignore_errors=True)
return 0
if __name__ == "__main__":
sys.exit(main())