SRP-base-model-training
/

eval

Model card Files Files and versions

xet

Community

aimabai commited on Jul 29, 2025

Commit

9f9a082

verified ·

1 Parent(s): 4b36065

Upload eval_bleu.py

Browse files

Files changed (1) hide show

eval_bleu.py +134 -0

eval_bleu.py ADDED Viewed

	@@ -0,0 +1,134 @@

+#!/usr/bin/env python3
+import json
+from pathlib import Path
+import re
+import torch
+from transformers import AutoTokenizer, Gemma3ForCausalLM
+from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
+from tqdm import tqdm
+import os
+import torch._dynamo
+torch._dynamo.config.suppress_errors = True
+torch.set_float32_matmul_precision('high')  # Also address the warning you saw
+# ─── CONFIG ─────────────────────────────────────────────────────────
+SRC_LANG, TGT_LANG = "en", "kk"
+MODEL_PATH   = "/raid/srp_base_model_training/abai_workspace/models/sync_kk_en/checkpoint-final"
+TEST_FILE    = "/raid/srp_base_model_training/abai_workspace/data/flores/en_to_kk_formatted.jsonl"   # JSONL with fields system,user,assistant
+OUTPUT_JSON    = f"eval_sync_KKEN_data_{SRC_LANG}_to_{TGT_LANG}.json"
+MAX_NEW_TOKS = 64
+os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,4,5"  # specify your GPU IDs here
+DEVICE = "cuda"  # or "cpu"
+# ────────────────────────────────────────────────────────────────────
+# Add this helper near the top
+def clean_user_field(user_str: str) -> str:
+    """
+    Remove leading <src=xx><tgt=yy> tags and any whitespace/newlines after them.
+    """
+    # This pattern matches <src=..><tgt=..> plus any whitespace/newline
+    return re.sub(r'^<src=[^>]+><tgt=[^>]+>\s*', '', user_str)
+def load_model_and_tokenizer():
+    print(f"Loading model/tokenizer from {MODEL_PATH} …")
+    tok = AutoTokenizer.from_pretrained(MODEL_PATH)
+    model = Gemma3ForCausalLM.from_pretrained(
+        MODEL_PATH,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",        # auto‐shard if you have multiple GPUs
+    )
+    model.eval()
+    return tok, model
+def build_prompt(system: str, user: str) -> str:
+    return (
+        f"<start_of_turn>system\n{system}<end_of_turn>\n"
+        f"<start_of_turn>user\n{user}<end_of_turn>\n"
+        f"<start_of_turn>assistant"
+    )
+def run_inference(tok, model, system: str, user: str) -> str:
+    prompt = build_prompt(system, user)
+    inputs = tok(prompt, return_tensors="pt", truncation=True).to(model.device)
+    input_len = inputs["input_ids"].shape[-1]
+    with torch.inference_mode():
+        out = model.generate(
+            **inputs,
+            max_new_tokens=MAX_NEW_TOKS,
+            do_sample=False,
+            eos_token_id=tok.convert_tokens_to_ids("<end_of_turn>"),
+            pad_token_id=tok.eos_token_id,
+        )
+    gen_ids = out[0][input_len:]
+    return tok.decode(gen_ids, skip_special_tokens=True).strip()
+def load_test_examples(path: str):
+    examples = []
+    for line in open(path, encoding="utf-8"):
+        obj = json.loads(line)
+        examples.append((obj["system"].strip(),
+                         obj["user"].strip(),
+                         obj["assistant"].strip()))
+    return examples
+def evaluate_bleu_nltk(hyps, refs):
+    """
+    Compute corpus-level 4-gram BLEU using NLTK.
+    - hyps: list of hypothesis strings
+    - refs: list of reference strings
+    Returns BLEU in percentage (e.g. 27.53).
+    """
+    # 1) tokenize on whitespace
+    tokenized_hyps = [hyp.split() for hyp in hyps]
+    # NLTK expects a list of references per hypothesis
+    tokenized_refs = [[ref.split()] for ref in refs]
+    # 2) smoothing to avoid zero scores on short examples
+    smoothing = SmoothingFunction().method1
+    # 3) compute corpus_bleu with uniform 4-gram weights
+    score = corpus_bleu(
+        tokenized_refs,
+        tokenized_hyps,
+        weights=(0.25, 0.25, 0.25, 0.25),
+        smoothing_function=smoothing,
+    )
+    # convert to percentage
+    return round(score, 4)
+def main():
+    tok, model = load_model_and_tokenizer()
+    examples = load_test_examples(TEST_FILE)
+    hyps, refs, users = [], [], []
+    for system, user, assistant in tqdm(examples, desc="Translating"):
+        clean_user = clean_user_field(user)
+        hyp = run_inference(tok, model, system, clean_user)
+        hyps.append(hyp)
+        refs.append(assistant)
+        users.append(clean_user)    # collect cleaned user texts
+    bleu_score = evaluate_bleu_nltk(hyps, refs)
+    # Save results, using cleaned user strings
+    out = {
+        "model": MODEL_PATH,
+        "bleu": bleu_score,
+        "examples": []
+    }
+    for (s, _, r), u_clean, h in zip(examples, users, hyps):
+        out["examples"].append({
+            "system": s,
+            "user": u_clean,          # no tags here
+            "reference": r,
+            "hypothesis": h
+        })
+    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
+        json.dump(out, f, ensure_ascii=False, indent=2)
+    print(f"✅ Saved cleaned evaluation to {OUTPUT_JSON}")
+if __name__ == "__main__":
+    main()