aimabai commited on
Commit
9f9a082
Β·
verified Β·
1 Parent(s): 4b36065

Upload eval_bleu.py

Browse files
Files changed (1) hide show
  1. eval_bleu.py +134 -0
eval_bleu.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import json
3
+ from pathlib import Path
4
+ import re
5
+ import torch
6
+ from transformers import AutoTokenizer, Gemma3ForCausalLM
7
+ from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
8
+ from tqdm import tqdm
9
+ import os
10
+ import torch._dynamo
11
+
12
+
13
+ torch._dynamo.config.suppress_errors = True
14
+ torch.set_float32_matmul_precision('high') # Also address the warning you saw
15
+ # ─── CONFIG ─────────────────────────────────────────────────────────
16
+ SRC_LANG, TGT_LANG = "en", "kk"
17
+ MODEL_PATH = "/raid/srp_base_model_training/abai_workspace/models/sync_kk_en/checkpoint-final"
18
+ TEST_FILE = "/raid/srp_base_model_training/abai_workspace/data/flores/en_to_kk_formatted.jsonl" # JSONL with fields system,user,assistant
19
+ OUTPUT_JSON = f"eval_sync_KKEN_data_{SRC_LANG}_to_{TGT_LANG}.json"
20
+ MAX_NEW_TOKS = 64
21
+ os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,4,5" # specify your GPU IDs here
22
+ DEVICE = "cuda" # or "cpu"
23
+ # ────────────────────────────────────────────────────────────────────
24
+ # Add this helper near the top
25
+ def clean_user_field(user_str: str) -> str:
26
+ """
27
+ Remove leading <src=xx><tgt=yy> tags and any whitespace/newlines after them.
28
+ """
29
+ # This pattern matches <src=..><tgt=..> plus any whitespace/newline
30
+ return re.sub(r'^<src=[^>]+><tgt=[^>]+>\s*', '', user_str)
31
+
32
+ def load_model_and_tokenizer():
33
+ print(f"Loading model/tokenizer from {MODEL_PATH} …")
34
+ tok = AutoTokenizer.from_pretrained(MODEL_PATH)
35
+ model = Gemma3ForCausalLM.from_pretrained(
36
+ MODEL_PATH,
37
+ torch_dtype=torch.bfloat16,
38
+ device_map="auto", # auto‐shard if you have multiple GPUs
39
+ )
40
+ model.eval()
41
+ return tok, model
42
+
43
+ def build_prompt(system: str, user: str) -> str:
44
+ return (
45
+ f"<start_of_turn>system\n{system}<end_of_turn>\n"
46
+ f"<start_of_turn>user\n{user}<end_of_turn>\n"
47
+ f"<start_of_turn>assistant"
48
+ )
49
+
50
+ def run_inference(tok, model, system: str, user: str) -> str:
51
+ prompt = build_prompt(system, user)
52
+ inputs = tok(prompt, return_tensors="pt", truncation=True).to(model.device)
53
+ input_len = inputs["input_ids"].shape[-1]
54
+
55
+ with torch.inference_mode():
56
+ out = model.generate(
57
+ **inputs,
58
+ max_new_tokens=MAX_NEW_TOKS,
59
+ do_sample=False,
60
+ eos_token_id=tok.convert_tokens_to_ids("<end_of_turn>"),
61
+ pad_token_id=tok.eos_token_id,
62
+ )
63
+ gen_ids = out[0][input_len:]
64
+ return tok.decode(gen_ids, skip_special_tokens=True).strip()
65
+
66
+ def load_test_examples(path: str):
67
+ examples = []
68
+ for line in open(path, encoding="utf-8"):
69
+ obj = json.loads(line)
70
+ examples.append((obj["system"].strip(),
71
+ obj["user"].strip(),
72
+ obj["assistant"].strip()))
73
+ return examples
74
+
75
+ def evaluate_bleu_nltk(hyps, refs):
76
+ """
77
+ Compute corpus-level 4-gram BLEU using NLTK.
78
+ - hyps: list of hypothesis strings
79
+ - refs: list of reference strings
80
+ Returns BLEU in percentage (e.g. 27.53).
81
+ """
82
+ # 1) tokenize on whitespace
83
+ tokenized_hyps = [hyp.split() for hyp in hyps]
84
+ # NLTK expects a list of references per hypothesis
85
+ tokenized_refs = [[ref.split()] for ref in refs]
86
+
87
+ # 2) smoothing to avoid zero scores on short examples
88
+ smoothing = SmoothingFunction().method1
89
+
90
+ # 3) compute corpus_bleu with uniform 4-gram weights
91
+ score = corpus_bleu(
92
+ tokenized_refs,
93
+ tokenized_hyps,
94
+ weights=(0.25, 0.25, 0.25, 0.25),
95
+ smoothing_function=smoothing,
96
+ )
97
+
98
+ # convert to percentage
99
+ return round(score, 4)
100
+
101
+ def main():
102
+ tok, model = load_model_and_tokenizer()
103
+ examples = load_test_examples(TEST_FILE)
104
+ hyps, refs, users = [], [], []
105
+
106
+ for system, user, assistant in tqdm(examples, desc="Translating"):
107
+ clean_user = clean_user_field(user)
108
+ hyp = run_inference(tok, model, system, clean_user)
109
+ hyps.append(hyp)
110
+ refs.append(assistant)
111
+ users.append(clean_user) # collect cleaned user texts
112
+
113
+ bleu_score = evaluate_bleu_nltk(hyps, refs)
114
+
115
+ # Save results, using cleaned user strings
116
+ out = {
117
+ "model": MODEL_PATH,
118
+ "bleu": bleu_score,
119
+ "examples": []
120
+ }
121
+ for (s, _, r), u_clean, h in zip(examples, users, hyps):
122
+ out["examples"].append({
123
+ "system": s,
124
+ "user": u_clean, # no tags here
125
+ "reference": r,
126
+ "hypothesis": h
127
+ })
128
+
129
+ with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
130
+ json.dump(out, f, ensure_ascii=False, indent=2)
131
+ print(f"βœ… Saved cleaned evaluation to {OUTPUT_JSON}")
132
+
133
+ if __name__ == "__main__":
134
+ main()