File size: 3,051 Bytes
b1e25b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Single-phase evaluator (DeepSeek API) — Calculate EM / F1 only.
Usage Example
--------
python eval_single_phase.py --input data/2wikimqa.jsonl
"""
import argparse, time, jsonlines, os
from pathlib import Path
from tqdm import tqdm
from openai import OpenAI
from utils.metrics import qa_em_score, qa_f1_score
from utils.llmjudge import judge_answer_with_api
# -------------------- CLI --------------------
p = argparse.ArgumentParser("Single-phase evaluator")
p.add_argument("--input", required=True, help="Path to the *.jsonl file to evaluate")
p.add_argument("--model", default="deepseek-r1")
p.add_argument("--temperature", type=float, default=0.5)
p.add_argument("--max_tokens", type=int, default=30)
p.add_argument("--sleep", type=float, default=0.0)
args = p.parse_args()
client = OpenAI(
base_url=os.environ.get("OPENAI_BASE_URL"),
api_key=os.environ.get("OPENAI_API_KEY")
)
# -------------------- helper --------------------
def ask(context: str, question: str) -> str:
"""Call DeepSeek to get answer (return final answer only)"""
messages = [
{"role": "system",
"content": ("You are a QA assistant. "
"Answer strictly based on the passages; "
"output only the final answer.")},
{"role": "user",
"content": f"Answer the question and output only the final answer without extra words. Passages:\n{context}\n\nQuestion: {question}\nAnswer:"}
]
resp = client.chat.completions.create(
model=args.model,
messages=messages,
temperature=args.temperature,
max_tokens=args.max_tokens
)
if not resp.choices[0].message.content:
return "None"
return resp.choices[0].message.content.strip()
# -------------------- core eval --------------------
def evaluate_file(path: Path):
dataset = path.stem
data = {obj["input"]: obj for obj in jsonlines.open(path)}
total = len(data)
em_hits = 0
f1_sum = 0.0
for q, item in tqdm(data.items(), desc=f"{dataset}"):
ctx = item["context"]
golds = item["answers"] if isinstance(item["answers"], list) else [item["answers"]]
pred = ask(ctx, q).split('.', 1)[0] # Cut off extra explanations
if pred == "None":
continue
em = max(qa_em_score(pred, g) for g in golds)
f1 = max(qa_f1_score(pred, g) for g in golds)
em_hits += em
f1_sum += f1
if args.sleep:
time.sleep(args.sleep)
print(f"\n=== {dataset.upper()} SUMMARY ===")
print(f"Total samples : {total}")
print(f"Exact Match : {em_hits}/{total} ({em_hits/total:.2%})")
print(f"Average F1 : {f1_sum/total:.4f}")
print("-" * 40 + "\n")
# -------------------- run --------------------
input_path = Path(args.input)
if not input_path.exists():
raise SystemExit(f"File does not exist: {input_path}")
evaluate_file(input_path) |