File size: 3,051 Bytes

b1e25b1

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Single-phase evaluator (DeepSeek API) — Calculate EM / F1 only.

Usage Example
--------
python eval_single_phase.py --input data/2wikimqa.jsonl
"""

import argparse, time, jsonlines, os
from pathlib import Path
from tqdm import tqdm
from openai import OpenAI
from utils.metrics import qa_em_score, qa_f1_score
from utils.llmjudge import judge_answer_with_api

# -------------------- CLI --------------------
p = argparse.ArgumentParser("Single-phase evaluator")
p.add_argument("--input", required=True, help="Path to the *.jsonl file to evaluate")
p.add_argument("--model",       default="deepseek-r1")
p.add_argument("--temperature", type=float, default=0.5)
p.add_argument("--max_tokens",  type=int,   default=30)
p.add_argument("--sleep",       type=float, default=0.0)
args = p.parse_args()

client = OpenAI(
    base_url=os.environ.get("OPENAI_BASE_URL"),
    api_key=os.environ.get("OPENAI_API_KEY")
)

# -------------------- helper --------------------
def ask(context: str, question: str) -> str:
    """Call DeepSeek to get answer (return final answer only)"""
    messages = [
        {"role": "system",
         "content": ("You are a QA assistant. "
                     "Answer strictly based on the passages; "
                     "output only the final answer.")},
        {"role": "user",
         "content": f"Answer the question and output only the final answer without extra words. Passages:\n{context}\n\nQuestion: {question}\nAnswer:"}
    ]
    resp = client.chat.completions.create(
        model=args.model,
        messages=messages,
        temperature=args.temperature,
        max_tokens=args.max_tokens
    )
    if not resp.choices[0].message.content:
        return "None"

    return resp.choices[0].message.content.strip()


# -------------------- core eval --------------------
def evaluate_file(path: Path):
    dataset = path.stem
    data = {obj["input"]: obj for obj in jsonlines.open(path)}

    total   = len(data)
    em_hits = 0
    f1_sum  = 0.0

    for q, item in tqdm(data.items(), desc=f"{dataset}"):
        ctx   = item["context"]
        golds = item["answers"] if isinstance(item["answers"], list) else [item["answers"]]

        pred  = ask(ctx, q).split('.', 1)[0]        # Cut off extra explanations
        if pred == "None":
            continue
        em    = max(qa_em_score(pred, g)  for g in golds)
        f1    = max(qa_f1_score(pred, g) for g in golds)

        em_hits += em
        f1_sum  += f1
        if args.sleep:
            time.sleep(args.sleep)

    print(f"\n=== {dataset.upper()} SUMMARY ===")
    print(f"Total samples : {total}")
    print(f"Exact Match   : {em_hits}/{total}  ({em_hits/total:.2%})")
    print(f"Average F1    : {f1_sum/total:.4f}")
    print("-" * 40 + "\n")


# -------------------- run --------------------
input_path = Path(args.input)
if not input_path.exists():
    raise SystemExit(f"File does not exist: {input_path}")

evaluate_file(input_path)