File size: 3,470 Bytes

013f3b2

"""공통 평가 스크립트: vLLM 서버에 연결하여 HRM8K 전체 841문제 평가 (temperature=0)"""
import os, json, re, sys, asyncio
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from openai import OpenAI

MATH_SYSTEM_PROMPT = """주어진 수학 문제를 단계별로 풀고 답변을 작성하세요.
반드시 최종 답변을 \\boxed{정수} 형식으로 마지막 줄에 출력하세요.
예시: \\boxed{42}"""

def extract_boxed(text):
    m = re.findall(r'\\boxed\{([^}]+)\}', text)
    return m[-1].strip() if m else None

def normalize(a):
    if a is None: return None
    s = str(a).replace(",","").replace(" ","").strip()
    try:
        n = float(s)
        return str(int(n)) if n == int(n) else str(n)
    except: return s

def check(pred, gt):
    p, g = normalize(pred), normalize(gt)
    return p is not None and g is not None and p == g

async def evaluate(label="", save_path=None):
    client = OpenAI(base_url="http://localhost:8000/v1", api_key="token-abc123")
    model_name = client.models.list().data[0].id
    print(f"모델: {model_name}")

    with open("data/HRM8k_eval.json") as f:
        data = json.load(f)
    print(f"평가: {len(data)}개 (temperature=0, max_tokens=2048)")

    llm = ChatOpenAI(base_url="http://localhost:8000/v1", api_key="token-abc123",
                     model=model_name, temperature=0, max_tokens=2048)
    prompt = ChatPromptTemplate([("user", "{sp}\n\n{q}")]).partial(sp=MATH_SYSTEM_PROMPT)
    chain = prompt | llm | StrOutputParser()
    inputs = [{"q": item["question"]} for item in data]
    results = await chain.abatch(inputs, config={"max_concurrency": 400})

    by_src = {}
    details = []
    for item, res in zip(data, results):
        s = item.get("source", "?")
        if s not in by_src: by_src[s] = {"correct": 0, "total": 0, "no_boxed": 0}
        by_src[s]["total"] += 1
        pred = extract_boxed(res)
        is_correct = False
        if pred is None:
            by_src[s]["no_boxed"] += 1
        elif check(pred, item["answer"]):
            by_src[s]["correct"] += 1
            is_correct = True
        details.append({
            "question": item["question"][:80],
            "source": s,
            "gt": str(item["answer"])[-30:] if isinstance(item["answer"], str) else str(item["answer"]),
            "pred": pred,
            "correct": is_correct,
        })

    tc = sum(v["correct"] for v in by_src.values())
    tt = sum(v["total"] for v in by_src.values())
    print(f"\n=== {label} 결과 (temperature=0) ===")
    for s in sorted(by_src):
        v = by_src[s]
        print(f"  [{s.upper()}] {v['correct']}/{v['total']} ({v['correct']/v['total']*100:.1f}%) | boxed미출력: {v['no_boxed']}")
    print(f"  [전체] {tc}/{tt} ({tc/tt*100:.1f}%)")

    result_obj = {"label": label, "correct": tc, "total": tt, "accuracy": tc/tt*100, "by_source": by_src}

    if save_path:
        os.makedirs(os.path.dirname(save_path) or ".", exist_ok=True)
        with open(save_path, "w") as f:
            json.dump({"result": result_obj, "details": details}, f, ensure_ascii=False, indent=2)
        print(f"  결과 저장: {save_path}")

    return result_obj

if __name__ == "__main__":
    label = sys.argv[1] if len(sys.argv) > 1 else "eval"
    save_path = sys.argv[2] if len(sys.argv) > 2 else None
    asyncio.run(evaluate(label, save_path))