clone

File size: 1,912 Bytes

661c54a

# score_results.py
import argparse, json, re
from typing import List, Dict, Any

def normalize(s: str) -> str:
    s = s.replace("```", " ")
    s = s.strip().lower()
    # 把多空白压缩为单个空格，去掉常见对齐缩进影响
    s = re.sub(r"\s+", " ", s)
    return s

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--pred_path", type=str, required=True, help="eval 的输出 JSON")
    ap.add_argument("--out_path", type=str, default="./valid_clean/valid.json", help="评分明细输出 JSON")
    args = ap.parse_args()

    with open(args.pred_path, "r", encoding="utf-8") as f:
        preds: List[Dict[str, Any]] = json.load(f)

    rows = []
    hit, total = 0, 0
    for item in preds:
        gt = item.get("ground_truth", "")
        pred = item.get("model_output", "")
        # 只有有真解的样本才计分
        if gt is None or gt == "":
            rows.append({
                "id": item.get("id"),
                "match": None,
                "reason": "missing_ground_truth",
                "ground_truth": gt,
                "model_output": pred
            })
            continue

        total += 1
        ngt = normalize(gt)
        npred = normalize(pred)

        match = (npred in ngt)
        if match:
            hit += 1

        rows.append({
            "id": item.get("id"),
            "match": bool(match),
            "ground_truth": gt,
            "model_output": pred
        })

    summary = {
        "total_with_gt": total,
        "matched": hit,
        "accuracy": (hit / total) if total > 0 else None
    }

    out = {"summary": summary, "details": rows}
    with open(args.out_path, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)

    print(f"[SUMMARY] matched {hit}/{total} = {summary['accuracy']:.4f}" if total else "[SUMMARY] no GT")

if __name__ == "__main__":
    main()