clone / evaluation /get_valid_score.py
chenyili818's picture
Add files using upload-large-folder tool
661c54a verified
# score_results.py
import argparse, json, re
from typing import List, Dict, Any
def normalize(s: str) -> str:
s = s.replace("```", " ")
s = s.strip().lower()
# 把多空白压缩为单个空格,去掉常见对齐缩进影响
s = re.sub(r"\s+", " ", s)
return s
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--pred_path", type=str, required=True, help="eval 的输出 JSON")
ap.add_argument("--out_path", type=str, default="./valid_clean/valid.json", help="评分明细输出 JSON")
args = ap.parse_args()
with open(args.pred_path, "r", encoding="utf-8") as f:
preds: List[Dict[str, Any]] = json.load(f)
rows = []
hit, total = 0, 0
for item in preds:
gt = item.get("ground_truth", "")
pred = item.get("model_output", "")
# 只有有真解的样本才计分
if gt is None or gt == "":
rows.append({
"id": item.get("id"),
"match": None,
"reason": "missing_ground_truth",
"ground_truth": gt,
"model_output": pred
})
continue
total += 1
ngt = normalize(gt)
npred = normalize(pred)
match = (npred in ngt)
if match:
hit += 1
rows.append({
"id": item.get("id"),
"match": bool(match),
"ground_truth": gt,
"model_output": pred
})
summary = {
"total_with_gt": total,
"matched": hit,
"accuracy": (hit / total) if total > 0 else None
}
out = {"summary": summary, "details": rows}
with open(args.out_path, "w", encoding="utf-8") as f:
json.dump(out, f, ensure_ascii=False, indent=2)
print(f"[SUMMARY] matched {hit}/{total} = {summary['accuracy']:.4f}" if total else "[SUMMARY] no GT")
if __name__ == "__main__":
main()