#!/usr/bin/env python3 """ Split the GPT-5 inference JSONL by model (gpt-5, gpt-5-mini, gpt-5-nano), merge each model's generated_text with the subclaims data, and run evaluate_scores_bn.py on each model's data. Usage: python run_eval_gpt5_models.py """ import json import os import subprocess import sys from collections import defaultdict from datetime import datetime SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) JSONL_PATH = os.path.join( SCRIPT_DIR, "results/bn/gpt5_inference_all_wo_gs_20260314_173736.jsonl", ) SUBCLAIMS_PATH = os.path.join(SCRIPT_DIR, "dataset/bn/test_bn_subclaims.json") OUTPUT_DIR = os.path.join(SCRIPT_DIR, "evaluation/bn/eval_gpt5_models") EVAL_SCRIPT = os.path.join(SCRIPT_DIR, "evaluate_scores_bn.py") MODELS = ["gpt-5", "gpt-5-mini", "gpt-5-nano"] def main(): os.makedirs(OUTPUT_DIR, exist_ok=True) with open(JSONL_PATH, "r", encoding="utf-8") as f: all_items = [json.loads(line) for line in f if line.strip()] with open(SUBCLAIMS_PATH, "r", encoding="utf-8") as f: subclaims_data = json.load(f) sc_by_key = {} for sc in subclaims_data: key = (sc["doc_id"], sc["label"]) sc_by_key[key] = sc by_model = defaultdict(list) for item in all_items: by_model[item["model"]].append(item) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") for model_name in MODELS: items = by_model.get(model_name, []) if not items: print(f"[WARN] No items for model {model_name}, skipping.") continue safe_name = model_name.replace("-", "_") prepared = [] skipped_no_subclaims = 0 for item in items: doc_id = item["doc_id"] label = item["gold_label"] key = (doc_id, label) sc = sc_by_key.get(key) if sc is None: skipped_no_subclaims += 1 continue prepared.append({ "doc_id": doc_id, "label": label, "fulltext": sc.get("fulltext", ""), "summary_text": sc.get("summary", ""), "summary_subclaims": sc.get("summary_subclaims", []), "fulltext_subclaims": sc.get("fulltext_subclaims", []), "generated_text": item.get("generated_text", ""), }) input_path = os.path.join( OUTPUT_DIR, f"{safe_name}_prepared_{timestamp}.json" ) with open(input_path, "w", encoding="utf-8") as f: json.dump(prepared, f, indent=2, ensure_ascii=False) print(f"\n{'='*60}") print(f"Model: {model_name}") print(f" Total JSONL items : {len(items)}") print(f" Matched w/ subclaims: {len(prepared)}") print(f" Skipped (no subcl.) : {skipped_no_subclaims}") print(f" Prepared file : {input_path}") print(f"{'='*60}") output_path = os.path.join( OUTPUT_DIR, f"{safe_name}_eval_results_{timestamp}.json" ) cmd = [ sys.executable, EVAL_SCRIPT, "--input", input_path, "--output", output_path, ] print(f" Running: {' '.join(cmd)}") result = subprocess.run(cmd, cwd=SCRIPT_DIR) if result.returncode != 0: print(f" [ERROR] Evaluation failed for {model_name} (exit code {result.returncode})") else: print(f" [OK] Results saved to: {output_path}") if __name__ == "__main__": main()