| |
| """ |
| Split the GPT-5 inference JSONL by model (gpt-5, gpt-5-mini, gpt-5-nano), |
| merge each model's generated_text with the subclaims data, and run |
| evaluate_scores_bn.py on each model's data. |
| |
| Usage: |
| python run_eval_gpt5_models.py |
| """ |
|
|
| import json |
| import os |
| import subprocess |
| import sys |
| from collections import defaultdict |
| from datetime import datetime |
|
|
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) |
|
|
| JSONL_PATH = os.path.join( |
| SCRIPT_DIR, |
| "results/bn/gpt5_inference_all_wo_gs_20260314_173736.jsonl", |
| ) |
| SUBCLAIMS_PATH = os.path.join(SCRIPT_DIR, "dataset/bn/test_bn_subclaims.json") |
| OUTPUT_DIR = os.path.join(SCRIPT_DIR, "evaluation/bn/eval_gpt5_models") |
| EVAL_SCRIPT = os.path.join(SCRIPT_DIR, "evaluate_scores_bn.py") |
|
|
| MODELS = ["gpt-5", "gpt-5-mini", "gpt-5-nano"] |
|
|
|
|
| def main(): |
| os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
| with open(JSONL_PATH, "r", encoding="utf-8") as f: |
| all_items = [json.loads(line) for line in f if line.strip()] |
|
|
| with open(SUBCLAIMS_PATH, "r", encoding="utf-8") as f: |
| subclaims_data = json.load(f) |
|
|
| sc_by_key = {} |
| for sc in subclaims_data: |
| key = (sc["doc_id"], sc["label"]) |
| sc_by_key[key] = sc |
|
|
| by_model = defaultdict(list) |
| for item in all_items: |
| by_model[item["model"]].append(item) |
|
|
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
| for model_name in MODELS: |
| items = by_model.get(model_name, []) |
| if not items: |
| print(f"[WARN] No items for model {model_name}, skipping.") |
| continue |
|
|
| safe_name = model_name.replace("-", "_") |
|
|
| prepared = [] |
| skipped_no_subclaims = 0 |
| for item in items: |
| doc_id = item["doc_id"] |
| label = item["gold_label"] |
| key = (doc_id, label) |
| sc = sc_by_key.get(key) |
|
|
| if sc is None: |
| skipped_no_subclaims += 1 |
| continue |
|
|
| prepared.append({ |
| "doc_id": doc_id, |
| "label": label, |
| "fulltext": sc.get("fulltext", ""), |
| "summary_text": sc.get("summary", ""), |
| "summary_subclaims": sc.get("summary_subclaims", []), |
| "fulltext_subclaims": sc.get("fulltext_subclaims", []), |
| "generated_text": item.get("generated_text", ""), |
| }) |
|
|
| input_path = os.path.join( |
| OUTPUT_DIR, f"{safe_name}_prepared_{timestamp}.json" |
| ) |
| with open(input_path, "w", encoding="utf-8") as f: |
| json.dump(prepared, f, indent=2, ensure_ascii=False) |
|
|
| print(f"\n{'='*60}") |
| print(f"Model: {model_name}") |
| print(f" Total JSONL items : {len(items)}") |
| print(f" Matched w/ subclaims: {len(prepared)}") |
| print(f" Skipped (no subcl.) : {skipped_no_subclaims}") |
| print(f" Prepared file : {input_path}") |
| print(f"{'='*60}") |
|
|
| output_path = os.path.join( |
| OUTPUT_DIR, f"{safe_name}_eval_results_{timestamp}.json" |
| ) |
|
|
| cmd = [ |
| sys.executable, |
| EVAL_SCRIPT, |
| "--input", input_path, |
| "--output", output_path, |
| ] |
|
|
| print(f" Running: {' '.join(cmd)}") |
| result = subprocess.run(cmd, cwd=SCRIPT_DIR) |
|
|
| if result.returncode != 0: |
| print(f" [ERROR] Evaluation failed for {model_name} (exit code {result.returncode})") |
| else: |
| print(f" [OK] Results saved to: {output_path}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|