readCtrl_lambda / code /fine_tune_sft_dpo /run_eval_gpt5_models.py
shahidul034
"Update readCtrl repo"
93694bb
#!/usr/bin/env python3
"""
Split the GPT-5 inference JSONL by model (gpt-5, gpt-5-mini, gpt-5-nano),
merge each model's generated_text with the subclaims data, and run
evaluate_scores_bn.py on each model's data.
Usage:
python run_eval_gpt5_models.py
"""
import json
import os
import subprocess
import sys
from collections import defaultdict
from datetime import datetime
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
JSONL_PATH = os.path.join(
SCRIPT_DIR,
"results/bn/gpt5_inference_all_wo_gs_20260314_173736.jsonl",
)
SUBCLAIMS_PATH = os.path.join(SCRIPT_DIR, "dataset/bn/test_bn_subclaims.json")
OUTPUT_DIR = os.path.join(SCRIPT_DIR, "evaluation/bn/eval_gpt5_models")
EVAL_SCRIPT = os.path.join(SCRIPT_DIR, "evaluate_scores_bn.py")
MODELS = ["gpt-5", "gpt-5-mini", "gpt-5-nano"]
def main():
os.makedirs(OUTPUT_DIR, exist_ok=True)
with open(JSONL_PATH, "r", encoding="utf-8") as f:
all_items = [json.loads(line) for line in f if line.strip()]
with open(SUBCLAIMS_PATH, "r", encoding="utf-8") as f:
subclaims_data = json.load(f)
sc_by_key = {}
for sc in subclaims_data:
key = (sc["doc_id"], sc["label"])
sc_by_key[key] = sc
by_model = defaultdict(list)
for item in all_items:
by_model[item["model"]].append(item)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
for model_name in MODELS:
items = by_model.get(model_name, [])
if not items:
print(f"[WARN] No items for model {model_name}, skipping.")
continue
safe_name = model_name.replace("-", "_")
prepared = []
skipped_no_subclaims = 0
for item in items:
doc_id = item["doc_id"]
label = item["gold_label"]
key = (doc_id, label)
sc = sc_by_key.get(key)
if sc is None:
skipped_no_subclaims += 1
continue
prepared.append({
"doc_id": doc_id,
"label": label,
"fulltext": sc.get("fulltext", ""),
"summary_text": sc.get("summary", ""),
"summary_subclaims": sc.get("summary_subclaims", []),
"fulltext_subclaims": sc.get("fulltext_subclaims", []),
"generated_text": item.get("generated_text", ""),
})
input_path = os.path.join(
OUTPUT_DIR, f"{safe_name}_prepared_{timestamp}.json"
)
with open(input_path, "w", encoding="utf-8") as f:
json.dump(prepared, f, indent=2, ensure_ascii=False)
print(f"\n{'='*60}")
print(f"Model: {model_name}")
print(f" Total JSONL items : {len(items)}")
print(f" Matched w/ subclaims: {len(prepared)}")
print(f" Skipped (no subcl.) : {skipped_no_subclaims}")
print(f" Prepared file : {input_path}")
print(f"{'='*60}")
output_path = os.path.join(
OUTPUT_DIR, f"{safe_name}_eval_results_{timestamp}.json"
)
cmd = [
sys.executable,
EVAL_SCRIPT,
"--input", input_path,
"--output", output_path,
]
print(f" Running: {' '.join(cmd)}")
result = subprocess.run(cmd, cwd=SCRIPT_DIR)
if result.returncode != 0:
print(f" [ERROR] Evaluation failed for {model_name} (exit code {result.returncode})")
else:
print(f" [OK] Results saved to: {output_path}")
if __name__ == "__main__":
main()