import argparse import json import os from pathlib import Path from typing import Any, Dict, Tuple from tqdm import tqdm from reward_new_v5 import ( compute_score, compute_completeness_reward, compute_hallucination_score_vs_input, _compute_classifier_reward, ) # --------------------------------------------------------------------------- # Optional external metadata: verified_combined_0-80_clean200.json # --------------------------------------------------------------------------- VERIFIED_COMBINED_PATH = ( "/home/mshahidul/readctrl/code/readctrl_rl_inference/verified_combined_0-80_clean200.json" ) _VERIFIED_INDEX: Dict[Tuple[int, str], Dict[str, Any]] = {} _VERIFIED_LOADED = False def _load_verified_index() -> None: global _VERIFIED_LOADED, _VERIFIED_INDEX if _VERIFIED_LOADED: return _VERIFIED_LOADED = True if not os.path.exists(VERIFIED_COMBINED_PATH): return try: with open(VERIFIED_COMBINED_PATH, "r", encoding="utf-8") as f: data = json.load(f) except Exception: return index: Dict[Tuple[int, str], Dict[str, Any]] = {} for row in data: try: doc_id = int(row.get("doc_id")) except Exception: continue label = str(row.get("label", "")).strip() if not label: continue key = (doc_id, label) index[key] = { "summary": row.get("summary", ""), "fulltext": row.get("fulltext", ""), } _VERIFIED_INDEX = index def _lookup_verified(doc_id: Any, label: str) -> Dict[str, Any]: """ Try to fetch (summary, fulltext) for a given (doc_id, label) pair from verified_combined_0-80_clean200.json. Returns {} if not found. """ if doc_id is None or not label: return {} _load_verified_index() try: doc_id_int = int(doc_id) except Exception: return {} key = (doc_id_int, label.strip()) return _VERIFIED_INDEX.get(key, {}) def build_solution_str(prediction_text: str, target_level: str) -> str: payload = {target_level: prediction_text} return f"```json\n{json.dumps(payload, ensure_ascii=False)}\n```" def build_ground_truth(example: Dict[str, Any]) -> Dict[str, Any]: """ Build ground_truth dict for compute_score from a JSONL row. Priority: 1. Use external metadata from verified_combined_0-80_clean200.json (matched by doc_id + label). 2. Fallback: parse summary / source text from the prompt field. """ summary_text = "" input_text = "" # 1) Try to get from verified_combined_0-80_clean200.json doc_id = example.get("doc_id") gold_label = str(example.get("gold_label", "")).strip() meta = _lookup_verified(doc_id, gold_label) if meta: summary_text = str(meta.get("summary", "")).strip() input_text = str(meta.get("fulltext", "")).strip() # 2) Fallback: parse from prompt if needed if not summary_text or not input_text: prompt: str = example.get("prompt", "") # Very lightweight parsing based on the known template in the prompt. marker_summary = "- Gold Summary (the anchor reference summary):" marker_source = "- Source Text (detailed content):" if marker_summary in prompt and marker_source in prompt: before_source = prompt.split(marker_source, 1)[0] after_source = prompt.split(marker_source, 1)[1] if not summary_text and marker_summary in before_source: summary_text = before_source.split(marker_summary, 1)[1].strip() if not input_text: input_text = after_source.strip() return { "summary_text": summary_text, "input_text": input_text, } def score_row(example: Dict[str, Any]) -> Tuple[float, float, float, float]: gold_label = example.get("gold_label", "").strip() if not gold_label: return float("nan") # Prefer explicit JSON in "prediction" if present; otherwise use "generated_text". raw_prediction = example.get("prediction") if isinstance(raw_prediction, str) and raw_prediction.strip(): try: parsed = json.loads(raw_prediction) prediction_text = parsed.get(gold_label, "") except Exception: prediction_text = example.get("generated_text", "") else: prediction_text = example.get("generated_text", "") if not prediction_text or not prediction_text.strip(): nan = float("nan") return nan, nan, nan, nan # Build common pieces solution_str = build_solution_str(prediction_text, gold_label) ground_truth = build_ground_truth(example) extra_info = {"target_level": gold_label} # Overall reward (for reference) total_reward = compute_score( data_source="jsonl_offline_eval", solution_str=solution_str, ground_truth=ground_truth, extra_info=extra_info, ) summary_text = ground_truth.get("summary_text", "") input_text = ground_truth.get("input_text", "") # Component scores completeness = None if summary_text and summary_text.strip(): completeness = compute_completeness_reward( summary_text=summary_text, generated_text=prediction_text, threshold=0.5, batch_size=128, ) classifier = _compute_classifier_reward(gold_label, prediction_text) hallucination = None if input_text and input_text.strip(): hallucination = compute_hallucination_score_vs_input( input_text=input_text, generated_text=prediction_text, threshold=0.5, batch_size=128, ) # Normalise None → NaN for easy averaging def _to_float(x): return float("nan") if x is None else float(x) return ( float(total_reward), _to_float(completeness), float(classifier), _to_float(hallucination), ) def compute_avg_scores(path: str) -> Tuple[float, float, float, float]: total_reward = 0.0 total_compl = 0.0 total_class = 0.0 total_hallu = 0.0 n_reward = 0 n_compl = 0 n_class = 0 n_hallu = 0 with open(path, "r", encoding="utf-8") as f: for line in tqdm(f, desc="Scoring examples"): line = line.strip() if not line: continue try: example = json.loads(line) except Exception: continue reward, compl, clf, hallu = score_row(example) # Reward if reward == reward: # not NaN total_reward += reward n_reward += 1 # Completeness if compl == compl: total_compl += compl n_compl += 1 # Classifier if clf == clf: total_class += clf n_class += 1 # Hallucination if hallu == hallu: total_hallu += hallu n_hallu += 1 def _avg(total: float, n: int) -> float: if n == 0: return float("nan") return total / n return ( _avg(total_reward, n_reward), _avg(total_compl, n_compl), _avg(total_class, n_class), _avg(total_hallu, n_hallu), ) def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description=( "Compute average reward over a JSONL file " "containing GPT-5 inference outputs." ) ) parser.add_argument( "jsonl_path", type=str, help="Path to JSONL file with GPT-5 inference outputs.", ) return parser.parse_args() def _save_results( jsonl_path: str, avg_reward: float, avg_compl: float, avg_class: float, avg_hallu: float, ) -> None: """ Save aggregate metrics to test_result_v5 as a JSON file. """ output_dir = Path("/home/mshahidul/readctrl/code/readctrl_rl_inference/test_result_v5") output_dir.mkdir(parents=True, exist_ok=True) basename = os.path.basename(jsonl_path) stem = os.path.splitext(basename)[0] # Save using the input filename stem so the stats file # clearly corresponds to the original JSONL. out_path = output_dir / f"{stem}.json" payload = { "input_jsonl": os.path.abspath(jsonl_path), "avg_reward": avg_reward, "avg_completeness": avg_compl, "avg_classifier": avg_class, "avg_hallucination": avg_hallu, } with out_path.open("w", encoding="utf-8") as f: json.dump(payload, f, ensure_ascii=False, indent=2) def main() -> None: args = _parse_args() avg_reward, avg_compl, avg_class, avg_hallu = compute_avg_scores(args.jsonl_path) # Plain-text, easy-to-parse output print(f"avg_reward = {avg_reward:.6f}") print(f"avg_completeness = {avg_compl:.6f}") print(f"avg_classifier = {avg_class:.6f}") print(f"avg_hallucination = {avg_hallu:.6f}") # Save to JSON in test_result_v5 for later analysis. _save_results( jsonl_path=args.jsonl_path, avg_reward=avg_reward, avg_compl=avg_compl, avg_class=avg_class, avg_hallu=avg_hallu, ) if __name__ == "__main__": main()