| import argparse |
| import json |
| import os |
| from pathlib import Path |
| from typing import Any, Dict, Tuple |
|
|
| from tqdm import tqdm |
|
|
| from reward_new_v5 import ( |
| compute_score, |
| compute_completeness_reward, |
| compute_hallucination_score_vs_input, |
| _compute_classifier_reward, |
| ) |
|
|
|
|
| |
| |
| |
|
|
| VERIFIED_COMBINED_PATH = ( |
| "/home/mshahidul/readctrl/code/readctrl_rl_inference/verified_combined_0-80_clean200.json" |
| ) |
|
|
| _VERIFIED_INDEX: Dict[Tuple[int, str], Dict[str, Any]] = {} |
| _VERIFIED_LOADED = False |
|
|
|
|
| def _load_verified_index() -> None: |
| global _VERIFIED_LOADED, _VERIFIED_INDEX |
| if _VERIFIED_LOADED: |
| return |
| _VERIFIED_LOADED = True |
| if not os.path.exists(VERIFIED_COMBINED_PATH): |
| return |
| try: |
| with open(VERIFIED_COMBINED_PATH, "r", encoding="utf-8") as f: |
| data = json.load(f) |
| except Exception: |
| return |
|
|
| index: Dict[Tuple[int, str], Dict[str, Any]] = {} |
| for row in data: |
| try: |
| doc_id = int(row.get("doc_id")) |
| except Exception: |
| continue |
| label = str(row.get("label", "")).strip() |
| if not label: |
| continue |
| key = (doc_id, label) |
| index[key] = { |
| "summary": row.get("summary", ""), |
| "fulltext": row.get("fulltext", ""), |
| } |
| _VERIFIED_INDEX = index |
|
|
|
|
| def _lookup_verified(doc_id: Any, label: str) -> Dict[str, Any]: |
| """ |
| Try to fetch (summary, fulltext) for a given (doc_id, label) pair |
| from verified_combined_0-80_clean200.json. Returns {} if not found. |
| """ |
| if doc_id is None or not label: |
| return {} |
| _load_verified_index() |
| try: |
| doc_id_int = int(doc_id) |
| except Exception: |
| return {} |
| key = (doc_id_int, label.strip()) |
| return _VERIFIED_INDEX.get(key, {}) |
|
|
|
|
| def build_solution_str(prediction_text: str, target_level: str) -> str: |
| payload = {target_level: prediction_text} |
| return f"```json\n{json.dumps(payload, ensure_ascii=False)}\n```" |
|
|
|
|
| def build_ground_truth(example: Dict[str, Any]) -> Dict[str, Any]: |
| """ |
| Build ground_truth dict for compute_score from a JSONL row. |
| |
| Priority: |
| 1. Use external metadata from verified_combined_0-80_clean200.json |
| (matched by doc_id + label). |
| 2. Fallback: parse summary / source text from the prompt field. |
| """ |
| summary_text = "" |
| input_text = "" |
|
|
| |
| doc_id = example.get("doc_id") |
| gold_label = str(example.get("gold_label", "")).strip() |
| meta = _lookup_verified(doc_id, gold_label) |
| if meta: |
| summary_text = str(meta.get("summary", "")).strip() |
| input_text = str(meta.get("fulltext", "")).strip() |
|
|
| |
| if not summary_text or not input_text: |
| prompt: str = example.get("prompt", "") |
|
|
| |
| marker_summary = "- Gold Summary (the anchor reference summary):" |
| marker_source = "- Source Text (detailed content):" |
|
|
| if marker_summary in prompt and marker_source in prompt: |
| before_source = prompt.split(marker_source, 1)[0] |
| after_source = prompt.split(marker_source, 1)[1] |
|
|
| if not summary_text and marker_summary in before_source: |
| summary_text = before_source.split(marker_summary, 1)[1].strip() |
| if not input_text: |
| input_text = after_source.strip() |
|
|
| return { |
| "summary_text": summary_text, |
| "input_text": input_text, |
| } |
|
|
|
|
| def score_row(example: Dict[str, Any]) -> Tuple[float, float, float, float]: |
| gold_label = example.get("gold_label", "").strip() |
| if not gold_label: |
| return float("nan") |
|
|
| |
| raw_prediction = example.get("prediction") |
| if isinstance(raw_prediction, str) and raw_prediction.strip(): |
| try: |
| parsed = json.loads(raw_prediction) |
| prediction_text = parsed.get(gold_label, "") |
| except Exception: |
| prediction_text = example.get("generated_text", "") |
| else: |
| prediction_text = example.get("generated_text", "") |
|
|
| if not prediction_text or not prediction_text.strip(): |
| nan = float("nan") |
| return nan, nan, nan, nan |
|
|
| |
| solution_str = build_solution_str(prediction_text, gold_label) |
| ground_truth = build_ground_truth(example) |
| extra_info = {"target_level": gold_label} |
|
|
| |
| total_reward = compute_score( |
| data_source="jsonl_offline_eval", |
| solution_str=solution_str, |
| ground_truth=ground_truth, |
| extra_info=extra_info, |
| ) |
|
|
| summary_text = ground_truth.get("summary_text", "") |
| input_text = ground_truth.get("input_text", "") |
|
|
| |
| completeness = None |
| if summary_text and summary_text.strip(): |
| completeness = compute_completeness_reward( |
| summary_text=summary_text, |
| generated_text=prediction_text, |
| threshold=0.5, |
| batch_size=128, |
| ) |
|
|
| classifier = _compute_classifier_reward(gold_label, prediction_text) |
|
|
| hallucination = None |
| if input_text and input_text.strip(): |
| hallucination = compute_hallucination_score_vs_input( |
| input_text=input_text, |
| generated_text=prediction_text, |
| threshold=0.5, |
| batch_size=128, |
| ) |
|
|
| |
| def _to_float(x): |
| return float("nan") if x is None else float(x) |
|
|
| return ( |
| float(total_reward), |
| _to_float(completeness), |
| float(classifier), |
| _to_float(hallucination), |
| ) |
|
|
|
|
| def compute_avg_scores(path: str) -> Tuple[float, float, float, float]: |
| total_reward = 0.0 |
| total_compl = 0.0 |
| total_class = 0.0 |
| total_hallu = 0.0 |
|
|
| n_reward = 0 |
| n_compl = 0 |
| n_class = 0 |
| n_hallu = 0 |
|
|
| with open(path, "r", encoding="utf-8") as f: |
| for line in tqdm(f, desc="Scoring examples"): |
| line = line.strip() |
| if not line: |
| continue |
| try: |
| example = json.loads(line) |
| except Exception: |
| continue |
|
|
| reward, compl, clf, hallu = score_row(example) |
|
|
| |
| if reward == reward: |
| total_reward += reward |
| n_reward += 1 |
|
|
| |
| if compl == compl: |
| total_compl += compl |
| n_compl += 1 |
|
|
| |
| if clf == clf: |
| total_class += clf |
| n_class += 1 |
|
|
| |
| if hallu == hallu: |
| total_hallu += hallu |
| n_hallu += 1 |
|
|
| def _avg(total: float, n: int) -> float: |
| if n == 0: |
| return float("nan") |
| return total / n |
|
|
| return ( |
| _avg(total_reward, n_reward), |
| _avg(total_compl, n_compl), |
| _avg(total_class, n_class), |
| _avg(total_hallu, n_hallu), |
| ) |
|
|
|
|
| def _parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser( |
| description=( |
| "Compute average reward over a JSONL file " |
| "containing GPT-5 inference outputs." |
| ) |
| ) |
| parser.add_argument( |
| "jsonl_path", |
| type=str, |
| help="Path to JSONL file with GPT-5 inference outputs.", |
| ) |
| return parser.parse_args() |
|
|
|
|
| def _save_results( |
| jsonl_path: str, |
| avg_reward: float, |
| avg_compl: float, |
| avg_class: float, |
| avg_hallu: float, |
| ) -> None: |
| """ |
| Save aggregate metrics to test_result_v5 as a JSON file. |
| """ |
| output_dir = Path("/home/mshahidul/readctrl/code/readctrl_rl_inference/test_result_v5") |
| output_dir.mkdir(parents=True, exist_ok=True) |
|
|
| basename = os.path.basename(jsonl_path) |
| stem = os.path.splitext(basename)[0] |
| |
| |
| out_path = output_dir / f"{stem}.json" |
|
|
| payload = { |
| "input_jsonl": os.path.abspath(jsonl_path), |
| "avg_reward": avg_reward, |
| "avg_completeness": avg_compl, |
| "avg_classifier": avg_class, |
| "avg_hallucination": avg_hallu, |
| } |
|
|
| with out_path.open("w", encoding="utf-8") as f: |
| json.dump(payload, f, ensure_ascii=False, indent=2) |
|
|
|
|
| def main() -> None: |
| args = _parse_args() |
| avg_reward, avg_compl, avg_class, avg_hallu = compute_avg_scores(args.jsonl_path) |
|
|
| |
| print(f"avg_reward = {avg_reward:.6f}") |
| print(f"avg_completeness = {avg_compl:.6f}") |
| print(f"avg_classifier = {avg_class:.6f}") |
| print(f"avg_hallucination = {avg_hallu:.6f}") |
|
|
| |
| _save_results( |
| jsonl_path=args.jsonl_path, |
| avg_reward=avg_reward, |
| avg_compl=avg_compl, |
| avg_class=avg_class, |
| avg_hallu=avg_hallu, |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|
|
|