File size: 9,360 Bytes

030876e

import argparse
import json
import os
from pathlib import Path
from typing import Any, Dict, Tuple

from tqdm import tqdm

from reward_new_v5 import (
    compute_score,
    compute_completeness_reward,
    compute_hallucination_score_vs_input,
    _compute_classifier_reward,
)


# ---------------------------------------------------------------------------
# Optional external metadata: verified_combined_0-80_clean200.json
# ---------------------------------------------------------------------------

VERIFIED_COMBINED_PATH = (
    "/home/mshahidul/readctrl/code/readctrl_rl_inference/verified_combined_0-80_clean200.json"
)

_VERIFIED_INDEX: Dict[Tuple[int, str], Dict[str, Any]] = {}
_VERIFIED_LOADED = False


def _load_verified_index() -> None:
    global _VERIFIED_LOADED, _VERIFIED_INDEX
    if _VERIFIED_LOADED:
        return
    _VERIFIED_LOADED = True
    if not os.path.exists(VERIFIED_COMBINED_PATH):
        return
    try:
        with open(VERIFIED_COMBINED_PATH, "r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception:
        return

    index: Dict[Tuple[int, str], Dict[str, Any]] = {}
    for row in data:
        try:
            doc_id = int(row.get("doc_id"))
        except Exception:
            continue
        label = str(row.get("label", "")).strip()
        if not label:
            continue
        key = (doc_id, label)
        index[key] = {
            "summary": row.get("summary", ""),
            "fulltext": row.get("fulltext", ""),
        }
    _VERIFIED_INDEX = index


def _lookup_verified(doc_id: Any, label: str) -> Dict[str, Any]:
    """
    Try to fetch (summary, fulltext) for a given (doc_id, label) pair
    from verified_combined_0-80_clean200.json. Returns {} if not found.
    """
    if doc_id is None or not label:
        return {}
    _load_verified_index()
    try:
        doc_id_int = int(doc_id)
    except Exception:
        return {}
    key = (doc_id_int, label.strip())
    return _VERIFIED_INDEX.get(key, {})


def build_solution_str(prediction_text: str, target_level: str) -> str:
    payload = {target_level: prediction_text}
    return f"```json\n{json.dumps(payload, ensure_ascii=False)}\n```"


def build_ground_truth(example: Dict[str, Any]) -> Dict[str, Any]:
    """
    Build ground_truth dict for compute_score from a JSONL row.

    Priority:
    1. Use external metadata from verified_combined_0-80_clean200.json
       (matched by doc_id + label).
    2. Fallback: parse summary / source text from the prompt field.
    """
    summary_text = ""
    input_text = ""

    # 1) Try to get from verified_combined_0-80_clean200.json
    doc_id = example.get("doc_id")
    gold_label = str(example.get("gold_label", "")).strip()
    meta = _lookup_verified(doc_id, gold_label)
    if meta:
        summary_text = str(meta.get("summary", "")).strip()
        input_text = str(meta.get("fulltext", "")).strip()

    # 2) Fallback: parse from prompt if needed
    if not summary_text or not input_text:
        prompt: str = example.get("prompt", "")

        # Very lightweight parsing based on the known template in the prompt.
        marker_summary = "- Gold Summary (the anchor reference summary):"
        marker_source = "- Source Text (detailed content):"

        if marker_summary in prompt and marker_source in prompt:
            before_source = prompt.split(marker_source, 1)[0]
            after_source = prompt.split(marker_source, 1)[1]

            if not summary_text and marker_summary in before_source:
                summary_text = before_source.split(marker_summary, 1)[1].strip()
            if not input_text:
                input_text = after_source.strip()

    return {
        "summary_text": summary_text,
        "input_text": input_text,
    }


def score_row(example: Dict[str, Any]) -> Tuple[float, float, float, float]:
    gold_label = example.get("gold_label", "").strip()
    if not gold_label:
        return float("nan")

    # Prefer explicit JSON in "prediction" if present; otherwise use "generated_text".
    raw_prediction = example.get("prediction")
    if isinstance(raw_prediction, str) and raw_prediction.strip():
        try:
            parsed = json.loads(raw_prediction)
            prediction_text = parsed.get(gold_label, "")
        except Exception:
            prediction_text = example.get("generated_text", "")
    else:
        prediction_text = example.get("generated_text", "")

    if not prediction_text or not prediction_text.strip():
        nan = float("nan")
        return nan, nan, nan, nan

    # Build common pieces
    solution_str = build_solution_str(prediction_text, gold_label)
    ground_truth = build_ground_truth(example)
    extra_info = {"target_level": gold_label}

    # Overall reward (for reference)
    total_reward = compute_score(
        data_source="jsonl_offline_eval",
        solution_str=solution_str,
        ground_truth=ground_truth,
        extra_info=extra_info,
    )

    summary_text = ground_truth.get("summary_text", "")
    input_text = ground_truth.get("input_text", "")

    # Component scores
    completeness = None
    if summary_text and summary_text.strip():
        completeness = compute_completeness_reward(
            summary_text=summary_text,
            generated_text=prediction_text,
            threshold=0.5,
            batch_size=128,
        )

    classifier = _compute_classifier_reward(gold_label, prediction_text)

    hallucination = None
    if input_text and input_text.strip():
        hallucination = compute_hallucination_score_vs_input(
            input_text=input_text,
            generated_text=prediction_text,
            threshold=0.5,
            batch_size=128,
        )

    # Normalise None → NaN for easy averaging
    def _to_float(x):
        return float("nan") if x is None else float(x)

    return (
        float(total_reward),
        _to_float(completeness),
        float(classifier),
        _to_float(hallucination),
    )


def compute_avg_scores(path: str) -> Tuple[float, float, float, float]:
    total_reward = 0.0
    total_compl = 0.0
    total_class = 0.0
    total_hallu = 0.0

    n_reward = 0
    n_compl = 0
    n_class = 0
    n_hallu = 0

    with open(path, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc="Scoring examples"):
            line = line.strip()
            if not line:
                continue
            try:
                example = json.loads(line)
            except Exception:
                continue

            reward, compl, clf, hallu = score_row(example)

            # Reward
            if reward == reward:  # not NaN
                total_reward += reward
                n_reward += 1

            # Completeness
            if compl == compl:
                total_compl += compl
                n_compl += 1

            # Classifier
            if clf == clf:
                total_class += clf
                n_class += 1

            # Hallucination
            if hallu == hallu:
                total_hallu += hallu
                n_hallu += 1

    def _avg(total: float, n: int) -> float:
        if n == 0:
            return float("nan")
        return total / n

    return (
        _avg(total_reward, n_reward),
        _avg(total_compl, n_compl),
        _avg(total_class, n_class),
        _avg(total_hallu, n_hallu),
    )


def _parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description=(
            "Compute average reward over a JSONL file "
            "containing GPT-5 inference outputs."
        )
    )
    parser.add_argument(
        "jsonl_path",
        type=str,
        help="Path to JSONL file with GPT-5 inference outputs.",
    )
    return parser.parse_args()


def _save_results(
    jsonl_path: str,
    avg_reward: float,
    avg_compl: float,
    avg_class: float,
    avg_hallu: float,
) -> None:
    """
    Save aggregate metrics to test_result_v5 as a JSON file.
    """
    output_dir = Path("/home/mshahidul/readctrl/code/readctrl_rl_inference/test_result_v5")
    output_dir.mkdir(parents=True, exist_ok=True)

    basename = os.path.basename(jsonl_path)
    stem = os.path.splitext(basename)[0]
    # Save using the input filename stem so the stats file
    # clearly corresponds to the original JSONL.
    out_path = output_dir / f"{stem}.json"

    payload = {
        "input_jsonl": os.path.abspath(jsonl_path),
        "avg_reward": avg_reward,
        "avg_completeness": avg_compl,
        "avg_classifier": avg_class,
        "avg_hallucination": avg_hallu,
    }

    with out_path.open("w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)


def main() -> None:
    args = _parse_args()
    avg_reward, avg_compl, avg_class, avg_hallu = compute_avg_scores(args.jsonl_path)

    # Plain-text, easy-to-parse output
    print(f"avg_reward         = {avg_reward:.6f}")
    print(f"avg_completeness   = {avg_compl:.6f}")
    print(f"avg_classifier     = {avg_class:.6f}")
    print(f"avg_hallucination  = {avg_hallu:.6f}")

    # Save to JSON in test_result_v5 for later analysis.
    _save_results(
        jsonl_path=args.jsonl_path,
        avg_reward=avg_reward,
        avg_compl=avg_compl,
        avg_class=avg_class,
        avg_hallu=avg_hallu,
    )


if __name__ == "__main__":
    main()