readCtrl_lambda / code /readctrl_rl_inference /compute_avg_reward_from_jsonl.py

mshahidul

Initial commit of readCtrl code without large models

030876e 6 days ago

9.36 kB

	import argparse
	import json
	import os
	from pathlib import Path
	from typing import Any, Dict, Tuple

	from tqdm import tqdm

	from reward_new_v5 import (
	compute_score,
	compute_completeness_reward,
	compute_hallucination_score_vs_input,
	_compute_classifier_reward,
	)


	# ---------------------------------------------------------------------------
	# Optional external metadata: verified_combined_0-80_clean200.json
	# ---------------------------------------------------------------------------

	VERIFIED_COMBINED_PATH = (
	"/home/mshahidul/readctrl/code/readctrl_rl_inference/verified_combined_0-80_clean200.json"
	)

	_VERIFIED_INDEX: Dict[Tuple[int, str], Dict[str, Any]] = {}
	_VERIFIED_LOADED = False


	def _load_verified_index() -> None:
	global _VERIFIED_LOADED, _VERIFIED_INDEX
	if _VERIFIED_LOADED:
	return
	_VERIFIED_LOADED = True
	if not os.path.exists(VERIFIED_COMBINED_PATH):
	return
	try:
	with open(VERIFIED_COMBINED_PATH, "r", encoding="utf-8") as f:
	data = json.load(f)
	except Exception:
	return

	index: Dict[Tuple[int, str], Dict[str, Any]] = {}
	for row in data:
	try:
	doc_id = int(row.get("doc_id"))
	except Exception:
	continue
	label = str(row.get("label", "")).strip()
	if not label:
	continue
	key = (doc_id, label)
	index[key] = {
	"summary": row.get("summary", ""),
	"fulltext": row.get("fulltext", ""),
	}
	_VERIFIED_INDEX = index


	def _lookup_verified(doc_id: Any, label: str) -> Dict[str, Any]:
	"""
	Try to fetch (summary, fulltext) for a given (doc_id, label) pair
	from verified_combined_0-80_clean200.json. Returns {} if not found.
	"""
	if doc_id is None or not label:
	return {}
	_load_verified_index()
	try:
	doc_id_int = int(doc_id)
	except Exception:
	return {}
	key = (doc_id_int, label.strip())
	return _VERIFIED_INDEX.get(key, {})


	def build_solution_str(prediction_text: str, target_level: str) -> str:
	payload = {target_level: prediction_text}
	return f"```json\n{json.dumps(payload, ensure_ascii=False)}\n```"


	def build_ground_truth(example: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Build ground_truth dict for compute_score from a JSONL row.

	Priority:
	1. Use external metadata from verified_combined_0-80_clean200.json
	(matched by doc_id + label).
	2. Fallback: parse summary / source text from the prompt field.
	"""
	summary_text = ""
	input_text = ""

	# 1) Try to get from verified_combined_0-80_clean200.json
	doc_id = example.get("doc_id")
	gold_label = str(example.get("gold_label", "")).strip()
	meta = _lookup_verified(doc_id, gold_label)
	if meta:
	summary_text = str(meta.get("summary", "")).strip()
	input_text = str(meta.get("fulltext", "")).strip()

	# 2) Fallback: parse from prompt if needed
	if not summary_text or not input_text:
	prompt: str = example.get("prompt", "")

	# Very lightweight parsing based on the known template in the prompt.
	marker_summary = "- Gold Summary (the anchor reference summary):"
	marker_source = "- Source Text (detailed content):"

	if marker_summary in prompt and marker_source in prompt:
	before_source = prompt.split(marker_source, 1)[0]
	after_source = prompt.split(marker_source, 1)[1]

	if not summary_text and marker_summary in before_source:
	summary_text = before_source.split(marker_summary, 1)[1].strip()
	if not input_text:
	input_text = after_source.strip()

	return {
	"summary_text": summary_text,
	"input_text": input_text,
	}


	def score_row(example: Dict[str, Any]) -> Tuple[float, float, float, float]:
	gold_label = example.get("gold_label", "").strip()
	if not gold_label:
	return float("nan")

	# Prefer explicit JSON in "prediction" if present; otherwise use "generated_text".
	raw_prediction = example.get("prediction")
	if isinstance(raw_prediction, str) and raw_prediction.strip():
	try:
	parsed = json.loads(raw_prediction)
	prediction_text = parsed.get(gold_label, "")
	except Exception:
	prediction_text = example.get("generated_text", "")
	else:
	prediction_text = example.get("generated_text", "")

	if not prediction_text or not prediction_text.strip():
	nan = float("nan")
	return nan, nan, nan, nan

	# Build common pieces
	solution_str = build_solution_str(prediction_text, gold_label)
	ground_truth = build_ground_truth(example)
	extra_info = {"target_level": gold_label}

	# Overall reward (for reference)
	total_reward = compute_score(
	data_source="jsonl_offline_eval",
	solution_str=solution_str,
	ground_truth=ground_truth,
	extra_info=extra_info,
	)

	summary_text = ground_truth.get("summary_text", "")
	input_text = ground_truth.get("input_text", "")

	# Component scores
	completeness = None
	if summary_text and summary_text.strip():
	completeness = compute_completeness_reward(
	summary_text=summary_text,
	generated_text=prediction_text,
	threshold=0.5,
	batch_size=128,
	)

	classifier = _compute_classifier_reward(gold_label, prediction_text)

	hallucination = None
	if input_text and input_text.strip():
	hallucination = compute_hallucination_score_vs_input(
	input_text=input_text,
	generated_text=prediction_text,
	threshold=0.5,
	batch_size=128,
	)

	# Normalise None → NaN for easy averaging
	def _to_float(x):
	return float("nan") if x is None else float(x)

	return (
	float(total_reward),
	_to_float(completeness),
	float(classifier),
	_to_float(hallucination),
	)


	def compute_avg_scores(path: str) -> Tuple[float, float, float, float]:
	total_reward = 0.0
	total_compl = 0.0
	total_class = 0.0
	total_hallu = 0.0

	n_reward = 0
	n_compl = 0
	n_class = 0
	n_hallu = 0

	with open(path, "r", encoding="utf-8") as f:
	for line in tqdm(f, desc="Scoring examples"):
	line = line.strip()
	if not line:
	continue
	try:
	example = json.loads(line)
	except Exception:
	continue

	reward, compl, clf, hallu = score_row(example)

	# Reward
	if reward == reward: # not NaN
	total_reward += reward
	n_reward += 1

	# Completeness
	if compl == compl:
	total_compl += compl
	n_compl += 1

	# Classifier
	if clf == clf:
	total_class += clf
	n_class += 1

	# Hallucination
	if hallu == hallu:
	total_hallu += hallu
	n_hallu += 1

	def _avg(total: float, n: int) -> float:
	if n == 0:
	return float("nan")
	return total / n

	return (
	_avg(total_reward, n_reward),
	_avg(total_compl, n_compl),
	_avg(total_class, n_class),
	_avg(total_hallu, n_hallu),
	)


	def _parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(
	description=(
	"Compute average reward over a JSONL file "
	"containing GPT-5 inference outputs."
	)
	)
	parser.add_argument(
	"jsonl_path",
	type=str,
	help="Path to JSONL file with GPT-5 inference outputs.",
	)
	return parser.parse_args()


	def _save_results(
	jsonl_path: str,
	avg_reward: float,
	avg_compl: float,
	avg_class: float,
	avg_hallu: float,
	) -> None:
	"""
	Save aggregate metrics to test_result_v5 as a JSON file.
	"""
	output_dir = Path("/home/mshahidul/readctrl/code/readctrl_rl_inference/test_result_v5")
	output_dir.mkdir(parents=True, exist_ok=True)

	basename = os.path.basename(jsonl_path)
	stem = os.path.splitext(basename)[0]
	# Save using the input filename stem so the stats file
	# clearly corresponds to the original JSONL.
	out_path = output_dir / f"{stem}.json"

	payload = {
	"input_jsonl": os.path.abspath(jsonl_path),
	"avg_reward": avg_reward,
	"avg_completeness": avg_compl,
	"avg_classifier": avg_class,
	"avg_hallucination": avg_hallu,
	}

	with out_path.open("w", encoding="utf-8") as f:
	json.dump(payload, f, ensure_ascii=False, indent=2)


	def main() -> None:
	args = _parse_args()
	avg_reward, avg_compl, avg_class, avg_hallu = compute_avg_scores(args.jsonl_path)

	# Plain-text, easy-to-parse output
	print(f"avg_reward = {avg_reward:.6f}")
	print(f"avg_completeness = {avg_compl:.6f}")
	print(f"avg_classifier = {avg_class:.6f}")
	print(f"avg_hallucination = {avg_hallu:.6f}")

	# Save to JSON in test_result_v5 for later analysis.
	_save_results(
	jsonl_path=args.jsonl_path,
	avg_reward=avg_reward,
	avg_compl=avg_compl,
	avg_class=avg_class,
	avg_hallu=avg_hallu,
	)


	if __name__ == "__main__":
	main()