#!/usr/bin/env python3 """Build a Google-Docs-ready comparison of 5 models' input/output from JSONL files.""" import json import os BASE = os.path.dirname(os.path.abspath(__file__)) def load_by_row_indices(path, indices_wanted): out = {} with open(path) as f: for line in f: row = json.loads(line) ri = row.get("row_index") if ri in indices_wanted and ri not in out: out[ri] = row if len(out) >= len(indices_wanted): break return out def get_input(row): return (row.get("input_text") or row.get("prompt") or row.get("summary_text") or "").strip() def get_output(row): return (row.get("generated_text") or row.get("prediction") or "").strip() def main(): indices = [0, 2, 3] vllm = load_by_row_indices(os.path.join(BASE, "vllm_model_result/vllm_inference_320_en_only_srcCov_v5.jsonl"), indices) gpt5 = load_by_row_indices(os.path.join(BASE, "gpt5mini-nano_inference/gpt5_inference_gpt-5_20260302_201653.jsonl"), indices) gpt5mini = load_by_row_indices(os.path.join(BASE, "gpt5mini-nano_inference/gpt5_inference_gpt-5-mini_20260213_025254_cleaned_by_verified_combined_0-80_clean200.jsonl"), indices) gpt5nano = load_by_row_indices(os.path.join(BASE, "gpt5mini-nano_inference/gpt5_inference_gpt-5-nano_20260213_025254_cleaned_by_verified_combined_0-80_clean200.jsonl"), indices) qwen4b = load_by_row_indices(os.path.join(BASE, "vllm_model_result/qwen3-4b-instruct-base-result.jsonl"), indices) models = [ ("vllm_inference_320 (trained RL)", vllm), ("gpt-5", gpt5), ("gpt-5-mini", gpt5mini), ("gpt-5-nano", gpt5nano), ("qwen3-4B-instruct (base, no RL)", qwen4b), ] # Build HTML for Google Docs (paste into doc) html_lines = [ "
Models: (1) vllm_inference_320 — your trained RL model; (2) gpt-5; (3) gpt-5-mini; (4) gpt-5-nano; (5) qwen3-4B-instruct — base without RL.
", "Task: simplified medical/summary text (low health literacy style).
", "Note: Example 3 — GPT-5-mini and GPT-5-nano were run on a subset; their row_index 3 may refer to a different case than the other three models.
", "Input (source text):
") html_lines.append(f"{inp.replace(chr(10), '
')}
Outputs by model:
") for label, data in models: if ri not in data: html_lines.append(f"{label}: — (no row for this index)
") continue out = get_output(data[ri]) html_lines.append(f"{label}
") html_lines.append(f"{out.replace(chr(10), '
')}