""" Run this on the workbench to insert the comparison cell into the notebook. Usage: python scripts/insert_comparison_cell.py """ import json from pathlib import Path NOTEBOOK_PATH = Path("/home/jupyter/tucano2/notebooks/v4_2_instruct_grpo.ipynb") # Adjust if your notebook is elsewhere CELL_SCRIPT_PATH = Path(__file__).parent.parent / "notebooks" / "cell_comparison_base_vs_tuned.py" # If running from repo root: if not CELL_SCRIPT_PATH.exists(): CELL_SCRIPT_PATH = Path("/home/jupyter/tucano2/notebooks/cell_comparison_base_vs_tuned.py") def main(): # Read notebook with open(NOTEBOOK_PATH) as f: nb = json.load(f) # Read cell script with open(CELL_SCRIPT_PATH) as f: cell_code = f.read() # Create markdown + code cells md_cell = { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "\n", "## Cell 15: Base vs Tuned Comparison (Final Evaluation)\n", "\n", "**Purpose:** Definitive A/B test — same 65 eval prompts, same generation config, \n", "comparing the raw base model against the GRPO-tuned best checkpoint (step 1100).\n", "\n", "**Prerequisites:** Cells 1-5 + Cell 7 + Cell 10 run. Best checkpoint at:\n", "`models/tucano2-0.5B-instruct-grpo-v4.2-seed42/best_checkpoint/`\n", "\n", "**Output:** Per-task reward comparison table with Wilcoxon significance test + sample outputs.\n", "\n", "**Gate:** This is the final cell. No gate — it produces the experiment's conclusion." ] } code_cell = { "cell_type": "code", "execution_count": None, "metadata": {}, "outputs": [], "source": [line + "\n" for line in cell_code.split("\n")[:-1]] + [cell_code.split("\n")[-1]] } # Insert at end of notebook nb["cells"].append(md_cell) nb["cells"].append(code_cell) with open(NOTEBOOK_PATH, "w") as f: json.dump(nb, f, ensure_ascii=False, indent=1) print(f"✓ Inserted comparison cell at end of notebook ({len(nb['cells'])} cells total)") print(f" Notebook: {NOTEBOOK_PATH}") if __name__ == "__main__": main()