| """ |
| Run this on the workbench to insert the comparison cell into the notebook. |
| Usage: python scripts/insert_comparison_cell.py |
| """ |
| import json |
| from pathlib import Path |
|
|
| NOTEBOOK_PATH = Path("/home/jupyter/tucano2/notebooks/v4_2_instruct_grpo.ipynb") |
| |
|
|
| CELL_SCRIPT_PATH = Path(__file__).parent.parent / "notebooks" / "cell_comparison_base_vs_tuned.py" |
|
|
| |
| if not CELL_SCRIPT_PATH.exists(): |
| CELL_SCRIPT_PATH = Path("/home/jupyter/tucano2/notebooks/cell_comparison_base_vs_tuned.py") |
|
|
| def main(): |
| |
| with open(NOTEBOOK_PATH) as f: |
| nb = json.load(f) |
| |
| |
| with open(CELL_SCRIPT_PATH) as f: |
| cell_code = f.read() |
| |
| |
| md_cell = { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "---\n", |
| "\n", |
| "## Cell 15: Base vs Tuned Comparison (Final Evaluation)\n", |
| "\n", |
| "**Purpose:** Definitive A/B test — same 65 eval prompts, same generation config, \n", |
| "comparing the raw base model against the GRPO-tuned best checkpoint (step 1100).\n", |
| "\n", |
| "**Prerequisites:** Cells 1-5 + Cell 7 + Cell 10 run. Best checkpoint at:\n", |
| "`models/tucano2-0.5B-instruct-grpo-v4.2-seed42/best_checkpoint/`\n", |
| "\n", |
| "**Output:** Per-task reward comparison table with Wilcoxon significance test + sample outputs.\n", |
| "\n", |
| "**Gate:** This is the final cell. No gate — it produces the experiment's conclusion." |
| ] |
| } |
| |
| code_cell = { |
| "cell_type": "code", |
| "execution_count": None, |
| "metadata": {}, |
| "outputs": [], |
| "source": [line + "\n" for line in cell_code.split("\n")[:-1]] + [cell_code.split("\n")[-1]] |
| } |
| |
| |
| nb["cells"].append(md_cell) |
| nb["cells"].append(code_cell) |
| |
| with open(NOTEBOOK_PATH, "w") as f: |
| json.dump(nb, f, ensure_ascii=False, indent=1) |
| |
| print(f"✓ Inserted comparison cell at end of notebook ({len(nb['cells'])} cells total)") |
| print(f" Notebook: {NOTEBOOK_PATH}") |
|
|
| if __name__ == "__main__": |
| main() |
|
|