{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# CommitmentOS Training Notebook\\n\n", "\\n\n", "This notebook reproduces GRPO training for CommitmentOS using TRL + LoRA." ] }, { "cell_type": "code", "execution_count": null, "id": "5bc9c2fe", "metadata": {}, "outputs": [], "source": [ "!pip -q install --upgrade pip\\n\n", "!pip -q install \"openenv-core>=0.2.0\" trl transformers peft datasets torch accelerate bitsandbytes matplotlib pandas pydantic" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!git clone https://github.com/Jayant2304/commitment_os.git\\n\n", "%cd commitment_os\\n\n", "!python -m pytest tests/test_environment.py -q" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python training/train_grpo.py \\\\\\n\n", " --model Qwen/Qwen2.5-1.5B-Instruct \\\\\\n\n", " --epochs 2 \\\\\\n\n", " --lr 5e-6 \\\\\\n\n", " --batch_size 1 \\\\\\n\n", " --group_size 2 \\\\\\n\n", " --lora_rank 16 \\\\\\n\n", " --lora_alpha 32 \\\\\\n\n", " --output_dir ./training_output" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\\n\n", "import matplotlib.pyplot as plt\\n\n", "from pathlib import Path\\n\n", "\\n\n", "p = Path('training_output/training_metrics.json')\\n\n", "logs = json.loads(p.read_text())\\n\n", "\\n\n", "steps = [float(x['step']) for x in logs if 'step' in x and 'loss' in x]\\n\n", "loss = [float(x['loss']) for x in logs if 'step' in x and 'loss' in x]\\n\n", "r_steps = [float(x['step']) for x in logs if 'step' in x and 'reward' in x]\\n\n", "rewards = [float(x['reward']) for x in logs if 'step' in x and 'reward' in x]\\n\n", "\\n\n", "plt.figure(figsize=(8,5))\\n\n", "plt.plot(steps, loss, marker='o')\\n\n", "plt.title('CommitmentOS GRPO Loss vs Step')\\n\n", "plt.xlabel('Step'); plt.ylabel('Loss'); plt.grid(alpha=0.3)\\n\n", "plt.tight_layout(); plt.savefig('loss_curve.png', dpi=200); plt.show()\\n\n", "\\n\n", "plt.figure(figsize=(8,5))\\n\n", "plt.plot(r_steps, rewards, marker='o')\\n\n", "plt.title('CommitmentOS GRPO Reward vs Step')\\n\n", "plt.xlabel('Step'); plt.ylabel('Reward'); plt.grid(alpha=0.3)\\n\n", "plt.tight_layout(); plt.savefig('reward_curve.png', dpi=200); plt.show()" ] }, { "cell_type": "markdown", "id": "e788b455", "metadata": {}, "source": [ "### Optional: zip `training_output` for download\n", "\n", "Run after training completes. On Colab, use **Files** sidebar or `files.download` for the zip.\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1b3c760a", "metadata": {}, "outputs": [], "source": [ "!cd /content/commitment_os && du -sh training_output && zip -r /content/training_output_only.zip training_output\n", "from google.colab import files\n", "\n", "files.download(\"/content/training_output_only.zip\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.10" } }, "nbformat": 4, "nbformat_minor": 5 }