{ "nbformat": 4, "nbformat_minor": 5, "metadata": { "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"name": "python", "version": "3.10.0"}, "accelerator": "GPU", "colab": {"gpuType": "T4", "provenance": []} }, "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 🚀 Phase 2 — Model Training (Google Colab)\n", "\n", "**Run this notebook in Google Colab with a FREE T4 GPU:**\n", "1. Go to https://colab.research.google.com\n", "2. Upload this file\n", "3. Runtime → Change Runtime Type → T4 GPU\n", "4. Run each cell one by one ⬇️" ] }, { "cell_type": "markdown", "metadata": {}, "source": ["### Cell 1 — Install all required libraries"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# ── Install everything needed ──────────────────────────────\n", "!pip install -q transformers datasets peft bitsandbytes trl accelerate huggingface_hub\n", "print('✅ All libraries installed!')" ] }, { "cell_type": "markdown", "metadata": {}, "source": ["### Cell 2 — Re-create the dataset inside Colab"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "import os\n", "\n", "print('Downloading and preparing dataset...')\n", "dataset = load_dataset('databricks/databricks-dolly-15k', split='train')\n", "\n", "def convert(example):\n", " weak = example['instruction'].strip()\n", " ctx = example.get('context', '').strip()\n", " resp = example['response'].strip()\n", " weak_c = f'{weak}\\nContext: {ctx[:300]}' if ctx else weak\n", " improved = (\n", " f'Role: Expert AI Assistant\\n'\n", " f'Tone: Clear, accurate, and helpful\\n'\n", " f'Instructions: {weak}\\n'\n", " f'Constraints: Be concise, factual, well-structured, avoid hallucination\\n'\n", " f'Output: {resp[:400]}'\n", " )\n", " return {\n", " 'text': f'### Weak Prompt:\\n{weak_c}\\n\\n### Improved Prompt:\\n{improved}'\n", " }\n", "\n", "dataset = dataset.map(convert, remove_columns=dataset.column_names)\n", "dataset = dataset.select(range(4000))\n", "os.makedirs('data', exist_ok=True)\n", "dataset.save_to_disk('data/prompt_dataset')\n", "print(f'✅ Dataset ready — {len(dataset)} samples')" ] }, { "cell_type": "markdown", "metadata": {}, "source": ["### Cell 3 — Load Gemma-2B with 4-bit quantization + LoRA"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n", "from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training\n", "\n", "MODEL_NAME = 'google/gemma-2b' # Change to 'google/gemma-7b' if you have Colab Pro\n", "\n", "print('Loading tokenizer...')\n", "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n", "tokenizer.pad_token = tokenizer.eos_token\n", "\n", "print('Loading model with 4-bit quantization (saves GPU memory)...')\n", "bnb_config = BitsAndBytesConfig(\n", " load_in_4bit=True,\n", " bnb_4bit_quant_type='nf4',\n", " bnb_4bit_compute_dtype=torch.float16,\n", " bnb_4bit_use_double_quant=True\n", ")\n", "model = AutoModelForCausalLM.from_pretrained(\n", " MODEL_NAME,\n", " quantization_config=bnb_config,\n", " device_map='auto'\n", ")\n", "model = prepare_model_for_kbit_training(model)\n", "\n", "print('Applying LoRA adapters...')\n", "lora_config = LoraConfig(\n", " r=16,\n", " lora_alpha=32,\n", " target_modules=['q_proj', 'v_proj'],\n", " lora_dropout=0.05,\n", " bias='none',\n", " task_type='CAUSAL_LM'\n", ")\n", "model = get_peft_model(model, lora_config)\n", "model.print_trainable_parameters()\n", "print('✅ Model ready!')" ] }, { "cell_type": "markdown", "metadata": {}, "source": ["### Cell 4 — Train the model (2–3 epochs, ~2–4 hours on free T4)"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import TrainingArguments\n", "from trl import SFTTrainer\n", "from datasets import load_from_disk\n", "\n", "dataset = load_from_disk('data/prompt_dataset')\n", "\n", "args = TrainingArguments(\n", " output_dir='./prompt-optimizer-checkpoints',\n", " num_train_epochs=2,\n", " per_device_train_batch_size=2,\n", " gradient_accumulation_steps=8,\n", " learning_rate=2e-4,\n", " fp16=True,\n", " save_steps=200,\n", " logging_steps=50,\n", " warmup_steps=100,\n", " lr_scheduler_type='cosine',\n", " report_to='none'\n", ")\n", "\n", "trainer = SFTTrainer(\n", " model=model,\n", " tokenizer=tokenizer,\n", " train_dataset=dataset,\n", " dataset_text_field='text',\n", " args=args,\n", " max_seq_length=512\n", ")\n", "\n", "print('🏋️ Starting training... (this will take 2–4 hours on free T4)')\n", "trainer.train()\n", "print('✅ Training complete!')" ] }, { "cell_type": "markdown", "metadata": {}, "source": ["### Cell 5 — Save the LoRA adapter"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model.save_pretrained('./lora-adapter')\n", "tokenizer.save_pretrained('./lora-adapter')\n", "print('✅ LoRA adapter saved to ./lora-adapter')\n", "print('📦 Download the lora-adapter folder from Colab Files panel (left sidebar)')" ] }, { "cell_type": "markdown", "metadata": {}, "source": ["### Cell 6 — Upload directly to HuggingFace (optional but recommended)"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import login, HfApi\n", "\n", "# Get your token from https://huggingface.co/settings/tokens\n", "HF_TOKEN = 'hf_YOUR_TOKEN_HERE' # ← paste your token\n", "HF_USERNAME = 'your-username' # ← your HF username\n", "REPO_NAME = 'prompt-optimizer-lora'\n", "\n", "login(token=HF_TOKEN)\n", "api = HfApi()\n", "api.create_repo(repo_id=f'{HF_USERNAME}/{REPO_NAME}', exist_ok=True)\n", "api.upload_folder(\n", " folder_path='./lora-adapter',\n", " repo_id=f'{HF_USERNAME}/{REPO_NAME}',\n", " repo_type='model'\n", ")\n", "print(f'✅ Model uploaded → https://huggingface.co/{HF_USERNAME}/{REPO_NAME}')" ] } ] }