{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# šŸ¤– Auto-FineTune-Ops: One-Click Fine-Tuning Pipeline\n", "\n", "**Run this notebook on Google Colab (with GPU) or Kaggle to fine-tune your LLM!**\n", "\n", "This notebook combines all the agents:\n", "- **DataArchitectAgent**: Cleans and formats your data\n", "- **TrainingPilot**: Fine-tunes with Unsloth (ultra-fast LoRA)\n", "- **TheJudge**: Evaluates base vs fine-tuned with LLM-as-Judge\n", "\n", "---" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1ļøāƒ£ Setup - Install Dependencies" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%capture\n", "# Install Unsloth (must be first!)\n", "!pip install \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\"\n", "!pip install --no-deps trl peft accelerate bitsandbytes\n", "\n", "# Install other dependencies\n", "!pip install datasets transformers rich pandas openai anthropic" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2ļøāƒ£ Configuration\n", "\n", "Set your training goal and upload your data!" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#@title āš™ļø Configuration\n", "#@markdown ### Training Settings\n", "GOAL = \"medical_assistant\" #@param {type:\"string\"}\n", "BASE_MODEL = \"unsloth/llama-3-8b-bnb-4bit\" #@param [\"unsloth/llama-3-8b-bnb-4bit\", \"unsloth/mistral-7b-bnb-4bit\", \"unsloth/gemma-7b-bnb-4bit\"]\n", "MAX_SEQ_LENGTH = 2048 #@param {type:\"integer\"}\n", "\n", "#@markdown ### Evaluation Settings\n", "RUN_EVALUATION = True #@param {type:\"boolean\"}\n", "JUDGE_MODEL = \"gpt-4o\" #@param [\"gpt-4o\", \"claude-3-5-sonnet-20241022\"]\n", "NUM_EVAL_SAMPLES = 20 #@param {type:\"integer\"}\n", "\n", "#@markdown ### API Keys (for evaluation only)\n", "OPENAI_API_KEY = \"\" #@param {type:\"string\"}\n", "ANTHROPIC_API_KEY = \"\" #@param {type:\"string\"}\n", "\n", "import os\n", "if OPENAI_API_KEY:\n", " os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY\n", "if ANTHROPIC_API_KEY:\n", " os.environ[\"ANTHROPIC_API_KEY\"] = ANTHROPIC_API_KEY\n", "\n", "print(f\"āœ… Goal: {GOAL}\")\n", "print(f\"āœ… Base Model: {BASE_MODEL}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3ļøāƒ£ Upload Your Data\n", "\n", "Upload a CSV or JSON file with instruction-response pairs." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from google.colab import files\n", "import pandas as pd\n", "\n", "print(\"šŸ“‚ Upload your dataset (CSV or JSON):\")\n", "uploaded = files.upload()\n", "\n", "# Get the uploaded file name\n", "DATA_FILE = list(uploaded.keys())[0]\n", "print(f\"\\nāœ… Uploaded: {DATA_FILE}\")\n", "\n", "# Preview the data\n", "if DATA_FILE.endswith('.csv'):\n", " df = pd.read_csv(DATA_FILE)\n", "else:\n", " df = pd.read_json(DATA_FILE, lines=DATA_FILE.endswith('.jsonl'))\n", "\n", "print(f\"\\nšŸ“Š Dataset shape: {df.shape}\")\n", "print(f\"šŸ“‹ Columns: {list(df.columns)}\")\n", "df.head(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4ļøāƒ£ Stage 1: Data Preparation (DataArchitectAgent)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import re\n", "from dataclasses import dataclass, field\n", "from typing import Optional, List, Dict, Tuple\n", "import pandas as pd\n", "from rich.console import Console\n", "from rich.table import Table\n", "\n", "console = Console()\n", "\n", "@dataclass\n", "class CleaningConfig:\n", " min_instruction_length: int = 10\n", " max_instruction_length: int = 2048\n", " min_response_length: int = 20\n", " max_response_length: int = 4096\n", " remove_duplicates: bool = True\n", "\n", "class DataArchitectAgent:\n", " \"\"\"Autonomous data preparation agent.\"\"\"\n", " \n", " INSTRUCTION_PATTERNS = [r'instruction', r'prompt', r'question', r'query', r'user', r'input_text']\n", " OUTPUT_PATTERNS = [r'output', r'response', r'answer', r'completion', r'assistant', r'target']\n", " \n", " def __init__(self, config=None):\n", " self.config = config or CleaningConfig()\n", " \n", " def _detect_columns(self, df):\n", " instruction_col, output_col = None, None\n", " for col in df.columns:\n", " col_lower = col.lower()\n", " for pattern in self.INSTRUCTION_PATTERNS:\n", " if re.search(pattern, col_lower) and not instruction_col:\n", " instruction_col = col\n", " for pattern in self.OUTPUT_PATTERNS:\n", " if re.search(pattern, col_lower) and not output_col:\n", " output_col = col\n", " return instruction_col, output_col\n", " \n", " def process(self, df, goal):\n", " console.print(\"[bold blue]šŸ—ļø DATA ARCHITECT AGENT[/]\")\n", " \n", " # Detect columns\n", " inst_col, out_col = self._detect_columns(df)\n", " console.print(f\"šŸ“Œ Detected: instruction='{inst_col}', output='{out_col}'\")\n", " \n", " if not inst_col or not out_col:\n", " raise ValueError(\"Could not auto-detect columns. Please rename to 'instruction' and 'output'.\")\n", " \n", " # Clean\n", " df_clean = df.dropna(subset=[inst_col, out_col])\n", " if self.config.remove_duplicates:\n", " df_clean = df_clean.drop_duplicates(subset=[inst_col])\n", " \n", " # Length filters\n", " df_clean = df_clean[\n", " (df_clean[inst_col].str.len() >= self.config.min_instruction_length) &\n", " (df_clean[inst_col].str.len() <= self.config.max_instruction_length) &\n", " (df_clean[out_col].str.len() >= self.config.min_response_length) &\n", " (df_clean[out_col].str.len() <= self.config.max_response_length)\n", " ]\n", " \n", " console.print(f\"āœ… Cleaned: {len(df_clean)} rows (from {len(df)})\")\n", " \n", " # Format for training\n", " system_prompt = f\"You are a specialized AI assistant for {goal}.\"\n", " \n", " formatted = []\n", " for _, row in df_clean.iterrows():\n", " formatted.append({\n", " \"instruction\": str(row[inst_col]),\n", " \"input\": \"\",\n", " \"output\": str(row[out_col]),\n", " \"system\": system_prompt\n", " })\n", " \n", " # Save\n", " output_path = f\"/content/{goal}_training.jsonl\"\n", " with open(output_path, 'w') as f:\n", " for item in formatted:\n", " f.write(json.dumps(item) + '\\n')\n", " \n", " console.print(f\"šŸ’¾ Saved to: {output_path}\")\n", " return output_path, len(formatted)\n", "\n", "# Run Data Agent\n", "data_agent = DataArchitectAgent()\n", "TRAINING_DATA_PATH, DATASET_SIZE = data_agent.process(df, GOAL)\n", "print(f\"\\nāœ… Dataset ready: {DATASET_SIZE} samples\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5ļøāƒ£ Stage 2: Fine-Tuning (TrainingPilot)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from unsloth import FastLanguageModel\n", "from datasets import load_dataset\n", "from trl import SFTTrainer\n", "from transformers import TrainingArguments\n", "import torch\n", "\n", "print(f\"šŸš€ GPU: {torch.cuda.get_device_name(0)}\")\n", "print(f\"šŸ“Š VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")\n", "\n", "# Auto-configure hyperparameters based on dataset size\n", "if DATASET_SIZE < 1000:\n", " LORA_RANK, LORA_ALPHA, LR, EPOCHS = 8, 16, 2e-4, 5\n", "elif DATASET_SIZE < 10000:\n", " LORA_RANK, LORA_ALPHA, LR, EPOCHS = 16, 32, 1e-4, 3\n", "else:\n", " LORA_RANK, LORA_ALPHA, LR, EPOCHS = 32, 64, 5e-5, 2\n", "\n", "print(f\"\\nāš™ļø Auto-configured for {DATASET_SIZE} samples:\")\n", "print(f\" LoRA Rank: {LORA_RANK}, Alpha: {LORA_ALPHA}\")\n", "print(f\" Learning Rate: {LR}, Epochs: {EPOCHS}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load model with Unsloth\n", "model, tokenizer = FastLanguageModel.from_pretrained(\n", " model_name=BASE_MODEL,\n", " max_seq_length=MAX_SEQ_LENGTH,\n", " dtype=None,\n", " load_in_4bit=True,\n", ")\n", "\n", "# Apply LoRA\n", "model = FastLanguageModel.get_peft_model(\n", " model,\n", " r=LORA_RANK,\n", " lora_alpha=LORA_ALPHA,\n", " lora_dropout=0,\n", " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n", " bias=\"none\",\n", " use_gradient_checkpointing=\"unsloth\",\n", " random_state=42,\n", ")\n", "\n", "print(\"āœ… Model loaded with LoRA!\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load dataset\n", "dataset = load_dataset('json', data_files=TRAINING_DATA_PATH, split='train')\n", "\n", "# Format prompts\n", "alpaca_template = \"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", "\n", "### Instruction:\n", "{instruction}\n", "\n", "### Response:\n", "{output}\"\"\"\n", "\n", "def format_prompt(example):\n", " return {\"text\": alpaca_template.format(**example)}\n", "\n", "dataset = dataset.map(format_prompt)\n", "print(f\"āœ… Loaded {len(dataset)} training samples\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Train!\n", "trainer = SFTTrainer(\n", " model=model,\n", " tokenizer=tokenizer,\n", " train_dataset=dataset,\n", " dataset_text_field=\"text\",\n", " max_seq_length=MAX_SEQ_LENGTH,\n", " args=TrainingArguments(\n", " output_dir=f\"/content/{GOAL}_model\",\n", " num_train_epochs=EPOCHS,\n", " per_device_train_batch_size=4,\n", " gradient_accumulation_steps=4,\n", " learning_rate=LR,\n", " warmup_ratio=0.03,\n", " fp16=True,\n", " logging_steps=10,\n", " save_strategy=\"epoch\",\n", " optim=\"adamw_8bit\",\n", " seed=42,\n", " ),\n", ")\n", "\n", "print(\"šŸ‹ļø Training started...\")\n", "trainer.train()\n", "\n", "# Save\n", "MODEL_PATH = f\"/content/{GOAL}_model_final\"\n", "trainer.save_model(MODEL_PATH)\n", "tokenizer.save_pretrained(MODEL_PATH)\n", "print(f\"\\nāœ… Model saved to: {MODEL_PATH}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6ļøāƒ£ Stage 3: Evaluation (TheJudge) - Optional" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if RUN_EVALUATION and (OPENAI_API_KEY or ANTHROPIC_API_KEY):\n", " print(\"āš–ļø Running Model Arena evaluation...\")\n", " \n", " # Simple evaluation - compare responses\n", " FastLanguageModel.for_inference(model)\n", " \n", " # Sample prompts from dataset\n", " test_prompts = [dataset[i][\"instruction\"] for i in range(min(NUM_EVAL_SAMPLES, len(dataset)))]\n", " \n", " print(f\"\\nšŸ“Š Evaluating on {len(test_prompts)} samples...\")\n", " print(\"Note: Full arena evaluation requires loading base model separately.\")\n", " print(\"For complete evaluation, use the full TheJudge agent locally.\")\n", " \n", " # Quick test generation\n", " test_prompt = test_prompts[0] if test_prompts else \"Hello, how are you?\"\n", " inputs = tokenizer(f\"### Instruction:\\n{test_prompt}\\n\\n### Response:\\n\", return_tensors=\"pt\").to(\"cuda\")\n", " outputs = model.generate(**inputs, max_new_tokens=128, temperature=0.7)\n", " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", " \n", " print(f\"\\nšŸ“ Sample generation:\")\n", " print(f\"Prompt: {test_prompt[:100]}...\")\n", " print(f\"Response: {response.split('### Response:')[-1][:200]}...\")\n", "else:\n", " print(\"ā­ļø Skipping evaluation (no API key or disabled)\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7ļøāƒ£ Download Your Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Option 1: Save to Google Drive\n", "from google.colab import drive\n", "drive.mount('/content/drive')\n", "\n", "!cp -r {MODEL_PATH} /content/drive/MyDrive/{GOAL}_finetuned_model\n", "print(f\"āœ… Model copied to Google Drive: /MyDrive/{GOAL}_finetuned_model\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Option 2: Push to HuggingFace Hub\n", "# Uncomment and fill in your details:\n", "\n", "# from huggingface_hub import login\n", "# login(token=\"YOUR_HF_TOKEN\")\n", "# \n", "# model.push_to_hub(\"your-username/your-model-name\")\n", "# tokenizer.push_to_hub(\"your-username/your-model-name\")\n", "# print(\"āœ… Pushed to HuggingFace Hub!\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## šŸŽ‰ Done!\n", "\n", "Your fine-tuned model is ready! You can:\n", "1. Download from Google Drive\n", "2. Push to HuggingFace Hub\n", "3. Use the FastAPI deployment script locally" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.13.1" } }, "nbformat": 4, "nbformat_minor": 0 }