{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# \ud83d\ude80 Stack 2.9 - Kaggle Training\n", "\n", "Free GPU training on Kaggle using Qwen2.5-Coder-7B.\n", "\n", "\u23f1\ufe0f **Runtime:** 2-4 hours | \ud83d\udcbe **VRAM:** ~14GB (bfloat16, no bitsandbytes)\n", "\n", "**Setup:**\n", "1. Settings \u2192 Accelerator \u2192 GPU **T4**\n", "2. Run all cells in order\n", "3. Download merged model from Output tab when done" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check GPU\n", "!nvidia-smi" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Clone repository\n", "import os, shutil, subprocess\n", "\n", "os.chdir('/kaggle/working')\n", "REPO_DIR = '/kaggle/working/stack-2.9'\n", "OUTPUT_DIR = os.path.join(REPO_DIR, 'training_output')\n", "\n", "if os.path.exists(REPO_DIR):\n", " shutil.rmtree(REPO_DIR)\n", "subprocess.run(['git', 'clone', 'https://github.com/my-ai-stack/stack-2.9.git', REPO_DIR], check=True)\n", "os.chdir(REPO_DIR)\n", "print('\u2705 Repo ready:', REPO_DIR)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Save to Kaggle output (download before session ends!)\n", "# Kaggle sessions expire after 9 hours - download outputs immediately!\n", "\n", "# Create a symbolic link to make paths easier\n", "OUTPUT_DIR = os.path.join(REPO_DIR, 'training_output')\n", "os.makedirs(OUTPUT_DIR, exist_ok=True)\n", "\n", "print(f\"\u2705 Output directory: {OUTPUT_DIR}\")\n", "print(\"\u26a0\ufe0f IMPORTANT: Download outputs from 'Output' tab before session expires!\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Install PyTorch (force CUDA 11.8 build for sm_60 Pascal GPU compatibility)\n", "# Kaggle sometimes assigns P100 (sm_60) which requires CUDA 11.x builds of PyTorch\n", "!pip uninstall -y torch torchvision torchaudio\n", "!pip install torch==2.2.0+cu118 torchvision==0.17.0+cu118 torchaudio==2.2.0+cu118 --index-url https://download.pytorch.org/whl/cu118\n", "print('\u2705 PyTorch ready')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Install other dependencies (NO bitsandbytes \u2014 bfloat16 only)\n!pip install -q transformers==4.40.0 peft==0.10.0 accelerate==0.34.0 datasets==3.0.0 pyyaml tqdm scipy numpy\nprint('\u2705 Dependencies ready')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Fix NumPy 2.0 compatibility (downgrade to <2.0)\n", "!pip install -q \"numpy<2\" --force-reinstall\n", "print('\u2705 NumPy downgraded to <2.0')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Prepare training data (auto-detect or synthetic fallback)\n", "import os, json\n", "\n", "REPO_TRAIN_DATA = os.path.join(REPO_DIR, 'training-data/final/train.jsonl')\n", "MINI_DATA_DIR = os.path.join(REPO_DIR, 'data_mini')\n", "MINI_DATA_FILE = os.path.join(MINI_DATA_DIR, 'train_mini.jsonl')\n", "SYNTHETIC_FILE = os.path.join(REPO_DIR, 'data/synthetic.jsonl')\n", "\n", "print('\ud83d\udd0d Data check')\n", "\n", "if os.path.exists(REPO_TRAIN_DATA):\n", " os.makedirs(MINI_DATA_DIR, exist_ok=True)\n", " if not os.path.exists(MINI_DATA_FILE):\n", " print(' Building mini dataset (1K samples) from full data...')\n", " !python scripts/create_mini_dataset.py --size 1000 --output {MINI_DATA_FILE} --source {REPO_TRAIN_DATA}\n", " DATA_FILE = MINI_DATA_FILE\n", " print(' Using mini dataset')\n", "elif os.path.exists(MINI_DATA_FILE):\n", " DATA_FILE = MINI_DATA_FILE\n", " print(' Using existing mini dataset')\n", "else:\n", " print(' Creating synthetic data (last resort)')\n", " examples = [\n", " {'instruction': 'Write a Python function to reverse a string', 'output': 'def reverse_string(s):\\n return s[::-1]'},\n", " {'instruction': 'Write a function to check if a number is prime', 'output': 'def is_prime(n):\\n if n <= 1:\\n return False\\n for i in range(2, int(n**0.5) + 1):\\n if n % i == 0:\\n return False\\n return True'},\n", " {'instruction': 'Write a binary search function', 'output': 'def binary_search(arr, target):\\n left, right = 0, len(arr) - 1\\n while left <= right:\\n mid = (left + right) // 2\\n if arr[mid] == target:\\n return mid\\n elif arr[mid] < target:\\n left = mid + 1\\n else:\\n right = mid - 1\\n return -1'},\n", " ]\n", " samples = examples * 333\n", " os.makedirs(os.path.dirname(SYNTHETIC_FILE), exist_ok=True)\n", " with open(SYNTHETIC_FILE, 'w') as f:\n", " for s in samples:\n", " f.write(json.dumps(s) + '\\n')\n", " DATA_FILE = SYNTHETIC_FILE\n", " print(f' Synthetic dataset: {len(samples)} examples')\n", "\n", "print(f'\\n\u2705 Data: {DATA_FILE}')\n", "!ls -lh {DATA_FILE}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Generate training configuration\n", "# Uses bfloat16 only (NO bitsandbytes \u2014 avoids CUDA 13 dependency issues)\n", "import yaml\n", "\n", "os.makedirs(OUTPUT_DIR, exist_ok=True)\n", "\n", "config = {\n", " 'model': {'name': 'Qwen/Qwen2.5-Coder-1.5B', 'trust_remote_code': True},\n", " 'data': {'input_path': DATA_FILE, 'max_length': 2048, 'train_split': 0.999},\n", " 'lora': {'r': 8, 'lora_alpha': 16, 'dropout': 0.05, 'target_modules': ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'], 'bias': 'none', 'task_type': 'CAUSAL_LM'},\n", " 'training': {'num_epochs': 1, 'batch_size': 1, 'gradient_accumulation': 4, 'learning_rate': 2e-4, 'warmup_steps': 50, 'weight_decay': 0.01, 'max_grad_norm': 1.0, 'logging_steps': 10, 'save_steps': 100, 'save_total_limit': 2, 'fp16': True, 'bf16': False, 'gradient_checkpointing': True},\n", " 'output': {'lora_dir': os.path.join(OUTPUT_DIR, 'lora'), 'logging_dir': os.path.join(OUTPUT_DIR, 'logs')},\n", " 'quantization': {'enabled': False},\n", " 'hardware': {'device': 'cuda', 'num_gpus': 1, 'use_4bit': False, 'use_8bit': False}\n", "}\n", "\n", "config_path = os.path.join(OUTPUT_DIR, 'train_config.yaml')\n", "with open(config_path, 'w') as f:\n", " yaml.dump(config, f, default_flow_style=False)\n", "\n", "print(f'\u2705 Config: {config_path}')\n", "print(f\" Model: {config['model']['name']}\")\n", "print(f\" Data: {config['data']['input_path']}\")\n", "print(f\" bf16={config['training']['bf16']}, fp16={config['training']['fp16']}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Train (using standalone train_simple_nobnb.py - bfloat16, no quantization)\n", "print('='*60)\n", "print('STARTING TRAINING (bfloat16, no quantization)')\n", "print('='*60)\n", "\n", "!cd {REPO_DIR} && python train_simple_nobnb.py --config {config_path}\n", "\n", "print('\\n\u2705 Training step finished')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Merge LoRA adapter into final model\n", "lora_dir = os.path.join(OUTPUT_DIR, 'lora')\n", "merged_dir = os.path.join(OUTPUT_DIR, 'merged')\n", "\n", "print('='*60)\n", "print('MERGING LORA ADAPTER')\n", "print('='*60)\n", "\n", "!cd {REPO_DIR} && python merge_simple.py \\\n", " --base-model {config['model']['name']} \\\n", " --adapter-path {lora_dir} \\\n", " --output-path {merged_dir} \\\n", " --use-safetensors\n", "\n", "print('\\n\u2705 Merge complete!')\n", "print(f'Merged model: {merged_dir}')\n", "!ls -lh {merged_dir}\n", "\n", "print(\"\\n\u26a0\ufe0f DOWNLOAD THE MODEL NOW: Go to Output tab and download 'merged' folder!\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Push merged model to GitHub LFS (optional - for permanent storage)\n", "# This saves the model to your GitHub repo so you can download anytime\n", "\n", "# Configure Git LFS\n", "!git lfs install 2>/dev/null || echo 'Git LFS already installed'\n", "\n", "# Clone the repo if not already there\n", "import subprocess\n", "repo_url = 'https://github.com/my-ai-stack/stack-2.9.git'\n", "local_repo = '/kaggle/working/stack-2.9-repo'\n", "\n", "if not os.path.exists(local_repo):\n", " subprocess.run(['git', 'clone', repo_url, local_repo], check=True)\n", "\n", "# Copy merged model to repo\n", "import shutil\n", "target_dir = os.path.join(local_repo, 'models/stack-2.9-finetuned')\n", "os.makedirs(target_dir, exist_ok=True)\n", "\n", "if os.path.exists(merged_dir):\n", " # Copy files\n", " for f in os.listdir(merged_dir):\n", " src = os.path.join(merged_dir, f)\n", " dst = os.path.join(target_dir, f)\n", " if os.path.isdir(src):\n", " shutil.copytree(src, dst, dirs_exist_ok=True)\n", " else:\n", " shutil.copy2(src, dst)\n", " \n", " print(f'\u2705 Copied model to {target_dir}')\n", " \n", " # Push to GitHub\n", " os.chdir(local_repo)\n", " subprocess.run(['git', 'add', 'models/stack-2.9-finetuned/'], check=True)\n", " subprocess.run(['git', 'config', 'user.email', 'kaggle@kaggle.com'], check=True)\n", " subprocess.run(['git', 'config', 'user.name', 'Kaggle Auto-Push'], check=True)\n", " subprocess.run(['git', 'commit', '-m', 'feat: add fine-tuned model from Kaggle'], check=True)\n", " \n", " # Push (you may need a GitHub token for private repos)\n", " result = subprocess.run(['git', 'push', 'origin', 'main'], capture_output=True, text=True)\n", " if result.returncode == 0:\n", " print('\u2705 Model pushed to GitHub!')\n", " else:\n", " print(f'\u26a0\ufe0f Push failed: {result.stderr}')\n", " print(' You can still download from Kaggle Output tab.')\n", "else:\n", " print('\u26a0\ufe0f Merged model not found. Train first!')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## \ud83d\udce5 Download Model\n", "\n", "1. Open **Output** tab on the right\n", "2. Find `training_output/merged/`\n", "3. Select all files and **Download**\n", "\n", "\u26a0\ufe0f **Do this before Kaggle session ends!**" ] } ], "metadata": { "kaggle": { "accelerator": "gpu" } }, "nbformat": 4, "nbformat_minor": 0 }