File size: 11,452 Bytes
bb61f7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65c52b2
 
 
 
 
 
a8f2981
 
65c52b2
a8f2981
 
 
65c52b2
a8f2981
 
65c52b2
 
bb61f7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65c52b2
 
a8f2981
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65c52b2
a8f2981
 
 
65c52b2
a8f2981
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65c52b2
a8f2981
bb61f7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# \ud83d\ude80 Stack 2.9 - Kaggle Training\n",
    "\n",
    "Free GPU training on Kaggle using Qwen2.5-Coder-7B.\n",
    "\n",
    "\u23f1\ufe0f **Runtime:** 2-4 hours  |  \ud83d\udcbe **VRAM:** ~14GB (bfloat16, no bitsandbytes)\n",
    "\n",
    "**Setup:**\n",
    "1. Settings \u2192 Accelerator \u2192 GPU **T4**\n",
    "2. Run all cells in order\n",
    "3. Download merged model from Output tab when done"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check GPU\n",
    "!nvidia-smi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Clone repository\n",
    "import os, shutil, subprocess\n",
    "\n",
    "os.chdir('/kaggle/working')\n",
    "REPO_DIR = '/kaggle/working/stack-2.9'\n",
    "OUTPUT_DIR = os.path.join(REPO_DIR, 'training_output')\n",
    "\n",
    "if os.path.exists(REPO_DIR):\n",
    "    shutil.rmtree(REPO_DIR)\n",
    "subprocess.run(['git', 'clone', 'https://github.com/my-ai-stack/stack-2.9.git', REPO_DIR], check=True)\n",
    "os.chdir(REPO_DIR)\n",
    "print('\u2705 Repo ready:', REPO_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save to Kaggle output (download before session ends!)\n",
    "# Kaggle sessions expire after 9 hours - download outputs immediately!\n",
    "\n",
    "# Create a symbolic link to make paths easier\n",
    "OUTPUT_DIR = os.path.join(REPO_DIR, 'training_output')\n",
    "os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
    "\n",
    "print(f\"\u2705 Output directory: {OUTPUT_DIR}\")\n",
    "print(\"\u26a0\ufe0f IMPORTANT: Download outputs from 'Output' tab before session expires!\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Install PyTorch (force CUDA 11.8 build for sm_60 Pascal GPU compatibility)\n",
    "# Kaggle sometimes assigns P100 (sm_60) which requires CUDA 11.x builds of PyTorch\n",
    "!pip uninstall -y torch torchvision torchaudio\n",
    "!pip install torch==2.2.0+cu118 torchvision==0.17.0+cu118 torchaudio==2.2.0+cu118 --index-url https://download.pytorch.org/whl/cu118\n",
    "print('\u2705 PyTorch ready')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Install other dependencies (NO bitsandbytes \u2014 bfloat16 only)\n!pip install -q transformers==4.40.0 peft==0.10.0 accelerate==0.34.0 datasets==3.0.0 pyyaml tqdm scipy numpy\nprint('\u2705 Dependencies ready')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fix NumPy 2.0 compatibility (downgrade to <2.0)\n",
    "!pip install -q \"numpy<2\" --force-reinstall\n",
    "print('\u2705 NumPy downgraded to <2.0')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prepare training data (auto-detect or synthetic fallback)\n",
    "import os, json\n",
    "\n",
    "REPO_TRAIN_DATA = os.path.join(REPO_DIR, 'training-data/final/train.jsonl')\n",
    "MINI_DATA_DIR = os.path.join(REPO_DIR, 'data_mini')\n",
    "MINI_DATA_FILE = os.path.join(MINI_DATA_DIR, 'train_mini.jsonl')\n",
    "SYNTHETIC_FILE = os.path.join(REPO_DIR, 'data/synthetic.jsonl')\n",
    "\n",
    "print('\ud83d\udd0d Data check')\n",
    "\n",
    "if os.path.exists(REPO_TRAIN_DATA):\n",
    "    os.makedirs(MINI_DATA_DIR, exist_ok=True)\n",
    "    if not os.path.exists(MINI_DATA_FILE):\n",
    "        print('   Building mini dataset (1K samples) from full data...')\n",
    "        !python scripts/create_mini_dataset.py --size 1000 --output {MINI_DATA_FILE} --source {REPO_TRAIN_DATA}\n",
    "    DATA_FILE = MINI_DATA_FILE\n",
    "    print('   Using mini dataset')\n",
    "elif os.path.exists(MINI_DATA_FILE):\n",
    "    DATA_FILE = MINI_DATA_FILE\n",
    "    print('   Using existing mini dataset')\n",
    "else:\n",
    "    print('   Creating synthetic data (last resort)')\n",
    "    examples = [\n",
    "        {'instruction': 'Write a Python function to reverse a string', 'output': 'def reverse_string(s):\\n    return s[::-1]'},\n",
    "        {'instruction': 'Write a function to check if a number is prime', 'output': 'def is_prime(n):\\n    if n <= 1:\\n        return False\\n    for i in range(2, int(n**0.5) + 1):\\n        if n % i == 0:\\n            return False\\n        return True'},\n",
    "        {'instruction': 'Write a binary search function', 'output': 'def binary_search(arr, target):\\n    left, right = 0, len(arr) - 1\\n    while left <= right:\\n        mid = (left + right) // 2\\n        if arr[mid] == target:\\n            return mid\\n        elif arr[mid] < target:\\n            left = mid + 1\\n        else:\\n            right = mid - 1\\n        return -1'},\n",
    "    ]\n",
    "    samples = examples * 333\n",
    "    os.makedirs(os.path.dirname(SYNTHETIC_FILE), exist_ok=True)\n",
    "    with open(SYNTHETIC_FILE, 'w') as f:\n",
    "        for s in samples:\n",
    "            f.write(json.dumps(s) + '\\n')\n",
    "    DATA_FILE = SYNTHETIC_FILE\n",
    "    print(f'   Synthetic dataset: {len(samples)} examples')\n",
    "\n",
    "print(f'\\n\u2705 Data: {DATA_FILE}')\n",
    "!ls -lh {DATA_FILE}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate training configuration\n",
    "# Uses bfloat16 only (NO bitsandbytes \u2014 avoids CUDA 13 dependency issues)\n",
    "import yaml\n",
    "\n",
    "os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
    "\n",
    "config = {\n",
    "    'model': {'name': 'Qwen/Qwen2.5-Coder-1.5B', 'trust_remote_code': True},\n",
    "    'data': {'input_path': DATA_FILE, 'max_length': 2048, 'train_split': 0.999},\n",
    "    'lora': {'r': 8, 'lora_alpha': 16, 'dropout': 0.05, 'target_modules': ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'], 'bias': 'none', 'task_type': 'CAUSAL_LM'},\n",
    "    'training': {'num_epochs': 1, 'batch_size': 1, 'gradient_accumulation': 4, 'learning_rate': 2e-4, 'warmup_steps': 50, 'weight_decay': 0.01, 'max_grad_norm': 1.0, 'logging_steps': 10, 'save_steps': 100, 'save_total_limit': 2, 'fp16': True, 'bf16': False, 'gradient_checkpointing': True},\n",
    "    'output': {'lora_dir': os.path.join(OUTPUT_DIR, 'lora'), 'logging_dir': os.path.join(OUTPUT_DIR, 'logs')},\n",
    "    'quantization': {'enabled': False},\n",
    "    'hardware': {'device': 'cuda', 'num_gpus': 1, 'use_4bit': False, 'use_8bit': False}\n",
    "}\n",
    "\n",
    "config_path = os.path.join(OUTPUT_DIR, 'train_config.yaml')\n",
    "with open(config_path, 'w') as f:\n",
    "    yaml.dump(config, f, default_flow_style=False)\n",
    "\n",
    "print(f'\u2705 Config: {config_path}')\n",
    "print(f\"   Model: {config['model']['name']}\")\n",
    "print(f\"   Data: {config['data']['input_path']}\")\n",
    "print(f\"   bf16={config['training']['bf16']}, fp16={config['training']['fp16']}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train (using standalone train_simple_nobnb.py - bfloat16, no quantization)\n",
    "print('='*60)\n",
    "print('STARTING TRAINING (bfloat16, no quantization)')\n",
    "print('='*60)\n",
    "\n",
    "!cd {REPO_DIR} && python train_simple_nobnb.py --config {config_path}\n",
    "\n",
    "print('\\n\u2705 Training step finished')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Merge LoRA adapter into final model\n",
    "lora_dir = os.path.join(OUTPUT_DIR, 'lora')\n",
    "merged_dir = os.path.join(OUTPUT_DIR, 'merged')\n",
    "\n",
    "print('='*60)\n",
    "print('MERGING LORA ADAPTER')\n",
    "print('='*60)\n",
    "\n",
    "!cd {REPO_DIR} && python merge_simple.py \\\n",
    "    --base-model {config['model']['name']} \\\n",
    "    --adapter-path {lora_dir} \\\n",
    "    --output-path {merged_dir} \\\n",
    "    --use-safetensors\n",
    "\n",
    "print('\\n\u2705 Merge complete!')\n",
    "print(f'Merged model: {merged_dir}')\n",
    "!ls -lh {merged_dir}\n",
    "\n",
    "print(\"\\n\u26a0\ufe0f DOWNLOAD THE MODEL NOW: Go to Output tab and download 'merged' folder!\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Push merged model to GitHub LFS (optional - for permanent storage)\n",
    "# This saves the model to your GitHub repo so you can download anytime\n",
    "\n",
    "# Configure Git LFS\n",
    "!git lfs install 2>/dev/null || echo 'Git LFS already installed'\n",
    "\n",
    "# Clone the repo if not already there\n",
    "import subprocess\n",
    "repo_url = 'https://github.com/my-ai-stack/stack-2.9.git'\n",
    "local_repo = '/kaggle/working/stack-2.9-repo'\n",
    "\n",
    "if not os.path.exists(local_repo):\n",
    "    subprocess.run(['git', 'clone', repo_url, local_repo], check=True)\n",
    "\n",
    "# Copy merged model to repo\n",
    "import shutil\n",
    "target_dir = os.path.join(local_repo, 'models/stack-2.9-finetuned')\n",
    "os.makedirs(target_dir, exist_ok=True)\n",
    "\n",
    "if os.path.exists(merged_dir):\n",
    "    # Copy files\n",
    "    for f in os.listdir(merged_dir):\n",
    "        src = os.path.join(merged_dir, f)\n",
    "        dst = os.path.join(target_dir, f)\n",
    "        if os.path.isdir(src):\n",
    "            shutil.copytree(src, dst, dirs_exist_ok=True)\n",
    "        else:\n",
    "            shutil.copy2(src, dst)\n",
    "    \n",
    "    print(f'\u2705 Copied model to {target_dir}')\n",
    "    \n",
    "    # Push to GitHub\n",
    "    os.chdir(local_repo)\n",
    "    subprocess.run(['git', 'add', 'models/stack-2.9-finetuned/'], check=True)\n",
    "    subprocess.run(['git', 'config', 'user.email', 'kaggle@kaggle.com'], check=True)\n",
    "    subprocess.run(['git', 'config', 'user.name', 'Kaggle Auto-Push'], check=True)\n",
    "    subprocess.run(['git', 'commit', '-m', 'feat: add fine-tuned model from Kaggle'], check=True)\n",
    "    \n",
    "    # Push (you may need a GitHub token for private repos)\n",
    "    result = subprocess.run(['git', 'push', 'origin', 'main'], capture_output=True, text=True)\n",
    "    if result.returncode == 0:\n",
    "        print('\u2705 Model pushed to GitHub!')\n",
    "    else:\n",
    "        print(f'\u26a0\ufe0f Push failed: {result.stderr}')\n",
    "        print('   You can still download from Kaggle Output tab.')\n",
    "else:\n",
    "    print('\u26a0\ufe0f Merged model not found. Train first!')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## \ud83d\udce5 Download Model\n",
    "\n",
    "1. Open **Output** tab on the right\n",
    "2. Find `training_output/merged/`\n",
    "3. Select all files and **Download**\n",
    "\n",
    "\u26a0\ufe0f **Do this before Kaggle session ends!**"
   ]
  }
 ],
 "metadata": {
  "kaggle": {
   "accelerator": "gpu"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}