walidsobhie-code Claude Opus 4.6 commited on
Commit
1c64613
·
1 Parent(s): 9ce0c00

feat: add Kaggle training notebook

Browse files

- 9-step notebook for Kaggle GPU training
- Uses Kaggle P100 (16GB VRAM)
- Downloads model, trains LoRA, merges model

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. kaggle_train_stack29.ipynb +213 -0
kaggle_train_stack29.ipynb ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# 🚀 Stack 2.9 - Kaggle Training Notebook\n",
8
+ "\n",
9
+ "**Free GPU training on Kaggle**\n",
10
+ "\n",
11
+ "This notebook trains a LoRA adapter for Stack 2.9 on **Qwen2.5-Coder-7B** using Kaggle's free GPU.\n",
12
+ "\n",
13
+ "⏱️ **Expected runtime:** 2-4 hours\n",
14
+ "💾 **VRAM needed:** ~16GB (Kaggle P100 has 16GB)\n",
15
+ "\n",
16
+ "---\n",
17
+ "\n",
18
+ "**Instructions:**\n",
19
+ "1. Kaggle → New Notebook\n",
20
+ "2. Add this notebook's code OR clone from GitHub\n",
21
+ "3. Enable GPU (Settings → Accelerator → GPU P100)\n",
22
+ "4. Run cells in order\n",
23
+ "\n",
24
+ "---"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": null,
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "# Check GPU\n",
34
+ "!nvidia-smi"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": null,
40
+ "metadata": {},
41
+ "outputs": [],
42
+ "source": [
43
+ "# STEP 1: Clone the repo\n",
44
+ "import os\n",
45
+ "import shutil\n",
46
+ "\n",
47
+ "REPO_DIR = \"/kaggle/working/stack-2.9\"\n",
48
+ "\n",
49
+ "# Remove old if exists\n",
50
+ "if os.path.exists(REPO_DIR):\n",
51
+ " shutil.rmtree(REPO_DIR)\n",
52
+ "\n",
53
+ "!git clone https://github.com/my-ai-stack/stack-2.9.git {REPO_DIR}\n",
54
+ "\n",
55
+ "os.chdir(REPO_DIR)\n",
56
+ "print(f\"✅ Working in: {os.getcwd()}\")"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": null,
62
+ "metadata": {},
63
+ "outputs": [],
64
+ "source": [
65
+ "# STEP 2: Install dependencies\n",
66
+ "!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
67
+ "!pip install -q transformers peft accelerate datasets pyyaml tqdm scipy bitsandbytes\n",
68
+ "print(\"✅ Dependencies installed\")"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": null,
74
+ "metadata": {},
75
+ "outputs": [],
76
+ "source": [
77
+ "# STEP 3: Download Base Model\n",
78
+ "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
79
+ "\n",
80
+ "MODEL_NAME = \"Qwen/Qwen2.5-Coder-7B\"\n",
81
+ "MODEL_DIR = os.path.join(REPO_DIR, \"base_model_qwen7b\")\n",
82
+ "\n",
83
+ "if not os.path.exists(os.path.join(MODEL_DIR, \"config.json\")):\n",
84
+ " print(f\"Downloading {MODEL_NAME}...\")\n",
85
+ " print(\"This takes ~10-15 minutes...\")\n",
86
+ " tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)\n",
87
+ " tokenizer.save_pretrained(MODEL_DIR)\n",
88
+ " model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True)\n",
89
+ " model.save_pretrained(MODEL_DIR)\n",
90
+ " print(\"✅ Model downloaded!\")\n",
91
+ "else:\n",
92
+ " print(\"✅ Model already exists\")\n",
93
+ "\n",
94
+ "!ls -lh {MODEL_DIR} | head -5"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": null,
100
+ "metadata": {},
101
+ "outputs": [],
102
+ "source": [
103
+ "# STEP 4: Setup paths and config\n",
104
+ "import yaml\n",
105
+ "\n",
106
+ "config_path = os.path.join(REPO_DIR, \"stack/training/train_config_local.yaml\")\n",
107
+ "\n",
108
+ "with open(config_path, 'r') as f:\n",
109
+ " config = yaml.safe_load(f)\n",
110
+ "\n",
111
+ "# Update for Kaggle GPU\n",
112
+ "config['model']['name'] = MODEL_DIR\n",
113
+ "config['hardware']['device'] = \"cuda\"\n",
114
+ "config['hardware']['num_gpus'] = 1\n",
115
+ "\n",
116
+ "OUTPUT_DIR = os.path.join(REPO_DIR, \"training_output\")\n",
117
+ "config['output']['lora_dir'] = os.path.join(OUTPUT_DIR, \"lora\")\n",
118
+ "config['output']['merged_dir'] = os.path.join(OUTPUT_DIR, \"merged\")\n",
119
+ "\n",
120
+ "os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
121
+ "updated_config = os.path.join(OUTPUT_DIR, \"train_config.yaml\")\n",
122
+ "\n",
123
+ "with open(updated_config, 'w') as f:\n",
124
+ " yaml.dump(config, f)\n",
125
+ "\n",
126
+ "print(f\"✅ Config saved to: {updated_config}\")\n",
127
+ "print(f\" Device: {config['hardware']['device']}\")"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": null,
133
+ "metadata": {},
134
+ "outputs": [],
135
+ "source": [
136
+ "# STEP 5: Train LoRA\n",
137
+ "import sys\n",
138
+ "sys.path.insert(0, os.path.join(REPO_DIR, \"stack/training\"))\n",
139
+ "\n",
140
+ "print(\"=\"*60)\n",
141
+ "print(\"STARTING TRAINING\")\n",
142
+ "print(\"=\"*60)\n",
143
+ "\n",
144
+ "from train_lora import train_lora\n",
145
+ "trainer = train_lora(updated_config)\n",
146
+ "\n",
147
+ "print(\"=\"*60)\n",
148
+ "print(\"TRAINING COMPLETED\")\n",
149
+ "print(\"=\"*60)"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": null,
155
+ "metadata": {},
156
+ "outputs": [],
157
+ "source": [
158
+ "# STEP 6: Merge and save\n",
159
+ "import sys\n",
160
+ "sys.path.insert(0, os.path.join(REPO_DIR, \"stack/training\"))\n",
161
+ "from merge_adapter import merge_adapter\n",
162
+ "\n",
163
+ "merged_dir = os.path.join(OUTPUT_DIR, \"merged\")\n",
164
+ "os.makedirs(merged_dir, exist_ok=True)\n",
165
+ "\n",
166
+ "merge_config = {\n",
167
+ " 'model': {'name': MODEL_DIR, 'trust_remote_code': True},\n",
168
+ " 'output': {'lora_dir': os.path.join(OUTPUT_DIR, 'lora'), 'merged_dir': merged_dir},\n",
169
+ " 'quantization': {'enabled': False}\n",
170
+ "}\n",
171
+ "\n",
172
+ "merge_cfg_path = os.path.join(OUTPUT_DIR, \"merge_config.yaml\")\n",
173
+ "with open(merge_cfg_path, 'w') as f:\n",
174
+ " yaml.dump(merge_config, f)\n",
175
+ "\n",
176
+ "merge_adapter(merge_cfg_path, os.path.join(OUTPUT_DIR, \"lora\"), merged_dir)\n",
177
+ "\n",
178
+ "print(f\"✅ Model saved to: {merged_dir}\")\n",
179
+ "!ls -lh {merged_dir}"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "code",
184
+ "execution_count": null,
185
+ "metadata": {},
186
+ "outputs": [],
187
+ "source": [
188
+ "# STEP 7: Download the trained model (for saving)\n",
189
+ "# The model is saved at OUTPUT_DIR/merged/\n",
190
+ "# You can download it from the Kaggle outputs\n",
191
+ "\n",
192
+ "print(\"Training complete!\")\n",
193
+ "print(f\"Model saved at: {merged_dir}\")\n",
194
+ "print(\"\\nTo download:\")\n",
195
+ "print(\"1. Click 'Output' tab in Kaggle\")\n",
196
+ "print(\"2. Download the files from training_output/merged/\")"
197
+ ]
198
+ }
199
+ ],
200
+ "metadata": {
201
+ "kaggle": {
202
+ "accelerator": "gpu",
203
+ "dataSources": [],
204
+ "kernelSpec": {
205
+ "displayName": "Python 3",
206
+ "language": "python",
207
+ "name": "python3"
208
+ }
209
+ }
210
+ },
211
+ "nbformat": 4,
212
+ "nbformat_minor": 0
213
+ }