walidsobhie-code commited on
Commit
058de92
·
1 Parent(s): 8f0e2c5

fix: update Kaggle notebook with proper data handling and training configuration

Browse files

- Fix data path issue: now correctly finds and uses training data
- Use repo's training-data/final/train.jsonl if available
- Fallback to create mini dataset (1K samples) from full data
- Removed broken model download step (assumes model in base_model_qwen7b)
- Proper config YAML generation with correct train_file path
- Better error handling and progress messages
- Simplified training and merge steps with proper imports

Files changed (1) hide show
  1. kaggle_train_stack29.ipynb +147 -77
kaggle_train_stack29.ipynb CHANGED
@@ -4,7 +4,7 @@
4
  "cell_type": "markdown",
5
  "metadata": {},
6
  "source": [
7
- "# 🚀 Stack 2.9 - Kaggle Training Notebook\n",
8
  "\n",
9
  "**Free GPU training on Kaggle**\n",
10
  "\n",
@@ -40,7 +40,31 @@
40
  "execution_count": null,
41
  "metadata": {},
42
  "outputs": [],
43
- "source": "# STEP 2: Clone repo and setup paths\nimport os\nimport shutil\nimport subprocess\n\n# Change to a valid directory first (in case we're in a deleted folder)\nos.chdir(\"/kaggle/working\")\n\nREPO_DIR = \"/kaggle/working/stack-2.9\"\nMODEL_DIR = os.path.join(REPO_DIR, \"base_model_qwen7b\")\nOUTPUT_DIR = os.path.join(REPO_DIR, \"training_output\")\n\n# Remove old repo if exists (force fresh clone)\nif os.path.exists(REPO_DIR):\n shutil.rmtree(REPO_DIR)\n\n# Clone fresh (now includes the input_path fix)\nsubprocess.run([\"git\", \"clone\", \"https://github.com/my-ai-stack/stack-2.9.git\", REPO_DIR], check=True)\nos.chdir(REPO_DIR)\n\nprint(f\"✅ Working in: {os.getcwd()}\")\nprint(f\" MODEL_DIR: {MODEL_DIR}\")\nprint(f\" OUTPUT_DIR: {OUTPUT_DIR}\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  },
45
  {
46
  "cell_type": "code",
@@ -62,26 +86,35 @@
62
  "metadata": {},
63
  "outputs": [],
64
  "source": [
65
- "# STEP 4: Download model (if not exists)\n",
66
- "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
67
  "import os\n",
68
  "\n",
69
- "if os.path.exists(os.path.join(MODEL_DIR, \"config.json\")):\n",
70
- " print(\"✅ Model already exists, skipping download!\")\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  "else:\n",
72
- " print(\"⬇️ Downloading model (Qwen2.5-Coder-7B)...\")\n",
73
- " print(\"This takes ~10-15 minutes...\")\n",
74
- " \n",
75
- " tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen2.5-Coder-7B\", trust_remote_code=True)\n",
76
- " tokenizer.save_pretrained(MODEL_DIR)\n",
77
- " \n",
78
- " model = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen2.5-Coder-7B\", trust_remote_code=True)\n",
79
- " model.save_pretrained(MODEL_DIR)\n",
80
- " \n",
81
- " print(\"✅ Model downloaded!\")\n",
82
- "\n",
83
- "print(\"\\nModel files:\")\n",
84
- "os.listdir(MODEL_DIR)"
85
  ]
86
  },
87
  {
@@ -90,43 +123,72 @@
90
  "metadata": {},
91
  "outputs": [],
92
  "source": [
93
- "# STEP 5: Create config with train_dir and eval_dir\n",
94
  "import yaml\n",
95
  "import os\n",
96
  "\n",
97
  "os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
98
  "\n",
99
  "config = {\n",
100
- " 'model': {'name': MODEL_DIR, 'trust_remote_code': True, 'torch_dtype': 'float16'},\n",
 
 
 
 
101
  " 'data': {\n",
102
- " 'input_path': os.path.join(REPO_DIR, 'data/final/train.jsonl'),\n",
103
- " 'train_dir': None,\n",
104
- " 'eval_dir': None,\n",
105
  " 'max_length': 2048,\n",
106
- " 'train_split': 0.9,\n",
107
- " 'test_split': 0.1\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  " },\n",
109
- " 'lora': {'r': 16, 'alpha': 32, 'dropout': 0.05,\n",
110
- " 'target_modules': ['q_proj', 'k_proj', 'v_proj', 'o_proj'],\n",
111
- " 'bias': 'none', 'task_type': 'CAUSAL_LM'},\n",
112
- " 'training': {'num_epochs': 1, 'batch_size': 2, 'gradient_accumulation': 4,\n",
113
- " 'learning_rate': 2e-4, 'warmup_steps': 50, 'weight_decay': 0.01,\n",
114
- " 'max_grad_norm': 1.0, 'logging_steps': 5, 'eval_steps': 100,\n",
115
- " 'save_steps': 200, 'save_total_limit': 2, 'fp16': True, 'bf16': False,\n",
116
- " 'gradient_checkpointing': True},\n",
117
- " 'output': {'lora_dir': os.path.join(OUTPUT_DIR, 'lora'),\n",
118
- " 'merged_dir': os.path.join(OUTPUT_DIR, 'merged')},\n",
119
- " 'quantization': {'enabled': False},\n",
120
- " 'hardware': {'device': 'cuda', 'num_gpus': 1, 'use_4bit': False, 'use_8bit': False}\n",
121
  "}\n",
122
  "\n",
123
  "config_path = os.path.join(OUTPUT_DIR, \"train_config.yaml\")\n",
124
  "with open(config_path, 'w') as f:\n",
125
- " yaml.dump(config, f)\n",
126
  "\n",
127
  "print(f\"✅ Config saved to: {config_path}\")\n",
128
- "print(f\" Device: {config['hardware']['device']}\")\n",
129
- "print(f\" Data: {config['data']['input_path']}\")"
 
 
 
 
130
  ]
131
  },
132
  {
@@ -137,18 +199,28 @@
137
  "source": [
138
  "# STEP 6: Train LoRA\n",
139
  "import sys\n",
140
- "sys.path.insert(0, os.path.join(REPO_DIR, \"stack/training\"))\n",
141
  "\n",
142
  "print(\"=\"*60)\n",
143
  "print(\"STARTING TRAINING\")\n",
144
  "print(\"=\"*60)\n",
 
 
 
145
  "\n",
146
- "from train_lora import train_lora\n",
147
- "trainer = train_lora(config_path)\n",
148
  "\n",
149
- "print(\"=\"*60)\n",
150
- "print(\"TRAINING COMPLETED!\")\n",
151
- "print(\"=\"*60)"
 
 
 
 
 
 
 
152
  ]
153
  },
154
  {
@@ -157,43 +229,41 @@
157
  "metadata": {},
158
  "outputs": [],
159
  "source": [
160
- "# STEP 7: Merge model\n",
161
  "import sys\n",
162
- "sys.path.insert(0, os.path.join(REPO_DIR, \"stack/training\"))\n",
163
- "from merge_adapter import merge_adapter\n",
164
  "\n",
165
- "merged_dir = os.path.join(OUTPUT_DIR, \"merged\")\n",
 
166
  "os.makedirs(merged_dir, exist_ok=True)\n",
167
  "\n",
168
- "merge_config = {\n",
169
- " 'model': {'name': MODEL_DIR, 'trust_remote_code': True},\n",
170
- " 'output': {'lora_dir': os.path.join(OUTPUT_DIR, 'lora'), 'merged_dir': merged_dir},\n",
171
- " 'quantization': {'enabled': False}\n",
172
- "}\n",
173
- "\n",
174
- "merge_cfg_path = os.path.join(OUTPUT_DIR, \"merge_config.yaml\")\n",
175
- "with open(merge_cfg_path, 'w') as f:\n",
176
- " yaml.dump(merge_config, f)\n",
177
  "\n",
178
- "merge_adapter(merge_cfg_path, os.path.join(OUTPUT_DIR, \"lora\"), merged_dir)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  "\n",
180
- "print(f\"✅ Merged model saved to: {merged_dir}\")\n",
181
- "print(\"Files:\", os.listdir(merged_dir))"
182
- ]
183
- },
184
- {
185
- "cell_type": "code",
186
- "execution_count": null,
187
- "metadata": {},
188
- "outputs": [],
189
- "source": [
190
- "# STEP 8: Done!\n",
191
  "print(\"=\"*60)\n",
192
- "print(\"🎉 TRAINING COMPLETE!\")\n",
193
  "print(\"=\"*60)\n",
194
- "print(f\"LoRA adapter: {os.path.join(OUTPUT_DIR, 'lora')}\")\n",
195
- "print(f\"Merged model: {os.path.join(OUTPUT_DIR, 'merged')}\")\n",
196
- "print(\"\\n📥 Download from: Kaggle → Output tab\")"
197
  ]
198
  }
199
  ],
@@ -210,4 +280,4 @@
210
  },
211
  "nbformat": 4,
212
  "nbformat_minor": 0
213
- }
 
4
  "cell_type": "markdown",
5
  "metadata": {},
6
  "source": [
7
+ "# 🚀 Stack 2.9 - Kaggle Training\n",
8
  "\n",
9
  "**Free GPU training on Kaggle**\n",
10
  "\n",
 
40
  "execution_count": null,
41
  "metadata": {},
42
  "outputs": [],
43
+ "source": [
44
+ "# STEP 2: Clone repo and setup paths\n",
45
+ "import os\n",
46
+ "import shutil\n",
47
+ "import subprocess\n",
48
+ "\n",
49
+ "# Change to a valid directory first (in case we're in a deleted folder)\n",
50
+ "os.chdir(\"/kaggle/working\")\n",
51
+ "\n",
52
+ "REPO_DIR = \"/kaggle/working/stack-2.9\"\n",
53
+ "MODEL_DIR = os.path.join(REPO_DIR, \"base_model_qwen7b\")\n",
54
+ "OUTPUT_DIR = os.path.join(REPO_DIR, \"training_output\")\n",
55
+ "\n",
56
+ "# Remove old repo if exists (force fresh clone)\n",
57
+ "if os.path.exists(REPO_DIR):\n",
58
+ " shutil.rmtree(REPO_DIR)\n",
59
+ "\n",
60
+ "# Clone fresh (now includes the input_path fix)\n",
61
+ "subprocess.run([\"git\", \"clone\", \"https://github.com/my-ai-stack/stack-2.9.git\", REPO_DIR], check=True)\n",
62
+ "os.chdir(REPO_DIR)\n",
63
+ "\n",
64
+ "print(f\"✅ Working in: {os.getcwd()}\")\n",
65
+ "print(f\" MODEL_DIR: {MODEL_DIR}\")\n",
66
+ "print(f\" OUTPUT_DIR: {OUTPUT_DIR}\")"
67
+ ]
68
  },
69
  {
70
  "cell_type": "code",
 
86
  "metadata": {},
87
  "outputs": [],
88
  "source": [
89
+ "# STEP 4: Prepare training data\n",
 
90
  "import os\n",
91
  "\n",
92
+ "# Check what training data is available\n",
93
+ "REPO_TRAIN_DATA = os.path.join(REPO_DIR, \"training-data/final/train.jsonl\")\n",
94
+ "MINI_DATA_DIR = os.path.join(REPO_DIR, \"data_mini\")\n",
95
+ "MINI_DATA_FILE = os.path.join(MINI_DATA_DIR, \"train_mini.jsonl\")\n",
96
+ "\n",
97
+ "print(\"🔍 Checking for training data...\")\n",
98
+ "if os.path.exists(REPO_TRAIN_DATA):\n",
99
+ " print(f\" Found full dataset: {REPO_TRAIN_DATA}\")\n",
100
+ " # Create mini subset (1K samples) for faster training\n",
101
+ " os.makedirs(MINI_DATA_DIR, exist_ok=True)\n",
102
+ " if not os.path.exists(MINI_DATA_FILE):\n",
103
+ " print(\" Creating mini dataset (1000 samples)...\")\n",
104
+ " import subprocess\n",
105
+ " subprocess.run([\"python\", os.path.join(REPO_DIR, \"scripts/create_mini_dataset.py\"),\n",
106
+ " \"--size\", \"1000\", \"--output\", MINI_DATA_FILE, \"--source\", REPO_TRAIN_DATA], check=True)\n",
107
+ " DATA_FILE = MINI_DATA_FILE\n",
108
  "else:\n",
109
+ " print(\" Full dataset not found, checking for existing mini dataset...\")\n",
110
+ " if os.path.exists(MINI_DATA_FILE):\n",
111
+ " DATA_FILE = MINI_DATA_FILE\n",
112
+ " print(f\" Using existing mini dataset: {MINI_DATA_FILE}\")\n",
113
+ " else:\n",
114
+ " raise FileNotFoundError(\"No training data found! Please ensure training-data/final/train.jsonl exists in the repo.\")\n",
115
+ "\n",
116
+ "print(f\"\\n✅ Using training data: {DATA_FILE}\")\n",
117
+ "print(f\" Size: {os.path.getsize(DATA_FILE) / 1024:.1f} KB\")"
 
 
 
 
118
  ]
119
  },
120
  {
 
123
  "metadata": {},
124
  "outputs": [],
125
  "source": [
126
+ "# STEP 5: Prepare config for training\n",
127
  "import yaml\n",
128
  "import os\n",
129
  "\n",
130
  "os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
131
  "\n",
132
  "config = {\n",
133
+ " 'model': {\n",
134
+ " 'name': 'Qwen/Qwen2.5-Coder-7B',\n",
135
+ " 'trust_remote_code': True,\n",
136
+ " 'torch_dtype': 'float16'\n",
137
+ " },\n",
138
  " 'data': {\n",
139
+ " 'train_file': DATA_FILE, # USE THE ACTUAL DATA FILE PATH\n",
 
 
140
  " 'max_length': 2048,\n",
141
+ " 'train_split': 1.0 # Use all data for training\n",
142
+ " },\n",
143
+ " 'lora': {\n",
144
+ " 'r': 16,\n",
145
+ " 'alpha': 32,\n",
146
+ " 'dropout': 0.05,\n",
147
+ " 'target_modules': ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],\n",
148
+ " 'bias': 'none',\n",
149
+ " 'task_type': 'CAUSAL_LM'\n",
150
+ " },\n",
151
+ " 'training': {\n",
152
+ " 'num_epochs': 1,\n",
153
+ " 'batch_size': 2,\n",
154
+ " 'gradient_accumulation': 4,\n",
155
+ " 'learning_rate': 2e-4,\n",
156
+ " 'warmup_steps': 50,\n",
157
+ " 'weight_decay': 0.01,\n",
158
+ " 'max_grad_norm': 1.0,\n",
159
+ " 'logging_steps': 10,\n",
160
+ " 'save_steps': 100,\n",
161
+ " 'save_total_limit': 2,\n",
162
+ " 'fp16': True,\n",
163
+ " 'bf16': False,\n",
164
+ " 'gradient_checkpointing': True\n",
165
+ " },\n",
166
+ " 'output': {\n",
167
+ " 'lora_dir': os.path.join(OUTPUT_DIR, 'lora'),\n",
168
+ " 'logging_dir': os.path.join(OUTPUT_DIR, 'logs')\n",
169
  " },\n",
170
+ " 'quantization': {\n",
171
+ " 'enabled': False\n",
172
+ " },\n",
173
+ " 'hardware': {\n",
174
+ " 'device': 'cuda',\n",
175
+ " 'num_gpus': 1,\n",
176
+ " 'use_4bit': False,\n",
177
+ " 'use_8bit': False\n",
178
+ " }\n",
 
 
 
179
  "}\n",
180
  "\n",
181
  "config_path = os.path.join(OUTPUT_DIR, \"train_config.yaml\")\n",
182
  "with open(config_path, 'w') as f:\n",
183
+ " yaml.dump(config, f, default_flow_style=False)\n",
184
  "\n",
185
  "print(f\"✅ Config saved to: {config_path}\")\n",
186
+ "print(\"\\nConfig summary:\")\n",
187
+ "print(f\" Model: {config['model']['name']}\")\n",
188
+ "print(f\" Data: {config['data']['train_file']}\")\n",
189
+ "print(f\" LoRA rank: {config['lora']['r']}\")\n",
190
+ "print(f\" Batch size: {config['training']['batch_size']}\")\n",
191
+ "print(f\" Epochs: {config['training']['num_epochs']}\")"
192
  ]
193
  },
194
  {
 
199
  "source": [
200
  "# STEP 6: Train LoRA\n",
201
  "import sys\n",
202
+ "sys.path.insert(0, os.path.join(REPO_DIR, \"stack_2_9_training\"))\n",
203
  "\n",
204
  "print(\"=\"*60)\n",
205
  "print(\"STARTING TRAINING\")\n",
206
  "print(\"=\"*60)\n",
207
+ "print(f\"Config: {config_path}\")\n",
208
+ "print(f\"Checkpoint dir: {config['output']['lora_dir']}\")\n",
209
+ "print(\"=\"*60 + \"\\n\")\n",
210
  "\n",
211
+ "# Import and run training\n",
212
+ "from stack_2_9_training.train_lora import train_lora\n",
213
  "\n",
214
+ "try:\n",
215
+ " trainer = train_lora(config_path)\n",
216
+ " print(\"\\n\" + \"=\"*60)\n",
217
+ " print(\"TRAINING COMPLETED SUCCESSFULLY\")\n",
218
+ " print(\"=\"*60)\n",
219
+ "except Exception as e:\n",
220
+ " print(f\"\\n❌ Training failed: {e}\")\n",
221
+ " import traceback\n",
222
+ " traceback.print_exc()\n",
223
+ " raise"
224
  ]
225
  },
226
  {
 
229
  "metadata": {},
230
  "outputs": [],
231
  "source": [
232
+ "# STEP 7: Merge LoRA adapter with base model\n",
233
  "import sys\n",
234
+ "sys.path.insert(0, os.path.join(REPO_DIR, \"stack_2_9_training\"))\n",
235
+ "from stack_2_9_training.merge_adapter import merge_adapter\n",
236
  "\n",
237
+ "lora_dir = config['output']['lora_dir']\n",
238
+ "merged_dir = os.path.join(OUTPUT_DIR, 'merged')\n",
239
  "os.makedirs(merged_dir, exist_ok=True)\n",
240
  "\n",
241
+ "print(\"=\"*60)\n",
242
+ "print(\"MERGING LORA ADAPTER\")\n",
243
+ "print(\"=\"*60)\n",
244
+ "print(f\"LoRA adapter: {lora_dir}\")\n",
245
+ "print(f\"Output: {merged_dir}\")\n",
 
 
 
 
246
  "\n",
247
+ "try:\n",
248
+ " merge_adapter(\n",
249
+ " base_model_name_or_path=config['model']['name'],\n",
250
+ " adapter_path=lora_dir,\n",
251
+ " output_path=merged_dir,\n",
252
+ " use_safetensors=True\n",
253
+ " )\n",
254
+ " print(\"\\n✅ Merge completed!\")\n",
255
+ " print(f\"Merged model files: {os.listdir(merged_dir)}\")\n",
256
+ "except Exception as e:\n",
257
+ " print(f\"\\n❌ Merge failed: {e}\")\n",
258
+ " import traceback\n",
259
+ " traceback.print_exc()\n",
260
+ " raise\n",
261
  "\n",
 
 
 
 
 
 
 
 
 
 
 
262
  "print(\"=\"*60)\n",
263
+ "print(\"🎉 ALL DONE!\")\n",
264
  "print(\"=\"*60)\n",
265
+ "print(f\"\\n📦 Merged model ready at: {merged_dir}\")\n",
266
+ "print(\"\\n⏳ Download the 'merged' folder from Kaggle's Output panel before the session ends!\")"
 
267
  ]
268
  }
269
  ],
 
280
  },
281
  "nbformat": 4,
282
  "nbformat_minor": 0
283
+ }