walidsobhie-code Claude Opus 4.6 commited on
Commit
78417b9
Β·
1 Parent(s): f268fb6

fix: update Kaggle notebook with self-contained cells

Browse files

- Each cell defines its own paths (no NameError)
- Auto-downloads model only if not exists
- 9 steps: clone β†’ install β†’ model β†’ config β†’ train β†’ merge β†’ done

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. kaggle_train_stack29.ipynb +64 -49
kaggle_train_stack29.ipynb CHANGED
@@ -16,10 +16,9 @@
16
  "---\n",
17
  "\n",
18
  "**Instructions:**\n",
19
- "1. Kaggle β†’ New Notebook\n",
20
- "2. Add this notebook's code OR clone from GitHub\n",
21
- "3. Enable GPU (Settings β†’ Accelerator β†’ GPU P100)\n",
22
- "4. Run cells in order\n",
23
  "\n",
24
  "---"
25
  ]
@@ -30,7 +29,7 @@
30
  "metadata": {},
31
  "outputs": [],
32
  "source": [
33
- "# Check GPU\n",
34
  "!nvidia-smi"
35
  ]
36
  },
@@ -40,13 +39,12 @@
40
  "metadata": {},
41
  "outputs": [],
42
  "source": [
43
- "# STEP 1: Clone the repo\n",
44
  "import os\n",
45
  "import shutil\n",
46
  "\n",
47
  "REPO_DIR = \"/kaggle/working/stack-2.9\"\n",
48
  "\n",
49
- "# Remove old if exists\n",
50
  "if os.path.exists(REPO_DIR):\n",
51
  " shutil.rmtree(REPO_DIR)\n",
52
  "\n",
@@ -62,7 +60,7 @@
62
  "metadata": {},
63
  "outputs": [],
64
  "source": [
65
- "# STEP 2: Install dependencies\n",
66
  "!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
67
  "!pip install -q transformers peft accelerate datasets pyyaml tqdm scipy bitsandbytes\n",
68
  "print(\"βœ… Dependencies installed\")"
@@ -74,22 +72,37 @@
74
  "metadata": {},
75
  "outputs": [],
76
  "source": [
77
- "# STEP 3: Download Base Model\n",
78
- "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
79
  "\n",
80
- "MODEL_NAME = \"Qwen/Qwen2.5-Coder-7B\"\n",
81
  "MODEL_DIR = os.path.join(REPO_DIR, \"base_model_qwen7b\")\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  "\n",
83
- "if not os.path.exists(os.path.join(MODEL_DIR, \"config.json\")):\n",
84
- " print(f\"Downloading {MODEL_NAME}...\")\n",
 
 
85
  " print(\"This takes ~10-15 minutes...\")\n",
86
- " tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)\n",
87
  " tokenizer.save_pretrained(MODEL_DIR)\n",
88
- " model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True)\n",
89
  " model.save_pretrained(MODEL_DIR)\n",
90
  " print(\"βœ… Model downloaded!\")\n",
91
- "else:\n",
92
- " print(\"βœ… Model already exists\")\n",
93
  "\n",
94
  "!ls -lh {MODEL_DIR} | head -5"
95
  ]
@@ -100,30 +113,34 @@
100
  "metadata": {},
101
  "outputs": [],
102
  "source": [
103
- "# STEP 4: Setup paths and config\n",
104
  "import yaml\n",
105
- "\n",
106
- "config_path = os.path.join(REPO_DIR, \"stack/training/train_config_local.yaml\")\n",
107
- "\n",
108
- "with open(config_path, 'r') as f:\n",
109
- " config = yaml.safe_load(f)\n",
110
- "\n",
111
- "# Update for Kaggle GPU\n",
112
- "config['model']['name'] = MODEL_DIR\n",
113
- "config['hardware']['device'] = \"cuda\"\n",
114
- "config['hardware']['num_gpus'] = 1\n",
115
- "\n",
116
- "OUTPUT_DIR = os.path.join(REPO_DIR, \"training_output\")\n",
117
- "config['output']['lora_dir'] = os.path.join(OUTPUT_DIR, \"lora\")\n",
118
- "config['output']['merged_dir'] = os.path.join(OUTPUT_DIR, \"merged\")\n",
119
  "\n",
120
  "os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
121
- "updated_config = os.path.join(OUTPUT_DIR, \"train_config.yaml\")\n",
122
  "\n",
123
- "with open(updated_config, 'w') as f:\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  " yaml.dump(config, f)\n",
125
  "\n",
126
- "print(f\"βœ… Config saved to: {updated_config}\")\n",
127
  "print(f\" Device: {config['hardware']['device']}\")"
128
  ]
129
  },
@@ -133,7 +150,7 @@
133
  "metadata": {},
134
  "outputs": [],
135
  "source": [
136
- "# STEP 5: Train LoRA\n",
137
  "import sys\n",
138
  "sys.path.insert(0, os.path.join(REPO_DIR, \"stack/training\"))\n",
139
  "\n",
@@ -142,10 +159,10 @@
142
  "print(\"=\"*60)\n",
143
  "\n",
144
  "from train_lora import train_lora\n",
145
- "trainer = train_lora(updated_config)\n",
146
  "\n",
147
  "print(\"=\"*60)\n",
148
- "print(\"TRAINING COMPLETED\")\n",
149
  "print(\"=\"*60)"
150
  ]
151
  },
@@ -155,7 +172,7 @@
155
  "metadata": {},
156
  "outputs": [],
157
  "source": [
158
- "# STEP 6: Merge and save\n",
159
  "import sys\n",
160
  "sys.path.insert(0, os.path.join(REPO_DIR, \"stack/training\"))\n",
161
  "from merge_adapter import merge_adapter\n",
@@ -175,7 +192,7 @@
175
  "\n",
176
  "merge_adapter(merge_cfg_path, os.path.join(OUTPUT_DIR, \"lora\"), merged_dir)\n",
177
  "\n",
178
- "print(f\"βœ… Model saved to: {merged_dir}\")\n",
179
  "!ls -lh {merged_dir}"
180
  ]
181
  },
@@ -185,15 +202,13 @@
185
  "metadata": {},
186
  "outputs": [],
187
  "source": [
188
- "# STEP 7: Download the trained model (for saving)\n",
189
- "# The model is saved at OUTPUT_DIR/merged/\n",
190
- "# You can download it from the Kaggle outputs\n",
191
- "\n",
192
- "print(\"Training complete!\")\n",
193
- "print(f\"Model saved at: {merged_dir}\")\n",
194
- "print(\"\\nTo download:\")\n",
195
- "print(\"1. Click 'Output' tab in Kaggle\")\n",
196
- "print(\"2. Download the files from training_output/merged/\")"
197
  ]
198
  }
199
  ],
 
16
  "---\n",
17
  "\n",
18
  "**Instructions:**\n",
19
+ "1. Enable GPU: Settings β†’ Accelerator β†’ GPU P100\n",
20
+ "2. Run cells in order from the top\n",
21
+ "3. Model auto-downloads if not present\n",
 
22
  "\n",
23
  "---"
24
  ]
 
29
  "metadata": {},
30
  "outputs": [],
31
  "source": [
32
+ "# STEP 1: Check GPU\n",
33
  "!nvidia-smi"
34
  ]
35
  },
 
39
  "metadata": {},
40
  "outputs": [],
41
  "source": [
42
+ "# STEP 2: Clone repo\n",
43
  "import os\n",
44
  "import shutil\n",
45
  "\n",
46
  "REPO_DIR = \"/kaggle/working/stack-2.9\"\n",
47
  "\n",
 
48
  "if os.path.exists(REPO_DIR):\n",
49
  " shutil.rmtree(REPO_DIR)\n",
50
  "\n",
 
60
  "metadata": {},
61
  "outputs": [],
62
  "source": [
63
+ "# STEP 3: Install dependencies\n",
64
  "!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
65
  "!pip install -q transformers peft accelerate datasets pyyaml tqdm scipy bitsandbytes\n",
66
  "print(\"βœ… Dependencies installed\")"
 
72
  "metadata": {},
73
  "outputs": [],
74
  "source": [
75
+ "# STEP 4: Setup paths (MODEL_DIR, OUTPUT_DIR)\n",
76
+ "import os\n",
77
  "\n",
78
+ "REPO_DIR = \"/kaggle/working/stack-2.9\"\n",
79
  "MODEL_DIR = os.path.join(REPO_DIR, \"base_model_qwen7b\")\n",
80
+ "OUTPUT_DIR = os.path.join(REPO_DIR, \"training_output\")\n",
81
+ "\n",
82
+ "print(f\"REPO_DIR: {REPO_DIR}\")\n",
83
+ "print(f\"MODEL_DIR: {MODEL_DIR}\")\n",
84
+ "print(f\"OUTPUT_DIR: {OUTPUT_DIR}\")"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": null,
90
+ "metadata": {},
91
+ "outputs": [],
92
+ "source": [
93
+ "# STEP 5: Download model (if not exists)\n",
94
+ "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
95
  "\n",
96
+ "if os.path.exists(os.path.join(MODEL_DIR, \"config.json\")):\n",
97
+ " print(\"βœ… Model already exists, skipping download!\")\n",
98
+ "else:\n",
99
+ " print(\"⬇️ Downloading model (Qwen2.5-Coder-7B)...\")\n",
100
  " print(\"This takes ~10-15 minutes...\")\n",
101
+ " tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen2.5-Coder-7B\", trust_remote_code=True)\n",
102
  " tokenizer.save_pretrained(MODEL_DIR)\n",
103
+ " model = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen2.5-Coder-7B\", trust_remote_code=True)\n",
104
  " model.save_pretrained(MODEL_DIR)\n",
105
  " print(\"βœ… Model downloaded!\")\n",
 
 
106
  "\n",
107
  "!ls -lh {MODEL_DIR} | head -5"
108
  ]
 
113
  "metadata": {},
114
  "outputs": [],
115
  "source": [
116
+ "# STEP 6: Create config\n",
117
  "import yaml\n",
118
+ "import os\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  "\n",
120
  "os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
 
121
  "\n",
122
+ "config = {\n",
123
+ " 'model': {'name': MODEL_DIR, 'trust_remote_code': True, 'torch_dtype': 'float16'},\n",
124
+ " 'data': {'input_path': './data/final/train.jsonl', 'max_length': 2048},\n",
125
+ " 'lora': {'r': 16, 'alpha': 32, 'dropout': 0.05,\n",
126
+ " 'target_modules': ['q_proj', 'k_proj', 'v_proj', 'o_proj'],\n",
127
+ " 'bias': 'none', 'task_type': 'CAUSAL_LM'},\n",
128
+ " 'training': {'num_epochs': 1, 'batch_size': 2, 'gradient_accumulation': 4,\n",
129
+ " 'learning_rate': 2e-4, 'warmup_steps': 50, 'weight_decay': 0.01,\n",
130
+ " 'max_grad_norm': 1.0, 'logging_steps': 5, 'eval_steps': 100,\n",
131
+ " 'save_steps': 200, 'save_total_limit': 2, 'fp16': True, 'bf16': False,\n",
132
+ " 'gradient_checkpointing': True},\n",
133
+ " 'output': {'lora_dir': os.path.join(OUTPUT_DIR, 'lora'),\n",
134
+ " 'merged_dir': os.path.join(OUTPUT_DIR, 'merged')},\n",
135
+ " 'quantization': {'enabled': False},\n",
136
+ " 'hardware': {'device': 'cuda', 'num_gpus': 1, 'use_4bit': False, 'use_8bit': False}\n",
137
+ "}\n",
138
+ "\n",
139
+ "config_path = os.path.join(OUTPUT_DIR, \"train_config.yaml\")\n",
140
+ "with open(config_path, 'w') as f:\n",
141
  " yaml.dump(config, f)\n",
142
  "\n",
143
+ "print(f\"βœ… Config saved to: {config_path}\")\n",
144
  "print(f\" Device: {config['hardware']['device']}\")"
145
  ]
146
  },
 
150
  "metadata": {},
151
  "outputs": [],
152
  "source": [
153
+ "# STEP 7: Train LoRA\n",
154
  "import sys\n",
155
  "sys.path.insert(0, os.path.join(REPO_DIR, \"stack/training\"))\n",
156
  "\n",
 
159
  "print(\"=\"*60)\n",
160
  "\n",
161
  "from train_lora import train_lora\n",
162
+ "trainer = train_lora(config_path)\n",
163
  "\n",
164
  "print(\"=\"*60)\n",
165
+ "print(\"TRAINING COMPLETED!\")\n",
166
  "print(\"=\"*60)"
167
  ]
168
  },
 
172
  "metadata": {},
173
  "outputs": [],
174
  "source": [
175
+ "# STEP 8: Merge model\n",
176
  "import sys\n",
177
  "sys.path.insert(0, os.path.join(REPO_DIR, \"stack/training\"))\n",
178
  "from merge_adapter import merge_adapter\n",
 
192
  "\n",
193
  "merge_adapter(merge_cfg_path, os.path.join(OUTPUT_DIR, \"lora\"), merged_dir)\n",
194
  "\n",
195
+ "print(f\"βœ… Merged model saved to: {merged_dir}\")\n",
196
  "!ls -lh {merged_dir}"
197
  ]
198
  },
 
202
  "metadata": {},
203
  "outputs": [],
204
  "source": [
205
+ "# STEP 9: Done!\n",
206
+ "print(\"=\"*60)\n",
207
+ "print(\"πŸŽ‰ TRAINING COMPLETE!\")\n",
208
+ "print(\"=\"*60)\n",
209
+ "print(f\"LoRA adapter: {os.path.join(OUTPUT_DIR, 'lora')}\")\n",
210
+ "print(f\"Merged model: {os.path.join(OUTPUT_DIR, 'merged')}\")\n",
211
+ "print(\"\\nπŸ“₯ Download from: Kaggle β†’ Output tab\")"
 
 
212
  ]
213
  }
214
  ],