walidsobhie-code commited on
Commit
d863fcd
·
1 Parent(s): 51896e7

feat: standalone training and merge scripts for Kaggle

Browse files

- Added train_simple.py: self-contained training without package install
- Added merge_simple.py: simple LoRA merge utility
- Updated Kaggle notebook (kaggle_train_stack29_final.ipynb) to use standalone scripts
- Removes dependency on pip install -e . (broken pyproject.toml)
- Synthetic data fallback ensures training works without large datasets
- Should finally work on fresh Kaggle GPU session

Files changed (3) hide show
  1. kaggle_train_stack29_final.ipynb +198 -0
  2. merge_simple.py +64 -0
  3. train_simple.py +197 -0
kaggle_train_stack29_final.ipynb ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# 🚀 Stack 2.9 - Kaggle Training\n",
8
+ "\n",
9
+ "Free GPU training on Kaggle using Qwen2.5-Coder-7B.\n",
10
+ "\n",
11
+ "⏱️ **Runtime:** 2-4 hours | 💾 **VRAM:** ~16GB\n",
12
+ "\n",
13
+ "**Setup:**\n",
14
+ "1. Settings → Accelerator → GPU **T4**\n",
15
+ "2. Run all cells in order\n",
16
+ "3. Download merged model from Output tab when done"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "metadata": {},
23
+ "outputs": [],
24
+ "source": [
25
+ "# Check GPU\n",
26
+ "!nvidia-smi"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": null,
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "# Clone repository\n",
36
+ "import os, shutil, subprocess\n",
37
+ "\n",
38
+ "os.chdir('/kaggle/working')\n",
39
+ "REPO_DIR = '/kaggle/working/stack-2.9'\n",
40
+ "OUTPUT_DIR = os.path.join(REPO_DIR, 'training_output')\n",
41
+ "\n",
42
+ "if os.path.exists(REPO_DIR):\n",
43
+ " shutil.rmtree(REPO_DIR)\n",
44
+ "subprocess.run(['git', 'clone', 'https://github.com/my-ai-stack/stack-2.9.git', REPO_DIR], check=True)\n",
45
+ "os.chdir(REPO_DIR)\n",
46
+ "print('✅ Repo ready:', REPO_DIR)"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": null,
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": [
55
+ "# Install dependencies (single command)\n",
56
+ "!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
57
+ "!pip install -q transformers==4.40.0 peft==0.10.0 accelerate==0.34.0 datasets==3.0.0 pyyaml tqdm scipy bitsandbytes==0.43.0\n",
58
+ "print('✅ Dependencies ready')"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": null,
64
+ "metadata": {},
65
+ "outputs": [],
66
+ "source": [
67
+ "# Prepare training data (auto-detect or synthetic fallback)\n",
68
+ "import os, json\n",
69
+ "\n",
70
+ "REPO_TRAIN_DATA = os.path.join(REPO_DIR, 'training-data/final/train.jsonl')\n",
71
+ "MINI_DATA_DIR = os.path.join(REPO_DIR, 'data_mini')\n",
72
+ "MINI_DATA_FILE = os.path.join(MINI_DATA_DIR, 'train_mini.jsonl')\n",
73
+ "SYNTHETIC_FILE = os.path.join(REPO_DIR, 'data/synthetic.jsonl')\n",
74
+ "\n",
75
+ "print('🔍 Data check')\n",
76
+ "\n",
77
+ "if os.path.exists(REPO_TRAIN_DATA):\n",
78
+ " os.makedirs(MINI_DATA_DIR, exist_ok=True)\n",
79
+ " if not os.path.exists(MINI_DATA_FILE):\n",
80
+ " print(' Building mini dataset (1K samples) from full data...')\n",
81
+ " !python scripts/create_mini_dataset.py --size 1000 --output {MINI_DATA_FILE} --source {REPO_TRAIN_DATA}\n",
82
+ " DATA_FILE = MINI_DATA_FILE\n",
83
+ " print(' Using mini dataset')\n",
84
+ "elif os.path.exists(MINI_DATA_FILE):\n",
85
+ " DATA_FILE = MINI_DATA_FILE\n",
86
+ " print(' Using existing mini dataset')\n",
87
+ "else:\n",
88
+ " print(' Creating synthetic data (last resort)')\n",
89
+ " examples = [\n",
90
+ " {'instruction': 'Write a Python function to reverse a string', 'output': 'def reverse_string(s):\\n return s[::-1]'},\n",
91
+ " {'instruction': 'Write a function to check if a number is prime', 'output': 'def is_prime(n):\\n if n <= 1:\\n return False\\n for i in range(2, int(n**0.5) + 1):\\n if n % i == 0:\\n return False\\n return True'},\n",
92
+ " {'instruction': 'Write a binary search function', 'output': 'def binary_search(arr, target):\\n left, right = 0, len(arr) - 1\\n while left <= right:\\n mid = (left + right) // 2\\n if arr[mid] == target:\\n return mid\\n elif arr[mid] < target:\\n left = mid + 1\\n else:\\n right = mid - 1\\n return -1'},\n",
93
+ " ]\n",
94
+ " samples = examples * 333\n",
95
+ " os.makedirs(os.path.dirname(SYNTHETIC_FILE), exist_ok=True)\n",
96
+ " with open(SYNTHETIC_FILE, 'w') as f:\n",
97
+ " for s in samples:\n",
98
+ " f.write(json.dumps(s) + '\\n')\n",
99
+ " DATA_FILE = SYNTHETIC_FILE\n",
100
+ " print(f' Synthetic dataset: {len(samples)} examples')\n",
101
+ "\n",
102
+ "print(f'\\n✅ Data: {DATA_FILE}')\n",
103
+ "!ls -lh {DATA_FILE}"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": null,
109
+ "metadata": {},
110
+ "outputs": [],
111
+ "source": [
112
+ "# Generate training configuration\n",
113
+ "import yaml\n",
114
+ "\n",
115
+ "os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
116
+ "\n",
117
+ "config = {\n",
118
+ " 'model': {'name': 'Qwen/Qwen2.5-Coder-7B', 'trust_remote_code': True, 'torch_dtype': 'float16'},\n",
119
+ " 'data': {'input_path': DATA_FILE, 'max_length': 2048, 'train_split': 1.0},\n",
120
+ " 'lora': {'r': 16, 'alpha': 32, 'dropout': 0.05, 'target_modules': ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'], 'bias': 'none', 'task_type': 'CAUSAL_LM'},\n",
121
+ " 'training': {'num_epochs': 1, 'batch_size': 2, 'gradient_accumulation': 4, 'learning_rate': 2e-4, 'warmup_steps': 50, 'weight_decay': 0.01, 'max_grad_norm': 1.0, 'logging_steps': 10, 'save_steps': 100, 'save_total_limit': 2, 'fp16': True, 'bf16': False, 'gradient_checkpointing': True},\n",
122
+ " 'output': {'lora_dir': os.path.join(OUTPUT_DIR, 'lora'), 'logging_dir': os.path.join(OUTPUT_DIR, 'logs')},\n",
123
+ " 'quantization': {'enabled': False},\n",
124
+ " 'hardware': {'device': 'cuda', 'num_gpus': 1, 'use_4bit': False, 'use_8bit': False}\n",
125
+ "}\n",
126
+ "\n",
127
+ "config_path = os.path.join(OUTPUT_DIR, 'train_config.yaml')\n",
128
+ "with open(config_path, 'w') as f:\n",
129
+ " yaml.dump(config, f, default_flow_style=False)\n",
130
+ "\n",
131
+ "print(f'✅ Config: {config_path}')\n",
132
+ "print(f\" Model: {config['model']['name']}\")\n",
133
+ "print(f\" Data: {config['data']['input_path']}\")"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": null,
139
+ "metadata": {},
140
+ "outputs": [],
141
+ "source": [
142
+ "# Train (using standalone train_simple.py - no package install needed)\n",
143
+ "print('='*60)\n",
144
+ "print('STARTING TRAINING')\n",
145
+ "print('='*60)\n",
146
+ "\n",
147
+ "!cd {REPO_DIR} && python train_simple.py --config {config_path}\n",
148
+ "\n",
149
+ "print('\\n✅ Training step finished')"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": null,
155
+ "metadata": {},
156
+ "outputs": [],
157
+ "source": [
158
+ "# Merge LoRA adapter into final model\n",
159
+ "lora_dir = os.path.join(OUTPUT_DIR, 'lora')\n",
160
+ "merged_dir = os.path.join(OUTPUT_DIR, 'merged')\n",
161
+ "\n",
162
+ "print('='*60)\n",
163
+ "print('MERGING LORA ADAPTER')\n",
164
+ "print('='*60)\n",
165
+ "\n",
166
+ "!cd {REPO_DIR} && python merge_simple.py \\\n",
167
+ " --base-model {config['model']['name']} \\\n",
168
+ " --adapter-path {lora_dir} \\\n",
169
+ " --output-path {merged_dir} \\\n",
170
+ " --use-safetensors\n",
171
+ "\n",
172
+ "print('\\n✅ Merge complete!')\n",
173
+ "print(f'Merged model: {merged_dir}')\n",
174
+ "!ls -lh {merged_dir}"
175
+ ]
176
+ },
177
+ {
178
+ "cell_type": "markdown",
179
+ "metadata": {},
180
+ "source": [
181
+ "## 📥 Download Model\n",
182
+ "\n",
183
+ "1. Open **Output** tab on the right\n",
184
+ "2. Find `training_output/merged/`\n",
185
+ "3. Select all files and **Download**\n",
186
+ "\n",
187
+ "⚠️ **Do this before Kaggle session ends!**"
188
+ ]
189
+ }
190
+ ],
191
+ "metadata": {
192
+ "kaggle": {
193
+ "accelerator": "gpu"
194
+ }
195
+ },
196
+ "nbformat": 4,
197
+ "nbformat_minor": 0
198
+ }
merge_simple.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Simple LoRA merge script.
4
+ Usage: python merge_simple.py --base-model Qwen/Qwen2.5-Coder-7B --adapter-path adapters/lora --output-path merged_model
5
+ """
6
+
7
+ import argparse
8
+ import os
9
+ from pathlib import Path
10
+
11
+ import torch
12
+ from peft import PeftModel
13
+ from transformers import AutoModelForCausalLM, AutoTokenizer
14
+
15
+
16
+ def main():
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument("--base-model", type=str, required=True, help="Base model name or path")
19
+ parser.add_argument("--adapter-path", type=str, required=True, help="LoRA adapter directory")
20
+ parser.add_argument("--output-path", type=str, required=True, help="Output directory for merged model")
21
+ parser.add_argument("--use-safetensors", action="store_true", help="Use safetensors format")
22
+ args = parser.parse_args()
23
+
24
+ print("="*60)
25
+ print("Merging LoRA Adapter")
26
+ print("="*60)
27
+ print(f"Base model: {args.base_model}")
28
+ print(f"Adapter: {args.adapter_path}")
29
+ print(f"Output: {args.output_path}")
30
+
31
+ # Load base model
32
+ print("Loading base model...")
33
+ model = AutoModelForCausalLM.from_pretrained(
34
+ args.base_model,
35
+ torch_dtype=torch.float16,
36
+ device_map="auto",
37
+ trust_remote_code=True
38
+ )
39
+ tokenizer = AutoTokenizer.from_pretrained(args.base_model, trust_remote_code=True)
40
+
41
+ # Load and merge adapter
42
+ print("Loading LoRA adapter...")
43
+ model = PeftModel.from_pretrained(model, args.adapter_path)
44
+
45
+ print("Merging weights...")
46
+ model = model.merge_and_unload()
47
+
48
+ # Save
49
+ os.makedirs(args.output_path, exist_ok=True)
50
+ print(f"Saving to {args.output_path}...")
51
+ model.save_pretrained(args.output_path, safe_serialization=args.use_safetensors)
52
+ tokenizer.save_pretrained(args.output_path)
53
+
54
+ print("="*60)
55
+ print("✅ Merge complete!")
56
+ print("="*60)
57
+ files = list(Path(args.output_path).glob("*"))
58
+ print(f"Files saved ({len(files)}):")
59
+ for f in files:
60
+ print(f" {f.name}")
61
+
62
+
63
+ if __name__ == "__main__":
64
+ main()
train_simple.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Simple standalone training script for Stack 2.9.
4
+ No package installation required — just run: python train_simple.py --config train_config.yaml
5
+ """
6
+
7
+ import argparse
8
+ import os
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ import yaml
13
+ from datasets import load_dataset
14
+ from transformers import (
15
+ AutoModelForCausalLM,
16
+ AutoTokenizer,
17
+ BitsAndBytesConfig,
18
+ TrainingArguments,
19
+ Trainer,
20
+ DataCollatorForLanguageModeling
21
+ )
22
+ from peft import LoraConfig, get_peft_model, TaskType
23
+ import torch
24
+
25
+
26
+ def load_config(config_path: str) -> dict:
27
+ with open(config_path, 'r') as f:
28
+ return yaml.safe_load(f)
29
+
30
+
31
+ def load_model_and_tokenizer(model_name: str, trust_remote_code: bool = True, use_4bit: bool = False):
32
+ """Load base model and tokenizer."""
33
+ if use_4bit:
34
+ quantization_config = BitsAndBytesConfig(
35
+ load_in_4bit=True,
36
+ bnb_4bit_compute_dtype=torch.float16,
37
+ bnb_4bit_quant_type="nf4",
38
+ bnb_4bit_use_double_quant=True
39
+ )
40
+ else:
41
+ quantization_config = None
42
+
43
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=trust_remote_code)
44
+ model = AutoModelForCausalLM.from_pretrained(
45
+ model_name,
46
+ quantization_config=quantization_config,
47
+ torch_dtype=torch.float16,
48
+ trust_remote_code=trust_remote_code,
49
+ device_map="auto"
50
+ )
51
+ return model, tokenizer
52
+
53
+
54
+ def load_data(data_path: str, tokenizer, max_length: int = 2048, train_split: float = 0.9):
55
+ """Load and tokenize dataset."""
56
+ raw_dataset = load_dataset("json", data_files=data_path, split="train")
57
+
58
+ def tokenize_function(examples):
59
+ # Combine instruction and output
60
+ texts = []
61
+ for instr, out in zip(examples.get("instruction", [""]), examples.get("output", [""])):
62
+ if instr and out:
63
+ texts.append(f"### Instruction:\n{instr}\n\n### Response:\n{out}")
64
+ elif out:
65
+ texts.append(out)
66
+ elif instr:
67
+ texts.append(instr)
68
+ else:
69
+ texts.append("")
70
+
71
+ tokenized = tokenizer(texts, truncation=True, max_length=max_length, padding="max_length")
72
+ tokenized["labels"] = tokenized["input_ids"].copy()
73
+ return tokenized
74
+
75
+ tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=raw_dataset.column_names)
76
+ split = tokenized_dataset.train_test_split(train_size=train_split)
77
+ return split["train"], split["test"]
78
+
79
+
80
+ def train(config: dict):
81
+ """Main training function."""
82
+ model_config = config["model"]
83
+ data_config = config["data"]
84
+ lora_config = config["lora"]
85
+ training_config = config["training"]
86
+ output_config = config["output"]
87
+ hardware_config = config["hardware"]
88
+
89
+ # Load model and tokenizer
90
+ print(f"Loading model: {model_config['name']}")
91
+ model, tokenizer = load_model_and_tokenizer(
92
+ model_name=model_config["name"],
93
+ trust_remote_code=model_config.get("trust_remote_code", True),
94
+ use_4bit=hardware_config.get("use_4bit", False)
95
+ )
96
+
97
+ # Load data
98
+ print(f"Loading dataset: {data_config['input_path']}")
99
+ train_dataset, eval_dataset = load_data(
100
+ data_path=data_config["input_path"],
101
+ tokenizer=tokenizer,
102
+ max_length=data_config.get("max_length", 2048),
103
+ train_split=data_config.get("train_split", 0.9)
104
+ )
105
+ print(f" Train samples: {len(train_dataset)}")
106
+ print(f" Eval samples: {len(eval_dataset)}")
107
+
108
+ # Apply LoRA
109
+ peft_config = LoraConfig(
110
+ r=lora_config["r"],
111
+ alpha=lora_config["alpha"],
112
+ dropout=lora_config["dropout"],
113
+ target_modules=lora_config["target_modules"],
114
+ bias=lora_config["bias"],
115
+ task_type=TaskType.CAUSAL_LM
116
+ )
117
+ model = get_peft_model(model, peft_config)
118
+ model.print_trainable_parameters()
119
+
120
+ # Training arguments
121
+ output_dir = output_config["lora_dir"]
122
+ os.makedirs(output_dir, exist_ok=True)
123
+
124
+ training_args = TrainingArguments(
125
+ output_dir=output_dir,
126
+ num_train_epochs=training_config["num_epochs"],
127
+ per_device_train_batch_size=training_config["batch_size"],
128
+ gradient_accumulation_steps=training_config["gradient_accumulation"],
129
+ learning_rate=training_config["learning_rate"],
130
+ warmup_steps=training_config.get("warmup_steps", 100),
131
+ weight_decay=training_config.get("weight_decay", 0.01),
132
+ max_grad_norm=training_config.get("max_grad_norm", 1.0),
133
+ logging_steps=training_config.get("logging_steps", 10),
134
+ save_steps=training_config.get("save_steps", 100),
135
+ save_total_limit=training_config.get("save_total_limit", 2),
136
+ fp16=training_config.get("fp16", True),
137
+ bf16=training_config.get("bf16", False),
138
+ gradient_checkpointing=training_config.get("gradient_checkpointing", True),
139
+ evaluation_strategy="steps" if eval_dataset else "no",
140
+ eval_steps=training_config.get("eval_steps", 100) if eval_dataset else None,
141
+ report_to="none", # No WandB
142
+ )
143
+
144
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
145
+
146
+ trainer = Trainer(
147
+ model=model,
148
+ args=training_args,
149
+ train_dataset=train_dataset,
150
+ eval_dataset=eval_dataset,
151
+ data_collator=data_collator,
152
+ tokenizer=tokenizer,
153
+ )
154
+
155
+ print("="*60)
156
+ print("Starting training...")
157
+ print("="*60)
158
+ trainer.train()
159
+ print("Training completed!")
160
+
161
+ # Save final adapter
162
+ trainer.save_model(output_dir)
163
+ print(f"✅ Adapter saved to {output_dir}")
164
+
165
+ return trainer
166
+
167
+
168
+ def main():
169
+ parser = argparse.ArgumentParser()
170
+ parser.add_argument("--config", type=str, required=True, help="Path to YAML config")
171
+ args = parser.parse_args()
172
+
173
+ print("="*60)
174
+ print("Stack 2.9 Simple Training")
175
+ print("="*60)
176
+
177
+ config = load_config(args.config)
178
+ print(f"Config loaded: {args.config}")
179
+ print(f"Model: {config['model']['name']}")
180
+ print(f"Data: {config['data']['input_path']}")
181
+
182
+ try:
183
+ train(config)
184
+ print("\n" + "="*60)
185
+ print("✅ TRAINING SUCCESS")
186
+ print("="*60)
187
+ except Exception as e:
188
+ print("\n" + "="*60)
189
+ print(f"❌ TRAINING FAILED: {e}")
190
+ print("="*60)
191
+ import traceback
192
+ traceback.print_exc()
193
+ sys.exit(1)
194
+
195
+
196
+ if __name__ == "__main__":
197
+ main()