walidsobhie-code commited on
Commit
bfe21f8
·
1 Parent(s): d863fcd

fix: remove bitsandbytes dependency to fix Kaggle CUDA compatibility

Browse files

- Created train_simple_nobnb.py: same as train_simple but without bitsandbytes
- Updated merge_simple.py: sets PEFT_DISABLE_LOFTQ=1 to avoid bitsandbytes import
- Updated Kaggle notebook (v5) to use no-bitsandbytes training
- Dependencies: removed bitsandbytes from pip install
- This fixes the CUDA setup failures on Kaggle's CUDA 12.8 environment

Training now uses pure float16 (no 4-bit quantization). Should work on T4.

kaggle_train_stack29_v5.ipynb ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# 🚀 Stack 2.9 - Kaggle Training\n",
8
+ "\n",
9
+ "Free GPU training on Kaggle using Qwen2.5-Coder-7B.\n",
10
+ "\n",
11
+ "⏱️ **Runtime:** 2-4 hours | 💾 **VRAM:** ~16GB\n",
12
+ "\n",
13
+ "**Setup:**\n",
14
+ "1. Settings → Accelerator → GPU **T4**\n",
15
+ "2. Run all cells in order\n",
16
+ "3. Download merged model from Output tab when done"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "metadata": {},
23
+ "outputs": [],
24
+ "source": [
25
+ "# Check GPU\n",
26
+ "!nvidia-smi"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": null,
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "# Clone repository\n",
36
+ "import os, shutil, subprocess\n",
37
+ "\n",
38
+ "os.chdir('/kaggle/working')\n",
39
+ "REPO_DIR = '/kaggle/working/stack-2.9'\n",
40
+ "OUTPUT_DIR = os.path.join(REPO_DIR, 'training_output')\n",
41
+ "\n",
42
+ "if os.path.exists(REPO_DIR):\n",
43
+ " shutil.rmtree(REPO_DIR)\n",
44
+ "subprocess.run(['git', 'clone', 'https://github.com/my-ai-stack/stack-2.9.git', REPO_DIR], check=True)\n",
45
+ "os.chdir(REPO_DIR)\n",
46
+ "print('✅ Repo ready:', REPO_DIR)"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": null,
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": [
55
+ "# Install dependencies (no bitsandbytes - avoids CUDA compatibility issues)\n",
56
+ "!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
57
+ "!pip install -q transformers==4.40.0 peft==0.10.0 accelerate==0.34.0 datasets==3.0.0 pyyaml tqdm scipy\n",
58
+ "print('✅ Dependencies ready')"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": null,
64
+ "metadata": {},
65
+ "outputs": [],
66
+ "source": [
67
+ "# Prepare training data (auto-detect or synthetic fallback)\n",
68
+ "import os, json\n",
69
+ "\n",
70
+ "REPO_TRAIN_DATA = os.path.join(REPO_DIR, 'training-data/final/train.jsonl')\n",
71
+ "MINI_DATA_DIR = os.path.join(REPO_DIR, 'data_mini')\n",
72
+ "MINI_DATA_FILE = os.path.join(MINI_DATA_DIR, 'train_mini.jsonl')\n",
73
+ "SYNTHETIC_FILE = os.path.join(REPO_DIR, 'data/synthetic.jsonl')\n",
74
+ "\n",
75
+ "print('🔍 Data check')\n",
76
+ "\n",
77
+ "if os.path.exists(REPO_TRAIN_DATA):\n",
78
+ " os.makedirs(MINI_DATA_DIR, exist_ok=True)\n",
79
+ " if not os.path.exists(MINI_DATA_FILE):\n",
80
+ " print(' Building mini dataset (1K samples) from full data...')\n",
81
+ " !python scripts/create_mini_dataset.py --size 1000 --output {MINI_DATA_FILE} --source {REPO_TRAIN_DATA}\n",
82
+ " DATA_FILE = MINI_DATA_FILE\n",
83
+ " print(' Using mini dataset')\n",
84
+ "elif os.path.exists(MINI_DATA_FILE):\n",
85
+ " DATA_FILE = MINI_DATA_FILE\n",
86
+ " print(' Using existing mini dataset')\n",
87
+ "else:\n",
88
+ " print(' Creating synthetic data (last resort)')\n",
89
+ " examples = [\n",
90
+ " {'instruction': 'Write a Python function to reverse a string', 'output': 'def reverse_string(s):\\n return s[::-1]'},\n",
91
+ " {'instruction': 'Write a function to check if a number is prime', 'output': 'def is_prime(n):\\n if n <= 1:\\n return False\\n for i in range(2, int(n**0.5) + 1):\\n if n % i == 0:\\n return False\\n return True'},\n",
92
+ " {'instruction': 'Write a binary search function', 'output': 'def binary_search(arr, target):\\n left, right = 0, len(arr) - 1\\n while left <= right:\\n mid = (left + right) // 2\\n if arr[mid] == target:\\n return mid\\n elif arr[mid] < target:\\n left = mid + 1\\n else:\\n right = mid - 1\\n return -1'},\n",
93
+ " ]\n",
94
+ " samples = examples * 333\n",
95
+ " os.makedirs(os.path.dirname(SYNTHETIC_FILE), exist_ok=True)\n",
96
+ " with open(SYNTHETIC_FILE, 'w') as f:\n",
97
+ " for s in samples:\n",
98
+ " f.write(json.dumps(s) + '\\n')\n",
99
+ " DATA_FILE = SYNTHETIC_FILE\n",
100
+ " print(f' Synthetic dataset: {len(samples)} examples')\n",
101
+ "\n",
102
+ "print(f'\\n✅ Data: {DATA_FILE}')\n",
103
+ "!ls -lh {DATA_FILE}"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": null,
109
+ "metadata": {},
110
+ "outputs": [],
111
+ "source": [
112
+ "# Generate training configuration\n",
113
+ "import yaml\n",
114
+ "\n",
115
+ "os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
116
+ "\n",
117
+ "config = {\n",
118
+ " 'model': {'name': 'Qwen/Qwen2.5-Coder-7B', 'trust_remote_code': True, 'torch_dtype': 'float16'},\n",
119
+ " 'data': {'input_path': DATA_FILE, 'max_length': 2048, 'train_split': 1.0},\n",
120
+ " 'lora': {'r': 16, 'alpha': 32, 'dropout': 0.05, 'target_modules': ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'], 'bias': 'none', 'task_type': 'CAUSAL_LM'},\n",
121
+ " 'training': {'num_epochs': 1, 'batch_size': 2, 'gradient_accumulation': 4, 'learning_rate': 2e-4, 'warmup_steps': 50, 'weight_decay': 0.01, 'max_grad_norm': 1.0, 'logging_steps': 10, 'save_steps': 100, 'save_total_limit': 2, 'fp16': True, 'bf16': False, 'gradient_checkpointing': True},\n",
122
+ " 'output': {'lora_dir': os.path.join(OUTPUT_DIR, 'lora'), 'logging_dir': os.path.join(OUTPUT_DIR, 'logs')},\n",
123
+ " 'quantization': {'enabled': False},\n",
124
+ " 'hardware': {'device': 'cuda', 'num_gpus': 1, 'use_4bit': False, 'use_8bit': False}\n",
125
+ "}\n",
126
+ "\n",
127
+ "config_path = os.path.join(OUTPUT_DIR, 'train_config.yaml')\n",
128
+ "with open(config_path, 'w') as f:\n",
129
+ " yaml.dump(config, f, default_flow_style=False)\n",
130
+ "\n",
131
+ "print(f'✅ Config: {config_path}')\n",
132
+ "print(f\" Model: {config['model']['name']}\")\n",
133
+ "print(f\" Data: {config['data']['input_path']}\")"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": null,
139
+ "metadata": {},
140
+ "outputs": [],
141
+ "source": [
142
+ "# Train (using standalone train_simple_nobnb.py)\n",
143
+ "print('='*60)\n",
144
+ "print('STARTING TRAINING')\n",
145
+ "print('='*60)\n",
146
+ "\n",
147
+ "!cd {REPO_DIR} && python train_simple_nobnb.py --config {config_path}\n",
148
+ "\n",
149
+ "print('\\n✅ Training step finished')"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": null,
155
+ "metadata": {},
156
+ "outputs": [],
157
+ "source": [
158
+ "# Merge LoRA adapter into final model\n",
159
+ "lora_dir = os.path.join(OUTPUT_DIR, 'lora')\n",
160
+ "merged_dir = os.path.join(OUTPUT_DIR, 'merged')\n",
161
+ "\n",
162
+ "print('='*60)\n",
163
+ "print('MERGING LORA ADAPTER')\n",
164
+ "print('='*60)\n",
165
+ "\n",
166
+ "!cd {REPO_DIR} && python merge_simple.py \\\n",
167
+ " --base-model {config['model']['name']} \\\n",
168
+ " --adapter-path {lora_dir} \\\n",
169
+ " --output-path {merged_dir} \\\n",
170
+ " --use-safetensors\n",
171
+ "\n",
172
+ "print('\\n✅ Merge complete!')\n",
173
+ "print(f'Merged model: {merged_dir}')\n",
174
+ "!ls -lh {merged_dir}"
175
+ ]
176
+ },
177
+ {
178
+ "cell_type": "markdown",
179
+ "metadata": {},
180
+ "source": [
181
+ "## 📥 Download Model\n",
182
+ "\n",
183
+ "1. Open **Output** tab on the right\n",
184
+ "2. Find `training_output/merged/`\n",
185
+ "3. Select all files and **Download**\n",
186
+ "\n",
187
+ "⚠️ **Do this before Kaggle session ends!**"
188
+ ]
189
+ }
190
+ ],
191
+ "metadata": {
192
+ "kaggle": {
193
+ "accelerator": "gpu"
194
+ }
195
+ },
196
+ "nbformat": 4,
197
+ "nbformat_minor": 0
198
+ }
merge_simple.py CHANGED
@@ -9,6 +9,8 @@ import os
9
  from pathlib import Path
10
 
11
  import torch
 
 
12
  from peft import PeftModel
13
  from transformers import AutoModelForCausalLM, AutoTokenizer
14
 
 
9
  from pathlib import Path
10
 
11
  import torch
12
+ # Disable LoFTQ to avoid bitsandbytes import
13
+ os.environ['PEFT_DISABLE_LOFTQ'] = '1'
14
  from peft import PeftModel
15
  from transformers import AutoModelForCausalLM, AutoTokenizer
16
 
train_simple_nobnb.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Simple standalone training script for Stack 2.9.
4
+ No bitsandbytes dependency — uses pure float16.
5
+ """
6
+
7
+ import argparse
8
+ import os
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ import yaml
13
+ from datasets import load_dataset
14
+ from transformers import (
15
+ AutoModelForCausalLM,
16
+ AutoTokenizer,
17
+ TrainingArguments,
18
+ Trainer,
19
+ DataCollatorForLanguageModeling
20
+ )
21
+ from peft import LoraConfig, get_peft_model, TaskType
22
+ import torch
23
+
24
+
25
+ def load_config(config_path: str) -> dict:
26
+ with open(config_path, 'r') as f:
27
+ return yaml.safe_load(f)
28
+
29
+
30
+ def load_model_and_tokenizer(model_name: str, trust_remote_code: bool = True):
31
+ """Load base model in float16 (no quantization)."""
32
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=trust_remote_code)
33
+ model = AutoModelForCausalLM.from_pretrained(
34
+ model_name,
35
+ torch_dtype=torch.float16,
36
+ trust_remote_code=trust_remote_code,
37
+ device_map="auto"
38
+ )
39
+ return model, tokenizer
40
+
41
+
42
+ def load_data(data_path: str, tokenizer, max_length: int = 2048, train_split: float = 0.9):
43
+ """Load and tokenize dataset."""
44
+ raw_dataset = load_dataset("json", data_files=data_path, split="train")
45
+
46
+ def tokenize_function(examples):
47
+ texts = []
48
+ for instr, out in zip(examples.get("instruction", [""]), examples.get("output", [""])):
49
+ if instr and out:
50
+ texts.append(f"### Instruction:\n{instr}\n\n### Response:\n{out}")
51
+ elif out:
52
+ texts.append(out)
53
+ elif instr:
54
+ texts.append(instr)
55
+ else:
56
+ texts.append("")
57
+
58
+ tokenized = tokenizer(texts, truncation=True, max_length=max_length, padding="max_length")
59
+ tokenized["labels"] = tokenized["input_ids"].copy()
60
+ return tokenized
61
+
62
+ tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=raw_dataset.column_names)
63
+ split = tokenized_dataset.train_test_split(train_size=train_split)
64
+ return split["train"], split["test"]
65
+
66
+
67
+ def train(config: dict):
68
+ """Main training function."""
69
+ model_config = config["model"]
70
+ data_config = config["data"]
71
+ lora_config = config["lora"]
72
+ training_config = config["training"]
73
+ output_config = config["output"]
74
+
75
+ # Load model and tokenizer
76
+ print(f"Loading model: {model_config['name']}")
77
+ model, tokenizer = load_model_and_tokenizer(
78
+ model_name=model_config["name"],
79
+ trust_remote_code=model_config.get("trust_remote_code", True)
80
+ )
81
+
82
+ # Load data
83
+ print(f"Loading dataset: {data_config['input_path']}")
84
+ train_dataset, eval_dataset = load_data(
85
+ data_path=data_config["input_path"],
86
+ tokenizer=tokenizer,
87
+ max_length=data_config.get("max_length", 2048),
88
+ train_split=data_config.get("train_split", 0.9)
89
+ )
90
+ print(f" Train samples: {len(train_dataset)}")
91
+ print(f" Eval samples: {len(eval_dataset)}")
92
+
93
+ # Apply LoRA
94
+ peft_config = LoraConfig(
95
+ r=lora_config["r"],
96
+ alpha=lora_config["alpha"],
97
+ dropout=lora_config["dropout"],
98
+ target_modules=lora_config["target_modules"],
99
+ bias=lora_config["bias"],
100
+ task_type=TaskType.CAUSAL_LM
101
+ )
102
+ model = get_peft_model(model, peft_config)
103
+ model.print_trainable_parameters()
104
+
105
+ # Training arguments
106
+ output_dir = output_config["lora_dir"]
107
+ os.makedirs(output_dir, exist_ok=True)
108
+
109
+ training_args = TrainingArguments(
110
+ output_dir=output_dir,
111
+ num_train_epochs=training_config["num_epochs"],
112
+ per_device_train_batch_size=training_config["batch_size"],
113
+ gradient_accumulation_steps=training_config["gradient_accumulation"],
114
+ learning_rate=training_config["learning_rate"],
115
+ warmup_steps=training_config.get("warmup_steps", 100),
116
+ weight_decay=training_config.get("weight_decay", 0.01),
117
+ max_grad_norm=training_config.get("max_grad_norm", 1.0),
118
+ logging_steps=training_config.get("logging_steps", 10),
119
+ save_steps=training_config.get("save_steps", 100),
120
+ save_total_limit=training_config.get("save_total_limit", 2),
121
+ fp16=training_config.get("fp16", True),
122
+ bf16=training_config.get("bf16", False),
123
+ gradient_checkpointing=training_config.get("gradient_checkpointing", True),
124
+ evaluation_strategy="steps" if eval_dataset else "no",
125
+ eval_steps=training_config.get("eval_steps", 100) if eval_dataset else None,
126
+ report_to="none",
127
+ )
128
+
129
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
130
+
131
+ trainer = Trainer(
132
+ model=model,
133
+ args=training_args,
134
+ train_dataset=train_dataset,
135
+ eval_dataset=eval_dataset,
136
+ data_collator=data_collator,
137
+ tokenizer=tokenizer,
138
+ )
139
+
140
+ print("="*60)
141
+ print("Starting training...")
142
+ print("="*60)
143
+ trainer.train()
144
+ print("Training completed!")
145
+
146
+ # Save final adapter
147
+ trainer.save_model(output_dir)
148
+ print(f"✅ Adapter saved to {output_dir}")
149
+
150
+ return trainer
151
+
152
+
153
+ def main():
154
+ parser = argparse.ArgumentParser()
155
+ parser.add_argument("--config", type=str, required=True, help="Path to YAML config")
156
+ args = parser.parse_args()
157
+
158
+ print("="*60)
159
+ print("Stack 2.9 Simple Training")
160
+ print("="*60)
161
+
162
+ config = load_config(args.config)
163
+ print(f"Config loaded: {args.config}")
164
+ print(f"Model: {config['model']['name']}")
165
+ print(f"Data: {config['data']['input_path']}")
166
+
167
+ try:
168
+ train(config)
169
+ print("\n" + "="*60)
170
+ print("✅ TRAINING SUCCESS")
171
+ print("="*60)
172
+ except Exception as e:
173
+ print("\n" + "="*60)
174
+ print(f"❌ TRAINING FAILED: {e}")
175
+ print("="*60)
176
+ import traceback
177
+ traceback.print_exc()
178
+ sys.exit(1)
179
+
180
+
181
+ if __name__ == "__main__":
182
+ main()