Trouter-Library commited on
Commit
8a776ef
·
verified ·
1 Parent(s): e62d89f

Create autotrain_v15.py

Browse files
Files changed (1) hide show
  1. autotrain_v15.py +379 -0
autotrain_v15.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helion-V1.5 AutoTrain Script
3
+ Enhanced training with better error handling and AutoTrain compatibility
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import json
9
+ import logging
10
+ import traceback
11
+ from pathlib import Path
12
+ from dataclasses import dataclass
13
+ from typing import Optional, Dict
14
+
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
18
+ handlers=[
19
+ logging.FileHandler('helion_v15_training.log'),
20
+ logging.StreamHandler(sys.stdout)
21
+ ]
22
+ )
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ @dataclass
27
+ class HelionV15Config:
28
+ """Configuration for Helion-V1.5 training."""
29
+ model_name: str = "DeepXR/Helion-V1.5"
30
+ base_model: str = "meta-llama/Llama-2-7b-hf"
31
+ dataset_name: str = None
32
+ output_dir: str = "./helion-v1.5-output"
33
+ hub_model_id: str = "DeepXR/Helion-V1.5"
34
+
35
+ # Training params
36
+ num_epochs: int = 3
37
+ batch_size: int = 4
38
+ gradient_accumulation: int = 8
39
+ learning_rate: float = 2e-5
40
+ warmup_steps: int = 100
41
+ max_seq_length: int = 4096
42
+
43
+ # LoRA config
44
+ lora_r: int = 64
45
+ lora_alpha: int = 128
46
+ lora_dropout: float = 0.05
47
+
48
+ # AutoTrain specific
49
+ use_autotrain: bool = True
50
+ autotrain_backend: str = "local" # or "spaces"
51
+
52
+ # HuggingFace token
53
+ hf_token: Optional[str] = None
54
+
55
+
56
+ class HelionV15Trainer:
57
+ """Enhanced trainer for Helion-V1.5 with AutoTrain support."""
58
+
59
+ def __init__(self, config: HelionV15Config):
60
+ self.config = config
61
+ self.hf_token = config.hf_token or os.getenv("HF_TOKEN")
62
+
63
+ if not self.hf_token:
64
+ raise ValueError("HuggingFace token required. Set HF_TOKEN environment variable.")
65
+
66
+ def verify_setup(self) -> bool:
67
+ """Verify all prerequisites."""
68
+ logger.info("Verifying setup for Helion-V1.5...")
69
+
70
+ checks = {
71
+ "CUDA Available": self._check_cuda(),
72
+ "HuggingFace Token": self._check_token(),
73
+ "Base Model Access": self._check_base_model(),
74
+ "Disk Space": self._check_disk_space()
75
+ }
76
+
77
+ for check, result in checks.items():
78
+ status = "✅" if result else "❌"
79
+ logger.info(f"{status} {check}")
80
+
81
+ return all(checks.values())
82
+
83
+ def _check_cuda(self) -> bool:
84
+ """Check CUDA availability."""
85
+ try:
86
+ import torch
87
+ if torch.cuda.is_available():
88
+ logger.info(f"Found {torch.cuda.device_count()} GPU(s)")
89
+ for i in range(torch.cuda.device_count()):
90
+ logger.info(f" GPU {i}: {torch.cuda.get_device_name(i)}")
91
+ return True
92
+ return False
93
+ except:
94
+ return False
95
+
96
+ def _check_token(self) -> bool:
97
+ """Verify HuggingFace token."""
98
+ try:
99
+ from huggingface_hub import HfApi
100
+ api = HfApi(token=self.hf_token)
101
+ user_info = api.whoami()
102
+ logger.info(f"Logged in as: {user_info['name']}")
103
+ return True
104
+ except Exception as e:
105
+ logger.error(f"Token validation failed: {e}")
106
+ return False
107
+
108
+ def _check_base_model(self) -> bool:
109
+ """Check base model access."""
110
+ try:
111
+ from huggingface_hub import HfApi
112
+ api = HfApi(token=self.hf_token)
113
+ api.model_info(self.config.base_model)
114
+ return True
115
+ except Exception as e:
116
+ logger.error(f"Cannot access base model: {e}")
117
+ return False
118
+
119
+ def _check_disk_space(self, required_gb: int = 50) -> bool:
120
+ """Check available disk space."""
121
+ try:
122
+ import shutil
123
+ stat = shutil.disk_usage(self.config.output_dir)
124
+ available_gb = stat.free / (1024 ** 3)
125
+ logger.info(f"Available disk space: {available_gb:.2f} GB")
126
+ return available_gb >= required_gb
127
+ except:
128
+ return False
129
+
130
+ def prepare_model(self):
131
+ """Load and prepare model for training."""
132
+ import torch
133
+ from transformers import (
134
+ AutoTokenizer,
135
+ AutoModelForCausalLM,
136
+ BitsAndBytesConfig
137
+ )
138
+ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
139
+
140
+ logger.info("Loading tokenizer...")
141
+ self.tokenizer = AutoTokenizer.from_pretrained(
142
+ self.config.base_model,
143
+ token=self.hf_token,
144
+ trust_remote_code=True
145
+ )
146
+
147
+ # Add special tokens
148
+ special_tokens = {
149
+ "additional_special_tokens": ["<|system|>", "<|user|>", "<|assistant|>"]
150
+ }
151
+ self.tokenizer.add_special_tokens(special_tokens)
152
+
153
+ if self.tokenizer.pad_token is None:
154
+ self.tokenizer.pad_token = self.tokenizer.unk_token
155
+
156
+ logger.info("Loading base model with quantization...")
157
+
158
+ # QLoRA quantization config
159
+ bnb_config = BitsAndBytesConfig(
160
+ load_in_4bit=True,
161
+ bnb_4bit_use_double_quant=True,
162
+ bnb_4bit_quant_type="nf4",
163
+ bnb_4bit_compute_dtype=torch.bfloat16
164
+ )
165
+
166
+ self.model = AutoModelForCausalLM.from_pretrained(
167
+ self.config.base_model,
168
+ quantization_config=bnb_config,
169
+ device_map="auto",
170
+ token=self.hf_token,
171
+ trust_remote_code=True
172
+ )
173
+
174
+ # Resize embeddings for new tokens
175
+ self.model.resize_token_embeddings(len(self.tokenizer))
176
+
177
+ # Prepare for k-bit training
178
+ self.model = prepare_model_for_kbit_training(self.model)
179
+
180
+ # LoRA configuration
181
+ peft_config = LoraConfig(
182
+ r=self.config.lora_r,
183
+ lora_alpha=self.config.lora_alpha,
184
+ lora_dropout=self.config.lora_dropout,
185
+ bias="none",
186
+ task_type="CAUSAL_LM",
187
+ target_modules=[
188
+ "q_proj", "k_proj", "v_proj", "o_proj",
189
+ "gate_proj", "up_proj", "down_proj"
190
+ ]
191
+ )
192
+
193
+ self.model = get_peft_model(self.model, peft_config)
194
+ self.model.print_trainable_parameters()
195
+
196
+ logger.info("✅ Model prepared successfully")
197
+
198
+ def load_dataset(self):
199
+ """Load training dataset."""
200
+ from datasets import load_dataset
201
+
202
+ logger.info(f"Loading dataset: {self.config.dataset_name}")
203
+
204
+ self.dataset = load_dataset(
205
+ self.config.dataset_name,
206
+ token=self.hf_token
207
+ )
208
+
209
+ logger.info(f"Dataset loaded: {self.dataset}")
210
+
211
+ # Preprocessing function
212
+ def preprocess(examples):
213
+ texts = examples.get("text", [])
214
+ model_inputs = self.tokenizer(
215
+ texts,
216
+ max_length=self.config.max_seq_length,
217
+ truncation=True,
218
+ padding="max_length"
219
+ )
220
+ model_inputs["labels"] = model_inputs["input_ids"].copy()
221
+ return model_inputs
222
+
223
+ logger.info("Preprocessing dataset...")
224
+ self.tokenized_dataset = self.dataset.map(
225
+ preprocess,
226
+ batched=True,
227
+ remove_columns=self.dataset["train"].column_names
228
+ )
229
+
230
+ logger.info("✅ Dataset ready")
231
+
232
+ def train(self):
233
+ """Train the model."""
234
+ from transformers import (
235
+ TrainingArguments,
236
+ Trainer,
237
+ DataCollatorForLanguageModeling
238
+ )
239
+
240
+ logger.info("Setting up training...")
241
+
242
+ training_args = TrainingArguments(
243
+ output_dir=self.config.output_dir,
244
+ num_train_epochs=self.config.num_epochs,
245
+ per_device_train_batch_size=self.config.batch_size,
246
+ per_device_eval_batch_size=self.config.batch_size,
247
+ gradient_accumulation_steps=self.config.gradient_accumulation,
248
+ learning_rate=self.config.learning_rate,
249
+ weight_decay=0.01,
250
+ warmup_steps=self.config.warmup_steps,
251
+ logging_steps=10,
252
+ evaluation_strategy="steps",
253
+ eval_steps=500,
254
+ save_steps=500,
255
+ save_total_limit=3,
256
+ fp16=False,
257
+ bf16=True,
258
+ optim="adamw_torch",
259
+ gradient_checkpointing=True,
260
+ lr_scheduler_type="cosine",
261
+ report_to=["tensorboard"],
262
+ load_best_model_at_end=True,
263
+ metric_for_best_model="eval_loss",
264
+ greater_is_better=False,
265
+ push_to_hub=False
266
+ )
267
+
268
+ data_collator = DataCollatorForLanguageModeling(
269
+ tokenizer=self.tokenizer,
270
+ mlm=False
271
+ )
272
+
273
+ trainer = Trainer(
274
+ model=self.model,
275
+ args=training_args,
276
+ train_dataset=self.tokenized_dataset["train"],
277
+ eval_dataset=self.tokenized_dataset.get("validation"),
278
+ tokenizer=self.tokenizer,
279
+ data_collator=data_collator
280
+ )
281
+
282
+ logger.info("🚀 Starting training...")
283
+ trainer.train()
284
+
285
+ logger.info("✅ Training complete!")
286
+
287
+ # Save final model
288
+ logger.info("Saving model...")
289
+ trainer.save_model(self.config.output_dir)
290
+ self.tokenizer.save_pretrained(self.config.output_dir)
291
+
292
+ logger.info(f"Model saved to {self.config.output_dir}")
293
+
294
+ def push_to_hub(self):
295
+ """Upload model to HuggingFace Hub."""
296
+ from huggingface_hub import HfApi
297
+
298
+ logger.info(f"Pushing model to {self.config.hub_model_id}...")
299
+
300
+ api = HfApi(token=self.hf_token)
301
+
302
+ # Create repo
303
+ api.create_repo(
304
+ self.config.hub_model_id,
305
+ exist_ok=True,
306
+ private=False
307
+ )
308
+
309
+ # Upload files
310
+ api.upload_folder(
311
+ folder_path=self.config.output_dir,
312
+ repo_id=self.config.hub_model_id,
313
+ repo_type="model"
314
+ )
315
+
316
+ logger.info("✅ Model pushed to Hub!")
317
+
318
+ def run_pipeline(self) -> bool:
319
+ """Run complete training pipeline."""
320
+ try:
321
+ logger.info("="*60)
322
+ logger.info("Helion-V1.5 Training Pipeline")
323
+ logger.info("="*60)
324
+
325
+ if not self.verify_setup():
326
+ logger.error("Setup verification failed")
327
+ return False
328
+
329
+ self.prepare_model()
330
+ self.load_dataset()
331
+ self.train()
332
+ self.push_to_hub()
333
+
334
+ logger.info("="*60)
335
+ logger.info("✅ Training pipeline completed successfully!")
336
+ logger.info("="*60)
337
+ return True
338
+
339
+ except Exception as e:
340
+ logger.error(f"Training failed: {e}")
341
+ logger.error(traceback.format_exc())
342
+ return False
343
+
344
+
345
+ def main():
346
+ """Main entry point."""
347
+ import argparse
348
+
349
+ parser = argparse.ArgumentParser(description="Train Helion-V1.5")
350
+ parser.add_argument("--base-model", default="meta-llama/Llama-2-7b-hf")
351
+ parser.add_argument("--dataset", required=True)
352
+ parser.add_argument("--output-dir", default="./helion-v1.5-output")
353
+ parser.add_argument("--hub-model-id", default="DeepXR/Helion-V1.5")
354
+ parser.add_argument("--epochs", type=int, default=3)
355
+ parser.add_argument("--batch-size", type=int, default=4)
356
+ parser.add_argument("--learning-rate", type=float, default=2e-5)
357
+ parser.add_argument("--token", help="HuggingFace token")
358
+
359
+ args = parser.parse_args()
360
+
361
+ config = HelionV15Config(
362
+ base_model=args.base_model,
363
+ dataset_name=args.dataset,
364
+ output_dir=args.output_dir,
365
+ hub_model_id=args.hub_model_id,
366
+ num_epochs=args.epochs,
367
+ batch_size=args.batch_size,
368
+ learning_rate=args.learning_rate,
369
+ hf_token=args.token
370
+ )
371
+
372
+ trainer = HelionV15Trainer(config)
373
+ success = trainer.run_pipeline()
374
+
375
+ sys.exit(0 if success else 1)
376
+
377
+
378
+ if __name__ == "__main__":
379
+ main()