| | import os
|
| | import torch
|
| | import gc
|
| | from concurrent.futures import ThreadPoolExecutor, as_completed
|
| | from functools import partial
|
| | import psutil
|
| | import multiprocessing as mp
|
| | from datasets import load_dataset, Dataset, DatasetDict
|
| | from transformers import (
|
| | AutoTokenizer,
|
| | AutoModelForCausalLM,
|
| | TrainingArguments,
|
| | Trainer,
|
| | DataCollatorForLanguageModeling,
|
| | GPT2TokenizerFast
|
| | )
|
| | import shutil
|
| | from typing import Dict, Any, List
|
| | import warnings
|
| | import platform
|
| | import traceback
|
| | from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
| | import json
|
| | import tempfile
|
| | from datetime import datetime
|
| | warnings.filterwarnings("ignore")
|
| |
|
| |
|
| |
|
| | MODEL_NAME = "zxc4wewewe/blackthinking"
|
| | OUTPUT_DIR = "./offsec_model"
|
| | MERGED_MODELS_DIR = "./merged_models"
|
| | MAX_LENGTH = 512
|
| | BATCH_SIZE = 1
|
| | GRADIENT_ACCUMULATION = 8
|
| | EPOCHS = 3
|
| | LEARNING_RATE = 2e-5
|
| | SAVE_STEPS = 100
|
| | EVAL_STEPS = 100
|
| | LOGGING_STEPS = 50
|
| |
|
| |
|
| | USE_LORA = True
|
| | LORA_R = 8
|
| | LORA_ALPHA = 16
|
| | LORA_DROPOUT = 0.1
|
| |
|
| |
|
| | DATASET_SOURCES = [
|
| | "huihui-ai/Guilherme34_uncensor-v2",
|
| | "zxc4wewewe/offsec",
|
| | ]
|
| |
|
| |
|
| | NUM_WORKERS = min(2, mp.cpu_count())
|
| | BATCH_SIZE_TOKENIZATION = 50
|
| |
|
| |
|
| | class TrainingAnalyzer:
|
| | """Analyzes training progress and system resources"""
|
| |
|
| | def __init__(self):
|
| | self.start_time = datetime.now()
|
| | self.training_metrics = {
|
| | "total_samples": 0,
|
| | "processed_samples": 0,
|
| | "training_time": 0,
|
| | "peak_memory": 0,
|
| | "gpu_memory": 0,
|
| | }
|
| |
|
| | def analyze_system(self):
|
| | """Analyze system resources"""
|
| | try:
|
| | memory = psutil.virtual_memory()
|
| | gpu_memory = 0
|
| | if torch.cuda.is_available():
|
| | gpu_memory = torch.cuda.memory_allocated() / (1024**3)
|
| |
|
| | return {
|
| | "cpu_cores": mp.cpu_count(),
|
| | "total_memory_gb": memory.total / (1024**3),
|
| | "available_memory_gb": memory.available / (1024**3),
|
| | "memory_usage_percent": memory.percent,
|
| | "gpu_memory_gb": gpu_memory,
|
| | "cuda_available": torch.cuda.is_available(),
|
| | "cuda_version": torch.version.cuda,
|
| | "pytorch_version": torch.__version__,
|
| | }
|
| | except Exception as e:
|
| | print(f"β οΈ System analysis failed: {e}")
|
| | return {}
|
| |
|
| | def analyze_dataset(self, dataset):
|
| | """Analyze dataset characteristics"""
|
| | if not dataset:
|
| | return {}
|
| |
|
| | try:
|
| | analysis = {}
|
| | for split_name, split_data in dataset.items():
|
| | if hasattr(split_data, '__len__'):
|
| | analysis[split_name] = {
|
| | "num_samples": len(split_data),
|
| | "columns": split_data.column_names if hasattr(split_data, 'column_names') else [],
|
| | }
|
| |
|
| | return analysis
|
| | except Exception as e:
|
| | print(f"β οΈ Dataset analysis failed: {e}")
|
| | return {}
|
| |
|
| | def analyze_training(self, trainer, train_result):
|
| | """Analyze training results"""
|
| | try:
|
| | current_time = datetime.now()
|
| | training_time = (current_time - self.start_time).total_seconds()
|
| |
|
| | memory = psutil.virtual_memory()
|
| | peak_memory = memory.used / (1024**3)
|
| | gpu_memory = 0
|
| | if torch.cuda.is_available():
|
| | gpu_memory = torch.cuda.memory_allocated() / (1024**3)
|
| |
|
| | return {
|
| | "training_time_seconds": training_time,
|
| | "training_time_minutes": training_time / 60,
|
| | "peak_memory_gb": peak_memory,
|
| | "peak_gpu_memory_gb": gpu_memory,
|
| | "final_loss": getattr(train_result, 'training_loss', 'unknown'),
|
| | "total_steps": getattr(train_result, 'global_step', 0),
|
| | "samples_per_second": train_result.metrics.get('train_samples_per_second', 0) if train_result.metrics else 0,
|
| | }
|
| | except Exception as e:
|
| | print(f"β οΈ Training analysis failed: {e}")
|
| | return {}
|
| |
|
| | def generate_report(self, system_info, dataset_info, training_info):
|
| | """Generate comprehensive training report"""
|
| | report = f"""
|
| | {'='*60}
|
| | TRAINING ANALYSIS REPORT
|
| | {'='*60}
|
| |
|
| | SYSTEM INFORMATION:
|
| | - CPU Cores: {system_info.get('cpu_cores', 'unknown')}
|
| | - Total Memory: {system_info.get('total_memory_gb', 0):.1f} GB
|
| | - Available Memory: {system_info.get('available_memory_gb', 0):.1f} GB
|
| | - Memory Usage: {system_info.get('memory_usage_percent', 0):.1f}%
|
| | - CUDA Available: {system_info.get('cuda_available', False)}
|
| | - CUDA Version: {system_info.get('cuda_version', 'unknown')}
|
| | - PyTorch Version: {system_info.get('pytorch_version', 'unknown')}
|
| | - GPU Memory Used: {system_info.get('gpu_memory_gb', 0):.2f} GB
|
| |
|
| | DATASET ANALYSIS:
|
| | """
|
| |
|
| | for split_name, split_info in dataset_info.items():
|
| | report += f"- {split_name.upper()}: {split_info.get('num_samples', 0)} samples\n"
|
| | if split_info.get('columns'):
|
| | report += f" Columns: {', '.join(split_info['columns'])}\n"
|
| |
|
| | report += f"""
|
| | TRAINING PERFORMANCE:
|
| | - Training Time: {training_info.get('training_time_minutes', 0):.2f} minutes
|
| | - Final Loss: {training_info.get('final_loss', 'unknown')}
|
| | - Total Steps: {training_info.get('total_steps', 0)}
|
| | - Samples/Second: {training_info.get('samples_per_second', 0):.2f}
|
| | - Peak Memory: {training_info.get('peak_memory_gb', 0):.2f} GB
|
| | - Peak GPU Memory: {training_info.get('peak_gpu_memory_gb', 0):.2f} GB
|
| |
|
| | TRAINING CONFIGURATION:
|
| | - Model: {MODEL_NAME}
|
| | - Batch Size: {BATCH_SIZE}
|
| | - Gradient Accumulation: {GRADIENT_ACCUMULATION}
|
| | - Learning Rate: {LEARNING_RATE}
|
| | - Epochs: {EPOCHS}
|
| | - LoRA Enabled: {USE_LORA}
|
| | - Max Length: {MAX_LENGTH}
|
| |
|
| | {'='*60}
|
| | END REPORT
|
| | {'='*60}
|
| | """
|
| |
|
| | return report
|
| |
|
| |
|
| | def safe_makedirs(path):
|
| | """Safely create directories"""
|
| | try:
|
| | os.makedirs(path, exist_ok=True)
|
| | return True
|
| | except Exception as e:
|
| | print(f"β οΈ Failed to create directory {path}: {e}")
|
| | return False
|
| |
|
| | def cleanup_gpu_memory():
|
| | """Clean up GPU memory"""
|
| | if torch.cuda.is_available():
|
| | torch.cuda.empty_cache()
|
| | gc.collect()
|
| |
|
| | def load_tokenizer_robust(model_name):
|
| | """Load tokenizer with multiple fallback strategies"""
|
| | print(f"π Loading tokenizer for: {model_name}")
|
| |
|
| | strategies = [
|
| | lambda: AutoTokenizer.from_pretrained(model_name, use_fast=True, trust_remote_code=True),
|
| | lambda: AutoTokenizer.from_pretrained(model_name, use_fast=True, trust_remote_code=False),
|
| | lambda: GPT2TokenizerFast.from_pretrained("gpt2"),
|
| | lambda: create_minimal_tokenizer(),
|
| | ]
|
| |
|
| | for i, strategy in enumerate(strategies, 1):
|
| | try:
|
| | tokenizer = strategy()
|
| |
|
| |
|
| | if tokenizer.pad_token is None:
|
| | if tokenizer.eos_token:
|
| | tokenizer.pad_token = tokenizer.eos_token
|
| | else:
|
| | tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
|
| |
|
| | print(f"β
Tokenizer loaded (strategy {i})")
|
| | return tokenizer
|
| | except Exception as e:
|
| | print(f"β οΈ Strategy {i} failed: {str(e)[:100]}...")
|
| |
|
| | print("β All tokenizer strategies failed")
|
| | return None
|
| |
|
| | def create_minimal_tokenizer():
|
| | """Create absolute minimal tokenizer"""
|
| | try:
|
| | from transformers import PreTrainedTokenizerFast
|
| | import json
|
| |
|
| | vocab = {
|
| | "<|pad|>": 0,
|
| | "</s>": 1,
|
| | "<s>": 2,
|
| | "<|unk|>": 3,
|
| | }
|
| |
|
| | for i, char in enumerate("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 \n\t.,!?-", start=4):
|
| | vocab[char] = i
|
| |
|
| | tokenizer_json = {
|
| | "version": "1.0",
|
| | "model": {
|
| | "type": "BPE",
|
| | "vocab": vocab,
|
| | "merges": []
|
| | }
|
| | }
|
| |
|
| | with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
|
| | json.dump(tokenizer_json, f)
|
| | temp_path = f.name
|
| |
|
| | tokenizer = PreTrainedTokenizerFast(tokenizer_file=temp_path)
|
| | tokenizer.pad_token = "<|pad|>"
|
| | tokenizer.eos_token = "</s>"
|
| | tokenizer.bos_token = "<s>"
|
| |
|
| | os.unlink(temp_path)
|
| | return tokenizer
|
| | except:
|
| | return None
|
| |
|
| | def load_dataset_fallback():
|
| | """Load dataset with comprehensive fallbacks"""
|
| | print("π₯ Loading dataset...")
|
| |
|
| | for dataset_name in DATASET_SOURCES:
|
| | try:
|
| | print(f"π Trying: {dataset_name}")
|
| | dataset = load_dataset(dataset_name, streaming=False)
|
| | print(f"β
Loaded: {dataset_name}")
|
| |
|
| |
|
| | if "train" not in dataset and "test" not in dataset:
|
| | keys = list(dataset.keys())
|
| | if keys:
|
| | main_split = dataset[keys[0]]
|
| | dataset = main_split.train_test_split(test_size=0.1, seed=42)
|
| | print(f"β
Created train/test split")
|
| | else:
|
| | continue
|
| |
|
| | return dataset
|
| | except Exception as e:
|
| | print(f"β οΈ Failed: {str(e)[:100]}...")
|
| |
|
| |
|
| | print("π Creating dummy dataset...")
|
| | try:
|
| | dummy_data = {
|
| | "train": [
|
| | {"prompt": "What is AI?", "response": "Artificial Intelligence is computer systems performing human tasks."},
|
| | {"prompt": "How to code?", "response": "Start with basics like variables, loops, functions."},
|
| | ] * 10,
|
| | "test": [
|
| | {"prompt": "Define ML", "response": "Machine Learning enables computers to learn from data."},
|
| | ] * 3,
|
| | }
|
| |
|
| | dataset = DatasetDict({
|
| | split: Dataset.from_list(data)
|
| | for split, data in dummy_data.items()
|
| | })
|
| |
|
| | print("β
Created dummy dataset")
|
| | return dataset
|
| | except Exception as e:
|
| | print(f"β Dummy dataset failed: {e}")
|
| | return None
|
| |
|
| | def normalize_example(example):
|
| | """Normalize example format"""
|
| | if not example:
|
| | return {"prompt": "default", "response": "default"}
|
| |
|
| | try:
|
| | if "prompt" in example and "response" in example:
|
| | return {
|
| | "prompt": str(example.get("prompt", "")).strip() or "default",
|
| | "response": str(example.get("response", "")).strip() or "default",
|
| | }
|
| |
|
| | if "messages" in example and isinstance(example["messages"], list):
|
| | prompt, response = "", ""
|
| | for msg in example["messages"]:
|
| | if isinstance(msg, dict):
|
| | role, content = str(msg.get("role", "")), str(msg.get("content", ""))
|
| | if role.lower() in ["user", "human"]:
|
| | prompt = content
|
| | elif role.lower() in ["assistant", "bot"]:
|
| | response = content
|
| | return {"prompt": prompt or "default", "response": response or "default"}
|
| |
|
| | text = str(example.get("text", example.get("content", "default")))
|
| | if "Assistant:" in text:
|
| | parts = text.split("Assistant:", 1)
|
| | return {"prompt": parts[0].replace("User:", "").strip() or "default",
|
| | "response": parts[1].strip() or "default"}
|
| |
|
| | return {"prompt": text[:200] or "default",
|
| | "response": (text[-200:] if len(text) > 200 else text) or "default"}
|
| | except:
|
| | return {"prompt": "default", "response": "default"}
|
| |
|
| | def tokenize_function(examples, tokenizer):
|
| | """Tokenize examples safely"""
|
| | try:
|
| | full_texts = [
|
| | f"{prompt}\n\n{response}{tokenizer.eos_token}"
|
| | for prompt, response in zip(examples["prompt"], examples["response"])
|
| | ]
|
| |
|
| | result = tokenizer(
|
| | full_texts,
|
| | truncation=True,
|
| | max_length=MAX_LENGTH,
|
| | padding=False,
|
| | return_tensors=None,
|
| | )
|
| |
|
| | result["labels"] = [
|
| | [-100 if (hasattr(tokenizer, 'pad_token_id') and token_id == tokenizer.pad_token_id) else token_id
|
| | for token_id in labels]
|
| | for labels in result["input_ids"]
|
| | ]
|
| |
|
| | return result
|
| | except Exception as e:
|
| | print(f"β οΈ Tokenization error: {e}")
|
| | return {
|
| | "input_ids": [[1, 2, 3]] * len(examples["prompt"]),
|
| | "attention_mask": [[1, 1, 1]] * len(examples["prompt"]),
|
| | "labels": [[1, 2, 3]] * len(examples["prompt"]),
|
| | }
|
| |
|
| | def process_dataset(dataset, tokenizer):
|
| | """Process dataset efficiently"""
|
| | if not dataset or not tokenizer:
|
| | return None
|
| |
|
| | print("β‘ Processing dataset...")
|
| |
|
| | processed_splits = {}
|
| | for split_name in dataset.keys():
|
| | try:
|
| | print(f"π Processing {split_name} ({len(dataset[split_name])} samples)...")
|
| |
|
| |
|
| | normalized = dataset[split_name].map(
|
| | normalize_example,
|
| | remove_columns=dataset[split_name].column_names,
|
| | num_proc=1,
|
| | )
|
| |
|
| |
|
| | tokenized = normalized.map(
|
| | lambda x: tokenize_function(x, tokenizer),
|
| | batched=True,
|
| | batch_size=BATCH_SIZE_TOKENIZATION,
|
| | num_proc=1,
|
| | remove_columns=["prompt", "response"],
|
| | load_from_cache_file=False
|
| | )
|
| |
|
| | processed_splits[split_name] = tokenized
|
| | print(f"β
{split_name}: {len(tokenized)} samples")
|
| |
|
| | except Exception as e:
|
| | print(f"β οΈ {split_name} failed: {e}")
|
| |
|
| | try:
|
| | dummy_tokens = tokenizer("test\n\ntest", return_tensors=None)
|
| | dummy_tokens["labels"] = dummy_tokens["input_ids"].copy()
|
| | processed_splits[split_name] = Dataset.from_list([dummy_tokens] * min(10, len(dataset[split_name])))
|
| | except:
|
| | processed_splits[split_name] = Dataset.from_list([
|
| | {"input_ids": [1], "attention_mask": [1], "labels": [1]}
|
| | ] * 5)
|
| |
|
| | return DatasetDict(processed_splits) if processed_splits else None
|
| |
|
| | def load_model(model_name, tokenizer, use_lora=True):
|
| | """Load model with LoRA support"""
|
| | print("π§ Loading model...")
|
| |
|
| | strategies = [
|
| | {
|
| | "name": "8-bit + LoRA",
|
| | "params": {
|
| | "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
|
| | "device_map": "auto" if torch.cuda.is_available() else None,
|
| | "trust_remote_code": True,
|
| | "low_cpu_mem_usage": True,
|
| | "load_in_8bit": True,
|
| | }
|
| | },
|
| | {
|
| | "name": "float16",
|
| | "params": {
|
| | "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
|
| | "device_map": "auto" if torch.cuda.is_available() else None,
|
| | "trust_remote_code": True,
|
| | "low_cpu_mem_usage": True,
|
| | }
|
| | },
|
| | {
|
| | "name": "CPU fallback",
|
| | "params": {
|
| | "low_cpu_mem_usage": True,
|
| | }
|
| | }
|
| | ]
|
| |
|
| | for strategy in strategies:
|
| | try:
|
| | print(f"π {strategy['name']}...")
|
| | model = AutoModelForCausalLM.from_pretrained(model_name, **strategy["params"])
|
| |
|
| |
|
| | if use_lora and USE_LORA:
|
| | try:
|
| | model = prepare_model_for_kbit_training(model)
|
| | lora_config = LoraConfig(
|
| | r=LORA_R,
|
| | lora_alpha=LORA_ALPHA,
|
| | target_modules=["q_proj", "v_proj"],
|
| | lora_dropout=LORA_DROPOUT,
|
| | bias="none",
|
| | task_type="CAUSAL_LM"
|
| | )
|
| | model = get_peft_model(model, lora_config)
|
| | print("β
LoRA applied")
|
| | except Exception as e:
|
| | print(f"β οΈ LoRA failed: {e}")
|
| |
|
| |
|
| | if tokenizer:
|
| | try:
|
| | model.resize_token_embeddings(len(tokenizer))
|
| | except Exception as e:
|
| | print(f"β οΈ Embedding resize failed: {e}")
|
| |
|
| | print(f"β
Model loaded ({strategy['name']})")
|
| | return model
|
| | except Exception as e:
|
| | print(f"β οΈ {strategy['name']} failed: {str(e)[:100]}...")
|
| |
|
| | print("β All model strategies failed")
|
| | return None
|
| |
|
| | def setup_training(model, tokenizer, tokenized_dataset, dataset_name):
|
| | """Setup training configuration"""
|
| | if not model or not tokenizer or not tokenized_dataset:
|
| | return None
|
| |
|
| | print(f"βοΈ Setting up training for {dataset_name}...")
|
| |
|
| | try:
|
| | train_dataset = tokenized_dataset.get("train")
|
| | eval_dataset = tokenized_dataset.get("test") or tokenized_dataset.get("train")
|
| |
|
| | if not train_dataset or len(train_dataset) == 0:
|
| | print("β No training data")
|
| | return None
|
| |
|
| |
|
| | max_samples = 50
|
| | if len(train_dataset) > max_samples:
|
| | train_dataset = train_dataset.select(range(max_samples))
|
| | if eval_dataset and len(eval_dataset) > 10:
|
| | eval_dataset = eval_dataset.select(range(min(10, len(eval_dataset))))
|
| |
|
| | output_dir = os.path.join(OUTPUT_DIR, dataset_name.replace("/", "_"))
|
| | safe_makedirs(output_dir)
|
| |
|
| | training_args = TrainingArguments(
|
| | output_dir=output_dir,
|
| |
|
| | num_train_epochs=EPOCHS,
|
| | per_device_train_batch_size=BATCH_SIZE,
|
| | per_device_eval_batch_size=BATCH_SIZE,
|
| | gradient_accumulation_steps=GRADIENT_ACCUMULATION,
|
| |
|
| | learning_rate=LEARNING_RATE,
|
| | weight_decay=0.01,
|
| | warmup_ratio=0.1,
|
| | lr_scheduler_type="linear",
|
| |
|
| | logging_dir=os.path.join(output_dir, "logs"),
|
| | logging_steps=LOGGING_STEPS,
|
| | save_strategy="steps",
|
| | save_steps=SAVE_STEPS,
|
| | save_total_limit=2,
|
| |
|
| | eval_strategy="steps" if eval_dataset else "no",
|
| | eval_steps=EVAL_STEPS if eval_dataset else None,
|
| |
|
| | fp16=torch.cuda.is_available(),
|
| | bf16=False,
|
| | dataloader_num_workers=1,
|
| | dataloader_pin_memory=False,
|
| | remove_unused_columns=False,
|
| |
|
| | optim="adamw_torch",
|
| | dataloader_drop_last=True,
|
| | gradient_checkpointing=True,
|
| |
|
| | report_to="none",
|
| | run_name=f"training_{dataset_name}",
|
| | tf32=False,
|
| | )
|
| |
|
| | data_collator = DataCollatorForLanguageModeling(
|
| | tokenizer=tokenizer,
|
| | mlm=False,
|
| | pad_to_multiple_of=8,
|
| | )
|
| |
|
| | trainer = Trainer(
|
| | model=model,
|
| | args=training_args,
|
| | train_dataset=train_dataset,
|
| | eval_dataset=eval_dataset,
|
| | data_collator=data_collator,
|
| | processing_class=tokenizer,
|
| | callbacks=[]
|
| | )
|
| |
|
| | print("β
Training setup complete")
|
| | return trainer, output_dir
|
| | except Exception as e:
|
| | print(f"β Training setup failed: {e}")
|
| | return None, None
|
| |
|
| | def train_model(trainer, dataset_name):
|
| | """Execute training and save results"""
|
| | if not trainer:
|
| | return False, None, None
|
| |
|
| | print(f"π Training {dataset_name}...")
|
| |
|
| | try:
|
| | train_result = trainer.train()
|
| |
|
| |
|
| | output_dir = trainer.args.output_dir
|
| | final_model_dir = os.path.join(output_dir, "final_model")
|
| | safe_makedirs(final_model_dir)
|
| |
|
| | print("πΎ Saving model...")
|
| | trainer.save_model(final_model_dir)
|
| | trainer.save_state()
|
| |
|
| | print("πΎ Saving tokenizer...")
|
| | trainer.tokenizer.save_pretrained(final_model_dir)
|
| |
|
| | print(f"β
Training complete for {dataset_name}")
|
| | return True, final_model_dir, train_result
|
| |
|
| | except Exception as e:
|
| | print(f"β Training failed: {e}")
|
| | traceback.print_exc()
|
| | return False, None, None
|
| |
|
| | def merge_model(base_model_path, adapter_path, dataset_name):
|
| | """Merge LoRA weights with base model"""
|
| | print(f"π Merging {dataset_name}...")
|
| |
|
| | try:
|
| | output_path = os.path.join(MERGED_MODELS_DIR, dataset_name.replace("/", "_"))
|
| | safe_makedirs(output_path)
|
| |
|
| |
|
| | try:
|
| | tokenizer = AutoTokenizer.from_pretrained(adapter_path)
|
| | except:
|
| | tokenizer = load_tokenizer_robust(base_model_path)
|
| |
|
| |
|
| | base_model = AutoModelForCausalLM.from_pretrained(
|
| | base_model_path,
|
| | torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
| | device_map="auto" if torch.cuda.is_available() else None,
|
| | trust_remote_code=True,
|
| | low_cpu_mem_usage=True
|
| | )
|
| |
|
| |
|
| | current_vocab_size = len(tokenizer)
|
| | model_vocab_size = base_model.get_input_embeddings().weight.size(0)
|
| | if current_vocab_size != model_vocab_size:
|
| | base_model.resize_token_embeddings(current_vocab_size)
|
| |
|
| |
|
| | merged_model = PeftModel.from_pretrained(base_model, adapter_path)
|
| | merged_model = merged_model.merge_and_unload()
|
| |
|
| |
|
| | merged_model.save_pretrained(output_path)
|
| | tokenizer.save_pretrained(output_path)
|
| |
|
| | print(f"β
{dataset_name} merged successfully")
|
| | cleanup_gpu_memory()
|
| | return True, output_path
|
| |
|
| | except Exception as e:
|
| | print(f"β Merging {dataset_name} failed: {e}")
|
| |
|
| |
|
| | try:
|
| | fallback_path = os.path.join(MERGED_MODELS_DIR, dataset_name.replace("/", "_") + "_adapter_only")
|
| | safe_makedirs(fallback_path)
|
| |
|
| | adapter_files = os.listdir(adapter_path)
|
| | for file in adapter_files:
|
| | src = os.path.join(adapter_path, file)
|
| | dst = os.path.join(fallback_path, file)
|
| | if os.path.isfile(src):
|
| | shutil.copy2(src, dst)
|
| |
|
| | print(f"β οΈ {dataset_name} adapter copied (merging failed)")
|
| | return True, fallback_path
|
| | except Exception as e2:
|
| | print(f"β Fallback also failed: {e2}")
|
| | return False, None
|
| |
|
| | def save_analysis_report(analyzer, system_info, dataset_info, training_info, dataset_name):
|
| | """Save analysis report"""
|
| | try:
|
| | report = analyzer.generate_report(system_info, dataset_info, training_info)
|
| |
|
| | report_dir = os.path.join(OUTPUT_DIR, dataset_name.replace("/", "_"))
|
| | safe_makedirs(report_dir)
|
| |
|
| | report_path = os.path.join(report_dir, "training_analysis.txt")
|
| | with open(report_path, "w") as f:
|
| | f.write(report)
|
| |
|
| |
|
| | metrics_path = os.path.join(report_dir, "training_metrics.json")
|
| | with open(metrics_path, "w") as f:
|
| | json.dump({
|
| | "system": system_info,
|
| | "dataset": dataset_info,
|
| | "training": training_info
|
| | }, f, indent=2)
|
| |
|
| | print(f"π Analysis saved for {dataset_name}")
|
| | return True
|
| | except Exception as e:
|
| | print(f"β οΈ Failed to save analysis: {e}")
|
| | return False
|
| |
|
| |
|
| | def main():
|
| | """Main training pipeline with automatic model merging"""
|
| | print("π STARTING AUTOMATED TRAINING PIPELINE")
|
| | print(f"π§ Model: {MODEL_NAME}")
|
| | print(f"π― LoRA: {USE_LORA} | Batch: {BATCH_SIZE} | Epochs: {EPOCHS}")
|
| | print(f"π₯οΈ System: {platform.system()} | CUDA: {torch.cuda.is_available()}")
|
| |
|
| |
|
| | analyzer = TrainingAnalyzer()
|
| |
|
| |
|
| | safe_makedirs(OUTPUT_DIR)
|
| | safe_makedirs(MERGED_MODELS_DIR)
|
| |
|
| |
|
| | print("\nπ€ LOADING SHARED TOKENIZER...")
|
| | tokenizer = load_tokenizer_robust(MODEL_NAME)
|
| | if not tokenizer:
|
| | print("β CRITICAL: Tokenizer loading failed")
|
| | return
|
| |
|
| | print(f"β
Tokenizer loaded (vocab: {len(tokenizer)})")
|
| |
|
| |
|
| | system_info = analyzer.analyze_system()
|
| | print(f"π System: {system_info.get('total_memory_gb', 0):.1f}GB RAM, {system_info.get('cpu_cores', 0)} cores")
|
| |
|
| |
|
| | results = []
|
| | total_training_time = 0
|
| |
|
| | for dataset_name in DATASET_SOURCES:
|
| | print(f"\n{'='*60}")
|
| | print(f"π― PROCESSING DATASET: {dataset_name}")
|
| | print(f"{'='*60}")
|
| |
|
| |
|
| | dataset = load_dataset_fallback()
|
| | if not dataset:
|
| | print(f"β Failed to load {dataset_name}")
|
| | continue
|
| |
|
| |
|
| | dataset_info = analyzer.analyze_dataset(dataset)
|
| | print(f"π Dataset analysis: {dataset_info}")
|
| |
|
| |
|
| | tokenized_dataset = process_dataset(dataset, tokenizer)
|
| | if not tokenized_dataset:
|
| | print(f"β Failed to process {dataset_name}")
|
| | continue
|
| |
|
| |
|
| | model = load_model(MODEL_NAME, tokenizer, use_lora=True)
|
| | if not model:
|
| | print(f"β Failed to load model for {dataset_name}")
|
| | continue
|
| |
|
| |
|
| | setup_result = setup_training(model, tokenizer, tokenized_dataset, dataset_name)
|
| | if not setup_result or setup_result[0] is None:
|
| | print(f"β Failed to setup training for {dataset_name}")
|
| | continue
|
| |
|
| | trainer, model_dir = setup_result
|
| |
|
| |
|
| | success, final_model_dir, train_result = train_model(trainer, dataset_name)
|
| | if not success:
|
| | print(f"β Training failed for {dataset_name}")
|
| | continue
|
| |
|
| |
|
| | training_info = analyzer.analyze_training(trainer, train_result)
|
| | total_training_time += training_info.get('training_time_minutes', 0)
|
| |
|
| |
|
| | save_analysis_report(analyzer, system_info, dataset_info, training_info, dataset_name)
|
| |
|
| |
|
| | if USE_LORA and success:
|
| | merge_success, merged_path = merge_model(MODEL_NAME, final_model_dir, dataset_name)
|
| |
|
| |
|
| | results.append({
|
| | "dataset": dataset_name,
|
| | "training_time": training_info.get('training_time_minutes', 0),
|
| | "final_loss": training_info.get('final_loss', 'unknown'),
|
| | "model_saved": final_model_dir,
|
| | "model_merged": merged_path if merge_success else None,
|
| | "success": True
|
| | })
|
| | else:
|
| | results.append({
|
| | "dataset": dataset_name,
|
| | "training_time": training_info.get('training_time_minutes', 0),
|
| | "final_loss": training_info.get('final_loss', 'unknown'),
|
| | "model_saved": final_model_dir,
|
| | "model_merged": None,
|
| | "success": success
|
| | })
|
| |
|
| |
|
| | cleanup_gpu_memory()
|
| | print(f"β
{dataset_name} processing complete\n")
|
| |
|
| |
|
| | print(f"\n{'='*60}")
|
| | print("π FINAL TRAINING SUMMARY")
|
| | print(f"{'='*60}")
|
| |
|
| | successful_trainings = sum(1 for r in results if r['success'])
|
| | successful_merges = sum(1 for r in results if r.get('model_merged'))
|
| |
|
| | print(f"β
Total Datasets Processed: {len(results)}")
|
| | print(f"β
Successful Trainings: {successful_trainings}")
|
| | print(f"β
Successful Merges: {successful_merges}")
|
| | print(f"β±οΈ Total Training Time: {total_training_time:.2f} minutes")
|
| |
|
| | for result in results:
|
| | status = "β
" if result['success'] else "β"
|
| | merge_status = "π" if result.get('model_merged') else "βοΈ"
|
| | print(f"{status} {result['dataset']}: {result['training_time']:.1f}min | Loss: {result['final_loss']} {merge_status}")
|
| |
|
| | print(f"\nπ Models saved in: {OUTPUT_DIR}")
|
| | print(f"π Merged models in: {MERGED_MODELS_DIR}")
|
| | print(f"{'='*60}")
|
| |
|
| | return results
|
| |
|
| |
|
| | if __name__ == "__main__":
|
| | print("π STARTING AUTOMATED TRAINING...")
|
| |
|
| | try:
|
| | results = main()
|
| |
|
| | if results:
|
| | print("π TRAINING PIPELINE COMPLETED SUCCESSFULLY!")
|
| | else:
|
| | print("β οΈ TRAINING COMPLETED WITH ISSUES")
|
| |
|
| | except KeyboardInterrupt:
|
| | print("\nπ TRAINING STOPPED BY USER")
|
| | except Exception as e:
|
| | print(f"π₯ UNEXPECTED ERROR: {str(e)}")
|
| | traceback.print_exc()
|
| | print("β οΈ CONTINUING DESPITE ERROR...")
|
| |
|
| | print("π TRAINING PROCESS FINISHED")
|
| |
|