import os import torch import gc from concurrent.futures import ThreadPoolExecutor, as_completed from functools import partial import psutil import multiprocessing as mp from datasets import load_dataset, Dataset, DatasetDict from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, GPT2TokenizerFast ) import shutil from typing import Dict, Any, List import warnings import platform import traceback from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training import json import tempfile from datetime import datetime warnings.filterwarnings("ignore") # ─── Configuration ─────────────────────────────────────────────────────────── MODEL_NAME = "zxc4wewewe/blackthinking" OUTPUT_DIR = "./offsec_model" MERGED_MODELS_DIR = "./merged_models" MAX_LENGTH = 512 BATCH_SIZE = 1 GRADIENT_ACCUMULATION = 8 EPOCHS = 3 LEARNING_RATE = 2e-5 SAVE_STEPS = 100 EVAL_STEPS = 100 LOGGING_STEPS = 50 # LoRA Configuration USE_LORA = True LORA_R = 8 LORA_ALPHA = 16 LORA_DROPOUT = 0.1 # Dataset Configuration DATASET_SOURCES = [ "huihui-ai/Guilherme34_uncensor-v2", "zxc4wewewe/offsec", ] # System Configuration NUM_WORKERS = min(2, mp.cpu_count()) BATCH_SIZE_TOKENIZATION = 50 # ─── Analyzer Class ────────────────────────────────────────────────────────── class TrainingAnalyzer: """Analyzes training progress and system resources""" def __init__(self): self.start_time = datetime.now() self.training_metrics = { "total_samples": 0, "processed_samples": 0, "training_time": 0, "peak_memory": 0, "gpu_memory": 0, } def analyze_system(self): """Analyze system resources""" try: memory = psutil.virtual_memory() gpu_memory = 0 if torch.cuda.is_available(): gpu_memory = torch.cuda.memory_allocated() / (1024**3) return { "cpu_cores": mp.cpu_count(), "total_memory_gb": memory.total / (1024**3), "available_memory_gb": memory.available / (1024**3), "memory_usage_percent": memory.percent, "gpu_memory_gb": gpu_memory, "cuda_available": torch.cuda.is_available(), "cuda_version": torch.version.cuda, "pytorch_version": torch.__version__, } except Exception as e: print(f"⚠️ System analysis failed: {e}") return {} def analyze_dataset(self, dataset): """Analyze dataset characteristics""" if not dataset: return {} try: analysis = {} for split_name, split_data in dataset.items(): if hasattr(split_data, '__len__'): analysis[split_name] = { "num_samples": len(split_data), "columns": split_data.column_names if hasattr(split_data, 'column_names') else [], } return analysis except Exception as e: print(f"⚠️ Dataset analysis failed: {e}") return {} def analyze_training(self, trainer, train_result): """Analyze training results""" try: current_time = datetime.now() training_time = (current_time - self.start_time).total_seconds() memory = psutil.virtual_memory() peak_memory = memory.used / (1024**3) gpu_memory = 0 if torch.cuda.is_available(): gpu_memory = torch.cuda.memory_allocated() / (1024**3) return { "training_time_seconds": training_time, "training_time_minutes": training_time / 60, "peak_memory_gb": peak_memory, "peak_gpu_memory_gb": gpu_memory, "final_loss": getattr(train_result, 'training_loss', 'unknown'), "total_steps": getattr(train_result, 'global_step', 0), "samples_per_second": train_result.metrics.get('train_samples_per_second', 0) if train_result.metrics else 0, } except Exception as e: print(f"⚠️ Training analysis failed: {e}") return {} def generate_report(self, system_info, dataset_info, training_info): """Generate comprehensive training report""" report = f""" {'='*60} TRAINING ANALYSIS REPORT {'='*60} SYSTEM INFORMATION: - CPU Cores: {system_info.get('cpu_cores', 'unknown')} - Total Memory: {system_info.get('total_memory_gb', 0):.1f} GB - Available Memory: {system_info.get('available_memory_gb', 0):.1f} GB - Memory Usage: {system_info.get('memory_usage_percent', 0):.1f}% - CUDA Available: {system_info.get('cuda_available', False)} - CUDA Version: {system_info.get('cuda_version', 'unknown')} - PyTorch Version: {system_info.get('pytorch_version', 'unknown')} - GPU Memory Used: {system_info.get('gpu_memory_gb', 0):.2f} GB DATASET ANALYSIS: """ for split_name, split_info in dataset_info.items(): report += f"- {split_name.upper()}: {split_info.get('num_samples', 0)} samples\n" if split_info.get('columns'): report += f" Columns: {', '.join(split_info['columns'])}\n" report += f""" TRAINING PERFORMANCE: - Training Time: {training_info.get('training_time_minutes', 0):.2f} minutes - Final Loss: {training_info.get('final_loss', 'unknown')} - Total Steps: {training_info.get('total_steps', 0)} - Samples/Second: {training_info.get('samples_per_second', 0):.2f} - Peak Memory: {training_info.get('peak_memory_gb', 0):.2f} GB - Peak GPU Memory: {training_info.get('peak_gpu_memory_gb', 0):.2f} GB TRAINING CONFIGURATION: - Model: {MODEL_NAME} - Batch Size: {BATCH_SIZE} - Gradient Accumulation: {GRADIENT_ACCUMULATION} - Learning Rate: {LEARNING_RATE} - Epochs: {EPOCHS} - LoRA Enabled: {USE_LORA} - Max Length: {MAX_LENGTH} {'='*60} END REPORT {'='*60} """ return report # ─── Utility Functions ─────────────────────────────────────────────────────── def safe_makedirs(path): """Safely create directories""" try: os.makedirs(path, exist_ok=True) return True except Exception as e: print(f"⚠️ Failed to create directory {path}: {e}") return False def cleanup_gpu_memory(): """Clean up GPU memory""" if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() def load_tokenizer_robust(model_name): """Load tokenizer with multiple fallback strategies""" print(f"🔄 Loading tokenizer for: {model_name}") strategies = [ lambda: AutoTokenizer.from_pretrained(model_name, use_fast=True, trust_remote_code=True), lambda: AutoTokenizer.from_pretrained(model_name, use_fast=True, trust_remote_code=False), lambda: GPT2TokenizerFast.from_pretrained("gpt2"), lambda: create_minimal_tokenizer(), ] for i, strategy in enumerate(strategies, 1): try: tokenizer = strategy() # Add missing special tokens if tokenizer.pad_token is None: if tokenizer.eos_token: tokenizer.pad_token = tokenizer.eos_token else: tokenizer.add_special_tokens({"pad_token": "<|pad|>"}) print(f"✅ Tokenizer loaded (strategy {i})") return tokenizer except Exception as e: print(f"⚠️ Strategy {i} failed: {str(e)[:100]}...") print("❌ All tokenizer strategies failed") return None def create_minimal_tokenizer(): """Create absolute minimal tokenizer""" try: from transformers import PreTrainedTokenizerFast import json vocab = { "<|pad|>": 0, "": 1, "": 2, "<|unk|>": 3, } for i, char in enumerate("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 \n\t.,!?-", start=4): vocab[char] = i tokenizer_json = { "version": "1.0", "model": { "type": "BPE", "vocab": vocab, "merges": [] } } with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: json.dump(tokenizer_json, f) temp_path = f.name tokenizer = PreTrainedTokenizerFast(tokenizer_file=temp_path) tokenizer.pad_token = "<|pad|>" tokenizer.eos_token = "" tokenizer.bos_token = "" os.unlink(temp_path) return tokenizer except: return None def load_dataset_fallback(): """Load dataset with comprehensive fallbacks""" print("📥 Loading dataset...") for dataset_name in DATASET_SOURCES: try: print(f"🔄 Trying: {dataset_name}") dataset = load_dataset(dataset_name, streaming=False) print(f"✅ Loaded: {dataset_name}") # Ensure proper splits if "train" not in dataset and "test" not in dataset: keys = list(dataset.keys()) if keys: main_split = dataset[keys[0]] dataset = main_split.train_test_split(test_size=0.1, seed=42) print(f"✅ Created train/test split") else: continue return dataset except Exception as e: print(f"⚠️ Failed: {str(e)[:100]}...") # Create dummy dataset print("🔄 Creating dummy dataset...") try: dummy_data = { "train": [ {"prompt": "What is AI?", "response": "Artificial Intelligence is computer systems performing human tasks."}, {"prompt": "How to code?", "response": "Start with basics like variables, loops, functions."}, ] * 10, "test": [ {"prompt": "Define ML", "response": "Machine Learning enables computers to learn from data."}, ] * 3, } dataset = DatasetDict({ split: Dataset.from_list(data) for split, data in dummy_data.items() }) print("✅ Created dummy dataset") return dataset except Exception as e: print(f"❌ Dummy dataset failed: {e}") return None def normalize_example(example): """Normalize example format""" if not example: return {"prompt": "default", "response": "default"} try: if "prompt" in example and "response" in example: return { "prompt": str(example.get("prompt", "")).strip() or "default", "response": str(example.get("response", "")).strip() or "default", } if "messages" in example and isinstance(example["messages"], list): prompt, response = "", "" for msg in example["messages"]: if isinstance(msg, dict): role, content = str(msg.get("role", "")), str(msg.get("content", "")) if role.lower() in ["user", "human"]: prompt = content elif role.lower() in ["assistant", "bot"]: response = content return {"prompt": prompt or "default", "response": response or "default"} text = str(example.get("text", example.get("content", "default"))) if "Assistant:" in text: parts = text.split("Assistant:", 1) return {"prompt": parts[0].replace("User:", "").strip() or "default", "response": parts[1].strip() or "default"} return {"prompt": text[:200] or "default", "response": (text[-200:] if len(text) > 200 else text) or "default"} except: return {"prompt": "default", "response": "default"} def tokenize_function(examples, tokenizer): """Tokenize examples safely""" try: full_texts = [ f"{prompt}\n\n{response}{tokenizer.eos_token}" for prompt, response in zip(examples["prompt"], examples["response"]) ] result = tokenizer( full_texts, truncation=True, max_length=MAX_LENGTH, padding=False, return_tensors=None, ) result["labels"] = [ [-100 if (hasattr(tokenizer, 'pad_token_id') and token_id == tokenizer.pad_token_id) else token_id for token_id in labels] for labels in result["input_ids"] ] return result except Exception as e: print(f"⚠️ Tokenization error: {e}") return { "input_ids": [[1, 2, 3]] * len(examples["prompt"]), "attention_mask": [[1, 1, 1]] * len(examples["prompt"]), "labels": [[1, 2, 3]] * len(examples["prompt"]), } def process_dataset(dataset, tokenizer): """Process dataset efficiently""" if not dataset or not tokenizer: return None print("⚡ Processing dataset...") processed_splits = {} for split_name in dataset.keys(): try: print(f"🔄 Processing {split_name} ({len(dataset[split_name])} samples)...") # Normalize normalized = dataset[split_name].map( normalize_example, remove_columns=dataset[split_name].column_names, num_proc=1, ) # Tokenize tokenized = normalized.map( lambda x: tokenize_function(x, tokenizer), batched=True, batch_size=BATCH_SIZE_TOKENIZATION, num_proc=1, remove_columns=["prompt", "response"], load_from_cache_file=False ) processed_splits[split_name] = tokenized print(f"✅ {split_name}: {len(tokenized)} samples") except Exception as e: print(f"⚠️ {split_name} failed: {e}") # Create minimal fallback try: dummy_tokens = tokenizer("test\n\ntest", return_tensors=None) dummy_tokens["labels"] = dummy_tokens["input_ids"].copy() processed_splits[split_name] = Dataset.from_list([dummy_tokens] * min(10, len(dataset[split_name]))) except: processed_splits[split_name] = Dataset.from_list([ {"input_ids": [1], "attention_mask": [1], "labels": [1]} ] * 5) return DatasetDict(processed_splits) if processed_splits else None def load_model(model_name, tokenizer, use_lora=True): """Load model with LoRA support""" print("🧠 Loading model...") strategies = [ { "name": "8-bit + LoRA", "params": { "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32, "device_map": "auto" if torch.cuda.is_available() else None, "trust_remote_code": True, "low_cpu_mem_usage": True, "load_in_8bit": True, } }, { "name": "float16", "params": { "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32, "device_map": "auto" if torch.cuda.is_available() else None, "trust_remote_code": True, "low_cpu_mem_usage": True, } }, { "name": "CPU fallback", "params": { "low_cpu_mem_usage": True, } } ] for strategy in strategies: try: print(f"🔄 {strategy['name']}...") model = AutoModelForCausalLM.from_pretrained(model_name, **strategy["params"]) # Apply LoRA if requested if use_lora and USE_LORA: try: model = prepare_model_for_kbit_training(model) lora_config = LoraConfig( r=LORA_R, lora_alpha=LORA_ALPHA, target_modules=["q_proj", "v_proj"], lora_dropout=LORA_DROPOUT, bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, lora_config) print("✅ LoRA applied") except Exception as e: print(f"⚠️ LoRA failed: {e}") # Resize embeddings if tokenizer: try: model.resize_token_embeddings(len(tokenizer)) except Exception as e: print(f"⚠️ Embedding resize failed: {e}") print(f"✅ Model loaded ({strategy['name']})") return model except Exception as e: print(f"⚠️ {strategy['name']} failed: {str(e)[:100]}...") print("❌ All model strategies failed") return None def setup_training(model, tokenizer, tokenized_dataset, dataset_name): """Setup training configuration""" if not model or not tokenizer or not tokenized_dataset: return None print(f"⚙️ Setting up training for {dataset_name}...") try: train_dataset = tokenized_dataset.get("train") eval_dataset = tokenized_dataset.get("test") or tokenized_dataset.get("train") if not train_dataset or len(train_dataset) == 0: print("❌ No training data") return None # Limit samples for efficiency max_samples = 50 if len(train_dataset) > max_samples: train_dataset = train_dataset.select(range(max_samples)) if eval_dataset and len(eval_dataset) > 10: eval_dataset = eval_dataset.select(range(min(10, len(eval_dataset)))) output_dir = os.path.join(OUTPUT_DIR, dataset_name.replace("/", "_")) safe_makedirs(output_dir) training_args = TrainingArguments( output_dir=output_dir, num_train_epochs=EPOCHS, per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, gradient_accumulation_steps=GRADIENT_ACCUMULATION, learning_rate=LEARNING_RATE, weight_decay=0.01, warmup_ratio=0.1, lr_scheduler_type="linear", logging_dir=os.path.join(output_dir, "logs"), logging_steps=LOGGING_STEPS, save_strategy="steps", save_steps=SAVE_STEPS, save_total_limit=2, eval_strategy="steps" if eval_dataset else "no", eval_steps=EVAL_STEPS if eval_dataset else None, fp16=torch.cuda.is_available(), bf16=False, dataloader_num_workers=1, dataloader_pin_memory=False, remove_unused_columns=False, optim="adamw_torch", dataloader_drop_last=True, gradient_checkpointing=True, report_to="none", run_name=f"training_{dataset_name}", tf32=False, ) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, processing_class=tokenizer, callbacks=[] ) print("✅ Training setup complete") return trainer, output_dir except Exception as e: print(f"❌ Training setup failed: {e}") return None, None def train_model(trainer, dataset_name): """Execute training and save results""" if not trainer: return False, None, None print(f"🏃 Training {dataset_name}...") try: train_result = trainer.train() # Save final model output_dir = trainer.args.output_dir final_model_dir = os.path.join(output_dir, "final_model") safe_makedirs(final_model_dir) print("💾 Saving model...") trainer.save_model(final_model_dir) trainer.save_state() print("💾 Saving tokenizer...") trainer.tokenizer.save_pretrained(final_model_dir) print(f"✅ Training complete for {dataset_name}") return True, final_model_dir, train_result except Exception as e: print(f"❌ Training failed: {e}") traceback.print_exc() return False, None, None def merge_model(base_model_path, adapter_path, dataset_name): """Merge LoRA weights with base model""" print(f"🔗 Merging {dataset_name}...") try: output_path = os.path.join(MERGED_MODELS_DIR, dataset_name.replace("/", "_")) safe_makedirs(output_path) # Load tokenizer from adapter try: tokenizer = AutoTokenizer.from_pretrained(adapter_path) except: tokenizer = load_tokenizer_robust(base_model_path) # Load base model base_model = AutoModelForCausalLM.from_pretrained( base_model_path, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" if torch.cuda.is_available() else None, trust_remote_code=True, low_cpu_mem_usage=True ) # Resize embeddings to match tokenizer current_vocab_size = len(tokenizer) model_vocab_size = base_model.get_input_embeddings().weight.size(0) if current_vocab_size != model_vocab_size: base_model.resize_token_embeddings(current_vocab_size) # Load and merge LoRA adapter merged_model = PeftModel.from_pretrained(base_model, adapter_path) merged_model = merged_model.merge_and_unload() # Save merged model merged_model.save_pretrained(output_path) tokenizer.save_pretrained(output_path) print(f"✅ {dataset_name} merged successfully") cleanup_gpu_memory() return True, output_path except Exception as e: print(f"❌ Merging {dataset_name} failed: {e}") # Fallback: copy adapter files try: fallback_path = os.path.join(MERGED_MODELS_DIR, dataset_name.replace("/", "_") + "_adapter_only") safe_makedirs(fallback_path) adapter_files = os.listdir(adapter_path) for file in adapter_files: src = os.path.join(adapter_path, file) dst = os.path.join(fallback_path, file) if os.path.isfile(src): shutil.copy2(src, dst) print(f"⚠️ {dataset_name} adapter copied (merging failed)") return True, fallback_path except Exception as e2: print(f"❌ Fallback also failed: {e2}") return False, None def save_analysis_report(analyzer, system_info, dataset_info, training_info, dataset_name): """Save analysis report""" try: report = analyzer.generate_report(system_info, dataset_info, training_info) report_dir = os.path.join(OUTPUT_DIR, dataset_name.replace("/", "_")) safe_makedirs(report_dir) report_path = os.path.join(report_dir, "training_analysis.txt") with open(report_path, "w") as f: f.write(report) # Save metrics as JSON metrics_path = os.path.join(report_dir, "training_metrics.json") with open(metrics_path, "w") as f: json.dump({ "system": system_info, "dataset": dataset_info, "training": training_info }, f, indent=2) print(f"📋 Analysis saved for {dataset_name}") return True except Exception as e: print(f"⚠️ Failed to save analysis: {e}") return False # ─── Main Training Pipeline ─────────────────────────────────────────────────── def main(): """Main training pipeline with automatic model merging""" print("🚀 STARTING AUTOMATED TRAINING PIPELINE") print(f"🔧 Model: {MODEL_NAME}") print(f"🎯 LoRA: {USE_LORA} | Batch: {BATCH_SIZE} | Epochs: {EPOCHS}") print(f"🖥️ System: {platform.system()} | CUDA: {torch.cuda.is_available()}") # Initialize analyzer analyzer = TrainingAnalyzer() # Create directories safe_makedirs(OUTPUT_DIR) safe_makedirs(MERGED_MODELS_DIR) # Load tokenizer (shared across all training) print("\n🔤 LOADING SHARED TOKENIZER...") tokenizer = load_tokenizer_robust(MODEL_NAME) if not tokenizer: print("❌ CRITICAL: Tokenizer loading failed") return print(f"✅ Tokenizer loaded (vocab: {len(tokenizer)})") # Analyze system system_info = analyzer.analyze_system() print(f"📊 System: {system_info.get('total_memory_gb', 0):.1f}GB RAM, {system_info.get('cpu_cores', 0)} cores") # Process each dataset results = [] total_training_time = 0 for dataset_name in DATASET_SOURCES: print(f"\n{'='*60}") print(f"🎯 PROCESSING DATASET: {dataset_name}") print(f"{'='*60}") # 1. Load dataset dataset = load_dataset_fallback() if not dataset: print(f"❌ Failed to load {dataset_name}") continue # 2. Analyze dataset dataset_info = analyzer.analyze_dataset(dataset) print(f"📊 Dataset analysis: {dataset_info}") # 3. Process dataset tokenized_dataset = process_dataset(dataset, tokenizer) if not tokenized_dataset: print(f"❌ Failed to process {dataset_name}") continue # 4. Load model model = load_model(MODEL_NAME, tokenizer, use_lora=True) if not model: print(f"❌ Failed to load model for {dataset_name}") continue # 5. Setup training setup_result = setup_training(model, tokenizer, tokenized_dataset, dataset_name) if not setup_result or setup_result[0] is None: print(f"❌ Failed to setup training for {dataset_name}") continue trainer, model_dir = setup_result # 6. Train model success, final_model_dir, train_result = train_model(trainer, dataset_name) if not success: print(f"❌ Training failed for {dataset_name}") continue # 7. Analyze training training_info = analyzer.analyze_training(trainer, train_result) total_training_time += training_info.get('training_time_minutes', 0) # 8. Save analysis report save_analysis_report(analyzer, system_info, dataset_info, training_info, dataset_name) # 9. Merge model (if LoRA was used) if USE_LORA and success: merge_success, merged_path = merge_model(MODEL_NAME, final_model_dir, dataset_name) # Store results results.append({ "dataset": dataset_name, "training_time": training_info.get('training_time_minutes', 0), "final_loss": training_info.get('final_loss', 'unknown'), "model_saved": final_model_dir, "model_merged": merged_path if merge_success else None, "success": True }) else: results.append({ "dataset": dataset_name, "training_time": training_info.get('training_time_minutes', 0), "final_loss": training_info.get('final_loss', 'unknown'), "model_saved": final_model_dir, "model_merged": None, "success": success }) # Cleanup memory cleanup_gpu_memory() print(f"✅ {dataset_name} processing complete\n") # Generate final summary print(f"\n{'='*60}") print("📊 FINAL TRAINING SUMMARY") print(f"{'='*60}") successful_trainings = sum(1 for r in results if r['success']) successful_merges = sum(1 for r in results if r.get('model_merged')) print(f"✅ Total Datasets Processed: {len(results)}") print(f"✅ Successful Trainings: {successful_trainings}") print(f"✅ Successful Merges: {successful_merges}") print(f"⏱️ Total Training Time: {total_training_time:.2f} minutes") for result in results: status = "✅" if result['success'] else "❌" merge_status = "🔗" if result.get('model_merged') else "⏭️" print(f"{status} {result['dataset']}: {result['training_time']:.1f}min | Loss: {result['final_loss']} {merge_status}") print(f"\n📂 Models saved in: {OUTPUT_DIR}") print(f"🔗 Merged models in: {MERGED_MODELS_DIR}") print(f"{'='*60}") return results # ─── Execute Training ─────────────────────────────────────────────────────── if __name__ == "__main__": print("🏁 STARTING AUTOMATED TRAINING...") try: results = main() if results: print("🎊 TRAINING PIPELINE COMPLETED SUCCESSFULLY!") else: print("⚠️ TRAINING COMPLETED WITH ISSUES") except KeyboardInterrupt: print("\n🛑 TRAINING STOPPED BY USER") except Exception as e: print(f"💥 UNEXPECTED ERROR: {str(e)}") traceback.print_exc() print("⚠️ CONTINUING DESPITE ERROR...") print("🏁 TRAINING PROCESS FINISHED")