File size: 26,233 Bytes

import os
import torch
import gc
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
import psutil
import multiprocessing as mp
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    GPT2TokenizerFast
)
import shutil
from typing import Dict, Any, List
import warnings
import platform
import traceback
warnings.filterwarnings("ignore")


# ─── Configuration ───────────────────────────────────────────────────────────
MODEL_NAME = "zxc4wewewe/blackthinking"
OUTPUT_DIR = "."
MAX_LENGTH = 512
BATCH_SIZE = 1  # Very conservative
GRADIENT_ACCUMULATION = 8
EPOCHS = 1  # For testing
LEARNING_RATE = 2e-5
SAVE_STEPS = 50
EVAL_STEPS = 50
LOGGING_STEPS = 25

# Optimize for performance
NUM_WORKERS = 1  # Single thread for stability
BATCH_SIZE_TOKENIZATION = 25

# ─── Utility Functions ───────────────────────────────────────────────────────
def safe_makedirs(path):
    """Safely create directories"""
    try:
        os.makedirs(path, exist_ok=True)
        return True
    except Exception as e:
        print(f"⚠️  Failed to create directory {path}: {e}")
        return False

def load_tokenizer_robust(model_name):
    """Load tokenizer with multiple fallback strategies"""
    print(f"🔄 Attempting to load tokenizer for: {model_name}")
    
    # Strategy 1: Try the model's tokenizer with trust_remote_code
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_name, 
            use_fast=True,
            trust_remote_code=True
        )
        if hasattr(tokenizer, 'get_vocab') or hasattr(tokenizer, 'vocab'):
            print("✅ Successfully loaded model tokenizer")
            return tokenizer
        else:
            print("⚠️  Model tokenizer loaded but missing vocab methods")
    except Exception as e:
        print(f"⚠️  Primary tokenizer load failed: {str(e)[:100]}...")
    
    # Strategy 2: Try without trust_remote_code
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_name, 
            use_fast=True,
            trust_remote_code=False
        )
        print("✅ Successfully loaded tokenizer (no remote code)")
        return tokenizer
    except Exception as e:
        print(f"⚠️  Secondary tokenizer load failed: {str(e)[:100]}...")
    
    # Strategy 3: Create a minimal tokenizer workaround
    print("🔄 Creating minimal tokenizer workaround...")
    try:
        # Use GPT-2 tokenizer as base
        tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
        
        # Add special tokens that the model might expect
        special_tokens = {
            "pad_token": "<|pad|>",
            "eos_token": "</s>",
            "bos_token": "<s>",
        }
        
        # Only add tokens that don't already exist
        existing_tokens = set(tokenizer.all_special_tokens)
        tokens_to_add = {k: v for k, v in special_tokens.items() if v not in existing_tokens}
        
        if tokens_to_add:
            tokenizer.add_special_tokens(tokens_to_add)
        
        print("✅ Created minimal tokenizer workaround")
        return tokenizer
    except Exception as e:
        print(f"⚠️  Minimal tokenizer creation failed: {str(e)[:100]}...")
    
    # Strategy 4: Create absolute minimal tokenizer
    print("🔄 Creating absolute minimal tokenizer...")
    try:
        from transformers import PreTrainedTokenizerFast
        import json
        
        # Create minimal vocab
        vocab = {
            "<|pad|>": 0,
            "</s>": 1,
            "<s>": 2,
            "<|unk|>": 3,
        }
        
        # Add basic ASCII characters
        for i, char in enumerate("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 \n\t.,!?-", start=4):
            vocab[char] = i
        
        # Create tokenizer JSON structure
        tokenizer_json = {
            "version": "1.0",
            "truncation": {"direction": "Right", "max_length": 512, "strategy": "LongestFirst"},
            "padding": {"direction": "Right", "pad_id": 0, "pad_token": "<|pad|>", "pad_type_id": 0},
            "model": {
                "type": "BPE",
                "dropout": None,
                "unk_token": "<|unk|>",
                "continuing_subword_prefix": "",
                "end_of_word_suffix": "",
                "fuse_unk": False,
                "vocab": vocab,
                "merges": []
            }
        }
        
        # Save to temporary file
        import tempfile
        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
            json.dump(tokenizer_json, f)
            temp_path = f.name
        
        # Load the tokenizer
        tokenizer = PreTrainedTokenizerFast(tokenizer_file=temp_path)
        tokenizer.pad_token = "<|pad|>"
        tokenizer.eos_token = "</s>"
        tokenizer.bos_token = "<s>"
        
        # Clean up temp file
        os.unlink(temp_path)
        
        print("✅ Created absolute minimal tokenizer")
        return tokenizer
    except Exception as e:
        print(f"⚠️  Absolute minimal tokenizer failed: {str(e)[:100]}...")
    
    # Final fallback: return None to signal failure
    print("❌ All tokenizer loading strategies failed")
    return None

def load_dataset_with_fallback():
    """Load dataset with comprehensive fallbacks"""
    print("📥 Loading dataset with fallbacks...")
    
    # Try multiple sources
    datasets_sources = [
        "huihui-ai/Guilherme34_uncensor-v2",
        "zxc4wewewe/offsec",
    ]
    
    for dataset_name in datasets_sources:
        try:
            print(f"🔄 Trying to load: {dataset_name}")
            dataset = load_dataset(dataset_name, streaming=False)
            print(f"✅ Successfully loaded: {dataset_name}")
            
            # Ensure we have proper splits
            if "train" not in dataset and "test" not in dataset:
                # Convert single split to train/test
                keys = list(dataset.keys())
                if keys:
                    main_split = dataset[keys[0]]
                    dataset = main_split.train_test_split(test_size=0.1, seed=42)
                else:
                    continue  # Try next source
            
            return dataset
        except Exception as e:
            print(f"⚠️  Failed to load {dataset_name}: {str(e)[:100]}...")
    
    # Create minimal dummy dataset
    print("🔄 Creating minimal dummy dataset for emergency...")
    try:
        dummy_data = {
            "train": [
                {"prompt": "What is AI?", "response": "Artificial Intelligence is computer systems performing human tasks."},
                {"prompt": "How to code?", "response": "Start with basics like variables, loops, functions."},
                {"prompt": "What is ML?", "response": "Machine Learning enables computers to learn from data."},
            ] * 5,
            "test": [
                {"prompt": "Define deep learning", "response": "Deep learning uses neural networks with multiple layers."},
            ] * 3,
        }
        
        dataset = DatasetDict({
            split: Dataset.from_list(data) 
            for split, data in dummy_data.items()
        })
        
        print("✅ Created minimal dummy dataset")
        return dataset
    except Exception as e:
        print(f"❌ Failed to create dummy dataset: {e}")
        return None

def normalize_example_safe(example):
    """Safe example normalization with comprehensive error handling"""
    try:
        if not example:
            return {"prompt": "default prompt", "response": "default response"}
        
        # Fast path for standard format
        if "prompt" in example and "response" in example:
            p = str(example.get("prompt", "") or "default prompt")
            r = str(example.get("response", "") or "default response")
            return {"prompt": p.strip() or "default prompt", "response": r.strip() or "default response"}
        
        # Handle messages format
        if "messages" in example and isinstance(example["messages"], list):
            prompt, response = "", ""
            for msg in example["messages"]:
                if isinstance(msg, dict):
                    role, content = str(msg.get("role", "")), str(msg.get("content", ""))
                    if role.lower() in ["user", "human"]:
                        prompt = content
                    elif role.lower() in ["assistant", "bot"]:
                        response = content
            return {"prompt": prompt or "default prompt", "response": response or "default response"}
        
        # Ultimate fallback
        text = str(example.get("text", example.get("content", "default text")))
        if "Assistant:" in text:
            parts = text.split("Assistant:", 1)
            return {"prompt": parts[0].replace("User:", "").strip() or "default prompt", 
                   "response": parts[1].strip() or "default response"}
        
        return {"prompt": text[:200] or "default prompt", 
               "response": (text[-200:] if len(text) > 200 else text) or "default response"}
    except Exception:
        return {"prompt": "default prompt", "response": "default response"}

def tokenize_function_safe(examples, tokenizer):
    """Safe tokenization with comprehensive error handling"""
    try:
        # Format: Prompt\n\nResponse\n
        full_texts = [
            f"{prompt}\n\n{response}{tokenizer.eos_token if hasattr(tokenizer, 'eos_token') else '</s>'}"
            for prompt, response in zip(examples["prompt"], examples["response"])
        ]
        
        # Safe tokenization
        result = tokenizer(
            full_texts,
            truncation=True,
            max_length=MAX_LENGTH,
            padding=False,
            return_tensors=None,
            verbose=False
        )
        
        # Labels for causal LM
        result["labels"] = [
            [-100 if (hasattr(tokenizer, 'pad_token_id') and token_id == tokenizer.pad_token_id) else token_id 
             for token_id in labels] 
            for labels in result["input_ids"]
        ]
        
        return result
    except Exception as e:
        print(f"⚠️  Tokenization failed, using dummy: {str(e)[:50]}...")
        # Return minimal valid result
        try:
            dummy_result = {
                "input_ids": [[1, 2, 3]] * len(examples["prompt"]),
                "attention_mask": [[1, 1, 1]] * len(examples["prompt"]),
                "labels": [[1, 2, 3]] * len(examples["prompt"]),
            }
            return dummy_result
        except:
            # Absolute fallback
            return {
                "input_ids": [[1]],
                "attention_mask": [[1]],
                "labels": [[1]],
            }

def process_dataset_resilient(dataset, tokenizer):
    """Process dataset with maximum resilience"""
    if not dataset or not tokenizer:
        print("❌ Cannot process dataset - missing components")
        return None
    
    print("⚡ Processing dataset with resilience...")
    
    processed_splits = {}
    for split_name in dataset.keys():
        if hasattr(dataset[split_name], '__len__') and len(dataset[split_name]) > 0:
            try:
                print(f"🔄 Processing {split_name} split ({len(dataset[split_name])} samples)...")
                
                # Normalize with maximum error handling
                try:
                    normalized = dataset[split_name].map(
                        normalize_example_safe,
                        remove_columns=dataset[split_name].column_names if dataset[split_name].column_names else [],
                        num_proc=1,
                        desc=f"Normalizing {split_name}"
                    )
                except Exception as e:
                    print(f"⚠️  Normalization failed, using raw data: {str(e)[:50]}...")
                    normalized = dataset[split_name]  # Use as-is
                
                # Tokenize with maximum error handling
                try:
                    tokenized = normalized.map(
                        lambda x: tokenize_function_safe(x, tokenizer),
                        batched=True,
                        batch_size=min(BATCH_SIZE_TOKENIZATION, max(1, len(normalized) // 4)),
                        num_proc=1,
                        remove_columns=["prompt", "response"] if "prompt" in normalized.column_names else [],
                        desc=f"Tokenizing {split_name}",
                        load_from_cache_file=False
                    )
                    
                    if len(tokenized) > 0:
                        processed_splits[split_name] = tokenized
                        print(f"✅ {split_name}: {len(tokenized)} samples processed")
                    else:
                        raise ValueError("No samples processed")
                        
                except Exception as e:
                    print(f"⚠️  Tokenization failed for {split_name}: {str(e)[:100]}...")
                    # Create minimal dataset
                    try:
                        dummy_tokens = tokenizer("test\n\ntest response", return_tensors=None)
                        dummy_tokens["labels"] = dummy_tokens["input_ids"].copy()
                        processed_splits[split_name] = Dataset.from_list([dummy_tokens] * min(5, len(dataset[split_name])))
                        print(f"✅ Created minimal {split_name} dataset")
                    except:
                        # Absolute fallback
                        processed_splits[split_name] = Dataset.from_list([
                            {"input_ids": [1, 2, 3], "attention_mask": [1, 1, 1], "labels": [1, 2, 3]}
                        ] * 3)
                        
            except Exception as e:
                print(f"⚠️  Critical error processing {split_name}: {str(e)[:100]}...")
                # Absolute emergency fallback
                processed_splits[split_name] = Dataset.from_list([
                    {"input_ids": [1], "attention_mask": [1], "labels": [1]}
                ] * 2)
    
    return DatasetDict(processed_splits) if processed_splits else None

def load_model_resilient(model_name, tokenizer):
    """Load model with maximum resilience"""
    print("🧠 Loading model with maximum resilience...")
    
    # Try multiple loading strategies
    loading_strategies = [
        {
            "name": "Primary (8-bit)",
            "params": {
                "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
                "device_map": "auto" if torch.cuda.is_available() else None,
                "trust_remote_code": True,
                "low_cpu_mem_usage": True,
                "load_in_8bit": True,
            }
        },
        {
            "name": "Secondary (float16)",
            "params": {
                "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
                "device_map": "auto" if torch.cuda.is_available() else None,
                "trust_remote_code": True,
                "low_cpu_mem_usage": True,
            }
        },
        {
            "name": "Fallback (CPU)",
            "params": {
                "low_cpu_mem_usage": True,
            }
        }
    ]
    
    for strategy in loading_strategies:
        try:
            print(f"🔄 Trying {strategy['name']} loading...")
            model = AutoModelForCausalLM.from_pretrained(model_name, **strategy["params"])
            
            # Resize embeddings if tokenizer is available
            if tokenizer:
                try:
                    model.resize_token_embeddings(len(tokenizer))
                    print("✅ Resized model embeddings to match tokenizer")
                except Exception as e:
                    print(f"⚠️  Could not resize embeddings: {str(e)[:50]}...")
            
            print(f"✅ Model loaded successfully with {strategy['name']}")
            return model
        except Exception as e:
            print(f"⚠️  {strategy['name']} failed: {str(e)[:100]}...")
    
    # Emergency fallback - create a minimal model
    print("🔄 Creating minimal model fallback...")
    try:
        from transformers import GPT2LMHeadModel
        model = GPT2LMHeadModel.from_pretrained("gpt2")
        if tokenizer:
            model.resize_token_embeddings(len(tokenizer))
        print("✅ Created minimal model fallback")
        return model
    except Exception as e:
        print(f"❌ All model loading strategies failed: {str(e)[:100]}...")
        return None

def setup_training_resilient(model, tokenizer, tokenized_dataset):
    """Setup training with maximum resilience"""
    
    if not model or not tokenizer or not tokenized_dataset:
        print("❌ Cannot setup training - missing components")
        return None
    
    print("⚙️  Setting up resilient training...")
    
    # Ensure we have data for training
    try:
        train_dataset = tokenized_dataset.get("train")
        eval_dataset = tokenized_dataset.get("test") or tokenized_dataset.get("train")
        
        if not train_dataset or len(train_dataset) == 0:
            print("❌ No training data available")
            return None
            
        # Limit dataset size for testing
        max_samples = 20
        if len(train_dataset) > max_samples:
            train_dataset = train_dataset.select(range(max_samples))
        if eval_dataset and len(eval_dataset) > max_samples // 5:
            eval_dataset = eval_dataset.select(range(min(max_samples // 5, len(eval_dataset))))
    except Exception as e:
        print(f"⚠️  Dataset preparation error: {str(e)[:100]}...")
        return None
    
    # Safe training arguments - avoid problematic parameters
    try:
        training_args = TrainingArguments(
            output_dir=OUTPUT_DIR,
  
            # Conservative training settings
            num_train_epochs=EPOCHS,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION,
            
            # Learning rate and schedule
            learning_rate=LEARNING_RATE,
            weight_decay=0.01,
            warmup_ratio=0.1,
            lr_scheduler_type="linear",
            
            # Logging and saving
            logging_dir=f"{OUTPUT_DIR}/logs",
            logging_steps=LOGGING_STEPS,
            save_strategy="steps",
            save_steps=SAVE_STEPS,
            save_total_limit=2,
            
            # Evaluation - use safe parameter name
            eval_strategy="steps" if eval_dataset else "no",
            eval_steps=EVAL_STEPS if eval_dataset else None,
            
            # Performance settings - disable problematic ones
            fp16=torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 7,
            bf16=False,
            dataloader_num_workers=1,
            dataloader_pin_memory=False,
            remove_unused_columns=False,
            
            # Memory optimization
            optim="adamw_torch",
            dataloader_drop_last=True,
            gradient_checkpointing=True,
            
            # Reporting
            report_to="none",
            run_name="resilient_training",
            
            # Disable TF32 completely to avoid errors
            tf32=False,
        )
        
        # Data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False,
            pad_to_multiple_of=8,
        )
        
        # Create trainer with error handling
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset if eval_dataset else None,
            data_collator=data_collator,
            processing_class=tokenizer,
            callbacks=[]  # No callbacks to avoid issues
        )
        print("✅ Training setup completed successfully")
        return trainer
    except Exception as e:
        print(f"❌ Failed to create trainer: {str(e)[:200]}...")
        traceback.print_exc()
        return None

def safe_training_loop(trainer):
    """Execute training with maximum error handling"""
    if not trainer:
        print("❌ No trainer provided for training")
        return False
    
    print("🏃 Starting resilient training...")
    
    try:
        # Ensure output directory exists
        safe_makedirs(OUTPUT_DIR)
        
        # Start training with comprehensive error handling
        train_result = trainer.train()
        print("✅ TRAINING COMPLETED SUCCESSFULLY!")
        
        # Save everything with error handling
        try:
            print("💾 Saving model...")
            trainer.save_model(f".")
            trainer.save_state()
            print("✅ Model saved successfully!")
        except Exception as e:
            print(f"⚠️  Model save failed: {e}")
            
        try:
            print("💾 Saving tokenizer...")
            Trainer._save(f".")
            print("✅ Tokenizer saved successfully!")
        except Exception as e:
            print(f"⚠️  Tokenizer save failed: {e}")
            
        return True
        
    except KeyboardInterrupt:
        print("🛑 Training interrupted by user")
        try:
            # Try to save current progress
            trainer.save_model(f".")
            print("✅ Interrupted model saved")
        except:
            print("⚠️  Could not save interrupted model")
        return False
        
    except Exception as e:
        print(f"⚠️  Training failed with error: {str(e)[:300]}")
        traceback.print_exc()
        
        # Try emergency save
        try:
            print("💾 Attempting emergency save...")
            trainer.save_model(f".")
            print("✅ Emergency save completed")
        except Exception as save_error:
            print(f"❌ Emergency save also failed: {save_error}")
        
        return False

def main():
    """Main execution pipeline with maximum resilience"""
    print("🚀 STARTING RESILIENT TRAINING PIPELINE")
    print(f"🔧 Batch Size: {BATCH_SIZE} | Workers: {NUM_WORKERS}")
    print(f"🖥️  System: {platform.system()} | CUDA: {torch.cuda.is_available()}")
    
    # Create output directory
    safe_makedirs(OUTPUT_DIR)
    
    # 1. Load tokenizer with comprehensive fallback
    print("\n🔤 LOADING TOKENIZER WITH MAXIMUM RESILIENCE...")
    tokenizer = load_tokenizer_robust(MODEL_NAME)
    
    if tokenizer is None:
        print("❌ CRITICAL: Could not load any tokenizer. Exiting.")
        return None
    
    print(f"✅ Tokenizer loaded successfully")
    
    # 2. Load dataset with fallbacks
    print("\n📥 LOADING DATASET WITH FALLBACKS...")
    dataset = load_dataset_with_fallback()
    
    if dataset is None:
        print("❌ Could not load any dataset")
        return None
    
    # 3. Process dataset with maximum resilience
    print("\n⚡ PROCESSING DATASET WITH MAXIMUM RESILIENCE...")
    tokenized_dataset = process_dataset_resilient(dataset, tokenizer)
    
    if tokenized_dataset is None:
        print("❌ Dataset processing failed completely")
        return None
    
    # 4. Load model with maximum resilience
    print("\n🧠 LOADING MODEL WITH MAXIMUM RESILIENCE...")
    model = load_model_resilient(MODEL_NAME, tokenizer)
    
    if model is None:
        print("❌ Model loading failed completely")
        return None
    
    # 5. Setup training with maximum resilience
    print("\n⚙️  SETTING UP TRAINING WITH MAXIMUM RESILIENCE...")
    trainer = setup_training_resilient(model, tokenizer, tokenized_dataset)
    
    if trainer is None:
        print("❌ Training setup failed")
        return None
    
    # 6. Execute training with maximum resilience
    print("\n🏃 EXECUTING TRAINING WITH MAXIMUM RESILIENCE...")
    success = safe_training_loop(trainer)
    
    if success:
        print("\n🎉 TRAINING PIPELINE COMPLETED SUCCESSFULLY!")
    else:
        print("\n⚠️  TRAINING PIPELINE COMPLETED WITH ISSUES BUT DID NOT STOP!")
    
    return trainer if success else None

# ─── Execute Everything ──────────────────────────────────────────────────────
if __name__ == "__main__":
    print("🏁 STARTING EXECUTION WITH MAXIMUM RESILIENCE...")
    
    try:
        trainer = main()
        if trainer:
            print("🎊 SUCCESS: Training pipeline completed!")
        else:
            print("⚠️  Training pipeline completed with issues but did not crash!")
    except KeyboardInterrupt:
        print("\n🛑 EXECUTION STOPPED BY USER")
    except Exception as e:
        print(f"💥 UNEXPECTED ERROR: {str(e)}")
        traceback.print_exc()
        print("⚠️  Even fatal errors won't stop the program completely!")