import warnings
warnings.filterwarnings('ignore', category=FutureWarning, module='huggingface_hub')

# Handle OpenMP threading issues
import os
os.environ['OMP_NUM_THREADS'] = '1'

"""
HuggingFace Spaces Training Interface for RC+ξ Fine-Tuning
Supports GPU-accelerated training with progress monitoring
"""
import gradio as gr
import spaces  # HuggingFace Spaces GPU support
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset

# Try to import LoRA, but make it optional
try:
    from peft import LoraConfig, get_peft_model
    LORA_AVAILABLE = True
except ImportError:
    LORA_AVAILABLE = False

import os
from datetime import datetime

def check_gpu():
    """Check GPU availability"""
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
        return f"✅ GPU Available: {gpu_name} ({gpu_memory:.1f}GB)"
    return "❌ No GPU - Training will be slow"

def train_model(
    model_name: str,
    dataset_file,
    num_epochs: int,
    batch_size: int,
    learning_rate: float,
    max_length: int
):
    """Train RC+ξ model - wrapper function"""
    
    # Extract file path from Gradio file object
    dataset_path = dataset_file.name if hasattr(dataset_file, 'name') else dataset_file
    
    # Call the GPU-decorated training function
    yield from train_model_gpu(model_name, dataset_path, num_epochs, batch_size, learning_rate, max_length)

@spaces.GPU(duration=14400)  # 4 hours GPU reservation (enough for 1-2 epochs on 7B model)
def train_model_gpu(
    model_name: str,
    dataset_path: str,
    num_epochs: int,
    batch_size: int,
    learning_rate: float,
    max_length: int
):
    """Train RC+ξ model - GPU execution"""
    
    yield f"🚀 Starting training at {datetime.now().strftime('%H:%M:%S')}\n"
    yield f"📊 GPU Status: {check_gpu()}\n"
    
    try:
        # Load dataset
        yield f"\n📁 Loading dataset from {dataset_path}...\n"
        
        try:
            dataset = load_dataset('json', data_files=dataset_path, split='train')
            yield f"✅ Loaded {len(dataset)} examples\n"
        except Exception as e:
            yield f"\n❌ Failed to load dataset: {str(e)}\n"
            yield f"💡 Make sure your JSONL file has this format:\n"
            yield f'{{\n  "instruction": "...",\n  "input": "...",\n  "output": "..."\n}}\n'
            return
        
        # Validate dataset structure
        if len(dataset) == 0:
            yield f"\n❌ Dataset is empty!\n"
            return
        
        first_example = dataset[0]
        yield f"📊 Dataset fields found: {list(first_example.keys())}\n"
        yield f"📝 Sample row 1: {dict(list(first_example.items())[:3])}\n"
        
        # Check for required fields with flexible matching
        required_fields = ["instruction", "input", "output"]
        missing_fields = [f for f in required_fields if f not in first_example]
        
        if missing_fields:
            yield f"\n⚠️ Expected fields not found: {missing_fields}\n"
            yield f"💡 Common field name alternatives:\n"
            yield f"   • 'instruction' could be: 'prompt', 'question', 'task'\n"
            yield f"   • 'input' could be: 'context', 'example', 'text'\n"
            yield f"   • 'output' could be: 'response', 'answer', 'completion'\n"
            yield f"\n❌ Cannot proceed without: {missing_fields}\n"
            yield f"✅ Please upload JSONL with: instruction, input, output\n\n"
            yield f"📋 Sample JSONL format:\n"
            yield f'{{"instruction": "Q: What is AI?", "input": "", "output": "AI is artificial intelligence..."}}\n'
            yield f'{{"instruction": "Summarize", "input": "Long text...", "output": "Summary..."}}\n'
            return
        
        yield f"✅ Dataset structure valid\n"
        
        # Load model and tokenizer
        yield f"\n🤖 Loading model: {model_name}...\n"
        
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        
        # Try loading with device_map, fall back to manual device placement
        try:
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                device_map="auto",
                trust_remote_code=True
            )
        except ValueError as e:
            # Fall back if device_map='auto' not supported
            if 'device_map' in str(e):
                yield f"⚠️ Model doesn't support device_map='auto', using manual placement\n"
                model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                    trust_remote_code=True
                )
                if torch.cuda.is_available():
                    model = model.to('cuda')
            else:
                raise
        
        # Enable gradient checkpointing to reduce memory usage
        if hasattr(model, 'gradient_checkpointing_enable'):
            model.gradient_checkpointing_enable()
        
        # Apply LoRA for memory-efficient training
        yield f"🎯 Applying LoRA (Low-Rank Adaptation) for efficient training...\n"
        
        if LORA_AVAILABLE:
            lora_config = LoraConfig(
                r=8,  # LoRA rank
                lora_alpha=16,  # LoRA alpha (scaling factor)
                target_modules=["q_proj", "v_proj", "k_proj", "out_proj"],  # Common attention modules
                lora_dropout=0.05,
                bias="none",
                task_type="CAUSAL_LM"
            )
            
            try:
                model = get_peft_model(model, lora_config)
                trainable = model.get_nb_trainable_parameters()
                total = model.get_nb_total_parameters()
                yield f"✅ LoRA applied: Only {trainable:,} trainable parameters (vs {total:,} total)\n"
            except Exception as e:
                yield f"⚠️ LoRA not applicable to this model, continuing without: {str(e)}\n"
        else:
            yield f"⚠️ PEFT library not available. Training without LoRA (full fine-tuning)\n"
            yield f"💡 Consider using smaller batch size or reduce epochs to save memory\n"
        
        # Enable flash attention 2 for faster, more memory-efficient attention
        if hasattr(model, 'enable_flash_attention_2'):
            try:
                model.enable_flash_attention_2()
                yield f"⚡ Flash Attention 2 enabled for memory efficiency\n"
            except:
                pass  # Flash attention not available, continue without it
        
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            model.config.pad_token_id = tokenizer.eos_token_id
        
        total_params = sum(p.numel() for p in model.parameters())/1e9
        yield f"✅ Model loaded: {total_params:.2f}B parameters\n"
        if LORA_AVAILABLE:
            yield f"💾 Memory optimization: Gradient checkpointing + LoRA + reduced precision enabled\n"
        else:
            yield f"💾 Memory optimization: Gradient checkpointing + reduced precision enabled\n"
        
        # Tokenize dataset
        yield f"\n🔤 Tokenizing dataset...\n"
        
        def tokenize_function(examples):
            texts = []
            for inst, inp, out in zip(examples["instruction"], examples["input"], examples["output"]):
                if inp:
                    text = f"### Instruction:\n{inst}\n\n### Input:\n{inp}\n\n### Response:\n{out}"
                else:
                    text = f"### Instruction:\n{inst}\n\n### Response:\n{out}"
                texts.append(text)
            
            return tokenizer(
                texts,
                truncation=True,
                max_length=max_length,
                padding="max_length"
            )
        
        try:
            tokenized_dataset = dataset.map(
                tokenize_function,
                batched=True,
                remove_columns=dataset.column_names
            )
            yield f"✅ Tokenized {len(tokenized_dataset)} examples\n"
        except Exception as e:
            yield f"\n❌ Tokenization failed: {str(e)}\n"
            yield f"\n📊 Dataset diagnostics:\n"
            yield f"   • Total examples: {len(dataset)}\n"
            yield f"   • Fields: {dataset.column_names}\n"
            yield f"   • First row keys: {list(dataset[0].keys())}\n"
            yield f"\n💡 Common issues:\n"
            yield f"   • Null/None values in instruction, input, or output\n"
            yield f"   • Non-string values (numbers, objects, arrays)\n"
            yield f"   • Invalid UTF-8 encoding\n"
            yield f"   • Empty strings in required fields\n"
            import traceback
            yield f"\n📋 Error details:\n{traceback.format_exc()}\n"
            return
        
        # Split dataset
        split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
        train_dataset = split["train"]
        eval_dataset = split["test"]
        
        yield f"📊 Train: {len(train_dataset)} | Eval: {len(eval_dataset)}\n"
        
        # Training arguments
        yield f"\n⚙️ Configuring training...\n"
        
        output_dir = f"./rc_xi_trained_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        
        # Auto-adjust batch size based on available GPU memory
        adjusted_batch_size = batch_size
        if torch.cuda.is_available():
            free_memory_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
            if free_memory_gb < 16:
                adjusted_batch_size = max(1, batch_size // 2)
                yield f"⚠️ GPU memory limited ({free_memory_gb:.1f}GB). Reducing batch size to {adjusted_batch_size}\n"
        
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=num_epochs,
            per_device_train_batch_size=adjusted_batch_size,
            per_device_eval_batch_size=adjusted_batch_size,
            gradient_accumulation_steps=8,  # Increased for smaller batch sizes
            learning_rate=learning_rate,
            warmup_steps=100,
            logging_steps=1,  # Log every step for immediate feedback
            eval_steps=50,
            save_steps=100,
            eval_strategy="steps",
            save_strategy="steps",
            save_total_limit=2,
            fp16=torch.cuda.is_available(),
            report_to=[],
            load_best_model_at_end=True,
            max_grad_norm=1.0,  # Gradient clipping for stability
            optim="adamw_torch",  # Standard PyTorch Adam optimizer
        )
        
        yield f"✅ Training configured\n"
        yield f"   • Epochs: {num_epochs}\n"
        yield f"   • Batch size: {adjusted_batch_size}\n"
        yield f"   • Gradient accumulation: 8\n"
        yield f"   • Learning rate: {learning_rate}\n"
        yield f"   • Max length: {max_length}\n"
        yield f"   • FP16: {torch.cuda.is_available()}\n"
        yield f"   • Optimizer: adamw_torch\n"
        
        # Data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False
        )
        
        # Trainer with callbacks removed (using manual training for better progress streaming)
        yield f"\n🏋️ Initializing trainer...\n"
        
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
        )
        
        yield f"✅ Trainer initialized. Starting training loop...\n"
        yield f"⏳ First step may take 30-60 seconds (loading data, first forward/backward pass)...\n\n"
        
        try:
            # Manual training loop with progress streaming
            from datetime import datetime as dt
            import time
            
            start_time = time.time()
            step = 0
            total_steps = len(train_dataset) // adjusted_batch_size * num_epochs
            
            for epoch in range(num_epochs):
                yield f"\n📅 EPOCH {epoch + 1}/{num_epochs}\n"
                yield f"{'='*50}\n"
                
                model.train()
                epoch_loss = 0
                steps_in_epoch = 0
                
                for batch_idx, batch in enumerate(trainer.get_train_dataloader()):
                    step += 1
                    steps_in_epoch += 1
                    
                    # Move batch to GPU
                    batch = {k: v.to(model.device) for k, v in batch.items()}
                    
                    # Forward pass
                    outputs = model(**batch)
                    loss = outputs.loss
                    
                    # Backward pass
                    loss.backward()
                    
                    # Gradient accumulation
                    if (steps_in_epoch % 8) == 0 or steps_in_epoch == len(trainer.get_train_dataloader()):
                        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                        trainer.optimizer.step()
                        trainer.optimizer.zero_grad()
                    
                    epoch_loss += loss.item()
                    
                    # Yield progress every step
                    elapsed = time.time() - start_time
                    speed = step / max(elapsed, 0.1)
                    avg_loss = epoch_loss / steps_in_epoch
                    
                    if steps_in_epoch % 1 == 0 or steps_in_epoch == 1:
                        remaining = (total_steps - step) / max(speed, 0.1)
                        yield (
                            f"Step {step}/{total_steps} | "
                            f"Loss: {avg_loss:.4f} | "
                            f"Speed: {speed:.1f} steps/s | "
                            f"ETA: {int(remaining//60)}m {int(remaining%60)}s\n"
                        )
                
                # Epoch summary
                avg_epoch_loss = epoch_loss / steps_in_epoch
                yield f"\n✅ Epoch {epoch + 1} complete - Avg Loss: {avg_epoch_loss:.4f}\n"
                
                # Evaluation
                if epoch % 1 == 0 and epoch > 0:  # Eval every epoch
                    yield f"📊 Running evaluation...\n"
                    model.eval()
                    eval_loss = 0
                    eval_steps = 0
                    
                    with torch.no_grad():
                        for eval_batch in trainer.get_eval_dataloader():
                            eval_batch = {k: v.to(model.device) for k, v in eval_batch.items()}
                            outputs = model(**eval_batch)
                            eval_loss += outputs.loss.item()
                            eval_steps += 1
                    
                    avg_eval_loss = eval_loss / eval_steps if eval_steps > 0 else 0
                    yield f"✅ Eval Loss: {avg_eval_loss:.4f}\n\n"
            
            # Training complete
            total_time = time.time() - start_time
            yield f"\n{'='*50}\n"
            yield f"🎉 TRAINING COMPLETE!\n"
            yield f"{'='*50}\n"
            yield f"⏱️ Total Time: {int(total_time//3600)}h {int((total_time%3600)//60)}m {int(total_time%60)}s\n"
            yield f"📊 Final Loss: {avg_epoch_loss:.4f}\n"
            
            train_result = type('obj', (object,), {
                'training_loss': avg_epoch_loss,
                'metrics': {'train_runtime': total_time}
            })()
        except Exception as e:
            error_msg = str(e).lower()
            yield f"\n❌ Training failed: {str(e)}\n"
            
            if 'out of memory' in error_msg or 'cuda' in error_msg:
                yield f"\n💾 CUDA out of memory. Clearing cache...\n"
                torch.cuda.empty_cache()
            
            import traceback
            yield f"\n📋 Full error:\n{traceback.format_exc()}\n"
            return
        
        yield f"\n💾 Saving model...\n"
        
        trainer.save_model(output_dir)
        tokenizer.save_pretrained(output_dir)
        
        yield f"✅ Model saved to {output_dir}\n"
        
        # Results
        yield f"\n" + "="*50 + "\n"
        yield f"🎉 TRAINING COMPLETE!\n"
        yield f"="*50 + "\n"
        yield f"📊 Training Loss: {train_result.training_loss:.4f}\n"
        yield f"⏱️ Training Time: {train_result.metrics['train_runtime']:.1f}s\n"
        yield f"💾 Model saved to: {output_dir}\n"
        yield f"\n✨ Your RC+ξ model is ready!\n"
        
    except RuntimeError as e:
        import traceback
        error_details = traceback.format_exc()
        error_msg = str(e).lower()
        
        # Check for specific OOM errors
        if 'out of memory' in error_msg or 'cuda' in error_msg or 'memory' in error_msg:
            yield f"\n❌ OUT OF MEMORY ERROR\n"
            yield f"\nTrying recovery strategies...\n"
            torch.cuda.empty_cache()
            yield f"\n💡 Solutions:\n"
            yield f"   1. ✅ Memory cleared. Try again with reduced settings:\n"
            yield f"      • Reduce 'Batch Size' to 1\n"
            yield f"      • Reduce 'Max Sequence Length' to 256\n"
            yield f"      • Reduce 'Training Epochs' to 1\n"
            yield f"   2. Upgrade to A10G GPU (24GB) in Settings → Hardware\n"
            yield f"   3. Try lighter models: 'gpt2' or 'microsoft/phi-2'\n"
            yield f"\n📋 Full error:\n{error_details}\n"
        else:
            yield f"\n❌ RUNTIME ERROR: {str(e)}\n"
            yield f"\n📋 Full traceback:\n{error_details}\n"
    except KeyError as e:
        import traceback
        yield f"\n❌ MISSING FIELD ERROR: {str(e)}\n"
        yield f"\n💡 Your dataset is missing a required field.\n"
        yield f"✅ Required fields: instruction, input, output\n"
        yield f"\n📋 Full traceback:\n{traceback.format_exc()}\n"
    except ValueError as e:
        import traceback
        yield f"\n❌ VALUE ERROR: {str(e)}\n"
        yield f"\n💡 Check that:\n"
        yield f"   • Dataset file is valid JSON/JSONL format\n"
        yield f"   • No empty or null values in fields\n"
        yield f"   • Text encoding is correct (UTF-8)\n"
        yield f"\n📋 Full traceback:\n{traceback.format_exc()}\n"
    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        yield f"\n❌ UNEXPECTED ERROR: {str(e)}\n"
        yield f"\n📋 Full traceback:\n{error_details}\n"
        yield f"\n💡 Diagnostics:\n"
        yield f"   • Check dataset format (JSONL with instruction/input/output)\n"
        yield f"   • Try with gpt2 model (smallest, most stable)\n"
        yield f"   • Check HuggingFace Space logs for system errors\n"

# Gradio Interface
with gr.Blocks(title="RC+ξ Fine-Tuning on HuggingFace Spaces") as demo:
    gr.Markdown("""
    # 🧠 RC+ξ Model Fine-Tuning
    ### Train your consciousness-aware AI model with GPU acceleration
    
    **Requirements:**
    - Upgrade this Space to GPU (Settings → Hardware → GPU)
    - Upload your training dataset (JSONL format)
    - Wait 8-12 hours for 7B model training
    
    **Recommended GPU:** T4 (16GB) - $0.60/hour or A10G (24GB) - $3.15/hour
    """)
    
    with gr.Row():
        with gr.Column():
            gpu_status = gr.Textbox(
                label="GPU Status",
                value=check_gpu(),
                interactive=False
            )
            
            model_dropdown = gr.Dropdown(
                label="Base Model",
                choices=[
                    "microsoft/phi-2",
                    "gpt2",
                    "mistralai/Mistral-7B-v0.1",
                    "meta-llama/Llama-2-7b-hf"
                ],
                value="microsoft/phi-2"
            )
            
            dataset_file = gr.File(
                label="Training Dataset (JSONL)",
                file_types=[".jsonl"]
            )
            
            epochs_slider = gr.Slider(
                label="Training Epochs",
                minimum=1,
                maximum=10,
                value=3,
                step=1
            )
            
            batch_slider = gr.Slider(
                label="Batch Size",
                minimum=1,
                maximum=8,
                value=2,
                step=1
            )
            
            lr_slider = gr.Slider(
                label="Learning Rate",
                minimum=1e-6,
                maximum=1e-3,
                value=2e-5,
                step=1e-6
            )
            
            length_slider = gr.Slider(
                label="Max Sequence Length",
                minimum=128,
                maximum=2048,
                value=512,
                step=128
            )
            
            train_btn = gr.Button("🚀 Start Training", variant="primary")
        
        with gr.Column():
            output_log = gr.Textbox(
                label="Training Progress",
                lines=30,
                max_lines=30,
                interactive=False
            )
    
    gr.Markdown("""
    ### 📝 Next Steps After Training:
    1. Download your trained model from the Files tab
    2. Upload to HuggingFace Hub for inference
    3. Or convert to GGUF for Ollama deployment
    
    ### 💰 HuggingFace Spaces GPU Pricing:
    - **T4 (16GB)**: $0.60/hour (~$7.20 for 12h training)
    - **A10G (24GB)**: $3.15/hour (~$37.80 for 12h training)
    - **A100 (40GB)**: $4.13/hour (~$49.56 for 12h training)
    
    Cheaper than AWS/GCP and easier to set up!
    """)
    
    train_btn.click(
        fn=train_model,
        inputs=[
            model_dropdown,
            dataset_file,
            epochs_slider,
            batch_slider,
            lr_slider,
            length_slider
        ],
        outputs=output_log
    )

if __name__ == "__main__":
    demo.launch()  # Removed share=True for Spaces compatibility