"""
ULTIMATE LoRA Fine-Tuning Demo - Covers ALL Project Requirements
Group 6: Model Adaptation, Efficient Fine-Tuning & Deployment of LLMs
"""

import streamlit as st
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import time
import psutil
import os

# Page configuration
st.set_page_config(
    page_title="LoRA Fine-Tuning Complete Demo",
    page_icon="🤖",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS
st.markdown("""
<style>
    .main-header {
        font-size: 2.5rem;
        font-weight: bold;
        text-align: center;
        background: linear-gradient(120deg, #1f77b4, #00cc88);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
        margin-bottom: 0.5rem;
    }
    .sub-header {
        text-align: center;
        color: #666;
        margin-bottom: 2rem;
        font-size: 1.1rem;
    }
    .metric-card {
        background: #f0f2f6;
        padding: 1rem;
        border-radius: 10px;
        border-left: 4px solid #1f77b4;
    }
    .model-box {
        padding: 1.5rem;
        border-radius: 10px;
        margin: 1rem 0;
        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
    }
    .base-model {
        background-color: #fff5f5;
        border-left: 4px solid #ff4b4b;
    }
    .finetuned-model {
        background-color: #f0fff4;
        border-left: 4px solid #00cc88;
    }
    .theory-box {
        background: #e8f4f8;
        padding: 1.5rem;
        border-radius: 10px;
        margin: 1rem 0;
        border-left: 4px solid #1f77b4;
    }
</style>
""", unsafe_allow_html=True)

# Title
st.markdown('<div class="main-header">🚀 Complete LoRA Fine-Tuning Demo</div>', unsafe_allow_html=True)
st.markdown('<div class="sub-header">Parameter-Efficient Fine-Tuning & Deployment Showcase</div>',
            unsafe_allow_html=True)

# Sidebar Navigation
with st.sidebar:
    st.header("📚 Navigation")
    page = st.radio(
        "Select Section:",
        ["🎯 Live Demo", "📊 Theory & Concepts", "⚙️ Technical Details", "🚀 Deployment Info"],
        label_visibility="collapsed"
    )

    st.divider()

    if page == "🎯 Live Demo":
        st.header("⚙️ Model Settings")

        device_option = st.selectbox(
            "Inference Device",
            ["Auto (GPU if available)", "Force CPU", "Force GPU"],
            help="Compare CPU vs GPU inference speed"
        )

        use_quantization = st.checkbox(
            "Use 8-bit Quantization",
            value=False,
            help="Reduces memory usage, slightly slower"
        )

        temperature = st.slider("Temperature", 0.1, 1.0, 0.3, 0.1)
        max_length = st.slider("Max Length", 50, 400, 200, 10)
        top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)

        st.divider()

        st.header("📊 Quick Stats")
        col1, col2 = st.columns(2)
        with col1:
            st.metric("Base Model", "82M params")
            st.metric("Adapter Size", "~3 MB")
        with col2:
            st.metric("Trainable", "0.4%")
            st.metric("Training Time", "~30 min")


# Cache model loading
@st.cache_resource
def load_models(use_quantization=False, device_option="Auto"):
    """Load base model and fine-tuned model"""

    base_model_name = "distilgpt2"
    adapter_path = "./models/lora_adapters"

    # Determine device
    if device_option == "Force CPU":
        device = "cpu"
    elif device_option == "Force GPU":
        device = "cuda" if torch.cuda.is_available() else "cpu"
    else:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    with st.spinner("🔄 Loading models..."):
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        tokenizer.pad_token = tokenizer.eos_token

        # Quantization config
        if use_quantization and device == "cuda":
            quantization_config = BitsAndBytesConfig(
                load_in_8bit=True,
                llm_int8_threshold=6.0
            )
            base_model = AutoModelForCausalLM.from_pretrained(
                base_model_name,
                quantization_config=quantization_config,
                device_map="auto"
            )
            finetuned_model = AutoModelForCausalLM.from_pretrained(
                base_model_name,
                quantization_config=quantization_config,
                device_map="auto"
            )
            finetuned_model = PeftModel.from_pretrained(finetuned_model, adapter_path)
        else:
            # Standard loading
            base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
            finetuned_model = AutoModelForCausalLM.from_pretrained(base_model_name)
            finetuned_model = PeftModel.from_pretrained(finetuned_model, adapter_path)

            base_model.to(device)
            finetuned_model.to(device)

    return tokenizer, base_model, finetuned_model, device


def get_model_size_mb(model):
    """Calculate model size in MB"""
    param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers())
    return (param_size + buffer_size) / (1024 ** 2)


def generate_response(model, tokenizer, prompt, device, temperature, max_length, top_p):
    """Generate response from a model"""
    formatted_input = f"### Instruction:\n{prompt}\n\n### Code:\n"
    inputs = tokenizer(formatted_input, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


# =============================================================================
# PAGE 1: LIVE DEMO
# =============================================================================
if page == "🎯 Live Demo":
    # Load models
    try:
        tokenizer, base_model, finetuned_model, device = load_models(
            use_quantization=use_quantization if 'use_quantization' in dir() else False,
            device_option=device_option if 'device_option' in dir() else "Auto"
        )

        # Show device info
        device_emoji = "🚀" if device == "cuda" else "🐢"
        if device == "cuda":
            st.success(f"{device_emoji} Running on GPU: {torch.cuda.get_device_name(0)}")
        else:
            st.info(f"{device_emoji} Running on CPU (slower but works!)")

        # Show quantization status
        if use_quantization and device == "cuda":
            st.info("⚡ 8-bit quantization enabled - Lower memory usage!")

    except Exception as e:
        st.error(f"❌ Error loading models: {str(e)}")
        st.stop()

    # Sample prompts
    st.header("💬 Try the Demo")

    sample_prompts = [
        "Write a Python function to calculate factorial",
        "Create a function to check if a string is palindrome",
        "Write code to merge two sorted lists",
        "Implement a function to find the largest element in a list",
        "Create a Python function to check if a number is prime",
        "Write code to reverse a linked list",
        "Implement binary search algorithm in Python"
    ]

    col1, col2 = st.columns([3, 1])
    with col1:
        use_sample = st.selectbox("Select prompt or write custom:", ["Custom"] + sample_prompts)
    with col2:
        st.write("")
        st.write("")

    if use_sample == "Custom":
        user_instruction = st.text_area(
            "Enter your instruction:",
            height=100,
            placeholder="e.g., Write a Python function to sort a dictionary by values"
        )
    else:
        user_instruction = use_sample
        st.info(f"💡 Prompt: {user_instruction}")

    # Generate button
    if st.button("🚀 Generate Responses", type="primary", use_container_width=True):
        if user_instruction.strip():

            col_base, col_finetuned = st.columns(2)

            with col_base:
                st.markdown('<div class="model-box base-model">', unsafe_allow_html=True)
                st.subheader("🔴 Base Model (Untrained)")

                with st.spinner("Generating..."):
                    start_time = time.time()
                    base_response = generate_response(
                        base_model, tokenizer, user_instruction, device,
                        temperature, max_length, top_p
                    )
                    base_time = time.time() - start_time

                st.code(base_response, language="python")
                st.caption(f"⏱️ Generation time: {base_time:.3f}s")
                st.markdown('</div>', unsafe_allow_html=True)

            with col_finetuned:
                st.markdown('<div class="model-box finetuned-model">', unsafe_allow_html=True)
                st.subheader("🟢 Fine-tuned Model (+ LoRA)")

                with st.spinner("Generating..."):
                    start_time = time.time()
                    finetuned_response = generate_response(
                        finetuned_model, tokenizer, user_instruction, device,
                        temperature, max_length, top_p
                    )
                    finetuned_time = time.time() - start_time

                st.code(finetuned_response, language="python")
                st.caption(f"⏱️ Generation time: {finetuned_time:.3f}s")
                st.markdown('</div>', unsafe_allow_html=True)

            # Performance Analysis
            st.divider()
            st.subheader("📊 Performance Analysis")

            col1, col2, col3, col4 = st.columns(4)

            with col1:
                st.metric("Base Response", f"{len(base_response.split())} words")
            with col2:
                st.metric("Fine-tuned Response", f"{len(finetuned_response.split())} words")
            with col3:
                speed_diff = ((base_time - finetuned_time) / base_time) * 100
                st.metric("Speed Difference", f"{speed_diff:+.1f}%")
            with col4:
                st.metric("Device", device.upper())

            st.success("✅ Notice: Base model produces gibberish, fine-tuned generates actual Python code!")

        else:
            st.warning("⚠️ Please enter an instruction!")

# =============================================================================
# PAGE 2: THEORY & CONCEPTS
# =============================================================================
elif page == "📊 Theory & Concepts":
    st.header("📚 Theory & Key Concepts")

    tab1, tab2, tab3, tab4 = st.tabs([
        "🎓 Pre-training vs Fine-tuning",
        "🔧 LoRA & PEFT",
        "⚡ Training vs Inference",
        "📏 Trade-offs"
    ])

    with tab1:
        st.markdown('<div class="theory-box">', unsafe_allow_html=True)
        st.subheader("Pre-training vs Fine-tuning")

        col1, col2 = st.columns(2)

        with col1:
            st.markdown("### 🏗️ Pre-training")
            st.markdown("""
            - **Task**: Learn general language understanding
            - **Data**: Massive unlabeled text (billions of tokens)
            - **Cost**: Extremely expensive ($$$$$)
            - **Time**: Weeks to months
            - **Example**: GPT, BERT, LLaMA training
            - **Goal**: General purpose model
            """)

        with col2:
            st.markdown("### 🎯 Fine-tuning")
            st.markdown("""
            - **Task**: Adapt to specific domain/task
            - **Data**: Smaller labeled dataset (thousands)
            - **Cost**: Much cheaper ($$)
            - **Time**: Hours to days
            - **Example**: Code generation, Q&A, summarization
            - **Goal**: Specialized model
            """)

        st.divider()

        st.markdown("### 📊 Our Project: Transfer Learning")
        st.info("""
        **We started with**: Pre-trained `distilgpt2` (general language model)  
        **We fine-tuned on**: Python code instructions (5000 samples)  
        **Result**: Model now generates Python code instead of general text!

        This is **Transfer Learning** - leveraging pre-trained knowledge for new tasks.
        """)
        st.markdown('</div>', unsafe_allow_html=True)

    with tab2:
        st.markdown('<div class="theory-box">', unsafe_allow_html=True)
        st.subheader("LoRA: Low-Rank Adaptation")

        col1, col2 = st.columns([1, 1])

        with col1:
            st.markdown("### 🔴 Full Fine-tuning (Expensive)")
            st.markdown("""
            ```
            Total Parameters: 82M
            Trainable: 82M (100%)
            Memory: High
            Time: Long
            GPU: Required (expensive)
            Checkpoint: 320 MB
            ```
            **Problems**:
            - ❌ Expensive GPUs needed
            - ❌ Long training time
            - ❌ Large model checkpoints
            - ❌ Risk of catastrophic forgetting
            """)

        with col2:
            st.markdown("### 🟢 LoRA Fine-tuning (Efficient)")
            st.markdown("""
            ```
            Total Parameters: 82M
            Trainable: 295K (0.36%)
            Memory: Low
            Time: Fast
            GPU: Optional (Colab free tier OK)
            Checkpoint: 3 MB
            ```
            **Advantages**:
            - ✅ Train on free GPUs
            - ✅ Fast training (~30 min)
            - ✅ Tiny adapter files
            - ✅ Preserve base model knowledge
            """)

        st.divider()

        st.markdown("### 🧮 How LoRA Works")
        st.markdown("""
        Instead of updating all weights `W`, LoRA adds small adapter matrices:

        ```
        W_new = W_frozen + ΔW
        where ΔW = B × A  (low-rank decomposition)
        ```

        **Our Configuration**:
        - `r = 16` (rank - controls adapter capacity)
        - `alpha = 32` (scaling factor)
        - Target modules: Attention layers only
        - Result: 99.6% fewer trainable parameters!
        """)
        st.markdown('</div>', unsafe_allow_html=True)

    with tab3:
        st.markdown('<div class="theory-box">', unsafe_allow_html=True)
        st.subheader("Training vs Inference")

        col1, col2 = st.columns(2)

        with col1:
            st.markdown("### 🏋️ Training Phase")
            st.markdown("""
            **What happens**:
            - Forward pass through model
            - Calculate loss (prediction error)
            - Backward propagation (gradients)
            - Update weights (only LoRA adapters)

            **Requirements**:
            - GPU highly recommended
            - More memory needed
            - Longer time
            - Batch processing

            **Our Training**:
            - Dataset: 5000 Python code examples
            - Time: ~30 minutes (Colab T4 GPU)
            - Memory: ~8 GB VRAM
            - Output: 3 MB adapter file
            """)

        with col2:
            st.markdown("### 🚀 Inference Phase")
            st.markdown("""
            **What happens**:
            - Load base model + adapters
            - Forward pass only (no backprop)
            - Generate predictions
            - No weight updates

            **Requirements**:
            - CPU works (slower)
            - GPU faster (optional)
            - Less memory
            - Real-time response

            **Our Deployment**:
            - Works on: CPU or GPU
            - Load time: ~10-30 seconds
            - Inference: ~1-3 seconds per response
            - Memory: ~2 GB RAM
            """)

        st.markdown('</div>', unsafe_allow_html=True)

    with tab4:
        st.markdown('<div class="theory-box">', unsafe_allow_html=True)
        st.subheader("Trade-offs & Optimization")

        st.markdown("### ⚖️ Key Trade-offs")

        col1, col2 = st.columns(2)

        with col1:
            st.markdown("#### 📏 Model Size vs Accuracy")
            st.markdown("""
            **Larger models**:
            - ✅ Better accuracy
            - ✅ More capacity
            - ❌ Slower inference
            - ❌ More memory

            **Smaller models**:
            - ✅ Faster inference
            - ✅ Less memory
            - ❌ Lower accuracy
            - ❌ Less capacity
            """)

        with col2:
            st.markdown("#### ⚡ Speed vs Quality")
            st.markdown("""
            **Higher quality**:
            - More parameters
            - Longer sequences
            - Lower temperature
            - ❌ Slower

            **Higher speed**:
            - Fewer parameters
            - Shorter sequences
            - Quantization
            - ❌ Potentially lower quality
            """)

        st.divider()

        st.markdown("### 🔢 Quantization")
        st.markdown("""
        **What**: Reduce precision of model weights (32-bit → 8-bit)

        **Benefits**:
        - 75% less memory usage
        - Faster inference on some hardware
        - Enables larger models on limited hardware

        **Cost**:
        - Slight accuracy loss (~1-2%)
        - Requires calibration

        **Try it**: Enable "8-bit quantization" in the sidebar on Demo page!
        """)

        st.markdown('</div>', unsafe_allow_html=True)

# =============================================================================
# PAGE 3: TECHNICAL DETAILS
# =============================================================================
elif page == "⚙️ Technical Details":
    st.header("⚙️ Technical Implementation")

    col1, col2 = st.columns(2)

    with col1:
        st.markdown('<div class="metric-card">', unsafe_allow_html=True)
        st.markdown("### 📦 Model Architecture")
        st.markdown("""
        **Base Model**: distilgpt2
        - Type: Causal Language Model
        - Parameters: 82M
        - Layers: 6 transformer blocks
        - Hidden size: 768
        - Attention heads: 12
        - Vocabulary: 50,257 tokens
        """)
        st.markdown('</div>', unsafe_allow_html=True)

        st.markdown('<div class="metric-card">', unsafe_allow_html=True)
        st.markdown("### 🔧 LoRA Configuration")
        st.markdown("""
        ```python
        LoraConfig(
            r=16,                    # Rank
            lora_alpha=32,           # Scaling
            target_modules=["c_attn"], # Attention only
            lora_dropout=0.05,
            task_type="CAUSAL_LM"
        )
        ```

        **Trainable Parameters**: 294,912 (0.36%)
        **Adapter Size**: ~3 MB
        """)
        st.markdown('</div>', unsafe_allow_html=True)

    with col2:
        st.markdown('<div class="metric-card">', unsafe_allow_html=True)
        st.markdown("### 📊 Dataset")
        st.markdown("""
        **Name**: Python Code Instructions (18k Alpaca)
        **Source**: `iamtarun/python_code_instructions_18k_alpaca`
        **Used**: 5000 samples
        - Training: 4500 samples
        - Validation: 500 samples

        **Format**:
        ```
        Instruction: Write Python code for X
        Code: def function()...
        ```
        """)
        st.markdown('</div>', unsafe_allow_html=True)

        st.markdown('<div class="metric-card">', unsafe_allow_html=True)
        st.markdown("### 🏋️ Training Hyperparameters")
        st.markdown("""
        ```python
        Epochs: 4
        Batch size: 2 (per device)
        Gradient accumulation: 4
        Learning rate: 3e-4
        Max sequence length: 512
        Optimizer: AdamW
        Scheduler: Linear warmup
        ```

        **Training Time**: ~30 minutes (T4 GPU)
        **Final Loss**: ~2.5
        """)
        st.markdown('</div>', unsafe_allow_html=True)

    st.divider()

    st.markdown("### 🛠️ Tools & Libraries Used")

    col1, col2, col3 = st.columns(3)

    with col1:
        st.markdown("""
        **Training**:
        - 🤗 Transformers
        - 🎯 PEFT (LoRA)
        - 🚀 Accelerate
        - 📊 Datasets
        - 🔥 PyTorch
        """)

    with col2:
        st.markdown("""
        **Deployment**:
        - 🌐 Streamlit
        - 🤗 Hugging Face Hub
        - ⚡ bitsandbytes (quantization)
        - 💾 safetensors
        """)

    with col3:
        st.markdown("""
        **Infrastructure**:
        - 📓 Google Colab (training)
        - 💻 Local deployment
        - ☁️ Hugging Face Spaces (optional)
        - 🔒 Git LFS (model versioning)
        """)

# =============================================================================
# PAGE 4: DEPLOYMENT INFO
# =============================================================================
else:  # Deployment Info
    st.header("🚀 Deployment Options")

    tab1, tab2, tab3 = st.tabs(["💻 Local", "☁️ Cloud", "📊 Comparison"])

    with tab1:
        st.markdown('<div class="theory-box">', unsafe_allow_html=True)
        st.markdown("### 💻 Local Deployment (Current)")

        st.markdown("""
        **Advantages**:
        - ✅ Full control
        - ✅ No API costs
        - ✅ Data privacy
        - ✅ Works offline
        - ✅ Fast iteration

        **Requirements**:
        - Python 3.8+
        - 2-4 GB RAM
        - Optional: NVIDIA GPU

        **Setup**:
        ```bash
        pip install streamlit transformers peft torch
        streamlit run app.py
        ```

        **Best for**: Development, testing, demos
        """)
        st.markdown('</div>', unsafe_allow_html=True)

    with tab2:
        st.markdown('<div class="theory-box">', unsafe_allow_html=True)
        st.markdown("### ☁️ Cloud Deployment")

        st.markdown("#### 🤗 Hugging Face Spaces (Recommended)")
        st.markdown("""
        **Features**:
        - ✅ Free tier available
        - ✅ Auto-deploys from Git
        - ✅ Public URL
        - ✅ No server management
        - ✅ Built-in CI/CD

        **Setup**:
        1. Create account on huggingface.co
        2. Create new Space (Streamlit)
        3. Upload: app.py, requirements.txt, models/
        4. Auto-deploys!

        **URL**: `https://huggingface.co/spaces/YOUR_USERNAME/lora-demo`
        """)

        st.divider()

        st.markdown("#### Other Options")

        col1, col2 = st.columns(2)

        with col1:
            st.markdown("""
            **Streamlit Cloud**:
            - Free for public apps
            - GitHub integration
            - Easy deployment
            - Resource limits
            """)

        with col2:
            st.markdown("""
            **AWS/GCP/Azure**:
            - Full control
            - Scalable
            - More expensive
            - Requires devops
            """)

        st.markdown('</div>', unsafe_allow_html=True)

    with tab3:
        st.markdown('<div class="theory-box">', unsafe_allow_html=True)
        st.markdown("### 📊 Deployment Comparison")

        comparison_data = {
            "Feature": ["Cost", "Setup Time", "Control", "Scalability", "Maintenance", "Best For"],
            "Local": ["Free", "5 mins", "Full", "Limited", "Manual", "Development"],
            "HF Spaces": ["Free", "10 mins", "Medium", "Auto", "Minimal", "Demos"],
            "Cloud (AWS)": ["$$$", "1-2 hours", "Full", "High", "Manual", "Production"]
        }

        st.table(comparison_data)

        st.divider()

        st.markdown("### 🎯 CPU vs GPU Inference")

        col1, col2 = st.columns(2)

        with col1:
            st.markdown("""
            **CPU Inference**:
            - Speed: 2-5 seconds/response
            - Cost: $0 (uses existing hardware)
            - Memory: ~2 GB RAM
            - Best for: Low-traffic apps, development
            """)

        with col2:
            st.markdown("""
            **GPU Inference**:
            - Speed: 0.5-2 seconds/response
            - Cost: $0.50-2/hour (cloud)
            - Memory: ~4-8 GB VRAM
            - Best for: High-traffic, real-time apps
            """)

        st.info("💡 **Tip**: Start with CPU deployment, upgrade to GPU only if needed!")

        st.markdown('</div>', unsafe_allow_html=True)

# Footer
st.divider()
st.markdown("""
<div style="text-align: center; color: #666; padding: 1rem;">
    <p><strong>🎓 Group 6: Model Adaptation, Efficient Fine-Tuning & Deployment of LLMs</strong></p>
    <p>Built with Streamlit • Transformers • PEFT • PyTorch</p>
</div>
""", unsafe_allow_html=True)