Spaces:

Lubabah0
/

lora-code-generation-demo

Build error

File size: 25,628 Bytes

adfb728

"""

ULTIMATE LoRA Fine-Tuning Demo - Covers ALL Project Requirements

Group 6: Model Adaptation, Efficient Fine-Tuning & Deployment of LLMs

"""

import streamlit as st
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import time
import psutil
import os

# Page configuration
st.set_page_config(
    page_title="LoRA Fine-Tuning Complete Demo",
    page_icon="🤖",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS
st.markdown("""

<style>

    .main-header {

        font-size: 2.5rem;

        font-weight: bold;

        text-align: center;

        background: linear-gradient(120deg, #1f77b4, #00cc88);

        -webkit-background-clip: text;

        -webkit-text-fill-color: transparent;

        margin-bottom: 0.5rem;

    }

    .sub-header {

        text-align: center;

        color: #666;

        margin-bottom: 2rem;

        font-size: 1.1rem;

    }

    .metric-card {

        background: #f0f2f6;

        padding: 1rem;

        border-radius: 10px;

        border-left: 4px solid #1f77b4;

    }

    .model-box {

        padding: 1.5rem;

        border-radius: 10px;

        margin: 1rem 0;

        box-shadow: 0 2px 4px rgba(0,0,0,0.1);

    }

    .base-model {

        background-color: #fff5f5;

        border-left: 4px solid #ff4b4b;

    }

    .finetuned-model {

        background-color: #f0fff4;

        border-left: 4px solid #00cc88;

    }

    .theory-box {

        background: #e8f4f8;

        padding: 1.5rem;

        border-radius: 10px;

        margin: 1rem 0;

        border-left: 4px solid #1f77b4;

    }

</style>

""", unsafe_allow_html=True)

# Title
st.markdown('<div class="main-header">🚀 Complete LoRA Fine-Tuning Demo</div>', unsafe_allow_html=True)
st.markdown('<div class="sub-header">Parameter-Efficient Fine-Tuning & Deployment Showcase</div>',
            unsafe_allow_html=True)

# Sidebar Navigation
with st.sidebar:
    st.header("📚 Navigation")
    page = st.radio(
        "Select Section:",
        ["🎯 Live Demo", "📊 Theory & Concepts", "⚙️ Technical Details", "🚀 Deployment Info"],
        label_visibility="collapsed"
    )

    st.divider()

    if page == "🎯 Live Demo":
        st.header("⚙️ Model Settings")

        device_option = st.selectbox(
            "Inference Device",
            ["Auto (GPU if available)", "Force CPU", "Force GPU"],
            help="Compare CPU vs GPU inference speed"
        )

        use_quantization = st.checkbox(
            "Use 8-bit Quantization",
            value=False,
            help="Reduces memory usage, slightly slower"
        )

        temperature = st.slider("Temperature", 0.1, 1.0, 0.3, 0.1)
        max_length = st.slider("Max Length", 50, 400, 200, 10)
        top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)

        st.divider()

        st.header("📊 Quick Stats")
        col1, col2 = st.columns(2)
        with col1:
            st.metric("Base Model", "82M params")
            st.metric("Adapter Size", "~3 MB")
        with col2:
            st.metric("Trainable", "0.4%")
            st.metric("Training Time", "~30 min")


# Cache model loading
@st.cache_resource
def load_models(use_quantization=False, device_option="Auto"):
    """Load base model and fine-tuned model"""

    base_model_name = "distilgpt2"
    adapter_path = "./models/lora_adapters"

    # Determine device
    if device_option == "Force CPU":
        device = "cpu"
    elif device_option == "Force GPU":
        device = "cuda" if torch.cuda.is_available() else "cpu"
    else:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    with st.spinner("🔄 Loading models..."):
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        tokenizer.pad_token = tokenizer.eos_token

        # Quantization config
        if use_quantization and device == "cuda":
            quantization_config = BitsAndBytesConfig(
                load_in_8bit=True,
                llm_int8_threshold=6.0
            )
            base_model = AutoModelForCausalLM.from_pretrained(
                base_model_name,
                quantization_config=quantization_config,
                device_map="auto"
            )
            finetuned_model = AutoModelForCausalLM.from_pretrained(
                base_model_name,
                quantization_config=quantization_config,
                device_map="auto"
            )
            finetuned_model = PeftModel.from_pretrained(finetuned_model, adapter_path)
        else:
            # Standard loading
            base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
            finetuned_model = AutoModelForCausalLM.from_pretrained(base_model_name)
            finetuned_model = PeftModel.from_pretrained(finetuned_model, adapter_path)

            base_model.to(device)
            finetuned_model.to(device)

    return tokenizer, base_model, finetuned_model, device


def get_model_size_mb(model):
    """Calculate model size in MB"""
    param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers())
    return (param_size + buffer_size) / (1024 ** 2)


def generate_response(model, tokenizer, prompt, device, temperature, max_length, top_p):
    """Generate response from a model"""
    formatted_input = f"### Instruction:\n{prompt}\n\n### Code:\n"
    inputs = tokenizer(formatted_input, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


# =============================================================================
# PAGE 1: LIVE DEMO
# =============================================================================
if page == "🎯 Live Demo":
    # Load models
    try:
        tokenizer, base_model, finetuned_model, device = load_models(
            use_quantization=use_quantization if 'use_quantization' in dir() else False,
            device_option=device_option if 'device_option' in dir() else "Auto"
        )

        # Show device info
        device_emoji = "🚀" if device == "cuda" else "🐢"
        if device == "cuda":
            st.success(f"{device_emoji} Running on GPU: {torch.cuda.get_device_name(0)}")
        else:
            st.info(f"{device_emoji} Running on CPU (slower but works!)")

        # Show quantization status
        if use_quantization and device == "cuda":
            st.info("⚡ 8-bit quantization enabled - Lower memory usage!")

    except Exception as e:
        st.error(f"❌ Error loading models: {str(e)}")
        st.stop()

    # Sample prompts
    st.header("💬 Try the Demo")

    sample_prompts = [
        "Write a Python function to calculate factorial",
        "Create a function to check if a string is palindrome",
        "Write code to merge two sorted lists",
        "Implement a function to find the largest element in a list",
        "Create a Python function to check if a number is prime",
        "Write code to reverse a linked list",
        "Implement binary search algorithm in Python"
    ]

    col1, col2 = st.columns([3, 1])
    with col1:
        use_sample = st.selectbox("Select prompt or write custom:", ["Custom"] + sample_prompts)
    with col2:
        st.write("")
        st.write("")

    if use_sample == "Custom":
        user_instruction = st.text_area(
            "Enter your instruction:",
            height=100,
            placeholder="e.g., Write a Python function to sort a dictionary by values"
        )
    else:
        user_instruction = use_sample
        st.info(f"💡 Prompt: {user_instruction}")

    # Generate button
    if st.button("🚀 Generate Responses", type="primary", use_container_width=True):
        if user_instruction.strip():

            col_base, col_finetuned = st.columns(2)

            with col_base:
                st.markdown('<div class="model-box base-model">', unsafe_allow_html=True)
                st.subheader("🔴 Base Model (Untrained)")

                with st.spinner("Generating..."):
                    start_time = time.time()
                    base_response = generate_response(
                        base_model, tokenizer, user_instruction, device,
                        temperature, max_length, top_p
                    )
                    base_time = time.time() - start_time

                st.code(base_response, language="python")
                st.caption(f"⏱️ Generation time: {base_time:.3f}s")
                st.markdown('</div>', unsafe_allow_html=True)

            with col_finetuned:
                st.markdown('<div class="model-box finetuned-model">', unsafe_allow_html=True)
                st.subheader("🟢 Fine-tuned Model (+ LoRA)")

                with st.spinner("Generating..."):
                    start_time = time.time()
                    finetuned_response = generate_response(
                        finetuned_model, tokenizer, user_instruction, device,
                        temperature, max_length, top_p
                    )
                    finetuned_time = time.time() - start_time

                st.code(finetuned_response, language="python")
                st.caption(f"⏱️ Generation time: {finetuned_time:.3f}s")
                st.markdown('</div>', unsafe_allow_html=True)

            # Performance Analysis
            st.divider()
            st.subheader("📊 Performance Analysis")

            col1, col2, col3, col4 = st.columns(4)

            with col1:
                st.metric("Base Response", f"{len(base_response.split())} words")
            with col2:
                st.metric("Fine-tuned Response", f"{len(finetuned_response.split())} words")
            with col3:
                speed_diff = ((base_time - finetuned_time) / base_time) * 100
                st.metric("Speed Difference", f"{speed_diff:+.1f}%")
            with col4:
                st.metric("Device", device.upper())

            st.success("✅ Notice: Base model produces gibberish, fine-tuned generates actual Python code!")

        else:
            st.warning("⚠️ Please enter an instruction!")

# =============================================================================
# PAGE 2: THEORY & CONCEPTS
# =============================================================================
elif page == "📊 Theory & Concepts":
    st.header("📚 Theory & Key Concepts")

    tab1, tab2, tab3, tab4 = st.tabs([
        "🎓 Pre-training vs Fine-tuning",
        "🔧 LoRA & PEFT",
        "⚡ Training vs Inference",
        "📏 Trade-offs"
    ])

    with tab1:
        st.markdown('<div class="theory-box">', unsafe_allow_html=True)
        st.subheader("Pre-training vs Fine-tuning")

        col1, col2 = st.columns(2)

        with col1:
            st.markdown("### 🏗️ Pre-training")
            st.markdown("""

            - **Task**: Learn general language understanding

            - **Data**: Massive unlabeled text (billions of tokens)

            - **Cost**: Extremely expensive ($$$$$)

            - **Time**: Weeks to months

            - **Example**: GPT, BERT, LLaMA training

            - **Goal**: General purpose model

            """)

        with col2:
            st.markdown("### 🎯 Fine-tuning")
            st.markdown("""

            - **Task**: Adapt to specific domain/task

            - **Data**: Smaller labeled dataset (thousands)

            - **Cost**: Much cheaper ($$)

            - **Time**: Hours to days

            - **Example**: Code generation, Q&A, summarization

            - **Goal**: Specialized model

            """)

        st.divider()

        st.markdown("### 📊 Our Project: Transfer Learning")
        st.info("""

        **We started with**: Pre-trained `distilgpt2` (general language model)  

        **We fine-tuned on**: Python code instructions (5000 samples)  

        **Result**: Model now generates Python code instead of general text!



        This is **Transfer Learning** - leveraging pre-trained knowledge for new tasks.

        """)
        st.markdown('</div>', unsafe_allow_html=True)

    with tab2:
        st.markdown('<div class="theory-box">', unsafe_allow_html=True)
        st.subheader("LoRA: Low-Rank Adaptation")

        col1, col2 = st.columns([1, 1])

        with col1:
            st.markdown("### 🔴 Full Fine-tuning (Expensive)")
            st.markdown("""

            ```

            Total Parameters: 82M

            Trainable: 82M (100%)

            Memory: High

            Time: Long

            GPU: Required (expensive)

            Checkpoint: 320 MB

            ```

            **Problems**:

            - ❌ Expensive GPUs needed

            - ❌ Long training time

            - ❌ Large model checkpoints

            - ❌ Risk of catastrophic forgetting

            """)

        with col2:
            st.markdown("### 🟢 LoRA Fine-tuning (Efficient)")
            st.markdown("""

            ```

            Total Parameters: 82M

            Trainable: 295K (0.36%)

            Memory: Low

            Time: Fast

            GPU: Optional (Colab free tier OK)

            Checkpoint: 3 MB

            ```

            **Advantages**:

            - ✅ Train on free GPUs

            - ✅ Fast training (~30 min)

            - ✅ Tiny adapter files

            - ✅ Preserve base model knowledge

            """)

        st.divider()

        st.markdown("### 🧮 How LoRA Works")
        st.markdown("""

        Instead of updating all weights `W`, LoRA adds small adapter matrices:



        ```

        W_new = W_frozen + ΔW

        where ΔW = B × A  (low-rank decomposition)

        ```



        **Our Configuration**:

        - `r = 16` (rank - controls adapter capacity)

        - `alpha = 32` (scaling factor)

        - Target modules: Attention layers only

        - Result: 99.6% fewer trainable parameters!

        """)
        st.markdown('</div>', unsafe_allow_html=True)

    with tab3:
        st.markdown('<div class="theory-box">', unsafe_allow_html=True)
        st.subheader("Training vs Inference")

        col1, col2 = st.columns(2)

        with col1:
            st.markdown("### 🏋️ Training Phase")
            st.markdown("""

            **What happens**:

            - Forward pass through model

            - Calculate loss (prediction error)

            - Backward propagation (gradients)

            - Update weights (only LoRA adapters)



            **Requirements**:

            - GPU highly recommended

            - More memory needed

            - Longer time

            - Batch processing



            **Our Training**:

            - Dataset: 5000 Python code examples

            - Time: ~30 minutes (Colab T4 GPU)

            - Memory: ~8 GB VRAM

            - Output: 3 MB adapter file

            """)

        with col2:
            st.markdown("### 🚀 Inference Phase")
            st.markdown("""

            **What happens**:

            - Load base model + adapters

            - Forward pass only (no backprop)

            - Generate predictions

            - No weight updates



            **Requirements**:

            - CPU works (slower)

            - GPU faster (optional)

            - Less memory

            - Real-time response



            **Our Deployment**:

            - Works on: CPU or GPU

            - Load time: ~10-30 seconds

            - Inference: ~1-3 seconds per response

            - Memory: ~2 GB RAM

            """)

        st.markdown('</div>', unsafe_allow_html=True)

    with tab4:
        st.markdown('<div class="theory-box">', unsafe_allow_html=True)
        st.subheader("Trade-offs & Optimization")

        st.markdown("### ⚖️ Key Trade-offs")

        col1, col2 = st.columns(2)

        with col1:
            st.markdown("#### 📏 Model Size vs Accuracy")
            st.markdown("""

            **Larger models**:

            - ✅ Better accuracy

            - ✅ More capacity

            - ❌ Slower inference

            - ❌ More memory



            **Smaller models**:

            - ✅ Faster inference

            - ✅ Less memory

            - ❌ Lower accuracy

            - ❌ Less capacity

            """)

        with col2:
            st.markdown("#### ⚡ Speed vs Quality")
            st.markdown("""

            **Higher quality**:

            - More parameters

            - Longer sequences

            - Lower temperature

            - ❌ Slower



            **Higher speed**:

            - Fewer parameters

            - Shorter sequences

            - Quantization

            - ❌ Potentially lower quality

            """)

        st.divider()

        st.markdown("### 🔢 Quantization")
        st.markdown("""

        **What**: Reduce precision of model weights (32-bit → 8-bit)



        **Benefits**:

        - 75% less memory usage

        - Faster inference on some hardware

        - Enables larger models on limited hardware



        **Cost**:

        - Slight accuracy loss (~1-2%)

        - Requires calibration



        **Try it**: Enable "8-bit quantization" in the sidebar on Demo page!

        """)

        st.markdown('</div>', unsafe_allow_html=True)

# =============================================================================
# PAGE 3: TECHNICAL DETAILS
# =============================================================================
elif page == "⚙️ Technical Details":
    st.header("⚙️ Technical Implementation")

    col1, col2 = st.columns(2)

    with col1:
        st.markdown('<div class="metric-card">', unsafe_allow_html=True)
        st.markdown("### 📦 Model Architecture")
        st.markdown("""

        **Base Model**: distilgpt2

        - Type: Causal Language Model

        - Parameters: 82M

        - Layers: 6 transformer blocks

        - Hidden size: 768

        - Attention heads: 12

        - Vocabulary: 50,257 tokens

        """)
        st.markdown('</div>', unsafe_allow_html=True)

        st.markdown('<div class="metric-card">', unsafe_allow_html=True)
        st.markdown("### 🔧 LoRA Configuration")
        st.markdown("""

        ```python

        LoraConfig(

            r=16,                    # Rank

            lora_alpha=32,           # Scaling

            target_modules=["c_attn"], # Attention only

            lora_dropout=0.05,

            task_type="CAUSAL_LM"

        )

        ```



        **Trainable Parameters**: 294,912 (0.36%)

        **Adapter Size**: ~3 MB

        """)
        st.markdown('</div>', unsafe_allow_html=True)

    with col2:
        st.markdown('<div class="metric-card">', unsafe_allow_html=True)
        st.markdown("### 📊 Dataset")
        st.markdown("""

        **Name**: Python Code Instructions (18k Alpaca)

        **Source**: `iamtarun/python_code_instructions_18k_alpaca`

        **Used**: 5000 samples

        - Training: 4500 samples

        - Validation: 500 samples



        **Format**:

        ```

        Instruction: Write Python code for X

        Code: def function()...

        ```

        """)
        st.markdown('</div>', unsafe_allow_html=True)

        st.markdown('<div class="metric-card">', unsafe_allow_html=True)
        st.markdown("### 🏋️ Training Hyperparameters")
        st.markdown("""

        ```python

        Epochs: 4

        Batch size: 2 (per device)

        Gradient accumulation: 4

        Learning rate: 3e-4

        Max sequence length: 512

        Optimizer: AdamW

        Scheduler: Linear warmup

        ```



        **Training Time**: ~30 minutes (T4 GPU)

        **Final Loss**: ~2.5

        """)
        st.markdown('</div>', unsafe_allow_html=True)

    st.divider()

    st.markdown("### 🛠️ Tools & Libraries Used")

    col1, col2, col3 = st.columns(3)

    with col1:
        st.markdown("""

        **Training**:

        - 🤗 Transformers

        - 🎯 PEFT (LoRA)

        - 🚀 Accelerate

        - 📊 Datasets

        - 🔥 PyTorch

        """)

    with col2:
        st.markdown("""

        **Deployment**:

        - 🌐 Streamlit

        - 🤗 Hugging Face Hub

        - ⚡ bitsandbytes (quantization)

        - 💾 safetensors

        """)

    with col3:
        st.markdown("""

        **Infrastructure**:

        - 📓 Google Colab (training)

        - 💻 Local deployment

        - ☁️ Hugging Face Spaces (optional)

        - 🔒 Git LFS (model versioning)

        """)

# =============================================================================
# PAGE 4: DEPLOYMENT INFO
# =============================================================================
else:  # Deployment Info
    st.header("🚀 Deployment Options")

    tab1, tab2, tab3 = st.tabs(["💻 Local", "☁️ Cloud", "📊 Comparison"])

    with tab1:
        st.markdown('<div class="theory-box">', unsafe_allow_html=True)
        st.markdown("### 💻 Local Deployment (Current)")

        st.markdown("""

        **Advantages**:

        - ✅ Full control

        - ✅ No API costs

        - ✅ Data privacy

        - ✅ Works offline

        - ✅ Fast iteration



        **Requirements**:

        - Python 3.8+

        - 2-4 GB RAM

        - Optional: NVIDIA GPU



        **Setup**:

        ```bash

        pip install streamlit transformers peft torch

        streamlit run app.py

        ```



        **Best for**: Development, testing, demos

        """)
        st.markdown('</div>', unsafe_allow_html=True)

    with tab2:
        st.markdown('<div class="theory-box">', unsafe_allow_html=True)
        st.markdown("### ☁️ Cloud Deployment")

        st.markdown("#### 🤗 Hugging Face Spaces (Recommended)")
        st.markdown("""

        **Features**:

        - ✅ Free tier available

        - ✅ Auto-deploys from Git

        - ✅ Public URL

        - ✅ No server management

        - ✅ Built-in CI/CD



        **Setup**:

        1. Create account on huggingface.co

        2. Create new Space (Streamlit)

        3. Upload: app.py, requirements.txt, models/

        4. Auto-deploys!



        **URL**: `https://huggingface.co/spaces/YOUR_USERNAME/lora-demo`

        """)

        st.divider()

        st.markdown("#### Other Options")

        col1, col2 = st.columns(2)

        with col1:
            st.markdown("""

            **Streamlit Cloud**:

            - Free for public apps

            - GitHub integration

            - Easy deployment

            - Resource limits

            """)

        with col2:
            st.markdown("""

            **AWS/GCP/Azure**:

            - Full control

            - Scalable

            - More expensive

            - Requires devops

            """)

        st.markdown('</div>', unsafe_allow_html=True)

    with tab3:
        st.markdown('<div class="theory-box">', unsafe_allow_html=True)
        st.markdown("### 📊 Deployment Comparison")

        comparison_data = {
            "Feature": ["Cost", "Setup Time", "Control", "Scalability", "Maintenance", "Best For"],
            "Local": ["Free", "5 mins", "Full", "Limited", "Manual", "Development"],
            "HF Spaces": ["Free", "10 mins", "Medium", "Auto", "Minimal", "Demos"],
            "Cloud (AWS)": ["$$$", "1-2 hours", "Full", "High", "Manual", "Production"]
        }

        st.table(comparison_data)

        st.divider()

        st.markdown("### 🎯 CPU vs GPU Inference")

        col1, col2 = st.columns(2)

        with col1:
            st.markdown("""

            **CPU Inference**:

            - Speed: 2-5 seconds/response

            - Cost: $0 (uses existing hardware)

            - Memory: ~2 GB RAM

            - Best for: Low-traffic apps, development

            """)

        with col2:
            st.markdown("""

            **GPU Inference**:

            - Speed: 0.5-2 seconds/response

            - Cost: $0.50-2/hour (cloud)

            - Memory: ~4-8 GB VRAM

            - Best for: High-traffic, real-time apps

            """)

        st.info("💡 **Tip**: Start with CPU deployment, upgrade to GPU only if needed!")

        st.markdown('</div>', unsafe_allow_html=True)

# Footer
st.divider()
st.markdown("""

<div style="text-align: center; color: #666; padding: 1rem;">

    <p><strong>🎓 Group 6: Model Adaptation, Efficient Fine-Tuning & Deployment of LLMs</strong></p>

    <p>Built with Streamlit • Transformers • PEFT • PyTorch</p>

</div>

""", unsafe_allow_html=True)