""" ULTIMATE LoRA Fine-Tuning Demo - Covers ALL Project Requirements Group 6: Model Adaptation, Efficient Fine-Tuning & Deployment of LLMs """ import streamlit as st import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from peft import PeftModel import time import psutil import os # Page configuration st.set_page_config( page_title="LoRA Fine-Tuning Complete Demo", page_icon="🤖", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS st.markdown(""" """, unsafe_allow_html=True) # Title st.markdown('
🚀 Complete LoRA Fine-Tuning Demo
', unsafe_allow_html=True) st.markdown('
Parameter-Efficient Fine-Tuning & Deployment Showcase
', unsafe_allow_html=True) # Sidebar Navigation with st.sidebar: st.header("📚 Navigation") page = st.radio( "Select Section:", ["🎯 Live Demo", "📊 Theory & Concepts", "⚙️ Technical Details", "🚀 Deployment Info"], label_visibility="collapsed" ) st.divider() if page == "🎯 Live Demo": st.header("⚙️ Model Settings") device_option = st.selectbox( "Inference Device", ["Auto (GPU if available)", "Force CPU", "Force GPU"], help="Compare CPU vs GPU inference speed" ) use_quantization = st.checkbox( "Use 8-bit Quantization", value=False, help="Reduces memory usage, slightly slower" ) temperature = st.slider("Temperature", 0.1, 1.0, 0.3, 0.1) max_length = st.slider("Max Length", 50, 400, 200, 10) top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05) st.divider() st.header("📊 Quick Stats") col1, col2 = st.columns(2) with col1: st.metric("Base Model", "82M params") st.metric("Adapter Size", "~3 MB") with col2: st.metric("Trainable", "0.4%") st.metric("Training Time", "~30 min") # Cache model loading @st.cache_resource def load_models(use_quantization=False, device_option="Auto"): """Load base model and fine-tuned model""" base_model_name = "distilgpt2" adapter_path = "./models/lora_adapters" # Determine device if device_option == "Force CPU": device = "cpu" elif device_option == "Force GPU": device = "cuda" if torch.cuda.is_available() else "cpu" else: device = "cuda" if torch.cuda.is_available() else "cpu" with st.spinner("🔄 Loading models..."): # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(base_model_name) tokenizer.pad_token = tokenizer.eos_token # Quantization config if use_quantization and device == "cuda": quantization_config = BitsAndBytesConfig( load_in_8bit=True, llm_int8_threshold=6.0 ) base_model = AutoModelForCausalLM.from_pretrained( base_model_name, quantization_config=quantization_config, device_map="auto" ) finetuned_model = AutoModelForCausalLM.from_pretrained( base_model_name, quantization_config=quantization_config, device_map="auto" ) finetuned_model = PeftModel.from_pretrained(finetuned_model, adapter_path) else: # Standard loading base_model = AutoModelForCausalLM.from_pretrained(base_model_name) finetuned_model = AutoModelForCausalLM.from_pretrained(base_model_name) finetuned_model = PeftModel.from_pretrained(finetuned_model, adapter_path) base_model.to(device) finetuned_model.to(device) return tokenizer, base_model, finetuned_model, device def get_model_size_mb(model): """Calculate model size in MB""" param_size = sum(p.nelement() * p.element_size() for p in model.parameters()) buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers()) return (param_size + buffer_size) / (1024 ** 2) def generate_response(model, tokenizer, prompt, device, temperature, max_length, top_p): """Generate response from a model""" formatted_input = f"### Instruction:\n{prompt}\n\n### Code:\n" inputs = tokenizer(formatted_input, return_tensors="pt", padding=True) inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): outputs = model.generate( **inputs, max_length=max_length, temperature=temperature, top_p=top_p, do_sample=True, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response # ============================================================================= # PAGE 1: LIVE DEMO # ============================================================================= if page == "🎯 Live Demo": # Load models try: tokenizer, base_model, finetuned_model, device = load_models( use_quantization=use_quantization if 'use_quantization' in dir() else False, device_option=device_option if 'device_option' in dir() else "Auto" ) # Show device info device_emoji = "🚀" if device == "cuda" else "🐢" if device == "cuda": st.success(f"{device_emoji} Running on GPU: {torch.cuda.get_device_name(0)}") else: st.info(f"{device_emoji} Running on CPU (slower but works!)") # Show quantization status if use_quantization and device == "cuda": st.info("⚡ 8-bit quantization enabled - Lower memory usage!") except Exception as e: st.error(f"❌ Error loading models: {str(e)}") st.stop() # Sample prompts st.header("💬 Try the Demo") sample_prompts = [ "Write a Python function to calculate factorial", "Create a function to check if a string is palindrome", "Write code to merge two sorted lists", "Implement a function to find the largest element in a list", "Create a Python function to check if a number is prime", "Write code to reverse a linked list", "Implement binary search algorithm in Python" ] col1, col2 = st.columns([3, 1]) with col1: use_sample = st.selectbox("Select prompt or write custom:", ["Custom"] + sample_prompts) with col2: st.write("") st.write("") if use_sample == "Custom": user_instruction = st.text_area( "Enter your instruction:", height=100, placeholder="e.g., Write a Python function to sort a dictionary by values" ) else: user_instruction = use_sample st.info(f"💡 Prompt: {user_instruction}") # Generate button if st.button("🚀 Generate Responses", type="primary", use_container_width=True): if user_instruction.strip(): col_base, col_finetuned = st.columns(2) with col_base: st.markdown('
', unsafe_allow_html=True) st.subheader("🔴 Base Model (Untrained)") with st.spinner("Generating..."): start_time = time.time() base_response = generate_response( base_model, tokenizer, user_instruction, device, temperature, max_length, top_p ) base_time = time.time() - start_time st.code(base_response, language="python") st.caption(f"⏱️ Generation time: {base_time:.3f}s") st.markdown('
', unsafe_allow_html=True) with col_finetuned: st.markdown('
', unsafe_allow_html=True) st.subheader("🟢 Fine-tuned Model (+ LoRA)") with st.spinner("Generating..."): start_time = time.time() finetuned_response = generate_response( finetuned_model, tokenizer, user_instruction, device, temperature, max_length, top_p ) finetuned_time = time.time() - start_time st.code(finetuned_response, language="python") st.caption(f"⏱️ Generation time: {finetuned_time:.3f}s") st.markdown('
', unsafe_allow_html=True) # Performance Analysis st.divider() st.subheader("📊 Performance Analysis") col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Base Response", f"{len(base_response.split())} words") with col2: st.metric("Fine-tuned Response", f"{len(finetuned_response.split())} words") with col3: speed_diff = ((base_time - finetuned_time) / base_time) * 100 st.metric("Speed Difference", f"{speed_diff:+.1f}%") with col4: st.metric("Device", device.upper()) st.success("✅ Notice: Base model produces gibberish, fine-tuned generates actual Python code!") else: st.warning("⚠️ Please enter an instruction!") # ============================================================================= # PAGE 2: THEORY & CONCEPTS # ============================================================================= elif page == "📊 Theory & Concepts": st.header("📚 Theory & Key Concepts") tab1, tab2, tab3, tab4 = st.tabs([ "🎓 Pre-training vs Fine-tuning", "🔧 LoRA & PEFT", "⚡ Training vs Inference", "📏 Trade-offs" ]) with tab1: st.markdown('
', unsafe_allow_html=True) st.subheader("Pre-training vs Fine-tuning") col1, col2 = st.columns(2) with col1: st.markdown("### 🏗️ Pre-training") st.markdown(""" - **Task**: Learn general language understanding - **Data**: Massive unlabeled text (billions of tokens) - **Cost**: Extremely expensive ($$$$$) - **Time**: Weeks to months - **Example**: GPT, BERT, LLaMA training - **Goal**: General purpose model """) with col2: st.markdown("### 🎯 Fine-tuning") st.markdown(""" - **Task**: Adapt to specific domain/task - **Data**: Smaller labeled dataset (thousands) - **Cost**: Much cheaper ($$) - **Time**: Hours to days - **Example**: Code generation, Q&A, summarization - **Goal**: Specialized model """) st.divider() st.markdown("### 📊 Our Project: Transfer Learning") st.info(""" **We started with**: Pre-trained `distilgpt2` (general language model) **We fine-tuned on**: Python code instructions (5000 samples) **Result**: Model now generates Python code instead of general text! This is **Transfer Learning** - leveraging pre-trained knowledge for new tasks. """) st.markdown('
', unsafe_allow_html=True) with tab2: st.markdown('
', unsafe_allow_html=True) st.subheader("LoRA: Low-Rank Adaptation") col1, col2 = st.columns([1, 1]) with col1: st.markdown("### 🔴 Full Fine-tuning (Expensive)") st.markdown(""" ``` Total Parameters: 82M Trainable: 82M (100%) Memory: High Time: Long GPU: Required (expensive) Checkpoint: 320 MB ``` **Problems**: - ❌ Expensive GPUs needed - ❌ Long training time - ❌ Large model checkpoints - ❌ Risk of catastrophic forgetting """) with col2: st.markdown("### 🟢 LoRA Fine-tuning (Efficient)") st.markdown(""" ``` Total Parameters: 82M Trainable: 295K (0.36%) Memory: Low Time: Fast GPU: Optional (Colab free tier OK) Checkpoint: 3 MB ``` **Advantages**: - ✅ Train on free GPUs - ✅ Fast training (~30 min) - ✅ Tiny adapter files - ✅ Preserve base model knowledge """) st.divider() st.markdown("### 🧮 How LoRA Works") st.markdown(""" Instead of updating all weights `W`, LoRA adds small adapter matrices: ``` W_new = W_frozen + ΔW where ΔW = B × A (low-rank decomposition) ``` **Our Configuration**: - `r = 16` (rank - controls adapter capacity) - `alpha = 32` (scaling factor) - Target modules: Attention layers only - Result: 99.6% fewer trainable parameters! """) st.markdown('
', unsafe_allow_html=True) with tab3: st.markdown('
', unsafe_allow_html=True) st.subheader("Training vs Inference") col1, col2 = st.columns(2) with col1: st.markdown("### 🏋️ Training Phase") st.markdown(""" **What happens**: - Forward pass through model - Calculate loss (prediction error) - Backward propagation (gradients) - Update weights (only LoRA adapters) **Requirements**: - GPU highly recommended - More memory needed - Longer time - Batch processing **Our Training**: - Dataset: 5000 Python code examples - Time: ~30 minutes (Colab T4 GPU) - Memory: ~8 GB VRAM - Output: 3 MB adapter file """) with col2: st.markdown("### 🚀 Inference Phase") st.markdown(""" **What happens**: - Load base model + adapters - Forward pass only (no backprop) - Generate predictions - No weight updates **Requirements**: - CPU works (slower) - GPU faster (optional) - Less memory - Real-time response **Our Deployment**: - Works on: CPU or GPU - Load time: ~10-30 seconds - Inference: ~1-3 seconds per response - Memory: ~2 GB RAM """) st.markdown('
', unsafe_allow_html=True) with tab4: st.markdown('
', unsafe_allow_html=True) st.subheader("Trade-offs & Optimization") st.markdown("### ⚖️ Key Trade-offs") col1, col2 = st.columns(2) with col1: st.markdown("#### 📏 Model Size vs Accuracy") st.markdown(""" **Larger models**: - ✅ Better accuracy - ✅ More capacity - ❌ Slower inference - ❌ More memory **Smaller models**: - ✅ Faster inference - ✅ Less memory - ❌ Lower accuracy - ❌ Less capacity """) with col2: st.markdown("#### ⚡ Speed vs Quality") st.markdown(""" **Higher quality**: - More parameters - Longer sequences - Lower temperature - ❌ Slower **Higher speed**: - Fewer parameters - Shorter sequences - Quantization - ❌ Potentially lower quality """) st.divider() st.markdown("### 🔢 Quantization") st.markdown(""" **What**: Reduce precision of model weights (32-bit → 8-bit) **Benefits**: - 75% less memory usage - Faster inference on some hardware - Enables larger models on limited hardware **Cost**: - Slight accuracy loss (~1-2%) - Requires calibration **Try it**: Enable "8-bit quantization" in the sidebar on Demo page! """) st.markdown('
', unsafe_allow_html=True) # ============================================================================= # PAGE 3: TECHNICAL DETAILS # ============================================================================= elif page == "⚙️ Technical Details": st.header("⚙️ Technical Implementation") col1, col2 = st.columns(2) with col1: st.markdown('
', unsafe_allow_html=True) st.markdown("### 📦 Model Architecture") st.markdown(""" **Base Model**: distilgpt2 - Type: Causal Language Model - Parameters: 82M - Layers: 6 transformer blocks - Hidden size: 768 - Attention heads: 12 - Vocabulary: 50,257 tokens """) st.markdown('
', unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) st.markdown("### 🔧 LoRA Configuration") st.markdown(""" ```python LoraConfig( r=16, # Rank lora_alpha=32, # Scaling target_modules=["c_attn"], # Attention only lora_dropout=0.05, task_type="CAUSAL_LM" ) ``` **Trainable Parameters**: 294,912 (0.36%) **Adapter Size**: ~3 MB """) st.markdown('
', unsafe_allow_html=True) with col2: st.markdown('
', unsafe_allow_html=True) st.markdown("### 📊 Dataset") st.markdown(""" **Name**: Python Code Instructions (18k Alpaca) **Source**: `iamtarun/python_code_instructions_18k_alpaca` **Used**: 5000 samples - Training: 4500 samples - Validation: 500 samples **Format**: ``` Instruction: Write Python code for X Code: def function()... ``` """) st.markdown('
', unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) st.markdown("### 🏋️ Training Hyperparameters") st.markdown(""" ```python Epochs: 4 Batch size: 2 (per device) Gradient accumulation: 4 Learning rate: 3e-4 Max sequence length: 512 Optimizer: AdamW Scheduler: Linear warmup ``` **Training Time**: ~30 minutes (T4 GPU) **Final Loss**: ~2.5 """) st.markdown('
', unsafe_allow_html=True) st.divider() st.markdown("### 🛠️ Tools & Libraries Used") col1, col2, col3 = st.columns(3) with col1: st.markdown(""" **Training**: - 🤗 Transformers - 🎯 PEFT (LoRA) - 🚀 Accelerate - 📊 Datasets - 🔥 PyTorch """) with col2: st.markdown(""" **Deployment**: - 🌐 Streamlit - 🤗 Hugging Face Hub - ⚡ bitsandbytes (quantization) - 💾 safetensors """) with col3: st.markdown(""" **Infrastructure**: - 📓 Google Colab (training) - 💻 Local deployment - ☁️ Hugging Face Spaces (optional) - 🔒 Git LFS (model versioning) """) # ============================================================================= # PAGE 4: DEPLOYMENT INFO # ============================================================================= else: # Deployment Info st.header("🚀 Deployment Options") tab1, tab2, tab3 = st.tabs(["💻 Local", "☁️ Cloud", "📊 Comparison"]) with tab1: st.markdown('
', unsafe_allow_html=True) st.markdown("### 💻 Local Deployment (Current)") st.markdown(""" **Advantages**: - ✅ Full control - ✅ No API costs - ✅ Data privacy - ✅ Works offline - ✅ Fast iteration **Requirements**: - Python 3.8+ - 2-4 GB RAM - Optional: NVIDIA GPU **Setup**: ```bash pip install streamlit transformers peft torch streamlit run app.py ``` **Best for**: Development, testing, demos """) st.markdown('
', unsafe_allow_html=True) with tab2: st.markdown('
', unsafe_allow_html=True) st.markdown("### ☁️ Cloud Deployment") st.markdown("#### 🤗 Hugging Face Spaces (Recommended)") st.markdown(""" **Features**: - ✅ Free tier available - ✅ Auto-deploys from Git - ✅ Public URL - ✅ No server management - ✅ Built-in CI/CD **Setup**: 1. Create account on huggingface.co 2. Create new Space (Streamlit) 3. Upload: app.py, requirements.txt, models/ 4. Auto-deploys! **URL**: `https://huggingface.co/spaces/YOUR_USERNAME/lora-demo` """) st.divider() st.markdown("#### Other Options") col1, col2 = st.columns(2) with col1: st.markdown(""" **Streamlit Cloud**: - Free for public apps - GitHub integration - Easy deployment - Resource limits """) with col2: st.markdown(""" **AWS/GCP/Azure**: - Full control - Scalable - More expensive - Requires devops """) st.markdown('
', unsafe_allow_html=True) with tab3: st.markdown('
', unsafe_allow_html=True) st.markdown("### 📊 Deployment Comparison") comparison_data = { "Feature": ["Cost", "Setup Time", "Control", "Scalability", "Maintenance", "Best For"], "Local": ["Free", "5 mins", "Full", "Limited", "Manual", "Development"], "HF Spaces": ["Free", "10 mins", "Medium", "Auto", "Minimal", "Demos"], "Cloud (AWS)": ["$$$", "1-2 hours", "Full", "High", "Manual", "Production"] } st.table(comparison_data) st.divider() st.markdown("### 🎯 CPU vs GPU Inference") col1, col2 = st.columns(2) with col1: st.markdown(""" **CPU Inference**: - Speed: 2-5 seconds/response - Cost: $0 (uses existing hardware) - Memory: ~2 GB RAM - Best for: Low-traffic apps, development """) with col2: st.markdown(""" **GPU Inference**: - Speed: 0.5-2 seconds/response - Cost: $0.50-2/hour (cloud) - Memory: ~4-8 GB VRAM - Best for: High-traffic, real-time apps """) st.info("💡 **Tip**: Start with CPU deployment, upgrade to GPU only if needed!") st.markdown('
', unsafe_allow_html=True) # Footer st.divider() st.markdown("""

🎓 Group 6: Model Adaptation, Efficient Fine-Tuning & Deployment of LLMs

Built with Streamlit • Transformers • PEFT • PyTorch

""", unsafe_allow_html=True)