Lubabah0's picture
Upload 10 files
adfb728 verified
"""
ULTIMATE LoRA Fine-Tuning Demo - Covers ALL Project Requirements
Group 6: Model Adaptation, Efficient Fine-Tuning & Deployment of LLMs
"""
import streamlit as st
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import time
import psutil
import os
# Page configuration
st.set_page_config(
page_title="LoRA Fine-Tuning Complete Demo",
page_icon="🤖",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS
st.markdown("""
<style>
.main-header {
font-size: 2.5rem;
font-weight: bold;
text-align: center;
background: linear-gradient(120deg, #1f77b4, #00cc88);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
margin-bottom: 0.5rem;
}
.sub-header {
text-align: center;
color: #666;
margin-bottom: 2rem;
font-size: 1.1rem;
}
.metric-card {
background: #f0f2f6;
padding: 1rem;
border-radius: 10px;
border-left: 4px solid #1f77b4;
}
.model-box {
padding: 1.5rem;
border-radius: 10px;
margin: 1rem 0;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.base-model {
background-color: #fff5f5;
border-left: 4px solid #ff4b4b;
}
.finetuned-model {
background-color: #f0fff4;
border-left: 4px solid #00cc88;
}
.theory-box {
background: #e8f4f8;
padding: 1.5rem;
border-radius: 10px;
margin: 1rem 0;
border-left: 4px solid #1f77b4;
}
</style>
""", unsafe_allow_html=True)
# Title
st.markdown('<div class="main-header">🚀 Complete LoRA Fine-Tuning Demo</div>', unsafe_allow_html=True)
st.markdown('<div class="sub-header">Parameter-Efficient Fine-Tuning & Deployment Showcase</div>',
unsafe_allow_html=True)
# Sidebar Navigation
with st.sidebar:
st.header("📚 Navigation")
page = st.radio(
"Select Section:",
["🎯 Live Demo", "📊 Theory & Concepts", "⚙️ Technical Details", "🚀 Deployment Info"],
label_visibility="collapsed"
)
st.divider()
if page == "🎯 Live Demo":
st.header("⚙️ Model Settings")
device_option = st.selectbox(
"Inference Device",
["Auto (GPU if available)", "Force CPU", "Force GPU"],
help="Compare CPU vs GPU inference speed"
)
use_quantization = st.checkbox(
"Use 8-bit Quantization",
value=False,
help="Reduces memory usage, slightly slower"
)
temperature = st.slider("Temperature", 0.1, 1.0, 0.3, 0.1)
max_length = st.slider("Max Length", 50, 400, 200, 10)
top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)
st.divider()
st.header("📊 Quick Stats")
col1, col2 = st.columns(2)
with col1:
st.metric("Base Model", "82M params")
st.metric("Adapter Size", "~3 MB")
with col2:
st.metric("Trainable", "0.4%")
st.metric("Training Time", "~30 min")
# Cache model loading
@st.cache_resource
def load_models(use_quantization=False, device_option="Auto"):
"""Load base model and fine-tuned model"""
base_model_name = "distilgpt2"
adapter_path = "./models/lora_adapters"
# Determine device
if device_option == "Force CPU":
device = "cpu"
elif device_option == "Force GPU":
device = "cuda" if torch.cuda.is_available() else "cpu"
else:
device = "cuda" if torch.cuda.is_available() else "cpu"
with st.spinner("🔄 Loading models..."):
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token
# Quantization config
if use_quantization and device == "cuda":
quantization_config = BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_threshold=6.0
)
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
quantization_config=quantization_config,
device_map="auto"
)
finetuned_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
quantization_config=quantization_config,
device_map="auto"
)
finetuned_model = PeftModel.from_pretrained(finetuned_model, adapter_path)
else:
# Standard loading
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
finetuned_model = AutoModelForCausalLM.from_pretrained(base_model_name)
finetuned_model = PeftModel.from_pretrained(finetuned_model, adapter_path)
base_model.to(device)
finetuned_model.to(device)
return tokenizer, base_model, finetuned_model, device
def get_model_size_mb(model):
"""Calculate model size in MB"""
param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers())
return (param_size + buffer_size) / (1024 ** 2)
def generate_response(model, tokenizer, prompt, device, temperature, max_length, top_p):
"""Generate response from a model"""
formatted_input = f"### Instruction:\n{prompt}\n\n### Code:\n"
inputs = tokenizer(formatted_input, return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model.generate(
**inputs,
max_length=max_length,
temperature=temperature,
top_p=top_p,
do_sample=True,
num_return_sequences=1,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
# =============================================================================
# PAGE 1: LIVE DEMO
# =============================================================================
if page == "🎯 Live Demo":
# Load models
try:
tokenizer, base_model, finetuned_model, device = load_models(
use_quantization=use_quantization if 'use_quantization' in dir() else False,
device_option=device_option if 'device_option' in dir() else "Auto"
)
# Show device info
device_emoji = "🚀" if device == "cuda" else "🐢"
if device == "cuda":
st.success(f"{device_emoji} Running on GPU: {torch.cuda.get_device_name(0)}")
else:
st.info(f"{device_emoji} Running on CPU (slower but works!)")
# Show quantization status
if use_quantization and device == "cuda":
st.info("⚡ 8-bit quantization enabled - Lower memory usage!")
except Exception as e:
st.error(f"❌ Error loading models: {str(e)}")
st.stop()
# Sample prompts
st.header("💬 Try the Demo")
sample_prompts = [
"Write a Python function to calculate factorial",
"Create a function to check if a string is palindrome",
"Write code to merge two sorted lists",
"Implement a function to find the largest element in a list",
"Create a Python function to check if a number is prime",
"Write code to reverse a linked list",
"Implement binary search algorithm in Python"
]
col1, col2 = st.columns([3, 1])
with col1:
use_sample = st.selectbox("Select prompt or write custom:", ["Custom"] + sample_prompts)
with col2:
st.write("")
st.write("")
if use_sample == "Custom":
user_instruction = st.text_area(
"Enter your instruction:",
height=100,
placeholder="e.g., Write a Python function to sort a dictionary by values"
)
else:
user_instruction = use_sample
st.info(f"💡 Prompt: {user_instruction}")
# Generate button
if st.button("🚀 Generate Responses", type="primary", use_container_width=True):
if user_instruction.strip():
col_base, col_finetuned = st.columns(2)
with col_base:
st.markdown('<div class="model-box base-model">', unsafe_allow_html=True)
st.subheader("🔴 Base Model (Untrained)")
with st.spinner("Generating..."):
start_time = time.time()
base_response = generate_response(
base_model, tokenizer, user_instruction, device,
temperature, max_length, top_p
)
base_time = time.time() - start_time
st.code(base_response, language="python")
st.caption(f"⏱️ Generation time: {base_time:.3f}s")
st.markdown('</div>', unsafe_allow_html=True)
with col_finetuned:
st.markdown('<div class="model-box finetuned-model">', unsafe_allow_html=True)
st.subheader("🟢 Fine-tuned Model (+ LoRA)")
with st.spinner("Generating..."):
start_time = time.time()
finetuned_response = generate_response(
finetuned_model, tokenizer, user_instruction, device,
temperature, max_length, top_p
)
finetuned_time = time.time() - start_time
st.code(finetuned_response, language="python")
st.caption(f"⏱️ Generation time: {finetuned_time:.3f}s")
st.markdown('</div>', unsafe_allow_html=True)
# Performance Analysis
st.divider()
st.subheader("📊 Performance Analysis")
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Base Response", f"{len(base_response.split())} words")
with col2:
st.metric("Fine-tuned Response", f"{len(finetuned_response.split())} words")
with col3:
speed_diff = ((base_time - finetuned_time) / base_time) * 100
st.metric("Speed Difference", f"{speed_diff:+.1f}%")
with col4:
st.metric("Device", device.upper())
st.success("✅ Notice: Base model produces gibberish, fine-tuned generates actual Python code!")
else:
st.warning("⚠️ Please enter an instruction!")
# =============================================================================
# PAGE 2: THEORY & CONCEPTS
# =============================================================================
elif page == "📊 Theory & Concepts":
st.header("📚 Theory & Key Concepts")
tab1, tab2, tab3, tab4 = st.tabs([
"🎓 Pre-training vs Fine-tuning",
"🔧 LoRA & PEFT",
"⚡ Training vs Inference",
"📏 Trade-offs"
])
with tab1:
st.markdown('<div class="theory-box">', unsafe_allow_html=True)
st.subheader("Pre-training vs Fine-tuning")
col1, col2 = st.columns(2)
with col1:
st.markdown("### 🏗️ Pre-training")
st.markdown("""
- **Task**: Learn general language understanding
- **Data**: Massive unlabeled text (billions of tokens)
- **Cost**: Extremely expensive ($$$$$)
- **Time**: Weeks to months
- **Example**: GPT, BERT, LLaMA training
- **Goal**: General purpose model
""")
with col2:
st.markdown("### 🎯 Fine-tuning")
st.markdown("""
- **Task**: Adapt to specific domain/task
- **Data**: Smaller labeled dataset (thousands)
- **Cost**: Much cheaper ($$)
- **Time**: Hours to days
- **Example**: Code generation, Q&A, summarization
- **Goal**: Specialized model
""")
st.divider()
st.markdown("### 📊 Our Project: Transfer Learning")
st.info("""
**We started with**: Pre-trained `distilgpt2` (general language model)
**We fine-tuned on**: Python code instructions (5000 samples)
**Result**: Model now generates Python code instead of general text!
This is **Transfer Learning** - leveraging pre-trained knowledge for new tasks.
""")
st.markdown('</div>', unsafe_allow_html=True)
with tab2:
st.markdown('<div class="theory-box">', unsafe_allow_html=True)
st.subheader("LoRA: Low-Rank Adaptation")
col1, col2 = st.columns([1, 1])
with col1:
st.markdown("### 🔴 Full Fine-tuning (Expensive)")
st.markdown("""
```
Total Parameters: 82M
Trainable: 82M (100%)
Memory: High
Time: Long
GPU: Required (expensive)
Checkpoint: 320 MB
```
**Problems**:
- ❌ Expensive GPUs needed
- ❌ Long training time
- ❌ Large model checkpoints
- ❌ Risk of catastrophic forgetting
""")
with col2:
st.markdown("### 🟢 LoRA Fine-tuning (Efficient)")
st.markdown("""
```
Total Parameters: 82M
Trainable: 295K (0.36%)
Memory: Low
Time: Fast
GPU: Optional (Colab free tier OK)
Checkpoint: 3 MB
```
**Advantages**:
- ✅ Train on free GPUs
- ✅ Fast training (~30 min)
- ✅ Tiny adapter files
- ✅ Preserve base model knowledge
""")
st.divider()
st.markdown("### 🧮 How LoRA Works")
st.markdown("""
Instead of updating all weights `W`, LoRA adds small adapter matrices:
```
W_new = W_frozen + ΔW
where ΔW = B × A (low-rank decomposition)
```
**Our Configuration**:
- `r = 16` (rank - controls adapter capacity)
- `alpha = 32` (scaling factor)
- Target modules: Attention layers only
- Result: 99.6% fewer trainable parameters!
""")
st.markdown('</div>', unsafe_allow_html=True)
with tab3:
st.markdown('<div class="theory-box">', unsafe_allow_html=True)
st.subheader("Training vs Inference")
col1, col2 = st.columns(2)
with col1:
st.markdown("### 🏋️ Training Phase")
st.markdown("""
**What happens**:
- Forward pass through model
- Calculate loss (prediction error)
- Backward propagation (gradients)
- Update weights (only LoRA adapters)
**Requirements**:
- GPU highly recommended
- More memory needed
- Longer time
- Batch processing
**Our Training**:
- Dataset: 5000 Python code examples
- Time: ~30 minutes (Colab T4 GPU)
- Memory: ~8 GB VRAM
- Output: 3 MB adapter file
""")
with col2:
st.markdown("### 🚀 Inference Phase")
st.markdown("""
**What happens**:
- Load base model + adapters
- Forward pass only (no backprop)
- Generate predictions
- No weight updates
**Requirements**:
- CPU works (slower)
- GPU faster (optional)
- Less memory
- Real-time response
**Our Deployment**:
- Works on: CPU or GPU
- Load time: ~10-30 seconds
- Inference: ~1-3 seconds per response
- Memory: ~2 GB RAM
""")
st.markdown('</div>', unsafe_allow_html=True)
with tab4:
st.markdown('<div class="theory-box">', unsafe_allow_html=True)
st.subheader("Trade-offs & Optimization")
st.markdown("### ⚖️ Key Trade-offs")
col1, col2 = st.columns(2)
with col1:
st.markdown("#### 📏 Model Size vs Accuracy")
st.markdown("""
**Larger models**:
- ✅ Better accuracy
- ✅ More capacity
- ❌ Slower inference
- ❌ More memory
**Smaller models**:
- ✅ Faster inference
- ✅ Less memory
- ❌ Lower accuracy
- ❌ Less capacity
""")
with col2:
st.markdown("#### ⚡ Speed vs Quality")
st.markdown("""
**Higher quality**:
- More parameters
- Longer sequences
- Lower temperature
- ❌ Slower
**Higher speed**:
- Fewer parameters
- Shorter sequences
- Quantization
- ❌ Potentially lower quality
""")
st.divider()
st.markdown("### 🔢 Quantization")
st.markdown("""
**What**: Reduce precision of model weights (32-bit → 8-bit)
**Benefits**:
- 75% less memory usage
- Faster inference on some hardware
- Enables larger models on limited hardware
**Cost**:
- Slight accuracy loss (~1-2%)
- Requires calibration
**Try it**: Enable "8-bit quantization" in the sidebar on Demo page!
""")
st.markdown('</div>', unsafe_allow_html=True)
# =============================================================================
# PAGE 3: TECHNICAL DETAILS
# =============================================================================
elif page == "⚙️ Technical Details":
st.header("⚙️ Technical Implementation")
col1, col2 = st.columns(2)
with col1:
st.markdown('<div class="metric-card">', unsafe_allow_html=True)
st.markdown("### 📦 Model Architecture")
st.markdown("""
**Base Model**: distilgpt2
- Type: Causal Language Model
- Parameters: 82M
- Layers: 6 transformer blocks
- Hidden size: 768
- Attention heads: 12
- Vocabulary: 50,257 tokens
""")
st.markdown('</div>', unsafe_allow_html=True)
st.markdown('<div class="metric-card">', unsafe_allow_html=True)
st.markdown("### 🔧 LoRA Configuration")
st.markdown("""
```python
LoraConfig(
r=16, # Rank
lora_alpha=32, # Scaling
target_modules=["c_attn"], # Attention only
lora_dropout=0.05,
task_type="CAUSAL_LM"
)
```
**Trainable Parameters**: 294,912 (0.36%)
**Adapter Size**: ~3 MB
""")
st.markdown('</div>', unsafe_allow_html=True)
with col2:
st.markdown('<div class="metric-card">', unsafe_allow_html=True)
st.markdown("### 📊 Dataset")
st.markdown("""
**Name**: Python Code Instructions (18k Alpaca)
**Source**: `iamtarun/python_code_instructions_18k_alpaca`
**Used**: 5000 samples
- Training: 4500 samples
- Validation: 500 samples
**Format**:
```
Instruction: Write Python code for X
Code: def function()...
```
""")
st.markdown('</div>', unsafe_allow_html=True)
st.markdown('<div class="metric-card">', unsafe_allow_html=True)
st.markdown("### 🏋️ Training Hyperparameters")
st.markdown("""
```python
Epochs: 4
Batch size: 2 (per device)
Gradient accumulation: 4
Learning rate: 3e-4
Max sequence length: 512
Optimizer: AdamW
Scheduler: Linear warmup
```
**Training Time**: ~30 minutes (T4 GPU)
**Final Loss**: ~2.5
""")
st.markdown('</div>', unsafe_allow_html=True)
st.divider()
st.markdown("### 🛠️ Tools & Libraries Used")
col1, col2, col3 = st.columns(3)
with col1:
st.markdown("""
**Training**:
- 🤗 Transformers
- 🎯 PEFT (LoRA)
- 🚀 Accelerate
- 📊 Datasets
- 🔥 PyTorch
""")
with col2:
st.markdown("""
**Deployment**:
- 🌐 Streamlit
- 🤗 Hugging Face Hub
- ⚡ bitsandbytes (quantization)
- 💾 safetensors
""")
with col3:
st.markdown("""
**Infrastructure**:
- 📓 Google Colab (training)
- 💻 Local deployment
- ☁️ Hugging Face Spaces (optional)
- 🔒 Git LFS (model versioning)
""")
# =============================================================================
# PAGE 4: DEPLOYMENT INFO
# =============================================================================
else: # Deployment Info
st.header("🚀 Deployment Options")
tab1, tab2, tab3 = st.tabs(["💻 Local", "☁️ Cloud", "📊 Comparison"])
with tab1:
st.markdown('<div class="theory-box">', unsafe_allow_html=True)
st.markdown("### 💻 Local Deployment (Current)")
st.markdown("""
**Advantages**:
- ✅ Full control
- ✅ No API costs
- ✅ Data privacy
- ✅ Works offline
- ✅ Fast iteration
**Requirements**:
- Python 3.8+
- 2-4 GB RAM
- Optional: NVIDIA GPU
**Setup**:
```bash
pip install streamlit transformers peft torch
streamlit run app.py
```
**Best for**: Development, testing, demos
""")
st.markdown('</div>', unsafe_allow_html=True)
with tab2:
st.markdown('<div class="theory-box">', unsafe_allow_html=True)
st.markdown("### ☁️ Cloud Deployment")
st.markdown("#### 🤗 Hugging Face Spaces (Recommended)")
st.markdown("""
**Features**:
- ✅ Free tier available
- ✅ Auto-deploys from Git
- ✅ Public URL
- ✅ No server management
- ✅ Built-in CI/CD
**Setup**:
1. Create account on huggingface.co
2. Create new Space (Streamlit)
3. Upload: app.py, requirements.txt, models/
4. Auto-deploys!
**URL**: `https://huggingface.co/spaces/YOUR_USERNAME/lora-demo`
""")
st.divider()
st.markdown("#### Other Options")
col1, col2 = st.columns(2)
with col1:
st.markdown("""
**Streamlit Cloud**:
- Free for public apps
- GitHub integration
- Easy deployment
- Resource limits
""")
with col2:
st.markdown("""
**AWS/GCP/Azure**:
- Full control
- Scalable
- More expensive
- Requires devops
""")
st.markdown('</div>', unsafe_allow_html=True)
with tab3:
st.markdown('<div class="theory-box">', unsafe_allow_html=True)
st.markdown("### 📊 Deployment Comparison")
comparison_data = {
"Feature": ["Cost", "Setup Time", "Control", "Scalability", "Maintenance", "Best For"],
"Local": ["Free", "5 mins", "Full", "Limited", "Manual", "Development"],
"HF Spaces": ["Free", "10 mins", "Medium", "Auto", "Minimal", "Demos"],
"Cloud (AWS)": ["$$$", "1-2 hours", "Full", "High", "Manual", "Production"]
}
st.table(comparison_data)
st.divider()
st.markdown("### 🎯 CPU vs GPU Inference")
col1, col2 = st.columns(2)
with col1:
st.markdown("""
**CPU Inference**:
- Speed: 2-5 seconds/response
- Cost: $0 (uses existing hardware)
- Memory: ~2 GB RAM
- Best for: Low-traffic apps, development
""")
with col2:
st.markdown("""
**GPU Inference**:
- Speed: 0.5-2 seconds/response
- Cost: $0.50-2/hour (cloud)
- Memory: ~4-8 GB VRAM
- Best for: High-traffic, real-time apps
""")
st.info("💡 **Tip**: Start with CPU deployment, upgrade to GPU only if needed!")
st.markdown('</div>', unsafe_allow_html=True)
# Footer
st.divider()
st.markdown("""
<div style="text-align: center; color: #666; padding: 1rem;">
<p><strong>🎓 Group 6: Model Adaptation, Efficient Fine-Tuning & Deployment of LLMs</strong></p>
<p>Built with Streamlit • Transformers • PEFT • PyTorch</p>
</div>
""", unsafe_allow_html=True)