Spaces:

Lubabah0
/

lora-code-generation-demo

Build error

App Files Files Community

lora-code-generation-demo / app.py

Lubabah0

Upload 10 files

adfb728 verified about 2 months ago

raw

history blame contribute delete

25.6 kB

	"""
	ULTIMATE LoRA Fine-Tuning Demo - Covers ALL Project Requirements
	Group 6: Model Adaptation, Efficient Fine-Tuning & Deployment of LLMs
	"""

	import streamlit as st
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	from peft import PeftModel
	import time
	import psutil
	import os

	# Page configuration
	st.set_page_config(
	page_title="LoRA Fine-Tuning Complete Demo",
	page_icon="🤖",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom CSS
	st.markdown("""
	<style>
	.main-header {
	font-size: 2.5rem;
	font-weight: bold;
	text-align: center;
	background: linear-gradient(120deg, #1f77b4, #00cc88);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	margin-bottom: 0.5rem;
	}
	.sub-header {
	text-align: center;
	color: #666;
	margin-bottom: 2rem;
	font-size: 1.1rem;
	}
	.metric-card {
	background: #f0f2f6;
	padding: 1rem;
	border-radius: 10px;
	border-left: 4px solid #1f77b4;
	}
	.model-box {
	padding: 1.5rem;
	border-radius: 10px;
	margin: 1rem 0;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	}
	.base-model {
	background-color: #fff5f5;
	border-left: 4px solid #ff4b4b;
	}
	.finetuned-model {
	background-color: #f0fff4;
	border-left: 4px solid #00cc88;
	}
	.theory-box {
	background: #e8f4f8;
	padding: 1.5rem;
	border-radius: 10px;
	margin: 1rem 0;
	border-left: 4px solid #1f77b4;
	}
	</style>
	""", unsafe_allow_html=True)

	# Title
	st.markdown('<div class="main-header">🚀 Complete LoRA Fine-Tuning Demo</div>', unsafe_allow_html=True)
	st.markdown('<div class="sub-header">Parameter-Efficient Fine-Tuning & Deployment Showcase</div>',
	unsafe_allow_html=True)

	# Sidebar Navigation
	with st.sidebar:
	st.header("📚 Navigation")
	page = st.radio(
	"Select Section:",
	["🎯 Live Demo", "📊 Theory & Concepts", "⚙️ Technical Details", "🚀 Deployment Info"],
	label_visibility="collapsed"
	)

	st.divider()

	if page == "🎯 Live Demo":
	st.header("⚙️ Model Settings")

	device_option = st.selectbox(
	"Inference Device",
	["Auto (GPU if available)", "Force CPU", "Force GPU"],
	help="Compare CPU vs GPU inference speed"
	)

	use_quantization = st.checkbox(
	"Use 8-bit Quantization",
	value=False,
	help="Reduces memory usage, slightly slower"
	)

	temperature = st.slider("Temperature", 0.1, 1.0, 0.3, 0.1)
	max_length = st.slider("Max Length", 50, 400, 200, 10)
	top_p = st.slider("Top P", 0.1, 1.0, 0.95, 0.05)

	st.divider()

	st.header("📊 Quick Stats")
	col1, col2 = st.columns(2)
	with col1:
	st.metric("Base Model", "82M params")
	st.metric("Adapter Size", "~3 MB")
	with col2:
	st.metric("Trainable", "0.4%")
	st.metric("Training Time", "~30 min")


	# Cache model loading
	@st.cache_resource
	def load_models(use_quantization=False, device_option="Auto"):
	"""Load base model and fine-tuned model"""

	base_model_name = "distilgpt2"
	adapter_path = "./models/lora_adapters"

	# Determine device
	if device_option == "Force CPU":
	device = "cpu"
	elif device_option == "Force GPU":
	device = "cuda" if torch.cuda.is_available() else "cpu"
	else:
	device = "cuda" if torch.cuda.is_available() else "cpu"

	with st.spinner("🔄 Loading models..."):
	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(base_model_name)
	tokenizer.pad_token = tokenizer.eos_token

	# Quantization config
	if use_quantization and device == "cuda":
	quantization_config = BitsAndBytesConfig(
	load_in_8bit=True,
	llm_int8_threshold=6.0
	)
	base_model = AutoModelForCausalLM.from_pretrained(
	base_model_name,
	quantization_config=quantization_config,
	device_map="auto"
	)
	finetuned_model = AutoModelForCausalLM.from_pretrained(
	base_model_name,
	quantization_config=quantization_config,
	device_map="auto"
	)
	finetuned_model = PeftModel.from_pretrained(finetuned_model, adapter_path)
	else:
	# Standard loading
	base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
	finetuned_model = AutoModelForCausalLM.from_pretrained(base_model_name)
	finetuned_model = PeftModel.from_pretrained(finetuned_model, adapter_path)

	base_model.to(device)
	finetuned_model.to(device)

	return tokenizer, base_model, finetuned_model, device


	def get_model_size_mb(model):
	"""Calculate model size in MB"""
	param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
	buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers())
	return (param_size + buffer_size) / (1024 ** 2)


	def generate_response(model, tokenizer, prompt, device, temperature, max_length, top_p):
	"""Generate response from a model"""
	formatted_input = f"### Instruction:\n{prompt}\n\n### Code:\n"
	inputs = tokenizer(formatted_input, return_tensors="pt", padding=True)
	inputs = {k: v.to(device) for k, v in inputs.items()}

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_length=max_length,
	temperature=temperature,
	top_p=top_p,
	do_sample=True,
	num_return_sequences=1,
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id
	)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return response


	# =============================================================================
	# PAGE 1: LIVE DEMO
	# =============================================================================
	if page == "🎯 Live Demo":
	# Load models
	try:
	tokenizer, base_model, finetuned_model, device = load_models(
	use_quantization=use_quantization if 'use_quantization' in dir() else False,
	device_option=device_option if 'device_option' in dir() else "Auto"
	)

	# Show device info
	device_emoji = "🚀" if device == "cuda" else "🐢"
	if device == "cuda":
	st.success(f"{device_emoji} Running on GPU: {torch.cuda.get_device_name(0)}")
	else:
	st.info(f"{device_emoji} Running on CPU (slower but works!)")

	# Show quantization status
	if use_quantization and device == "cuda":
	st.info("⚡ 8-bit quantization enabled - Lower memory usage!")

	except Exception as e:
	st.error(f"❌ Error loading models: {str(e)}")
	st.stop()

	# Sample prompts
	st.header("💬 Try the Demo")

	sample_prompts = [
	"Write a Python function to calculate factorial",
	"Create a function to check if a string is palindrome",
	"Write code to merge two sorted lists",
	"Implement a function to find the largest element in a list",
	"Create a Python function to check if a number is prime",
	"Write code to reverse a linked list",
	"Implement binary search algorithm in Python"
	]

	col1, col2 = st.columns([3, 1])
	with col1:
	use_sample = st.selectbox("Select prompt or write custom:", ["Custom"] + sample_prompts)
	with col2:
	st.write("")
	st.write("")

	if use_sample == "Custom":
	user_instruction = st.text_area(
	"Enter your instruction:",
	height=100,
	placeholder="e.g., Write a Python function to sort a dictionary by values"
	)
	else:
	user_instruction = use_sample
	st.info(f"💡 Prompt: {user_instruction}")

	# Generate button
	if st.button("🚀 Generate Responses", type="primary", use_container_width=True):
	if user_instruction.strip():

	col_base, col_finetuned = st.columns(2)

	with col_base:
	st.markdown('<div class="model-box base-model">', unsafe_allow_html=True)
	st.subheader("🔴 Base Model (Untrained)")

	with st.spinner("Generating..."):
	start_time = time.time()
	base_response = generate_response(
	base_model, tokenizer, user_instruction, device,
	temperature, max_length, top_p
	)
	base_time = time.time() - start_time

	st.code(base_response, language="python")
	st.caption(f"⏱️ Generation time: {base_time:.3f}s")
	st.markdown('</div>', unsafe_allow_html=True)

	with col_finetuned:
	st.markdown('<div class="model-box finetuned-model">', unsafe_allow_html=True)
	st.subheader("🟢 Fine-tuned Model (+ LoRA)")

	with st.spinner("Generating..."):
	start_time = time.time()
	finetuned_response = generate_response(
	finetuned_model, tokenizer, user_instruction, device,
	temperature, max_length, top_p
	)
	finetuned_time = time.time() - start_time

	st.code(finetuned_response, language="python")
	st.caption(f"⏱️ Generation time: {finetuned_time:.3f}s")
	st.markdown('</div>', unsafe_allow_html=True)

	# Performance Analysis
	st.divider()
	st.subheader("📊 Performance Analysis")

	col1, col2, col3, col4 = st.columns(4)

	with col1:
	st.metric("Base Response", f"{len(base_response.split())} words")
	with col2:
	st.metric("Fine-tuned Response", f"{len(finetuned_response.split())} words")
	with col3:
	speed_diff = ((base_time - finetuned_time) / base_time) * 100
	st.metric("Speed Difference", f"{speed_diff:+.1f}%")
	with col4:
	st.metric("Device", device.upper())

	st.success("✅ Notice: Base model produces gibberish, fine-tuned generates actual Python code!")

	else:
	st.warning("⚠️ Please enter an instruction!")

	# =============================================================================
	# PAGE 2: THEORY & CONCEPTS
	# =============================================================================
	elif page == "📊 Theory & Concepts":
	st.header("📚 Theory & Key Concepts")

	tab1, tab2, tab3, tab4 = st.tabs([
	"🎓 Pre-training vs Fine-tuning",
	"🔧 LoRA & PEFT",
	"⚡ Training vs Inference",
	"📏 Trade-offs"
	])

	with tab1:
	st.markdown('<div class="theory-box">', unsafe_allow_html=True)
	st.subheader("Pre-training vs Fine-tuning")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown("### 🏗️ Pre-training")
	st.markdown("""
	- Task: Learn general language understanding
	- Data: Massive unlabeled text (billions of tokens)
	- Cost: Extremely expensive ($$$$$)
	- Time: Weeks to months
	- Example: GPT, BERT, LLaMA training
	- Goal: General purpose model
	""")

	with col2:
	st.markdown("### 🎯 Fine-tuning")
	st.markdown("""
	- Task: Adapt to specific domain/task
	- Data: Smaller labeled dataset (thousands)
	- Cost: Much cheaper ($$)
	- Time: Hours to days
	- Example: Code generation, Q&A, summarization
	- Goal: Specialized model
	""")

	st.divider()

	st.markdown("### 📊 Our Project: Transfer Learning")
	st.info("""
	We started with: Pre-trained `distilgpt2` (general language model)
	We fine-tuned on: Python code instructions (5000 samples)
	Result: Model now generates Python code instead of general text!

	This is Transfer Learning - leveraging pre-trained knowledge for new tasks.
	""")
	st.markdown('</div>', unsafe_allow_html=True)

	with tab2:
	st.markdown('<div class="theory-box">', unsafe_allow_html=True)
	st.subheader("LoRA: Low-Rank Adaptation")

	col1, col2 = st.columns([1, 1])

	with col1:
	st.markdown("### 🔴 Full Fine-tuning (Expensive)")
	st.markdown("""
	```
	Total Parameters: 82M
	Trainable: 82M (100%)
	Memory: High
	Time: Long
	GPU: Required (expensive)
	Checkpoint: 320 MB
	```
	Problems:
	- ❌ Expensive GPUs needed
	- ❌ Long training time
	- ❌ Large model checkpoints
	- ❌ Risk of catastrophic forgetting
	""")

	with col2:
	st.markdown("### 🟢 LoRA Fine-tuning (Efficient)")
	st.markdown("""
	```
	Total Parameters: 82M
	Trainable: 295K (0.36%)
	Memory: Low
	Time: Fast
	GPU: Optional (Colab free tier OK)
	Checkpoint: 3 MB
	```
	Advantages:
	- ✅ Train on free GPUs
	- ✅ Fast training (~30 min)
	- ✅ Tiny adapter files
	- ✅ Preserve base model knowledge
	""")

	st.divider()

	st.markdown("### 🧮 How LoRA Works")
	st.markdown("""
	Instead of updating all weights `W`, LoRA adds small adapter matrices:

	```
	W_new = W_frozen + ΔW
	where ΔW = B × A (low-rank decomposition)
	```

	Our Configuration:
	- `r = 16` (rank - controls adapter capacity)
	- `alpha = 32` (scaling factor)
	- Target modules: Attention layers only
	- Result: 99.6% fewer trainable parameters!
	""")
	st.markdown('</div>', unsafe_allow_html=True)

	with tab3:
	st.markdown('<div class="theory-box">', unsafe_allow_html=True)
	st.subheader("Training vs Inference")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown("### 🏋️ Training Phase")
	st.markdown("""
	What happens:
	- Forward pass through model
	- Calculate loss (prediction error)
	- Backward propagation (gradients)
	- Update weights (only LoRA adapters)

	Requirements:
	- GPU highly recommended
	- More memory needed
	- Longer time
	- Batch processing

	Our Training:
	- Dataset: 5000 Python code examples
	- Time: ~30 minutes (Colab T4 GPU)
	- Memory: ~8 GB VRAM
	- Output: 3 MB adapter file
	""")

	with col2:
	st.markdown("### 🚀 Inference Phase")
	st.markdown("""
	What happens:
	- Load base model + adapters
	- Forward pass only (no backprop)
	- Generate predictions
	- No weight updates

	Requirements:
	- CPU works (slower)
	- GPU faster (optional)
	- Less memory
	- Real-time response

	Our Deployment:
	- Works on: CPU or GPU
	- Load time: ~10-30 seconds
	- Inference: ~1-3 seconds per response
	- Memory: ~2 GB RAM
	""")

	st.markdown('</div>', unsafe_allow_html=True)

	with tab4:
	st.markdown('<div class="theory-box">', unsafe_allow_html=True)
	st.subheader("Trade-offs & Optimization")

	st.markdown("### ⚖️ Key Trade-offs")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown("#### 📏 Model Size vs Accuracy")
	st.markdown("""
	Larger models:
	- ✅ Better accuracy
	- ✅ More capacity
	- ❌ Slower inference
	- ❌ More memory

	Smaller models:
	- ✅ Faster inference
	- ✅ Less memory
	- ❌ Lower accuracy
	- ❌ Less capacity
	""")

	with col2:
	st.markdown("#### ⚡ Speed vs Quality")
	st.markdown("""
	Higher quality:
	- More parameters
	- Longer sequences
	- Lower temperature
	- ❌ Slower

	Higher speed:
	- Fewer parameters
	- Shorter sequences
	- Quantization
	- ❌ Potentially lower quality
	""")

	st.divider()

	st.markdown("### 🔢 Quantization")
	st.markdown("""
	What: Reduce precision of model weights (32-bit → 8-bit)

	Benefits:
	- 75% less memory usage
	- Faster inference on some hardware
	- Enables larger models on limited hardware

	Cost:
	- Slight accuracy loss (~1-2%)
	- Requires calibration

	Try it: Enable "8-bit quantization" in the sidebar on Demo page!
	""")

	st.markdown('</div>', unsafe_allow_html=True)

	# =============================================================================
	# PAGE 3: TECHNICAL DETAILS
	# =============================================================================
	elif page == "⚙️ Technical Details":
	st.header("⚙️ Technical Implementation")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown('<div class="metric-card">', unsafe_allow_html=True)
	st.markdown("### 📦 Model Architecture")
	st.markdown("""
	Base Model: distilgpt2
	- Type: Causal Language Model
	- Parameters: 82M
	- Layers: 6 transformer blocks
	- Hidden size: 768
	- Attention heads: 12
	- Vocabulary: 50,257 tokens
	""")
	st.markdown('</div>', unsafe_allow_html=True)

	st.markdown('<div class="metric-card">', unsafe_allow_html=True)
	st.markdown("### 🔧 LoRA Configuration")
	st.markdown("""
	```python
	LoraConfig(
	r=16, # Rank
	lora_alpha=32, # Scaling
	target_modules=["c_attn"], # Attention only
	lora_dropout=0.05,
	task_type="CAUSAL_LM"
	)
	```

	Trainable Parameters: 294,912 (0.36%)
	Adapter Size: ~3 MB
	""")
	st.markdown('</div>', unsafe_allow_html=True)

	with col2:
	st.markdown('<div class="metric-card">', unsafe_allow_html=True)
	st.markdown("### 📊 Dataset")
	st.markdown("""
	Name: Python Code Instructions (18k Alpaca)
	Source: `iamtarun/python_code_instructions_18k_alpaca`
	Used: 5000 samples
	- Training: 4500 samples
	- Validation: 500 samples

	Format:
	```
	Instruction: Write Python code for X
	Code: def function()...
	```
	""")
	st.markdown('</div>', unsafe_allow_html=True)

	st.markdown('<div class="metric-card">', unsafe_allow_html=True)
	st.markdown("### 🏋️ Training Hyperparameters")
	st.markdown("""
	```python
	Epochs: 4
	Batch size: 2 (per device)
	Gradient accumulation: 4
	Learning rate: 3e-4
	Max sequence length: 512
	Optimizer: AdamW
	Scheduler: Linear warmup
	```

	Training Time: ~30 minutes (T4 GPU)
	Final Loss: ~2.5
	""")
	st.markdown('</div>', unsafe_allow_html=True)

	st.divider()

	st.markdown("### 🛠️ Tools & Libraries Used")

	col1, col2, col3 = st.columns(3)

	with col1:
	st.markdown("""
	Training:
	- 🤗 Transformers
	- 🎯 PEFT (LoRA)
	- 🚀 Accelerate
	- 📊 Datasets
	- 🔥 PyTorch
	""")

	with col2:
	st.markdown("""
	Deployment:
	- 🌐 Streamlit
	- 🤗 Hugging Face Hub
	- ⚡ bitsandbytes (quantization)
	- 💾 safetensors
	""")

	with col3:
	st.markdown("""
	Infrastructure:
	- 📓 Google Colab (training)
	- 💻 Local deployment
	- ☁️ Hugging Face Spaces (optional)
	- 🔒 Git LFS (model versioning)
	""")

	# =============================================================================
	# PAGE 4: DEPLOYMENT INFO
	# =============================================================================
	else: # Deployment Info
	st.header("🚀 Deployment Options")

	tab1, tab2, tab3 = st.tabs(["💻 Local", "☁️ Cloud", "📊 Comparison"])

	with tab1:
	st.markdown('<div class="theory-box">', unsafe_allow_html=True)
	st.markdown("### 💻 Local Deployment (Current)")

	st.markdown("""
	Advantages:
	- ✅ Full control
	- ✅ No API costs
	- ✅ Data privacy
	- ✅ Works offline
	- ✅ Fast iteration

	Requirements:
	- Python 3.8+
	- 2-4 GB RAM
	- Optional: NVIDIA GPU

	Setup:
	```bash
	pip install streamlit transformers peft torch
	streamlit run app.py
	```

	Best for: Development, testing, demos
	""")
	st.markdown('</div>', unsafe_allow_html=True)

	with tab2:
	st.markdown('<div class="theory-box">', unsafe_allow_html=True)
	st.markdown("### ☁️ Cloud Deployment")

	st.markdown("#### 🤗 Hugging Face Spaces (Recommended)")
	st.markdown("""
	Features:
	- ✅ Free tier available
	- ✅ Auto-deploys from Git
	- ✅ Public URL
	- ✅ No server management
	- ✅ Built-in CI/CD

	Setup:
	1. Create account on huggingface.co
	2. Create new Space (Streamlit)
	3. Upload: app.py, requirements.txt, models/
	4. Auto-deploys!

	URL: `https://huggingface.co/spaces/YOUR_USERNAME/lora-demo`
	""")

	st.divider()

	st.markdown("#### Other Options")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown("""
	Streamlit Cloud:
	- Free for public apps
	- GitHub integration
	- Easy deployment
	- Resource limits
	""")

	with col2:
	st.markdown("""
	AWS/GCP/Azure:
	- Full control
	- Scalable
	- More expensive
	- Requires devops
	""")

	st.markdown('</div>', unsafe_allow_html=True)

	with tab3:
	st.markdown('<div class="theory-box">', unsafe_allow_html=True)
	st.markdown("### 📊 Deployment Comparison")

	comparison_data = {
	"Feature": ["Cost", "Setup Time", "Control", "Scalability", "Maintenance", "Best For"],
	"Local": ["Free", "5 mins", "Full", "Limited", "Manual", "Development"],
	"HF Spaces": ["Free", "10 mins", "Medium", "Auto", "Minimal", "Demos"],
	"Cloud (AWS)": ["$$$", "1-2 hours", "Full", "High", "Manual", "Production"]
	}

	st.table(comparison_data)

	st.divider()

	st.markdown("### 🎯 CPU vs GPU Inference")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown("""
	CPU Inference:
	- Speed: 2-5 seconds/response
	- Cost: $0 (uses existing hardware)
	- Memory: ~2 GB RAM
	- Best for: Low-traffic apps, development
	""")

	with col2:
	st.markdown("""
	GPU Inference:
	- Speed: 0.5-2 seconds/response
	- Cost: $0.50-2/hour (cloud)
	- Memory: ~4-8 GB VRAM
	- Best for: High-traffic, real-time apps
	""")

	st.info("💡 Tip: Start with CPU deployment, upgrade to GPU only if needed!")

	st.markdown('</div>', unsafe_allow_html=True)

	# Footer
	st.divider()
	st.markdown("""
	<div style="text-align: center; color: #666; padding: 1rem;">
	<p><strong>🎓 Group 6: Model Adaptation, Efficient Fine-Tuning & Deployment of LLMs</strong></p>
	<p>Built with Streamlit • Transformers • PEFT • PyTorch</p>
	</div>
	""", unsafe_allow_html=True)