lil-sumedhk
Fix for ZeroGPU: add @spaces.GPU decorator
82a52cd
import spaces
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from datasets import load_dataset
# Model configuration
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
ADAPTER_PATH = "sumedh/tinyllama-lora-math-adapter-v3"
# Load tokenizer globally
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
# Load test dataset for examples
print("Loading test dataset...")
test_data = load_dataset("openai/gsm8k", "main", split="test[:50]")
# Models will be loaded lazily on first GPU call
base_model = None
tuned_model = None
def load_models():
"""Load models on GPU when needed."""
global base_model, tuned_model
if base_model is None:
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
).eval()
if tuned_model is None:
print("Loading fine-tuned model...")
tmp_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
tuned_model = PeftModel.from_pretrained(tmp_model, ADAPTER_PATH)
tuned_model = tuned_model.merge_and_unload().eval()
print("Models loaded!")
return base_model, tuned_model
@spaces.GPU
def generate_responses(question):
"""Generate responses from both models - runs on GPU."""
if not question.strip():
return "Please enter a question.", ""
# Load models if not already loaded
base, tuned = load_models()
prompt = f"### Instruction:\n{question}\n### Response:\n"
token_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(base.device)
# Generate from base model
with torch.no_grad():
base_output = base.generate(
token_ids,
max_new_tokens=256,
pad_token_id=tokenizer.eos_token_id,
repetition_penalty=1.1
)
base_response = tokenizer.decode(base_output[0], skip_special_tokens=True)
if "### Response:" in base_response:
base_response = base_response.split("### Response:")[-1].strip()
# Generate from tuned model
token_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(tuned.device)
with torch.no_grad():
tuned_output = tuned.generate(
token_ids,
max_new_tokens=256,
pad_token_id=tokenizer.eos_token_id,
repetition_penalty=1.1
)
tuned_response = tokenizer.decode(tuned_output[0], skip_special_tokens=True)
if "### Response:" in tuned_response:
tuned_response = tuned_response.split("### Response:")[-1].strip()
return base_response, tuned_response
def load_example(idx):
"""Load an example from the test dataset."""
idx = int(idx)
if 0 <= idx < len(test_data):
question = test_data[idx]["question"]
answer = test_data[idx]["answer"]
return question, answer
return "", ""
def run_comparison(question, reference):
"""Run the full comparison."""
if not question.strip():
return "Please enter a question.", "", ""
base_response, tuned_response = generate_responses(question)
return base_response, tuned_response, reference
# Create Gradio interface
with gr.Blocks(title="TinyLlama Math Fine-tuning Demo") as demo:
gr.Markdown("""
# 🧮 TinyLlama Math Fine-tuning Demo
Compare the performance of **base TinyLlama** vs **fine-tuned TinyLlama** on math word problems.
- **Base Model**: TinyLlama-1.1B-Chat-v1.0 (no math training)
- **Fine-tuned Model**: LoRA adapter trained on GSM8K dataset (7,473 examples)
*Note: First run may take ~30s to load models.*
""")
with gr.Row():
with gr.Column(scale=2):
question_input = gr.Textbox(
label="Math Question",
placeholder="Enter a math word problem...",
lines=4
)
reference_input = gr.Textbox(
label="Reference Answer (optional)",
placeholder="The correct answer will appear here when loading examples",
lines=4
)
with gr.Column(scale=1):
gr.Markdown("### Load Example")
example_slider = gr.Slider(
minimum=0,
maximum=49,
step=1,
value=0,
label="Example Index (0-49)"
)
load_btn = gr.Button("Load Example", variant="secondary")
compare_btn = gr.Button("Compare Models", variant="primary", size="lg")
gr.Markdown("---")
with gr.Row():
with gr.Column():
gr.Markdown("### 🔴 Base Model Response")
base_output = gr.Textbox(label="", lines=10, show_label=False)
with gr.Column():
gr.Markdown("### 🟢 Fine-tuned Model Response")
tuned_output = gr.Textbox(label="", lines=10, show_label=False)
with gr.Column():
gr.Markdown("### ✅ Correct Answer")
reference_output = gr.Textbox(label="", lines=10, show_label=False)
# Event handlers
load_btn.click(
fn=load_example,
inputs=[example_slider],
outputs=[question_input, reference_input]
)
compare_btn.click(
fn=run_comparison,
inputs=[question_input, reference_input],
outputs=[base_output, tuned_output, reference_output]
)
gr.Markdown("""
---
### About
This demo showcases the effect of fine-tuning a small language model (TinyLlama 1.1B) on math word problems.
- **Dataset**: [GSM8K](https://huggingface.co/datasets/openai/gsm8k) - Grade School Math 8K
- **Method**: LoRA (Low-Rank Adaptation)
- **Training**: 5 epochs on 7,473 examples
""")
if __name__ == "__main__":
demo.launch()