Spaces:

sumedh
/

tinyllama-math-demo

Paused

lil-sumedhk

Fix for ZeroGPU: add @spaces.GPU decorator

82a52cd about 1 month ago

6.15 kB

	import spaces
	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel
	from datasets import load_dataset

	# Model configuration
	MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
	ADAPTER_PATH = "sumedh/tinyllama-lora-math-adapter-v3"

	# Load tokenizer globally
	print("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

	# Load test dataset for examples
	print("Loading test dataset...")
	test_data = load_dataset("openai/gsm8k", "main", split="test[:50]")

	# Models will be loaded lazily on first GPU call
	base_model = None
	tuned_model = None


	def load_models():
	"""Load models on GPU when needed."""
	global base_model, tuned_model

	if base_model is None:
	print("Loading base model...")
	base_model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True
	).eval()

	if tuned_model is None:
	print("Loading fine-tuned model...")
	tmp_model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True
	)
	tuned_model = PeftModel.from_pretrained(tmp_model, ADAPTER_PATH)
	tuned_model = tuned_model.merge_and_unload().eval()

	print("Models loaded!")
	return base_model, tuned_model


	@spaces.GPU
	def generate_responses(question):
	"""Generate responses from both models - runs on GPU."""
	if not question.strip():
	return "Please enter a question.", ""

	# Load models if not already loaded
	base, tuned = load_models()

	prompt = f"### Instruction:\n{question}\n### Response:\n"
	token_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(base.device)

	# Generate from base model
	with torch.no_grad():
	base_output = base.generate(
	token_ids,
	max_new_tokens=256,
	pad_token_id=tokenizer.eos_token_id,
	repetition_penalty=1.1
	)
	base_response = tokenizer.decode(base_output[0], skip_special_tokens=True)
	if "### Response:" in base_response:
	base_response = base_response.split("### Response:")[-1].strip()

	# Generate from tuned model
	token_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(tuned.device)
	with torch.no_grad():
	tuned_output = tuned.generate(
	token_ids,
	max_new_tokens=256,
	pad_token_id=tokenizer.eos_token_id,
	repetition_penalty=1.1
	)
	tuned_response = tokenizer.decode(tuned_output[0], skip_special_tokens=True)
	if "### Response:" in tuned_response:
	tuned_response = tuned_response.split("### Response:")[-1].strip()

	return base_response, tuned_response


	def load_example(idx):
	"""Load an example from the test dataset."""
	idx = int(idx)
	if 0 <= idx < len(test_data):
	question = test_data[idx]["question"]
	answer = test_data[idx]["answer"]
	return question, answer
	return "", ""


	def run_comparison(question, reference):
	"""Run the full comparison."""
	if not question.strip():
	return "Please enter a question.", "", ""

	base_response, tuned_response = generate_responses(question)
	return base_response, tuned_response, reference


	# Create Gradio interface
	with gr.Blocks(title="TinyLlama Math Fine-tuning Demo") as demo:
	gr.Markdown("""
	# 🧮 TinyLlama Math Fine-tuning Demo

	Compare the performance of base TinyLlama vs fine-tuned TinyLlama on math word problems.

	- Base Model: TinyLlama-1.1B-Chat-v1.0 (no math training)
	- Fine-tuned Model: LoRA adapter trained on GSM8K dataset (7,473 examples)

	Note: First run may take ~30s to load models.
	""")

	with gr.Row():
	with gr.Column(scale=2):
	question_input = gr.Textbox(
	label="Math Question",
	placeholder="Enter a math word problem...",
	lines=4
	)
	reference_input = gr.Textbox(
	label="Reference Answer (optional)",
	placeholder="The correct answer will appear here when loading examples",
	lines=4
	)

	with gr.Column(scale=1):
	gr.Markdown("### Load Example")
	example_slider = gr.Slider(
	minimum=0,
	maximum=49,
	step=1,
	value=0,
	label="Example Index (0-49)"
	)
	load_btn = gr.Button("Load Example", variant="secondary")

	compare_btn = gr.Button("Compare Models", variant="primary", size="lg")

	gr.Markdown("---")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### 🔴 Base Model Response")
	base_output = gr.Textbox(label="", lines=10, show_label=False)

	with gr.Column():
	gr.Markdown("### 🟢 Fine-tuned Model Response")
	tuned_output = gr.Textbox(label="", lines=10, show_label=False)

	with gr.Column():
	gr.Markdown("### ✅ Correct Answer")
	reference_output = gr.Textbox(label="", lines=10, show_label=False)

	# Event handlers
	load_btn.click(
	fn=load_example,
	inputs=[example_slider],
	outputs=[question_input, reference_input]
	)

	compare_btn.click(
	fn=run_comparison,
	inputs=[question_input, reference_input],
	outputs=[base_output, tuned_output, reference_output]
	)

	gr.Markdown("""
	---
	### About

	This demo showcases the effect of fine-tuning a small language model (TinyLlama 1.1B) on math word problems.

	- Dataset: [GSM8K](https://huggingface.co/datasets/openai/gsm8k) - Grade School Math 8K
	- Method: LoRA (Low-Rank Adaptation)
	- Training: 5 epochs on 7,473 examples
	""")

	if __name__ == "__main__":
	demo.launch()