Spaces:

Vivek16
/

vvv

Runtime error

App Files Files Community

vvv / app.py

Vivek16

Update app.py

8193117 verified 4 months ago

raw

history blame contribute delete

3.92 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel

	# --- Configuration (Verified) ---
	BASE_MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
	# Ensure this is correct for your model repository
	ADAPTER_MODEL_ID = "Vivek16/Root_Math-TinyLlama-CPU"

	# Define the instruction template components
	SYSTEM_INSTRUCTION = "Solve the following math problem:"
	USER_TEMPLATE = "<\|user\|>\n{}</s>"
	ASSISTANT_TEMPLATE = "<\|assistant\|>\n{}</s>"


	# --- Model Loading Function ---
	def load_model():
	"""Loads the base model and merges the LoRA adapters."""
	print("Loading base model...")
	# Use bfloat16 for efficiency on CPU
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
	model = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL_ID,
	torch_dtype=torch.bfloat16,
	device_map="cpu"
	)

	print("Loading and merging PEFT adapters...")
	# Load the trained LoRA adapters
	model = PeftModel.from_pretrained(model, ADAPTER_MODEL_ID)
	model = model.merge_and_unload()
	model.eval()

	# Ensure pad token is set for generation
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	print("Model loaded and merged successfully!")
	return tokenizer, model

	# Load the model outside the prediction function for efficiency
	tokenizer, model = load_model()


	# --- Prediction Function for gr.ChatInterface ---
	def generate_response(message, history):
	"""Generates a response using chat history and the fine-tuned model."""

	# 1. Build the full prompt including System Instruction, History, and current Message

	# Start with the system instruction
	full_prompt = f"<\|system\|>\n{SYSTEM_INSTRUCTION}</s>\n"

	# Append the chat history (if any)
	for user_msg, assistant_msg in history:
	full_prompt += USER_TEMPLATE.format(user_msg) + "\n"
	full_prompt += ASSISTANT_TEMPLATE.format(assistant_msg) + "\n"

	# Append the current user message and the start of the assistant's turn
	full_prompt += USER_TEMPLATE.format(message) + "\n"
	full_prompt += "<\|assistant\|>\n"

	print(f"--- Full Prompt ---\n{full_prompt}")

	# 2. Tokenize the input
	inputs = tokenizer(full_prompt, return_tensors="pt")

	# 3. Generate the response (on CPU)
	with torch.no_grad():
	output_tokens = model.generate(
	**inputs,
	max_new_tokens=256,
	do_sample=True,
	temperature=0.7,
	top_k=50,
	pad_token_id=tokenizer.eos_token_id
	)

	# 4. Decode the output
	generated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=False)

	# 5. Extract only the model's new response
	# Find the start of the assistant's turn in the output and everything after it
	response_start = generated_text.rfind('<\|assistant\|>')
	if response_start != -1:
	# Get the text after <\|assistant\|> and strip the trailing </s>
	raw_response = generated_text[response_start + len('<\|assistant\|>'):].strip()
	assistant_response = raw_response.split('</s>')[0].strip()
	else:
	assistant_response = "Error: Could not parse model output."

	return assistant_response


	# --- Gradio Chat Interface ---
	title = "Root Math TinyLlama 1.1B - Gemini-Like Chat Demo"
	description = "A conversational interface for the CPU-friendly TinyLlama model fine-tuned for math problems. Ask follow-up questions!"

	gr.ChatInterface(
	fn=generate_response,
	chatbot=gr.Chatbot(height=500), # Makes the chat history window taller
	textbox=gr.Textbox(placeholder="Enter your math problem or follow-up question...", scale=7),
	title=title,
	description=description,
	submit_btn="Ask Model",
	clear_btn="Start New Chat",
	undo_btn="Undo Last Message",
	theme="soft"
	).queue().launch()