Spaces:

akhaliq
/

hub-qj4gyx8r

Sleeping

App Files Files Community

hub-qj4gyx8r / app.py

akhaliq HF Staff

Update app.py

6e4badb verified 2 months ago

raw

history blame contribute delete

9.2 kB

	import gradio as gr
	import spaces
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import time
	from typing import List, Tuple

	# Model configuration
	MODEL_PATH = "microsoft/UserLM-8b"
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# Global variables for model and tokenizer
	model = None
	tokenizer = None

	def load_model():
	"""Load the model and tokenizer."""
	global model, tokenizer

	print(f"Loading model {MODEL_PATH}...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_PATH,
	trust_remote_code=True,
	torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
	low_cpu_mem_usage=True
	).to(DEVICE)
	print(f"Model loaded successfully on {DEVICE}")
	return model, tokenizer

	@spaces.GPU(duration=120)
	def generate_response(
	message: str,
	chat_history: List[Tuple[str, str]],
	system_prompt: str,
	temperature: float,
	top_p: float,
	max_new_tokens: int,
	) -> str:
	"""Generate a response from the model."""
	global model, tokenizer

	# Load model if not already loaded
	if model is None or tokenizer is None:
	model, tokenizer = load_model()

	# Build conversation history
	messages = []

	# Add system prompt if provided
	if system_prompt.strip():
	messages.append({"role": "system", "content": system_prompt})

	# Add chat history
	for user_msg, assistant_msg in chat_history:
	messages.append({"role": "user", "content": user_msg})
	if assistant_msg:
	messages.append({"role": "assistant", "content": assistant_msg})

	# Add current message
	messages.append({"role": "user", "content": message})

	# Tokenize input
	inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(DEVICE)

	# Define special tokens
	end_token = "<\|eot_id\|>"
	end_token_id = tokenizer.encode(end_token, add_special_tokens=False)

	end_conv_token = "<\|endconversation\|>"
	end_conv_token_id = tokenizer.encode(end_conv_token, add_special_tokens=False)

	# Generate response
	with torch.no_grad():
	outputs = model.generate(
	input_ids=inputs,
	do_sample=True,
	top_p=top_p,
	temperature=temperature,
	max_new_tokens=max_new_tokens,
	eos_token_id=end_token_id,
	pad_token_id=tokenizer.eos_token_id,
	bad_words_ids=[[token_id] for token_id in end_conv_token_id]
	)

	# Decode response
	response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
	return response

	def respond(
	message: str,
	chat_history: List[Tuple[str, str]],
	system_prompt: str,
	temperature: float,
	top_p: float,
	max_new_tokens: int,
	):
	"""Stream response to the chatbot."""
	# Generate complete response
	bot_message = generate_response(
	message,
	chat_history,
	system_prompt,
	temperature,
	top_p,
	max_new_tokens
	)

	# Add to chat history
	chat_history.append((message, bot_message))

	# Stream the response character by character for better UX
	partial_message = ""
	for char in bot_message:
	partial_message += char
	time.sleep(0.01) # Small delay for streaming effect
	yield chat_history[:-1] + [(message, partial_message)]

	yield chat_history

	def clear_conversation():
	"""Clear the conversation history."""
	return [], None

	# Create the Gradio interface
	with gr.Blocks(title="UserLM-8b Chat", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# 🤖 UserLM-8b Chat Interface

	Chat with Microsoft's UserLM-8b model. This model is designed to simulate user behavior and generate responses as if from a user perspective.

	[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
	"""
	)

	with gr.Row():
	with gr.Column(scale=3):
	chatbot = gr.Chatbot(
	height=500,
	show_copy_button=True,
	bubble_full_width=False,
	avatar_images=(None, "🤖"),
	render_markdown=True,
	)

	with gr.Row():
	msg = gr.Textbox(
	label="Message",
	placeholder="Type your message here and press Enter...",
	lines=2,
	scale=4,
	autofocus=True,
	)
	submit_btn = gr.Button("Send", variant="primary", scale=1)

	with gr.Row():
	clear_btn = gr.ClearButton(
	[chatbot, msg],
	value="🗑️ Clear Chat"
	)
	retry_btn = gr.Button("🔄 Retry Last")
	undo_btn = gr.Button("↩️ Undo Last")

	with gr.Column(scale=1):
	gr.Markdown("### ⚙️ Settings")

	system_prompt = gr.Textbox(
	label="System Prompt",
	placeholder="Set the behavior of the model...",
	value="You are a user who wants to implement a special type of sequence. The sequence sums up the two previous numbers in the sequence and adds 1 to the result. The first two numbers in the sequence are 1 and 1.",
	lines=4,
	)

	temperature = gr.Slider(
	minimum=0.1,
	maximum=2.0,
	value=1.0,
	step=0.1,
	label="Temperature",
	info="Higher values make output more random"
	)

	top_p = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.8,
	step=0.05,
	label="Top-p (nucleus sampling)",
	info="Lower values focus on more likely tokens"
	)

	max_new_tokens = gr.Slider(
	minimum=10,
	maximum=512,
	value=100,
	step=10,
	label="Max New Tokens",
	info="Maximum number of tokens to generate"
	)

	gr.Markdown(
	"""
	### 📊 Model Info
	- Model: microsoft/UserLM-8b
	- Parameters: 8 billion
	- Device: """ + DEVICE.upper() + """
	- Precision: FP16 (CUDA) / FP32 (CPU)
	"""
	)

	# Store conversation history
	chat_history = gr.State([])

	# Event handlers
	def user_submit(message, history):
	return "", history + [(message, None)]

	def bot_respond(history, system, temp, top_p, max_tokens):
	if not history or history[-1][1] is not None:
	return history

	message = history[-1][0]
	history_without_last = history[:-1]

	for new_history in respond(message, history_without_last, system, temp, top_p, max_tokens):
	yield new_history

	def retry_last(history, system, temp, top_p, max_tokens):
	if not history:
	return history

	# Remove last exchange and regenerate
	last_user_msg = history[-1][0]
	history = history[:-1]

	for new_history in respond(last_user_msg, history, system, temp, top_p, max_tokens):
	yield new_history

	def undo_last(history):
	if history:
	return history[:-1]
	return history

	# Connect events
	msg.submit(
	user_submit,
	[msg, chatbot],
	[msg, chatbot],
	queue=False
	).then(
	bot_respond,
	[chatbot, system_prompt, temperature, top_p, max_new_tokens],
	chatbot
	)

	submit_btn.click(
	user_submit,
	[msg, chatbot],
	[msg, chatbot],
	queue=False
	).then(
	bot_respond,
	[chatbot, system_prompt, temperature, top_p, max_new_tokens],
	chatbot
	)

	retry_btn.click(
	retry_last,
	[chatbot, system_prompt, temperature, top_p, max_new_tokens],
	chatbot
	)

	undo_btn.click(
	undo_last,
	chatbot,
	chatbot
	)

	# Load model on startup
	demo.load(
	fn=lambda: gr.Info("Model loading... This may take a moment on first run."),
	inputs=None,
	outputs=None
	)

	# Examples
	gr.Examples(
	examples=[
	["Can you help me understand how this sequence works?"],
	["What would be the next 5 numbers in the sequence?"],
	["Let's implement this sequence in Python together."],
	["Can you explain the pattern: 1, 1, 3, 5, 9, 15...?"],
	],
	inputs=msg,
	label="Example Messages",
	)

	if __name__ == "__main__":
	demo.launch(
	share=False,
	show_error=True,
	server_name="0.0.0.0",
	server_port=7860,
	)