Spaces:

ruvatron
/

Tvara

Sleeping

App Files Files Community

Tvara / app.py

unknownfriend00007

Update app.py

935c8c9 verified 2 months ago

raw

history blame contribute delete

2.1 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	import torch
	from threading import Thread

	# Load model
	model_id = "LiquidAI/LFM2-700M" # Balanced speed and quality
	print("Loading model...")
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	dtype=torch.float32,
	device_map="cpu"
	)
	print("Model loaded!")

	def chat(message, history):
	"""Gradio chat interface with streaming"""
	messages = []

	# Build message history
	if history:
	for entry in history:
	if isinstance(entry, dict):
	messages.append(entry)
	elif isinstance(entry, (list, tuple)) and len(entry) >= 2:
	messages.append({"role": "user", "content": entry[0]})
	if entry[1]:
	messages.append({"role": "assistant", "content": entry[1]})

	messages.append({"role": "user", "content": message})

	# Prepare for streaming
	inputs = tokenizer.apply_chat_template(
	messages,
	return_tensors="pt",
	add_generation_prompt=True
	)

	# Setup streamer
	streamer = TextIteratorStreamer(
	tokenizer,
	skip_special_tokens=True,
	skip_prompt=True
	)

	generation_kwargs = {
	"inputs": inputs,
	"max_new_tokens": 512,
	"temperature": 0.7,
	"top_p": 0.9,
	"do_sample": True,
	"streamer": streamer
	}

	# Start generation in thread
	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	# Stream tokens as they're generated
	partial_text = ""
	for new_text in streamer:
	partial_text += new_text
	yield partial_text

	# Create Gradio interface
	demo = gr.ChatInterface(
	fn=chat,
	title="LFM2-700M Chatbot (Streaming)",
	description="Chat with Liquid AI's LFM2-700M - balanced speed and quality",
	examples=["Hello!", "Explain AI", "Write a Python function"]
	)

	if __name__ == "__main__":
	demo.launch()