Tvara / app.py
unknownfriend00007's picture
Update app.py
935c8c9 verified
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import torch
from threading import Thread
# Load model
model_id = "LiquidAI/LFM2-700M" # Balanced speed and quality
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
dtype=torch.float32,
device_map="cpu"
)
print("Model loaded!")
def chat(message, history):
"""Gradio chat interface with streaming"""
messages = []
# Build message history
if history:
for entry in history:
if isinstance(entry, dict):
messages.append(entry)
elif isinstance(entry, (list, tuple)) and len(entry) >= 2:
messages.append({"role": "user", "content": entry[0]})
if entry[1]:
messages.append({"role": "assistant", "content": entry[1]})
messages.append({"role": "user", "content": message})
# Prepare for streaming
inputs = tokenizer.apply_chat_template(
messages,
return_tensors="pt",
add_generation_prompt=True
)
# Setup streamer
streamer = TextIteratorStreamer(
tokenizer,
skip_special_tokens=True,
skip_prompt=True
)
generation_kwargs = {
"inputs": inputs,
"max_new_tokens": 512,
"temperature": 0.7,
"top_p": 0.9,
"do_sample": True,
"streamer": streamer
}
# Start generation in thread
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
# Stream tokens as they're generated
partial_text = ""
for new_text in streamer:
partial_text += new_text
yield partial_text
# Create Gradio interface
demo = gr.ChatInterface(
fn=chat,
title="LFM2-700M Chatbot (Streaming)",
description="Chat with Liquid AI's LFM2-700M - balanced speed and quality",
examples=["Hello!", "Explain AI", "Write a Python function"]
)
if __name__ == "__main__":
demo.launch()