|
|
import gradio as gr |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer |
|
|
import torch |
|
|
from threading import Thread |
|
|
|
|
|
|
|
|
model_id = "LiquidAI/LFM2-700M" |
|
|
print("Loading model...") |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_id, |
|
|
dtype=torch.float32, |
|
|
device_map="cpu" |
|
|
) |
|
|
print("Model loaded!") |
|
|
|
|
|
def chat(message, history): |
|
|
"""Gradio chat interface with streaming""" |
|
|
messages = [] |
|
|
|
|
|
|
|
|
if history: |
|
|
for entry in history: |
|
|
if isinstance(entry, dict): |
|
|
messages.append(entry) |
|
|
elif isinstance(entry, (list, tuple)) and len(entry) >= 2: |
|
|
messages.append({"role": "user", "content": entry[0]}) |
|
|
if entry[1]: |
|
|
messages.append({"role": "assistant", "content": entry[1]}) |
|
|
|
|
|
messages.append({"role": "user", "content": message}) |
|
|
|
|
|
|
|
|
inputs = tokenizer.apply_chat_template( |
|
|
messages, |
|
|
return_tensors="pt", |
|
|
add_generation_prompt=True |
|
|
) |
|
|
|
|
|
|
|
|
streamer = TextIteratorStreamer( |
|
|
tokenizer, |
|
|
skip_special_tokens=True, |
|
|
skip_prompt=True |
|
|
) |
|
|
|
|
|
generation_kwargs = { |
|
|
"inputs": inputs, |
|
|
"max_new_tokens": 512, |
|
|
"temperature": 0.7, |
|
|
"top_p": 0.9, |
|
|
"do_sample": True, |
|
|
"streamer": streamer |
|
|
} |
|
|
|
|
|
|
|
|
thread = Thread(target=model.generate, kwargs=generation_kwargs) |
|
|
thread.start() |
|
|
|
|
|
|
|
|
partial_text = "" |
|
|
for new_text in streamer: |
|
|
partial_text += new_text |
|
|
yield partial_text |
|
|
|
|
|
|
|
|
demo = gr.ChatInterface( |
|
|
fn=chat, |
|
|
title="LFM2-700M Chatbot (Streaming)", |
|
|
description="Chat with Liquid AI's LFM2-700M - balanced speed and quality", |
|
|
examples=["Hello!", "Explain AI", "Write a Python function"] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|