import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer import torch from threading import Thread # Load model model_id = "LiquidAI/LFM2-700M" # Balanced speed and quality print("Loading model...") tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, dtype=torch.float32, device_map="cpu" ) print("Model loaded!") def chat(message, history): """Gradio chat interface with streaming""" messages = [] # Build message history if history: for entry in history: if isinstance(entry, dict): messages.append(entry) elif isinstance(entry, (list, tuple)) and len(entry) >= 2: messages.append({"role": "user", "content": entry[0]}) if entry[1]: messages.append({"role": "assistant", "content": entry[1]}) messages.append({"role": "user", "content": message}) # Prepare for streaming inputs = tokenizer.apply_chat_template( messages, return_tensors="pt", add_generation_prompt=True ) # Setup streamer streamer = TextIteratorStreamer( tokenizer, skip_special_tokens=True, skip_prompt=True ) generation_kwargs = { "inputs": inputs, "max_new_tokens": 512, "temperature": 0.7, "top_p": 0.9, "do_sample": True, "streamer": streamer } # Start generation in thread thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() # Stream tokens as they're generated partial_text = "" for new_text in streamer: partial_text += new_text yield partial_text # Create Gradio interface demo = gr.ChatInterface( fn=chat, title="LFM2-700M Chatbot (Streaming)", description="Chat with Liquid AI's LFM2-700M - balanced speed and quality", examples=["Hello!", "Explain AI", "Write a Python function"] ) if __name__ == "__main__": demo.launch()