import threading import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer import torch model_id = "LiquidAI/LFM2.5-1.2B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", dtype=torch.bfloat16 ) def chat(message, history): messages = [] for item in history: if isinstance(item, dict): messages.append(item) else: user_msg, assistant_msg = item messages.append({"role": "user", "content": user_msg}) if assistant_msg: messages.append({"role": "assistant", "content": assistant_msg}) messages.append({"role": "user", "content": message}) encoded = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt", tokenize=True ) input_ids = (encoded.input_ids if hasattr(encoded, "input_ids") else encoded).to(model.device) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) thread = threading.Thread(target=model.generate, kwargs=dict( input_ids=input_ids, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512, streamer=streamer, )) thread.start() partial = "" for token in streamer: partial += token yield partial demo = gr.ChatInterface( fn=chat, title="LFM2.5 Chat", description="Chat avec le modèle LiquidAI LFM2.5-1.2B-Instruct", ) if __name__ == "__main__": demo.launch()