File size: 3,063 Bytes
56ccbf2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import torch
from threading import Thread

# Available model options
MODEL_NAMES = {
    "LFM2-350M": "LiquidAI/LFM2-350M",
    "LFM2-700M": "LiquidAI/LFM2-700M",
    "LFM2-1.2B": "LiquidAI/LFM2-1.2B",
    "LFM2-2.6B": "LiquidAI/LFM2-2.6B",
    "LFM2-8B-A1B": "LiquidAI/LFM2-8B-A1B",
}

# Cache for loaded models
model_cache = {}

def load_model(model_key):
    """Load and cache the selected model."""
    if model_key in model_cache:
        return model_cache[model_key]
    
    model_name = MODEL_NAMES[model_key]
    print(f"Loading {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto"
    )
    model_cache[model_key] = (tokenizer, model)
    return tokenizer, model

def chat_with_model(message, history, model_choice):
    tokenizer, model = load_model(model_choice)

    # Build the chat history as a string
    prompt = ""
    for user_msg, bot_msg in history:
        prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
    prompt += f"User: {message}\nAssistant:"

    # Streaming setup
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    generation_kwargs = dict(
        **inputs,
        streamer=streamer,
        max_new_tokens=256,
        temperature=0.7,
        do_sample=True,
        top_p=0.9
    )

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    partial_text = ""
    for new_text in streamer:
        partial_text += new_text
        yield partial_text

def create_demo():
    with gr.Blocks(title="LiquidAI Chat Interface") as demo:
        gr.Markdown("## 💧 LiquidAI Model Chat Playground")

        with gr.Row():
            model_choice = gr.Dropdown(
                label="Select Model",
                choices=list(MODEL_NAMES.keys()),
                value="LFM2-1.2B"
            )

        chatbot = gr.Chatbot(label="Chat with the model", height=450)
        msg = gr.Textbox(label="Your message", placeholder="Type a message and hit Enter")

        clear = gr.Button("Clear Chat")

        def user_submit(user_message, chat_history, model_choice):
            chat_history = chat_history + [(user_message, "")]
            return "", chat_history, model_choice

        msg.submit(
            user_submit,
            [msg, chatbot, model_choice],
            [msg, chatbot, model_choice],
            queue=False
        ).then(
            chat_with_model,
            [msg, chatbot, model_choice],
            chatbot
        )

        clear.click(lambda: None, None, chatbot, queue=False)

    return demo

if __name__ == "__main__":
    demo = create_demo()
    demo.queue(max_size=32)
    demo.launch(server_name="0.0.0.0", server_port=7860)