kate-line's picture
Update app.py
9a3021b verified
# app.py — نسخة مصححة ومتكاملة
import inspect
import threading
from threading import Thread
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
# ====== إعدادات النموذج ======
MODEL_ID = "LiquidAI/LFM2.5-1.2B-Thinking"
DEFAULT_SYSTEM_PROMPT = """You are LFM2.5, an advanced reasoning model developed by LiquidAI. You excel at breaking down complex problems, thinking step-by-step, and providing clear, well-reasoned answers. Always think through problems systematically before providing your final answer."""
# ====== متغيرات عالمية ======
model = None
tokenizer = None
is_model_loaded = False
def load_model():
"""Load the model and tokenizer (مرّة واحدة)."""
global model, tokenizer, is_model_loaded
if is_model_loaded:
return True
try:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
print("Loading model...")
if torch.cuda.is_available():
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True,
)
else:
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float32,
device_map="cpu",
trust_remote_code=True,
)
is_model_loaded = True
print("Model loaded successfully!")
return True
except Exception as e:
print(f"Error loading model: {e}")
return False
# ====== تحويل الصيغ بين Gradio وداخل التطبيق ======
def gradio_history_to_internal(gr_history):
"""
Gradio Chatbot state is typically a list of (user, assistant) tuples.
We convert to a list of dicts: {"role": "user"|"assistant", "content": str}
"""
if not gr_history:
return []
# If already in internal dict format, return as-is
if isinstance(gr_history, list) and len(gr_history) > 0 and isinstance(gr_history[0], dict):
return gr_history
internal = []
for pair in gr_history:
if not pair:
continue
# pair may be a tuple/list of length 2 or a single string
if isinstance(pair, (list, tuple)) and len(pair) >= 2:
user_txt, assistant_txt = pair[0], pair[1]
if user_txt is not None and user_txt != "":
internal.append({"role": "user", "content": str(user_txt)})
if assistant_txt is not None and assistant_txt != "":
internal.append({"role": "assistant", "content": str(assistant_txt)})
else:
# fallback: treat item as a user message
internal.append({"role": "user", "content": str(pair)})
return internal
def internal_history_to_gradio(internal_history):
"""
Convert internal list of dicts to Gradio Chatbot format:
list of (user, assistant) tuples. We group sequential pairs.
"""
pairs = []
user_buf = None
assistant_buf = None
for msg in internal_history:
role = msg.get("role")
content = msg.get("content", "")
if role == "user":
# If previous user buffered without assistant, flush it as (user, "")
if user_buf is not None and assistant_buf is None:
pairs.append((user_buf, ""))
user_buf = content
assistant_buf = None
elif role == "assistant":
assistant_buf = content
if user_buf is None:
# assistant message without explicit user -> push as ("", assistant)
pairs.append(("", assistant_buf))
user_buf = None
assistant_buf = None
else:
pairs.append((user_buf, assistant_buf))
user_buf = None
assistant_buf = None
# flush any leftover user
if user_buf is not None and assistant_buf is None:
pairs.append((user_buf, ""))
return pairs
# ====== تنسيق الرسائل للـ model ======
def format_chat_history(history, system_prompt):
"""
history: list of dicts {"role":..., "content":...}
Returns list of messages formatted for apply_chat_template or manual fallback.
"""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
for msg in history:
if msg.get("role") and "content" in msg:
messages.append({"role": msg["role"], "content": msg["content"]})
return messages
def apply_chat_template(messages):
"""
Use tokenizer.apply_chat_template when available; otherwise fallback to simple markers.
"""
try:
# Some tokenizers expose apply_chat_template
# tokenize=False because we will tokenize later
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
return prompt
except Exception:
# manual fallback
prompt = ""
for msg in messages:
if msg["role"] == "system":
prompt += f"<|system|>\n{msg['content']}\n"
elif msg["role"] == "user":
prompt += f"<|user|>\n{msg['content']}\n"
elif msg["role"] == "assistant":
prompt += f"<|assistant|>\n{msg['content']}\n"
prompt += "<|assistant|>\n"
return prompt
# ====== توليد الاستجابة (يدعم البث streaming) ======
def generate_response(message, history, system_prompt, temperature, max_tokens, top_p):
"""
Generator that yields (partial_text, internal_history) while streaming.
"""
global model, tokenizer, is_model_loaded
# ensure model loaded
if not is_model_loaded:
if not load_model():
yield "❌ Error: Failed to load model. Please check the logs.", history
return
# Append user message into internal history
history = list(history) # copy
history.append({"role": "user", "content": message})
# Format messages for the model
messages_for_model = format_chat_history(history, system_prompt)
prompt = apply_chat_template(messages_for_model)
# Tokenize
inputs = tokenizer(prompt, return_tensors="pt")
if torch.cuda.is_available():
inputs = {k: v.cuda() for k, v in inputs.items()}
# Try streaming via TextIteratorStreamer; if it fails, fallback to non-streaming generation
try:
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=20.0)
generation_kwargs = {
**inputs,
"streamer": streamer,
"max_new_tokens": int(max_tokens),
"temperature": float(temperature),
"top_p": float(top_p),
"do_sample": float(temperature) > 0.0,
"pad_token_id": tokenizer.eos_token_id,
}
# start generation in a thread
gen_thread = Thread(target=model.generate, kwargs=generation_kwargs)
gen_thread.start()
response = ""
for new_text in streamer:
response += new_text
# update last assistant entry in history
# ensure we don't duplicate user entry — we know last entry is user, append/update assistant
if len(history) == 0 or history[-1].get("role") != "assistant":
history.append({"role": "assistant", "content": response})
else:
history[-1]["content"] = response
yield response, history
gen_thread.join()
except Exception as e:
# Fallback: synchronous non-streaming generation (less interactive)
try:
outputs = model.generate(
**inputs,
max_new_tokens=int(max_tokens),
temperature=float(temperature),
top_p=float(top_p),
do_sample=float(temperature) > 0.0,
pad_token_id=tokenizer.eos_token_id,
)
decoded = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
# update history
history.append({"role": "assistant", "content": decoded})
yield decoded, history
except Exception as e2:
err = f"❌ Generation error: {e} | fallback error: {e2}"
history.append({"role": "assistant", "content": err})
yield err, history
# ====== غلاف للدردشة مع معالجة الأخطاء وتحويل الصيغ ======
def chat_with_model(message, gr_chat_history, system_prompt, temperature, max_tokens, top_p):
"""
This function is connected to Gradio. It receives:
- message (str)
- gr_chat_history (Gradio Chatbot state)
It should return:
- cleared msg_input (""), updated gr_chat_history (list of tuples)
We implement streaming by yielding successive (msg_input, gr_chat_history) pairs.
"""
# If empty message, do nothing
if not message or not str(message).strip():
# return unchanged history and empty input
yield "", gr_chat_history
return
# Convert gradio history format to internal
internal_history = gradio_history_to_internal(gr_chat_history)
try:
# stream generator
for response_text, updated_internal in generate_response(
message, internal_history, system_prompt, temperature, max_tokens, top_p
):
# convert to Gradio format for display
gr_history_for_component = internal_history_to_gradio(updated_internal)
# clear input box on each yield (keeps behavior consistent)
yield "", gr_history_for_component
except Exception as e:
error_msg = f"❌ Error: {str(e)}"
internal_history.append({"role": "assistant", "content": error_msg})
yield "", internal_history_to_gradio(internal_history)
def clear_conversation():
return [], ""
def get_model_info():
return f""" ### 🧠 LFM2.5-1.2B-Thinking
**Model:** {MODEL_ID}
**Description:** An advanced reasoning model optimized for step-by-step thinking and complex problem-solving.
**Parameters:** ~1.2 Billion
**Capabilities:** - Logical reasoning - Mathematical problem solving - Code generation and analysis - Step-by-step thinking
**Tips:** Use the system prompt to guide the model's behavior and adjust temperature for creativity vs. precision.
"""
# ====== واجهة Gradio ======
with gr.Blocks(title="LFM2.5-1.2B-Thinking Trial", fill_height=True) as demo:
gr.Markdown(
"""
# 🧠 LFM2.5-1.2B-Thinking
### Advanced Reasoning Model by LiquidAI
"""
)
with gr.Row():
with gr.Column(scale=3):
# Note: avoid using `show_copy_button` directly (it may not exist in installed Gradio).
# If you want a copy button in newer Gradio versions, you could use `buttons=["copy"]`.
chatbot = gr.Chatbot(label="Conversation", height=500, bubble_full_width=False, type="messages")
with gr.Row():
msg_input = gr.Textbox(
label="Your Message",
placeholder="Ask me anything... Press Enter to send, Shift+Enter for new line",
lines=2,
show_label=False,
container=False,
)
send_btn = gr.Button("🚀 Send", variant="primary")
with gr.Row():
clear_btn = gr.Button("🗑️ Clear Conversation", variant="secondary")
retry_btn = gr.Button("🔄 Retry Last", variant="secondary")
with gr.Column(scale=1):
with gr.Accordion("⚙️ Settings", open=False):
system_prompt = gr.Textbox(label="System Prompt", value=DEFAULT_SYSTEM_PROMPT, lines=4)
temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
max_tokens = gr.Slider(minimum=64, maximum=2048, value=512, step=64, label="Max Tokens")
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top P")
with gr.Accordion("ℹ️ Model Info", open=False):
model_info = gr.Markdown(get_model_info())
gr.Markdown("### 💡 Example Prompts")
examples = gr.Examples(
examples=[
"Explain quantum entanglement in simple terms.",
"Solve this math problem: If a train travels at 60 mph for 2.5 hours, how far does it go?",
"Write a Python function to check if a number is prime.",
"What are the steps to debug a React application?",
"Explain the difference between supervised and unsupervised learning.",
],
inputs=msg_input,
label="Click to try:",
)
# Events
# msg_input.submit and send_btn.click both call chat_with_model.
msg_input.submit(
fn=chat_with_model,
inputs=[msg_input, chatbot, system_prompt, temperature, max_tokens, top_p],
outputs=[msg_input, chatbot],
api_visibility="public",
)
send_btn.click(
fn=chat_with_model,
inputs=[msg_input, chatbot, system_prompt, temperature, max_tokens, top_p],
outputs=[msg_input, chatbot],
api_visibility="public",
)
clear_btn.click(fn=clear_conversation, inputs=None, outputs=[chatbot, msg_input], api_visibility="private")
# Optional: retry last — naive implementation: re-send last user message
def retry_last(gr_chat_history, system_prompt, temperature, max_tokens, top_p):
internal = gradio_history_to_internal(gr_chat_history)
# find last user message
last_user = None
for msg in reversed(internal):
if msg.get("role") == "user" and msg.get("content", "").strip():
last_user = msg["content"]
break
if last_user is None:
return "", gr_chat_history
# call chat_with_model generator directly (non-streaming here for retry convenience)
for response_text, updated_internal in generate_response(last_user, internal[:-1], system_prompt, temperature, max_tokens, top_p):
# continue streaming until finished
pass
return "", internal_history_to_gradio(updated_internal)
retry_btn.click(
fn=retry_last,
inputs=[chatbot, system_prompt, temperature, max_tokens, top_p],
outputs=[msg_input, chatbot],
api_visibility="private",
)
# load placeholder (avoid heavy work on import; model will lazy-load on first request)
demo.load(fn=lambda: None)
# Launch
if __name__ == "__main__":
# You can pin a Gradio version in your environment instead of changing the code.
# The app below avoids `show_copy_button` to be compatible with multiple Gradio releases.
demo.launch(
theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="indigo",
neutral_hue="slate",
font=gr.themes.GoogleFont("Inter"),
text_size="md",
spacing_size="md",
radius_size="md",
).set(
button_primary_background_fill="*primary_600",
button_primary_background_fill_hover="*primary_700",
block_title_text_weight="600",
),
footer_links=[
{"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
{"label": "LiquidAI", "url": "https://huggingface.co/LiquidAI"},
{"label": "Model Card", "url": "https://huggingface.co/LiquidAI/LFM2.5-1.2B-Thinking"},
],
server_name="0.0.0.0",
server_port=7860,
show_error=True,
)