Spaces:

kate-line
/

anycoder-8157c445

Runtime error

File size: 15,973 Bytes

# app.py — نسخة مصححة ومتكاملة
import inspect
import threading
from threading import Thread

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

# ====== إعدادات النموذج ======
MODEL_ID = "LiquidAI/LFM2.5-1.2B-Thinking"
DEFAULT_SYSTEM_PROMPT = """You are LFM2.5, an advanced reasoning model developed by LiquidAI. You excel at breaking down complex problems, thinking step-by-step, and providing clear, well-reasoned answers. Always think through problems systematically before providing your final answer."""

# ====== متغيرات عالمية ======
model = None
tokenizer = None
is_model_loaded = False


def load_model():
    """Load the model and tokenizer (مرّة واحدة)."""
    global model, tokenizer, is_model_loaded
    if is_model_loaded:
        return True
    try:
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
        print("Loading model...")
        if torch.cuda.is_available():
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_ID,
                torch_dtype=torch.float16,
                device_map="auto",
                trust_remote_code=True,
            )
        else:
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_ID,
                torch_dtype=torch.float32,
                device_map="cpu",
                trust_remote_code=True,
            )
        is_model_loaded = True
        print("Model loaded successfully!")
        return True
    except Exception as e:
        print(f"Error loading model: {e}")
        return False


# ====== تحويل الصيغ بين Gradio وداخل التطبيق ======
def gradio_history_to_internal(gr_history):
    """
    Gradio Chatbot state is typically a list of (user, assistant) tuples.
    We convert to a list of dicts: {"role": "user"|"assistant", "content": str}
    """
    if not gr_history:
        return []
    # If already in internal dict format, return as-is
    if isinstance(gr_history, list) and len(gr_history) > 0 and isinstance(gr_history[0], dict):
        return gr_history
    internal = []
    for pair in gr_history:
        if not pair:
            continue
        # pair may be a tuple/list of length 2 or a single string
        if isinstance(pair, (list, tuple)) and len(pair) >= 2:
            user_txt, assistant_txt = pair[0], pair[1]
            if user_txt is not None and user_txt != "":
                internal.append({"role": "user", "content": str(user_txt)})
            if assistant_txt is not None and assistant_txt != "":
                internal.append({"role": "assistant", "content": str(assistant_txt)})
        else:
            # fallback: treat item as a user message
            internal.append({"role": "user", "content": str(pair)})
    return internal


def internal_history_to_gradio(internal_history):
    """
    Convert internal list of dicts to Gradio Chatbot format:
    list of (user, assistant) tuples. We group sequential pairs.
    """
    pairs = []
    user_buf = None
    assistant_buf = None
    for msg in internal_history:
        role = msg.get("role")
        content = msg.get("content", "")
        if role == "user":
            # If previous user buffered without assistant, flush it as (user, "")
            if user_buf is not None and assistant_buf is None:
                pairs.append((user_buf, ""))
            user_buf = content
            assistant_buf = None
        elif role == "assistant":
            assistant_buf = content
            if user_buf is None:
                # assistant message without explicit user -> push as ("", assistant)
                pairs.append(("", assistant_buf))
                user_buf = None
                assistant_buf = None
            else:
                pairs.append((user_buf, assistant_buf))
                user_buf = None
                assistant_buf = None
    # flush any leftover user
    if user_buf is not None and assistant_buf is None:
        pairs.append((user_buf, ""))
    return pairs


# ====== تنسيق الرسائل للـ model ======
def format_chat_history(history, system_prompt):
    """
    history: list of dicts {"role":..., "content":...}
    Returns list of messages formatted for apply_chat_template or manual fallback.
    """
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    for msg in history:
        if msg.get("role") and "content" in msg:
            messages.append({"role": msg["role"], "content": msg["content"]})
    return messages


def apply_chat_template(messages):
    """
    Use tokenizer.apply_chat_template when available; otherwise fallback to simple markers.
    """
    try:
        # Some tokenizers expose apply_chat_template
        # tokenize=False because we will tokenize later
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        return prompt
    except Exception:
        # manual fallback
        prompt = ""
        for msg in messages:
            if msg["role"] == "system":
                prompt += f"<|system|>\n{msg['content']}\n"
            elif msg["role"] == "user":
                prompt += f"<|user|>\n{msg['content']}\n"
            elif msg["role"] == "assistant":
                prompt += f"<|assistant|>\n{msg['content']}\n"
        prompt += "<|assistant|>\n"
        return prompt


# ====== توليد الاستجابة (يدعم البث streaming) ======
def generate_response(message, history, system_prompt, temperature, max_tokens, top_p):
    """
    Generator that yields (partial_text, internal_history) while streaming.
    """
    global model, tokenizer, is_model_loaded
    # ensure model loaded
    if not is_model_loaded:
        if not load_model():
            yield "❌ Error: Failed to load model. Please check the logs.", history
            return

    # Append user message into internal history
    history = list(history)  # copy
    history.append({"role": "user", "content": message})

    # Format messages for the model
    messages_for_model = format_chat_history(history, system_prompt)
    prompt = apply_chat_template(messages_for_model)

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}

    # Try streaming via TextIteratorStreamer; if it fails, fallback to non-streaming generation
    try:
        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=20.0)
        generation_kwargs = {
            **inputs,
            "streamer": streamer,
            "max_new_tokens": int(max_tokens),
            "temperature": float(temperature),
            "top_p": float(top_p),
            "do_sample": float(temperature) > 0.0,
            "pad_token_id": tokenizer.eos_token_id,
        }
        # start generation in a thread
        gen_thread = Thread(target=model.generate, kwargs=generation_kwargs)
        gen_thread.start()

        response = ""
        for new_text in streamer:
            response += new_text
            # update last assistant entry in history
            # ensure we don't duplicate user entry — we know last entry is user, append/update assistant
            if len(history) == 0 or history[-1].get("role") != "assistant":
                history.append({"role": "assistant", "content": response})
            else:
                history[-1]["content"] = response
            yield response, history
        gen_thread.join()
    except Exception as e:
        # Fallback: synchronous non-streaming generation (less interactive)
        try:
            outputs = model.generate(
                **inputs,
                max_new_tokens=int(max_tokens),
                temperature=float(temperature),
                top_p=float(top_p),
                do_sample=float(temperature) > 0.0,
                pad_token_id=tokenizer.eos_token_id,
            )
            decoded = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
            # update history
            history.append({"role": "assistant", "content": decoded})
            yield decoded, history
        except Exception as e2:
            err = f"❌ Generation error: {e} | fallback error: {e2}"
            history.append({"role": "assistant", "content": err})
            yield err, history


# ====== غلاف للدردشة مع معالجة الأخطاء وتحويل الصيغ ======
def chat_with_model(message, gr_chat_history, system_prompt, temperature, max_tokens, top_p):
    """
    This function is connected to Gradio. It receives:
      - message (str)
      - gr_chat_history (Gradio Chatbot state)
    It should return:
      - cleared msg_input (""), updated gr_chat_history (list of tuples)
    We implement streaming by yielding successive (msg_input, gr_chat_history) pairs.
    """
    # If empty message, do nothing
    if not message or not str(message).strip():
        # return unchanged history and empty input
        yield "", gr_chat_history
        return

    # Convert gradio history format to internal
    internal_history = gradio_history_to_internal(gr_chat_history)

    try:
        # stream generator
        for response_text, updated_internal in generate_response(
            message, internal_history, system_prompt, temperature, max_tokens, top_p
        ):
            # convert to Gradio format for display
            gr_history_for_component = internal_history_to_gradio(updated_internal)
            # clear input box on each yield (keeps behavior consistent)
            yield "", gr_history_for_component
    except Exception as e:
        error_msg = f"❌ Error: {str(e)}"
        internal_history.append({"role": "assistant", "content": error_msg})
        yield "", internal_history_to_gradio(internal_history)


def clear_conversation():
    return [], ""


def get_model_info():
    return f""" ### 🧠 LFM2.5-1.2B-Thinking
**Model:** {MODEL_ID}
**Description:** An advanced reasoning model optimized for step-by-step thinking and complex problem-solving.
**Parameters:** ~1.2 Billion
**Capabilities:** - Logical reasoning - Mathematical problem solving - Code generation and analysis - Step-by-step thinking
**Tips:** Use the system prompt to guide the model's behavior and adjust temperature for creativity vs. precision.
"""


# ====== واجهة Gradio ======
with gr.Blocks(title="LFM2.5-1.2B-Thinking Trial", fill_height=True) as demo:
    gr.Markdown(
        """
# 🧠 LFM2.5-1.2B-Thinking
### Advanced Reasoning Model by LiquidAI
"""
    )

    with gr.Row():
        with gr.Column(scale=3):
            # Note: avoid using `show_copy_button` directly (it may not exist in installed Gradio).
            # If you want a copy button in newer Gradio versions, you could use `buttons=["copy"]`.
            chatbot = gr.Chatbot(label="Conversation", height=500, bubble_full_width=False, type="messages")

            with gr.Row():
                msg_input = gr.Textbox(
                    label="Your Message",
                    placeholder="Ask me anything... Press Enter to send, Shift+Enter for new line",
                    lines=2,
                    show_label=False,
                    container=False,
                )
                send_btn = gr.Button("🚀 Send", variant="primary")

            with gr.Row():
                clear_btn = gr.Button("🗑️ Clear Conversation", variant="secondary")
                retry_btn = gr.Button("🔄 Retry Last", variant="secondary")

        with gr.Column(scale=1):
            with gr.Accordion("⚙️ Settings", open=False):
                system_prompt = gr.Textbox(label="System Prompt", value=DEFAULT_SYSTEM_PROMPT, lines=4)
                temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
                max_tokens = gr.Slider(minimum=64, maximum=2048, value=512, step=64, label="Max Tokens")
                top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top P")

            with gr.Accordion("ℹ️ Model Info", open=False):
                model_info = gr.Markdown(get_model_info())

            gr.Markdown("### 💡 Example Prompts")
            examples = gr.Examples(
                examples=[
                    "Explain quantum entanglement in simple terms.",
                    "Solve this math problem: If a train travels at 60 mph for 2.5 hours, how far does it go?",
                    "Write a Python function to check if a number is prime.",
                    "What are the steps to debug a React application?",
                    "Explain the difference between supervised and unsupervised learning.",
                ],
                inputs=msg_input,
                label="Click to try:",
            )

    # Events
    # msg_input.submit and send_btn.click both call chat_with_model.
    msg_input.submit(
        fn=chat_with_model,
        inputs=[msg_input, chatbot, system_prompt, temperature, max_tokens, top_p],
        outputs=[msg_input, chatbot],
        api_visibility="public",
    )
    send_btn.click(
        fn=chat_with_model,
        inputs=[msg_input, chatbot, system_prompt, temperature, max_tokens, top_p],
        outputs=[msg_input, chatbot],
        api_visibility="public",
    )
    clear_btn.click(fn=clear_conversation, inputs=None, outputs=[chatbot, msg_input], api_visibility="private")

    # Optional: retry last — naive implementation: re-send last user message
    def retry_last(gr_chat_history, system_prompt, temperature, max_tokens, top_p):
        internal = gradio_history_to_internal(gr_chat_history)
        # find last user message
        last_user = None
        for msg in reversed(internal):
            if msg.get("role") == "user" and msg.get("content", "").strip():
                last_user = msg["content"]
                break
        if last_user is None:
            return "", gr_chat_history
        # call chat_with_model generator directly (non-streaming here for retry convenience)
        for response_text, updated_internal in generate_response(last_user, internal[:-1], system_prompt, temperature, max_tokens, top_p):
            # continue streaming until finished
            pass
        return "", internal_history_to_gradio(updated_internal)

    retry_btn.click(
        fn=retry_last,
        inputs=[chatbot, system_prompt, temperature, max_tokens, top_p],
        outputs=[msg_input, chatbot],
        api_visibility="private",
    )

    # load placeholder (avoid heavy work on import; model will lazy-load on first request)
    demo.load(fn=lambda: None)

# Launch
if __name__ == "__main__":
    # You can pin a Gradio version in your environment instead of changing the code.
    # The app below avoids `show_copy_button` to be compatible with multiple Gradio releases.
    demo.launch(
        theme=gr.themes.Soft(
            primary_hue="blue",
            secondary_hue="indigo",
            neutral_hue="slate",
            font=gr.themes.GoogleFont("Inter"),
            text_size="md",
            spacing_size="md",
            radius_size="md",
        ).set(
            button_primary_background_fill="*primary_600",
            button_primary_background_fill_hover="*primary_700",
            block_title_text_weight="600",
        ),
        footer_links=[
            {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
            {"label": "LiquidAI", "url": "https://huggingface.co/LiquidAI"},
            {"label": "Model Card", "url": "https://huggingface.co/LiquidAI/LFM2.5-1.2B-Thinking"},
        ],
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True,
    )