# app.py — نسخة مصححة ومتكاملة import inspect import threading from threading import Thread import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer # ====== إعدادات النموذج ====== MODEL_ID = "LiquidAI/LFM2.5-1.2B-Thinking" DEFAULT_SYSTEM_PROMPT = """You are LFM2.5, an advanced reasoning model developed by LiquidAI. You excel at breaking down complex problems, thinking step-by-step, and providing clear, well-reasoned answers. Always think through problems systematically before providing your final answer.""" # ====== متغيرات عالمية ====== model = None tokenizer = None is_model_loaded = False def load_model(): """Load the model and tokenizer (مرّة واحدة).""" global model, tokenizer, is_model_loaded if is_model_loaded: return True try: print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) print("Loading model...") if torch.cuda.is_available(): model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True, ) else: model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True, ) is_model_loaded = True print("Model loaded successfully!") return True except Exception as e: print(f"Error loading model: {e}") return False # ====== تحويل الصيغ بين Gradio وداخل التطبيق ====== def gradio_history_to_internal(gr_history): """ Gradio Chatbot state is typically a list of (user, assistant) tuples. We convert to a list of dicts: {"role": "user"|"assistant", "content": str} """ if not gr_history: return [] # If already in internal dict format, return as-is if isinstance(gr_history, list) and len(gr_history) > 0 and isinstance(gr_history[0], dict): return gr_history internal = [] for pair in gr_history: if not pair: continue # pair may be a tuple/list of length 2 or a single string if isinstance(pair, (list, tuple)) and len(pair) >= 2: user_txt, assistant_txt = pair[0], pair[1] if user_txt is not None and user_txt != "": internal.append({"role": "user", "content": str(user_txt)}) if assistant_txt is not None and assistant_txt != "": internal.append({"role": "assistant", "content": str(assistant_txt)}) else: # fallback: treat item as a user message internal.append({"role": "user", "content": str(pair)}) return internal def internal_history_to_gradio(internal_history): """ Convert internal list of dicts to Gradio Chatbot format: list of (user, assistant) tuples. We group sequential pairs. """ pairs = [] user_buf = None assistant_buf = None for msg in internal_history: role = msg.get("role") content = msg.get("content", "") if role == "user": # If previous user buffered without assistant, flush it as (user, "") if user_buf is not None and assistant_buf is None: pairs.append((user_buf, "")) user_buf = content assistant_buf = None elif role == "assistant": assistant_buf = content if user_buf is None: # assistant message without explicit user -> push as ("", assistant) pairs.append(("", assistant_buf)) user_buf = None assistant_buf = None else: pairs.append((user_buf, assistant_buf)) user_buf = None assistant_buf = None # flush any leftover user if user_buf is not None and assistant_buf is None: pairs.append((user_buf, "")) return pairs # ====== تنسيق الرسائل للـ model ====== def format_chat_history(history, system_prompt): """ history: list of dicts {"role":..., "content":...} Returns list of messages formatted for apply_chat_template or manual fallback. """ messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) for msg in history: if msg.get("role") and "content" in msg: messages.append({"role": msg["role"], "content": msg["content"]}) return messages def apply_chat_template(messages): """ Use tokenizer.apply_chat_template when available; otherwise fallback to simple markers. """ try: # Some tokenizers expose apply_chat_template # tokenize=False because we will tokenize later prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) return prompt except Exception: # manual fallback prompt = "" for msg in messages: if msg["role"] == "system": prompt += f"<|system|>\n{msg['content']}\n" elif msg["role"] == "user": prompt += f"<|user|>\n{msg['content']}\n" elif msg["role"] == "assistant": prompt += f"<|assistant|>\n{msg['content']}\n" prompt += "<|assistant|>\n" return prompt # ====== توليد الاستجابة (يدعم البث streaming) ====== def generate_response(message, history, system_prompt, temperature, max_tokens, top_p): """ Generator that yields (partial_text, internal_history) while streaming. """ global model, tokenizer, is_model_loaded # ensure model loaded if not is_model_loaded: if not load_model(): yield "❌ Error: Failed to load model. Please check the logs.", history return # Append user message into internal history history = list(history) # copy history.append({"role": "user", "content": message}) # Format messages for the model messages_for_model = format_chat_history(history, system_prompt) prompt = apply_chat_template(messages_for_model) # Tokenize inputs = tokenizer(prompt, return_tensors="pt") if torch.cuda.is_available(): inputs = {k: v.cuda() for k, v in inputs.items()} # Try streaming via TextIteratorStreamer; if it fails, fallback to non-streaming generation try: streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=20.0) generation_kwargs = { **inputs, "streamer": streamer, "max_new_tokens": int(max_tokens), "temperature": float(temperature), "top_p": float(top_p), "do_sample": float(temperature) > 0.0, "pad_token_id": tokenizer.eos_token_id, } # start generation in a thread gen_thread = Thread(target=model.generate, kwargs=generation_kwargs) gen_thread.start() response = "" for new_text in streamer: response += new_text # update last assistant entry in history # ensure we don't duplicate user entry — we know last entry is user, append/update assistant if len(history) == 0 or history[-1].get("role") != "assistant": history.append({"role": "assistant", "content": response}) else: history[-1]["content"] = response yield response, history gen_thread.join() except Exception as e: # Fallback: synchronous non-streaming generation (less interactive) try: outputs = model.generate( **inputs, max_new_tokens=int(max_tokens), temperature=float(temperature), top_p=float(top_p), do_sample=float(temperature) > 0.0, pad_token_id=tokenizer.eos_token_id, ) decoded = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) # update history history.append({"role": "assistant", "content": decoded}) yield decoded, history except Exception as e2: err = f"❌ Generation error: {e} | fallback error: {e2}" history.append({"role": "assistant", "content": err}) yield err, history # ====== غلاف للدردشة مع معالجة الأخطاء وتحويل الصيغ ====== def chat_with_model(message, gr_chat_history, system_prompt, temperature, max_tokens, top_p): """ This function is connected to Gradio. It receives: - message (str) - gr_chat_history (Gradio Chatbot state) It should return: - cleared msg_input (""), updated gr_chat_history (list of tuples) We implement streaming by yielding successive (msg_input, gr_chat_history) pairs. """ # If empty message, do nothing if not message or not str(message).strip(): # return unchanged history and empty input yield "", gr_chat_history return # Convert gradio history format to internal internal_history = gradio_history_to_internal(gr_chat_history) try: # stream generator for response_text, updated_internal in generate_response( message, internal_history, system_prompt, temperature, max_tokens, top_p ): # convert to Gradio format for display gr_history_for_component = internal_history_to_gradio(updated_internal) # clear input box on each yield (keeps behavior consistent) yield "", gr_history_for_component except Exception as e: error_msg = f"❌ Error: {str(e)}" internal_history.append({"role": "assistant", "content": error_msg}) yield "", internal_history_to_gradio(internal_history) def clear_conversation(): return [], "" def get_model_info(): return f""" ### 🧠 LFM2.5-1.2B-Thinking **Model:** {MODEL_ID} **Description:** An advanced reasoning model optimized for step-by-step thinking and complex problem-solving. **Parameters:** ~1.2 Billion **Capabilities:** - Logical reasoning - Mathematical problem solving - Code generation and analysis - Step-by-step thinking **Tips:** Use the system prompt to guide the model's behavior and adjust temperature for creativity vs. precision. """ # ====== واجهة Gradio ====== with gr.Blocks(title="LFM2.5-1.2B-Thinking Trial", fill_height=True) as demo: gr.Markdown( """ # 🧠 LFM2.5-1.2B-Thinking ### Advanced Reasoning Model by LiquidAI """ ) with gr.Row(): with gr.Column(scale=3): # Note: avoid using `show_copy_button` directly (it may not exist in installed Gradio). # If you want a copy button in newer Gradio versions, you could use `buttons=["copy"]`. chatbot = gr.Chatbot(label="Conversation", height=500, bubble_full_width=False, type="messages") with gr.Row(): msg_input = gr.Textbox( label="Your Message", placeholder="Ask me anything... Press Enter to send, Shift+Enter for new line", lines=2, show_label=False, container=False, ) send_btn = gr.Button("🚀 Send", variant="primary") with gr.Row(): clear_btn = gr.Button("🗑️ Clear Conversation", variant="secondary") retry_btn = gr.Button("🔄 Retry Last", variant="secondary") with gr.Column(scale=1): with gr.Accordion("⚙️ Settings", open=False): system_prompt = gr.Textbox(label="System Prompt", value=DEFAULT_SYSTEM_PROMPT, lines=4) temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature") max_tokens = gr.Slider(minimum=64, maximum=2048, value=512, step=64, label="Max Tokens") top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top P") with gr.Accordion("ℹ️ Model Info", open=False): model_info = gr.Markdown(get_model_info()) gr.Markdown("### 💡 Example Prompts") examples = gr.Examples( examples=[ "Explain quantum entanglement in simple terms.", "Solve this math problem: If a train travels at 60 mph for 2.5 hours, how far does it go?", "Write a Python function to check if a number is prime.", "What are the steps to debug a React application?", "Explain the difference between supervised and unsupervised learning.", ], inputs=msg_input, label="Click to try:", ) # Events # msg_input.submit and send_btn.click both call chat_with_model. msg_input.submit( fn=chat_with_model, inputs=[msg_input, chatbot, system_prompt, temperature, max_tokens, top_p], outputs=[msg_input, chatbot], api_visibility="public", ) send_btn.click( fn=chat_with_model, inputs=[msg_input, chatbot, system_prompt, temperature, max_tokens, top_p], outputs=[msg_input, chatbot], api_visibility="public", ) clear_btn.click(fn=clear_conversation, inputs=None, outputs=[chatbot, msg_input], api_visibility="private") # Optional: retry last — naive implementation: re-send last user message def retry_last(gr_chat_history, system_prompt, temperature, max_tokens, top_p): internal = gradio_history_to_internal(gr_chat_history) # find last user message last_user = None for msg in reversed(internal): if msg.get("role") == "user" and msg.get("content", "").strip(): last_user = msg["content"] break if last_user is None: return "", gr_chat_history # call chat_with_model generator directly (non-streaming here for retry convenience) for response_text, updated_internal in generate_response(last_user, internal[:-1], system_prompt, temperature, max_tokens, top_p): # continue streaming until finished pass return "", internal_history_to_gradio(updated_internal) retry_btn.click( fn=retry_last, inputs=[chatbot, system_prompt, temperature, max_tokens, top_p], outputs=[msg_input, chatbot], api_visibility="private", ) # load placeholder (avoid heavy work on import; model will lazy-load on first request) demo.load(fn=lambda: None) # Launch if __name__ == "__main__": # You can pin a Gradio version in your environment instead of changing the code. # The app below avoids `show_copy_button` to be compatible with multiple Gradio releases. demo.launch( theme=gr.themes.Soft( primary_hue="blue", secondary_hue="indigo", neutral_hue="slate", font=gr.themes.GoogleFont("Inter"), text_size="md", spacing_size="md", radius_size="md", ).set( button_primary_background_fill="*primary_600", button_primary_background_fill_hover="*primary_700", block_title_text_weight="600", ), footer_links=[ {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, {"label": "LiquidAI", "url": "https://huggingface.co/LiquidAI"}, {"label": "Model Card", "url": "https://huggingface.co/LiquidAI/LFM2.5-1.2B-Thinking"}, ], server_name="0.0.0.0", server_port=7860, show_error=True, )