import os # Fixes the Gradio Analytics crash bug on Colab/Spaces os.environ["GRADIO_ANALYTICS_ENABLED"] = "False" import torch import gc import re import threading import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from peft import PeftModel # ========================================== # 1. SMART PRE-LOAD MODELS (NO QUANTIZATION) # ========================================== if "loaded_engines" not in globals(): global loaded_engines loaded_engines = {} MODELS_CONFIG = { "ReasonBorn-Instruct": { "base": "Qwen/Qwen2.5-3B-Instruct", "adapter": "Phase-Technologies/ReasonBorn-Qwen-3B", }, "ReasonBorn-LoRA": { "base": "Qwen/Qwen2.5-3B", "adapter": "Phase-Technologies/rb-qwen3b-16ds-lora", } } if not loaded_engines: print("Initializing Xerv Systems... Pre-loading models for instant streaming.") # Force single-device mapping to prevent PEFT offload KeyError target_device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Targeting inference device: {target_device.upper()}") for key, cfg in MODELS_CONFIG.items(): print(f"--- Loading {key} (Unquantized BF16) ---") tokenizer = AutoTokenizer.from_pretrained(cfg["adapter"]) # Load Base Model on a single device to avoid meta-tensor offloading issues base_model = AutoModelForCausalLM.from_pretrained( cfg["base"], torch_dtype=torch.bfloat16, device_map={"": target_device}, trust_remote_code=True ) # Merge adapter for inference model = PeftModel.from_pretrained(base_model, cfg["adapter"]) model.eval() loaded_engines[key] = {"model": model, "tokenizer": tokenizer} print("✅ Both Reasoning Engines successfully loaded and ready.") else: print("⚡ Models already detected in memory! Skipping load phase for instant boot.") # ========================================== # 2. BULLETPROOF LATEX & TAG PARSER # ========================================== def format_output_with_latex_support(text): # Standardize LaTeX delimiters for Gradio text = text.replace(r'\\(', '$').replace(r'\\)', '$') text = text.replace(r'\\[', '$$').replace(r'\\]', '$$') # Extract Conclusion conclusion_match = re.search(r"(.*?)(?:|$)", text, re.DOTALL) if conclusion_match: conclusion_text = conclusion_match.group(1).strip() thinking_text = text[:conclusion_match.start()].strip() # Format Thinking Process thinking_text = thinking_text.replace("", "**🔹 PLAN:**\n").replace("", "\n") thinking_text = thinking_text.replace("", "\n").replace("", "\n") # Handle dynamic tags thinking_text = re.sub(r"", lambda m: f"**🔸 STEP {m.group(1)}:** " if m.group(1) else "**🔸 STEP:** ", thinking_text) thinking_text = thinking_text.replace("", "\n") thinking_text = thinking_text.replace("", "**✅ VERIFY:** ").replace("", "\n") # Wrap thinking in a collapsible HTML details block formatted = ( f"
\n" f"🧠 View Thinking Process\n\n" f"{thinking_text}\n\n" f"
\n\n" f"**🎯 CONCLUSION:**\n\n{conclusion_text}" ) return formatted else: # Fallback if generation stops before conclusion text = text.replace("", "**🔹 PLAN:**\n").replace("", "\n") text = text.replace("", "\n").replace("", "\n") text = re.sub(r"", lambda m: f"**🔸 STEP {m.group(1)}:** " if m.group(1) else "**🔸 STEP:** ", text) text = text.replace("", "\n") text = text.replace("", "**✅ VERIFY:** ").replace("", "\n") return text # ========================================== # 3. REAL-TIME STREAMING GENERATOR # ========================================== def process_chat_stream(user_message, history, model_choice): """ Handles Gradio's 'messages' format natively: [{"role": "user", "content": "..."}, ...] """ if not user_message.strip(): yield "", gr.update(), gr.update(), gr.update() return # Initialize history if empty and append new user/assistant dicts history = history or [] history.append({"role": "user", "content": user_message}) history.append({"role": "assistant", "content": ""}) # Yield immediately to update UI (hide hero/suggestions, show chatbot) yield "", gr.update(value=history, visible=True), gr.update(visible=False), gr.update(visible=False) try: engine = loaded_engines[model_choice] model = engine["model"] tokenizer = engine["tokenizer"] # Build strict ReasonBorn System Prompt prompt = "<|im_start|>system\nYou are ReasonBorn. Use , with & , strictly.<|im_end|>\n" # Append prior conversation history (excluding the two entries we just appended) for msg in history[:-2]: role = msg["role"] content = msg["content"] if role == "user": prompt += f"<|im_start|>user\n{content}<|im_end|>\n" elif role == "assistant": # Strip out HTML UI elements so the model only sees plain text history clean_content = re.sub(r"<.*?>", "", content) prompt += f"<|im_start|>assistant\n{clean_content}<|im_end|>\n" # Append current message prompt += f"<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict( **inputs, max_new_tokens=1024, temperature=0.2, top_p=0.9, repetition_penalty=1.1, do_sample=True, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.convert_tokens_to_ids("<|im_end|>"), streamer=streamer ) # Start generation in a separate thread thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) thread.start() accumulated_text = "" # Stream chunks back to UI for new_text in streamer: accumulated_text += new_text # Real-time formatting for visual feedback live_text = accumulated_text.replace(r'\\(', '$').replace(r'\\)', '$').replace(r'\\[', '$$').replace(r'\\]', '$$') live_text = live_text.replace("", "**🔹 PLAN:**\n").replace("", "\n") live_text = live_text.replace("", "\n").replace("", "\n") live_text = re.sub(r"", lambda m: f"**🔸 STEP {m.group(1)}:** " if m.group(1) else "**🔸 STEP:** ", live_text) live_text = live_text.replace("", "\n") live_text = live_text.replace("", "**✅ VERIFY:** ").replace("", "\n") live_text = live_text.replace("", "\n\n**🎯 CONCLUSION:**\n\n").replace("", "") # Update the latest bot message in history dictionaries history[-1]["content"] = live_text + " ⏳" yield "", gr.update(value=history, visible=True), gr.update(visible=False), gr.update(visible=False) # Final formatting pass with HTML block wrapping final_formatted = format_output_with_latex_support(accumulated_text) history[-1]["content"] = final_formatted yield "", gr.update(value=history, visible=True), gr.update(visible=False), gr.update(visible=False) # Cleanup memory if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() except Exception as e: history[-1]["content"] = f"**System Error:** {str(e)}" yield "", gr.update(value=history, visible=True), gr.update(visible=False), gr.update(visible=False) # ========================================== # 4. UI/UX: ADAPTIVE DARK/LIGHT MODE CSS # ========================================== CSS = """ @import url('https://fonts.googleapis.com/css2?family=Google+Sans:wght@400;500;700&display=swap'); /* Global Typography & Layout */ .gradio-container { font-family: 'Google Sans', sans-serif !important; } .main-wrap { max-width: 750px !important; margin: 0 auto !important; padding-bottom: 100px !important; } /* Hero Section */ .xerv-title { font-size: 46px; font-weight: 700; letter-spacing: -1px; margin-top: 40px; margin-bottom: 8px;} .greeting { font-size: 18px; margin-bottom: 4px; opacity: 0.7;} .subtitle { font-size: 26px; font-weight: 500; margin-bottom: 30px;} /* Chat Window Base */ #chat-window { height: 65vh !important; } /* User Bubble - Always Blue */ .message.user { background: #2563eb !important; color: white !important; border-radius: 20px 20px 0 20px !important; padding: 14px 20px !important; font-size: 16px !important; } .message.user * { color: white !important; } /* Bot Bubble - Light Mode (Default) */ .message.bot { background: #ffffff !important; color: #0f172a !important; border: 1px solid #e2e8f0 !important; border-radius: 20px 20px 20px 0 !important; padding: 16px 20px !important; font-size: 16px !important; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05) !important; } /* Bot Bubble - Dark Mode */ .dark .message.bot { background: #1e293b !important; color: #f8fafc !important; border-color: #334155 !important; } /* Thinking Details Block - Light Mode */ #chat-window details { background-color: #f8fafc !important; border: 1px solid #e2e8f0 !important; border-radius: 12px !important; padding: 14px !important; margin-bottom: 16px !important; box-shadow: inset 0 2px 4px 0 rgb(0 0 0 / 0.02) !important; transition: all 0.2s ease !important; } #chat-window summary { cursor: pointer !important; font-weight: 600 !important; font-size: 15px !important; user-select: none !important; outline: none !important; color: #334155 !important;} /* Thinking Details Block - Dark Mode */ .dark #chat-window details { background-color: #0f172a !important; border-color: #1e293b !important; color: #cbd5e1 !important; } .dark #chat-window summary { color: #94a3b8 !important; } #chat-window details[open] summary { margin-bottom: 12px !important; padding-bottom: 12px !important; border-bottom: 1px solid rgba(128,128,128,0.2) !important; } /* Input Row - Adaptive */ .input-row { align-items: center !important; border-radius: 30px !important; padding: 6px 14px !important; border: 1px solid #cbd5e1 !important; transition: all 0.2s; box-shadow: 0 4px 6px -1px rgba(0,0,0,0.05) !important; background: #f8fafc !important; } .dark .input-row { background: #1e293b !important; border-color: #334155 !important; } .input-row:focus-within { border-color: #3b82f6 !important; box-shadow: 0 4px 12px rgba(59, 130, 246, 0.15) !important; } .input-row textarea { background: transparent !important; border: none !important; box-shadow: none !important; font-size: 16px !important; } .input-row textarea:focus { outline: none !important; border: none !important; box-shadow: none !important; } /* Buttons */ .send-button { background: #2563eb !important; color: white !important; border-radius: 50% !important; height: 42px !important; width: 42px !important; min-width: 42px !important; padding: 0 !important; border: none !important; display: flex; justify-content: center; align-items: center; } .send-button:disabled { background: #94a3b8 !important; } .dark .send-button:disabled { background: #334155 !important; color: #64748b !important; } /* Suggestions - Adaptive */ .sugg-btn { background: #ffffff !important; border: 1px solid #e2e8f0 !important; border-radius: 16px !important; padding: 16px 20px !important; text-align: left !important; justify-content: flex-start !important; font-size: 16px !important; color: #1e293b !important; box-shadow: 0 1px 2px rgba(0,0,0,0.05) !important; margin-bottom: 12px !important; cursor: pointer !important; } .dark .sugg-btn { background: #1e293b !important; border-color: #334155 !important; color: #f8fafc !important; } .sugg-btn:hover { opacity: 0.8; } /* LaTeX Fixes */ .katex-display { margin: 1em 0 !important; overflow-x: auto !important; overflow-y: hidden !important; padding: 8px 0 !important; } .katex { font-size: 1.1em !important; } footer, .label-wrap { display: none !important; } """ with gr.Blocks() as demo: with gr.Column(elem_classes="main-wrap"): with gr.Column(elem_id="hero-section") as hero: gr.HTML("""
Xerv
Hey there!
Let's make something happen.
""") with gr.Column(elem_id="suggestions-section") as suggestions: btn1 = gr.Button(r"🔍 Prove that $\sqrt{2}$ is irrational", elem_classes="sugg-btn") btn2 = gr.Button(r"🧮 Solve $x^3 - 6x^2 + 11x - 6 = 0$", elem_classes="sugg-btn") btn3 = gr.Button(r"📊 Explain eigenvalues with a matrix example", elem_classes="sugg-btn") chatbot = gr.Chatbot( visible=False, elem_id="chat-window", show_label=False, avatar_images=(None, None), sanitize_html=False, # Note: Removed type="messages" to resolve the TypeError in Gradio 6.0 latex_delimiters=[ {"left": "$$", "right": "$$", "display": True}, {"left": "$", "right": "$", "display": False} ] ) with gr.Column(): with gr.Row(elem_classes="input-row"): chat_input = gr.Textbox( show_label=False, placeholder="Ask Xerv to solve complex math...", lines=1, max_lines=4, scale=8 ) send_btn = gr.Button("🚀", elem_classes="send-button", scale=1) model_selector = gr.Radio( choices=list(MODELS_CONFIG.keys()), value="ReasonBorn-Instruct", label="Reasoning Engine", container=False ) # --- Wire up Interactivity --- chat_input.submit( process_chat_stream, inputs=[chat_input, chatbot, model_selector], outputs=[chat_input, chatbot, hero, suggestions] ) send_btn.click( process_chat_stream, inputs=[chat_input, chatbot, model_selector], outputs=[chat_input, chatbot, hero, suggestions] ) btn1.click( fn=lambda: r"Prove that $\sqrt{2}$ is irrational using step-by-step logic", outputs=[chat_input] ).then( fn=process_chat_stream, inputs=[chat_input, chatbot, model_selector], outputs=[chat_input, chatbot, hero, suggestions] ) btn2.click( fn=lambda: r"Solve $x^3 - 6x^2 + 11x - 6 = 0$ and verify roots", outputs=[chat_input] ).then( fn=process_chat_stream, inputs=[chat_input, chatbot, model_selector], outputs=[chat_input, chatbot, hero, suggestions] ) btn3.click( fn=lambda: r"Explain eigenvalues in linear algebra with an example matrix", outputs=[chat_input] ).then( fn=process_chat_stream, inputs=[chat_input, chatbot, model_selector], outputs=[chat_input, chatbot, hero, suggestions] ) if __name__ == "__main__": # Removed the manual light mode javascript. Added adaptive CSS directly to launch parameters. demo.launch( share=True, debug=True, css=CSS, theme=gr.themes.Default() )