Spaces:

Xerv-AI
/

Qwen-ReasonBorn-Adapter

Runtime error

File size: 16,480 Bytes
import os
# Fixes the Gradio Analytics crash bug on Colab/Spaces
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"

import torch
import gc
import re
import threading
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from peft import PeftModel

# ==========================================
# 1. SMART PRE-LOAD MODELS (NO QUANTIZATION)
# ==========================================
if "loaded_engines" not in globals():
    global loaded_engines
    loaded_engines = {}

MODELS_CONFIG = {
    "ReasonBorn-Instruct": {
        "base": "Qwen/Qwen2.5-3B-Instruct",
        "adapter": "Phase-Technologies/ReasonBorn-Qwen-3B",
    },
    "ReasonBorn-LoRA": {
        "base": "Qwen/Qwen2.5-3B",
        "adapter": "Phase-Technologies/rb-qwen3b-16ds-lora",
    }
}

if not loaded_engines:
    print("Initializing Xerv Systems... Pre-loading models for instant streaming.")
    
    # Force single-device mapping to prevent PEFT offload KeyError
    target_device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Targeting inference device: {target_device.upper()}")
    
    for key, cfg in MODELS_CONFIG.items():
        print(f"--- Loading {key} (Unquantized BF16) ---")
        tokenizer = AutoTokenizer.from_pretrained(cfg["adapter"])
        
        # Load Base Model on a single device to avoid meta-tensor offloading issues
        base_model = AutoModelForCausalLM.from_pretrained(
            cfg["base"],
            torch_dtype=torch.bfloat16,
            device_map={"": target_device}, 
            trust_remote_code=True
        )
        
        # Merge adapter for inference
        model = PeftModel.from_pretrained(base_model, cfg["adapter"])
        model.eval()
        
        loaded_engines[key] = {"model": model, "tokenizer": tokenizer}
        
    print("✅ Both Reasoning Engines successfully loaded and ready.")
else:
    print("⚡ Models already detected in memory! Skipping load phase for instant boot.")

# ==========================================
# 2. BULLETPROOF LATEX & TAG PARSER
# ==========================================
def format_output_with_latex_support(text):
    # Standardize LaTeX delimiters for Gradio
    text = text.replace(r'\\(', '$').replace(r'\\)', '$')
    text = text.replace(r'\\[', '$$').replace(r'\\]', '$$')
    
    # Extract Conclusion
    conclusion_match = re.search(r"<conclusion>(.*?)(?:</conclusion>|$)", text, re.DOTALL)
    
    if conclusion_match:
        conclusion_text = conclusion_match.group(1).strip()
        thinking_text = text[:conclusion_match.start()].strip()
        
        # Format Thinking Process
        thinking_text = thinking_text.replace("<plan>", "**🔹 PLAN:**\n").replace("</plan>", "\n")
        thinking_text = thinking_text.replace("<reasoning>", "\n").replace("</reasoning>", "\n")
        
        # Handle dynamic <step> tags
        thinking_text = re.sub(r"<step(?:\s+index=\"(\d+)\")?>", 
                               lambda m: f"**🔸 STEP {m.group(1)}:** " if m.group(1) else "**🔸 STEP:** ", 
                               thinking_text)
        thinking_text = thinking_text.replace("</step>", "\n")
        thinking_text = thinking_text.replace("<verify>", "**✅ VERIFY:** ").replace("</verify>", "\n")
        
        # Wrap thinking in a collapsible HTML details block
        formatted = (
            f"<details>\n"
            f"<summary>🧠 View Thinking Process</summary>\n\n"
            f"{thinking_text}\n\n"
            f"</details>\n\n"
            f"**🎯 CONCLUSION:**\n\n{conclusion_text}"
        )
        return formatted
    else:
        # Fallback if generation stops before conclusion
        text = text.replace("<plan>", "**🔹 PLAN:**\n").replace("</plan>", "\n")
        text = text.replace("<reasoning>", "\n").replace("</reasoning>", "\n")
        text = re.sub(r"<step(?:\s+index=\"(\d+)\")?>", 
                      lambda m: f"**🔸 STEP {m.group(1)}:** " if m.group(1) else "**🔸 STEP:** ", 
                      text)
        text = text.replace("</step>", "\n")
        text = text.replace("<verify>", "**✅ VERIFY:** ").replace("</verify>", "\n")
        return text

# ==========================================
# 3. REAL-TIME STREAMING GENERATOR
# ==========================================
def process_chat_stream(user_message, history, model_choice):
    """
    Handles Gradio's 'messages' format natively: [{"role": "user", "content": "..."}, ...]
    """
    if not user_message.strip():
        yield "", gr.update(), gr.update(), gr.update()
        return
        
    # Initialize history if empty and append new user/assistant dicts
    history = history or []
    history.append({"role": "user", "content": user_message})
    history.append({"role": "assistant", "content": ""})
    
    # Yield immediately to update UI (hide hero/suggestions, show chatbot)
    yield "", gr.update(value=history, visible=True), gr.update(visible=False), gr.update(visible=False)

    try:
        engine = loaded_engines[model_choice]
        model = engine["model"]
        tokenizer = engine["tokenizer"]
        
        # Build strict ReasonBorn System Prompt
        prompt = "<|im_start|>system\nYou are ReasonBorn. Use <plan>, <reasoning> with <step> & <verify>, <conclusion> strictly.<|im_end|>\n"
        
        # Append prior conversation history (excluding the two entries we just appended)
        for msg in history[:-2]:
            role = msg["role"]
            content = msg["content"]
            
            if role == "user":
                prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
            elif role == "assistant":
                # Strip out HTML UI elements so the model only sees plain text history
                clean_content = re.sub(r"<.*?>", "", content) 
                prompt += f"<|im_start|>assistant\n{clean_content}<|im_end|>\n"
                
        # Append current message
        prompt += f"<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n"
        
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
        
        generation_kwargs = dict(
            **inputs,
            max_new_tokens=1024,
            temperature=0.2,
            top_p=0.9,
            repetition_penalty=1.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.convert_tokens_to_ids("<|im_end|>"),
            streamer=streamer
        )
        
        # Start generation in a separate thread
        thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
        thread.start()
        
        accumulated_text = ""
        
        # Stream chunks back to UI
        for new_text in streamer:
            accumulated_text += new_text
            
            # Real-time formatting for visual feedback
            live_text = accumulated_text.replace(r'\\(', '$').replace(r'\\)', '$').replace(r'\\[', '$$').replace(r'\\]', '$$')
            live_text = live_text.replace("<plan>", "**🔹 PLAN:**\n").replace("</plan>", "\n")
            live_text = live_text.replace("<reasoning>", "\n").replace("</reasoning>", "\n")
            live_text = re.sub(r"<step(?:\s+index=\"(\d+)\")?>", 
                               lambda m: f"**🔸 STEP {m.group(1)}:** " if m.group(1) else "**🔸 STEP:** ", 
                               live_text)
            live_text = live_text.replace("</step>", "\n")
            live_text = live_text.replace("<verify>", "**✅ VERIFY:** ").replace("</verify>", "\n")
            live_text = live_text.replace("<conclusion>", "\n\n**🎯 CONCLUSION:**\n\n").replace("</conclusion>", "")
            
            # Update the latest bot message in history dictionaries
            history[-1]["content"] = live_text + " ⏳"
            yield "", gr.update(value=history, visible=True), gr.update(visible=False), gr.update(visible=False)
            
        # Final formatting pass with HTML block wrapping
        final_formatted = format_output_with_latex_support(accumulated_text)
        history[-1]["content"] = final_formatted
        
        yield "", gr.update(value=history, visible=True), gr.update(visible=False), gr.update(visible=False)
        
        # Cleanup memory
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        
    except Exception as e:
        history[-1]["content"] = f"**System Error:** {str(e)}"
        yield "", gr.update(value=history, visible=True), gr.update(visible=False), gr.update(visible=False)


# ==========================================
# 4. UI/UX: ADAPTIVE DARK/LIGHT MODE CSS
# ==========================================
CSS = """
@import url('https://fonts.googleapis.com/css2?family=Google+Sans:wght@400;500;700&display=swap');

/* Global Typography & Layout */
.gradio-container { font-family: 'Google Sans', sans-serif !important; }
.main-wrap { max-width: 750px !important; margin: 0 auto !important; padding-bottom: 100px !important; }

/* Hero Section */
.xerv-title { font-size: 46px; font-weight: 700; letter-spacing: -1px; margin-top: 40px; margin-bottom: 8px;}
.greeting { font-size: 18px; margin-bottom: 4px; opacity: 0.7;}
.subtitle { font-size: 26px; font-weight: 500; margin-bottom: 30px;}

/* Chat Window Base */
#chat-window { height: 65vh !important; }

/* User Bubble - Always Blue */
.message.user { background: #2563eb !important; color: white !important; border-radius: 20px 20px 0 20px !important; padding: 14px 20px !important; font-size: 16px !important; }
.message.user * { color: white !important; }

/* Bot Bubble - Light Mode (Default) */
.message.bot { background: #ffffff !important; color: #0f172a !important; border: 1px solid #e2e8f0 !important; border-radius: 20px 20px 20px 0 !important; padding: 16px 20px !important; font-size: 16px !important; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05) !important; }

/* Bot Bubble - Dark Mode */
.dark .message.bot { background: #1e293b !important; color: #f8fafc !important; border-color: #334155 !important; }

/* Thinking Details Block - Light Mode */
#chat-window details { background-color: #f8fafc !important; border: 1px solid #e2e8f0 !important; border-radius: 12px !important; padding: 14px !important; margin-bottom: 16px !important; box-shadow: inset 0 2px 4px 0 rgb(0 0 0 / 0.02) !important; transition: all 0.2s ease !important; }
#chat-window summary { cursor: pointer !important; font-weight: 600 !important; font-size: 15px !important; user-select: none !important; outline: none !important; color: #334155 !important;}

/* Thinking Details Block - Dark Mode */
.dark #chat-window details { background-color: #0f172a !important; border-color: #1e293b !important; color: #cbd5e1 !important; }
.dark #chat-window summary { color: #94a3b8 !important; }

#chat-window details[open] summary { margin-bottom: 12px !important; padding-bottom: 12px !important; border-bottom: 1px solid rgba(128,128,128,0.2) !important; }

/* Input Row - Adaptive */
.input-row { align-items: center !important; border-radius: 30px !important; padding: 6px 14px !important; border: 1px solid #cbd5e1 !important; transition: all 0.2s; box-shadow: 0 4px 6px -1px rgba(0,0,0,0.05) !important; background: #f8fafc !important; }
.dark .input-row { background: #1e293b !important; border-color: #334155 !important; }
.input-row:focus-within { border-color: #3b82f6 !important; box-shadow: 0 4px 12px rgba(59, 130, 246, 0.15) !important; }
.input-row textarea { background: transparent !important; border: none !important; box-shadow: none !important; font-size: 16px !important; }
.input-row textarea:focus { outline: none !important; border: none !important; box-shadow: none !important; }

/* Buttons */
.send-button { background: #2563eb !important; color: white !important; border-radius: 50% !important; height: 42px !important; width: 42px !important; min-width: 42px !important; padding: 0 !important; border: none !important; display: flex; justify-content: center; align-items: center; }
.send-button:disabled { background: #94a3b8 !important; }
.dark .send-button:disabled { background: #334155 !important; color: #64748b !important; }

/* Suggestions - Adaptive */
.sugg-btn { background: #ffffff !important; border: 1px solid #e2e8f0 !important; border-radius: 16px !important; padding: 16px 20px !important; text-align: left !important; justify-content: flex-start !important; font-size: 16px !important; color: #1e293b !important; box-shadow: 0 1px 2px rgba(0,0,0,0.05) !important; margin-bottom: 12px !important; cursor: pointer !important; }
.dark .sugg-btn { background: #1e293b !important; border-color: #334155 !important; color: #f8fafc !important; }
.sugg-btn:hover { opacity: 0.8; }

/* LaTeX Fixes */
.katex-display { margin: 1em 0 !important; overflow-x: auto !important; overflow-y: hidden !important; padding: 8px 0 !important; }
.katex { font-size: 1.1em !important; }
footer, .label-wrap { display: none !important; }
"""

with gr.Blocks() as demo:
    with gr.Column(elem_classes="main-wrap"):
        with gr.Column(elem_id="hero-section") as hero:
            gr.HTML("""
            <div class="xerv-title">Xerv</div>
            <div class="greeting">Hey there!</div>
            <div class="subtitle">Let's make something happen.</div>
            """)
            
        with gr.Column(elem_id="suggestions-section") as suggestions:
            btn1 = gr.Button(r"🔍 Prove that $\sqrt{2}$ is irrational", elem_classes="sugg-btn")
            btn2 = gr.Button(r"🧮 Solve $x^3 - 6x^2 + 11x - 6 = 0$", elem_classes="sugg-btn")
            btn3 = gr.Button(r"📊 Explain eigenvalues with a matrix example", elem_classes="sugg-btn")
            
        chatbot = gr.Chatbot(
            visible=False,
            elem_id="chat-window",
            show_label=False,
            avatar_images=(None, None),
            sanitize_html=False, 
            # Note: Removed type="messages" to resolve the TypeError in Gradio 6.0
            latex_delimiters=[
                {"left": "$$", "right": "$$", "display": True},
                {"left": "$", "right": "$", "display": False}
            ]
        )
        
        with gr.Column():
            with gr.Row(elem_classes="input-row"):
                chat_input = gr.Textbox(
                    show_label=False,
                    placeholder="Ask Xerv to solve complex math...",
                    lines=1,
                    max_lines=4,
                    scale=8
                )
                send_btn = gr.Button("🚀", elem_classes="send-button", scale=1)
                
            model_selector = gr.Radio(
                choices=list(MODELS_CONFIG.keys()),
                value="ReasonBorn-Instruct",
                label="Reasoning Engine",
                container=False
            )

    # --- Wire up Interactivity ---
    chat_input.submit(
        process_chat_stream,
        inputs=[chat_input, chatbot, model_selector],
        outputs=[chat_input, chatbot, hero, suggestions]
    )
    
    send_btn.click(
        process_chat_stream,
        inputs=[chat_input, chatbot, model_selector],
        outputs=[chat_input, chatbot, hero, suggestions]
    )

    btn1.click(
        fn=lambda: r"Prove that $\sqrt{2}$ is irrational using step-by-step logic", 
        outputs=[chat_input]
    ).then(
        fn=process_chat_stream,
        inputs=[chat_input, chatbot, model_selector],
        outputs=[chat_input, chatbot, hero, suggestions]
    )
    
    btn2.click(
        fn=lambda: r"Solve $x^3 - 6x^2 + 11x - 6 = 0$ and verify roots", 
        outputs=[chat_input]
    ).then(
        fn=process_chat_stream,
        inputs=[chat_input, chatbot, model_selector],
        outputs=[chat_input, chatbot, hero, suggestions]
    )
    
    btn3.click(
        fn=lambda: r"Explain eigenvalues in linear algebra with an example matrix", 
        outputs=[chat_input]
    ).then(
        fn=process_chat_stream,
        inputs=[chat_input, chatbot, model_selector],
        outputs=[chat_input, chatbot, hero, suggestions]
    )

if __name__ == "__main__":
    # Removed the manual light mode javascript. Added adaptive CSS directly to launch parameters.
    demo.launch(
        share=True,
        debug=True,
        css=CSS,
        theme=gr.themes.Default()
    )