Phase-Technologies's picture
Create app.py
5899137 verified
import os
# Fixes the Gradio Analytics crash bug on Colab/Spaces
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
import torch
import gc
import re
import threading
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from peft import PeftModel
# ==========================================
# 1. SMART PRE-LOAD MODELS (NO QUANTIZATION)
# ==========================================
if "loaded_engines" not in globals():
global loaded_engines
loaded_engines = {}
MODELS_CONFIG = {
"ReasonBorn-Instruct": {
"base": "Qwen/Qwen2.5-3B-Instruct",
"adapter": "Phase-Technologies/ReasonBorn-Qwen-3B",
},
"ReasonBorn-LoRA": {
"base": "Qwen/Qwen2.5-3B",
"adapter": "Phase-Technologies/rb-qwen3b-16ds-lora",
}
}
if not loaded_engines:
print("Initializing Xerv Systems... Pre-loading models for instant streaming.")
# Force single-device mapping to prevent PEFT offload KeyError
target_device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Targeting inference device: {target_device.upper()}")
for key, cfg in MODELS_CONFIG.items():
print(f"--- Loading {key} (Unquantized BF16) ---")
tokenizer = AutoTokenizer.from_pretrained(cfg["adapter"])
# Load Base Model on a single device to avoid meta-tensor offloading issues
base_model = AutoModelForCausalLM.from_pretrained(
cfg["base"],
torch_dtype=torch.bfloat16,
device_map={"": target_device},
trust_remote_code=True
)
# Merge adapter for inference
model = PeftModel.from_pretrained(base_model, cfg["adapter"])
model.eval()
loaded_engines[key] = {"model": model, "tokenizer": tokenizer}
print("โœ… Both Reasoning Engines successfully loaded and ready.")
else:
print("โšก Models already detected in memory! Skipping load phase for instant boot.")
# ==========================================
# 2. BULLETPROOF LATEX & TAG PARSER
# ==========================================
def format_output_with_latex_support(text):
# Standardize LaTeX delimiters for Gradio
text = text.replace(r'\\(', '$').replace(r'\\)', '$')
text = text.replace(r'\\[', '$$').replace(r'\\]', '$$')
# Extract Conclusion
conclusion_match = re.search(r"<conclusion>(.*?)(?:</conclusion>|$)", text, re.DOTALL)
if conclusion_match:
conclusion_text = conclusion_match.group(1).strip()
thinking_text = text[:conclusion_match.start()].strip()
# Format Thinking Process
thinking_text = thinking_text.replace("<plan>", "**๐Ÿ”น PLAN:**\n").replace("</plan>", "\n")
thinking_text = thinking_text.replace("<reasoning>", "\n").replace("</reasoning>", "\n")
# Handle dynamic <step> tags
thinking_text = re.sub(r"<step(?:\s+index=\"(\d+)\")?>",
lambda m: f"**๐Ÿ”ธ STEP {m.group(1)}:** " if m.group(1) else "**๐Ÿ”ธ STEP:** ",
thinking_text)
thinking_text = thinking_text.replace("</step>", "\n")
thinking_text = thinking_text.replace("<verify>", "**โœ… VERIFY:** ").replace("</verify>", "\n")
# Wrap thinking in a collapsible HTML details block
formatted = (
f"<details>\n"
f"<summary>๐Ÿง  View Thinking Process</summary>\n\n"
f"{thinking_text}\n\n"
f"</details>\n\n"
f"**๐ŸŽฏ CONCLUSION:**\n\n{conclusion_text}"
)
return formatted
else:
# Fallback if generation stops before conclusion
text = text.replace("<plan>", "**๐Ÿ”น PLAN:**\n").replace("</plan>", "\n")
text = text.replace("<reasoning>", "\n").replace("</reasoning>", "\n")
text = re.sub(r"<step(?:\s+index=\"(\d+)\")?>",
lambda m: f"**๐Ÿ”ธ STEP {m.group(1)}:** " if m.group(1) else "**๐Ÿ”ธ STEP:** ",
text)
text = text.replace("</step>", "\n")
text = text.replace("<verify>", "**โœ… VERIFY:** ").replace("</verify>", "\n")
return text
# ==========================================
# 3. REAL-TIME STREAMING GENERATOR
# ==========================================
def process_chat_stream(user_message, history, model_choice):
"""
Handles Gradio's 'messages' format natively: [{"role": "user", "content": "..."}, ...]
"""
if not user_message.strip():
yield "", gr.update(), gr.update(), gr.update()
return
# Initialize history if empty and append new user/assistant dicts
history = history or []
history.append({"role": "user", "content": user_message})
history.append({"role": "assistant", "content": ""})
# Yield immediately to update UI (hide hero/suggestions, show chatbot)
yield "", gr.update(value=history, visible=True), gr.update(visible=False), gr.update(visible=False)
try:
engine = loaded_engines[model_choice]
model = engine["model"]
tokenizer = engine["tokenizer"]
# Build strict ReasonBorn System Prompt
prompt = "<|im_start|>system\nYou are ReasonBorn. Use <plan>, <reasoning> with <step> & <verify>, <conclusion> strictly.<|im_end|>\n"
# Append prior conversation history (excluding the two entries we just appended)
for msg in history[:-2]:
role = msg["role"]
content = msg["content"]
if role == "user":
prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
elif role == "assistant":
# Strip out HTML UI elements so the model only sees plain text history
clean_content = re.sub(r"<.*?>", "", content)
prompt += f"<|im_start|>assistant\n{clean_content}<|im_end|>\n"
# Append current message
prompt += f"<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(
**inputs,
max_new_tokens=1024,
temperature=0.2,
top_p=0.9,
repetition_penalty=1.1,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.convert_tokens_to_ids("<|im_end|>"),
streamer=streamer
)
# Start generation in a separate thread
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
accumulated_text = ""
# Stream chunks back to UI
for new_text in streamer:
accumulated_text += new_text
# Real-time formatting for visual feedback
live_text = accumulated_text.replace(r'\\(', '$').replace(r'\\)', '$').replace(r'\\[', '$$').replace(r'\\]', '$$')
live_text = live_text.replace("<plan>", "**๐Ÿ”น PLAN:**\n").replace("</plan>", "\n")
live_text = live_text.replace("<reasoning>", "\n").replace("</reasoning>", "\n")
live_text = re.sub(r"<step(?:\s+index=\"(\d+)\")?>",
lambda m: f"**๐Ÿ”ธ STEP {m.group(1)}:** " if m.group(1) else "**๐Ÿ”ธ STEP:** ",
live_text)
live_text = live_text.replace("</step>", "\n")
live_text = live_text.replace("<verify>", "**โœ… VERIFY:** ").replace("</verify>", "\n")
live_text = live_text.replace("<conclusion>", "\n\n**๐ŸŽฏ CONCLUSION:**\n\n").replace("</conclusion>", "")
# Update the latest bot message in history dictionaries
history[-1]["content"] = live_text + " โณ"
yield "", gr.update(value=history, visible=True), gr.update(visible=False), gr.update(visible=False)
# Final formatting pass with HTML block wrapping
final_formatted = format_output_with_latex_support(accumulated_text)
history[-1]["content"] = final_formatted
yield "", gr.update(value=history, visible=True), gr.update(visible=False), gr.update(visible=False)
# Cleanup memory
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
except Exception as e:
history[-1]["content"] = f"**System Error:** {str(e)}"
yield "", gr.update(value=history, visible=True), gr.update(visible=False), gr.update(visible=False)
# ==========================================
# 4. UI/UX: ADAPTIVE DARK/LIGHT MODE CSS
# ==========================================
CSS = """
@import url('https://fonts.googleapis.com/css2?family=Google+Sans:wght@400;500;700&display=swap');
/* Global Typography & Layout */
.gradio-container { font-family: 'Google Sans', sans-serif !important; }
.main-wrap { max-width: 750px !important; margin: 0 auto !important; padding-bottom: 100px !important; }
/* Hero Section */
.xerv-title { font-size: 46px; font-weight: 700; letter-spacing: -1px; margin-top: 40px; margin-bottom: 8px;}
.greeting { font-size: 18px; margin-bottom: 4px; opacity: 0.7;}
.subtitle { font-size: 26px; font-weight: 500; margin-bottom: 30px;}
/* Chat Window Base */
#chat-window { height: 65vh !important; }
/* User Bubble - Always Blue */
.message.user { background: #2563eb !important; color: white !important; border-radius: 20px 20px 0 20px !important; padding: 14px 20px !important; font-size: 16px !important; }
.message.user * { color: white !important; }
/* Bot Bubble - Light Mode (Default) */
.message.bot { background: #ffffff !important; color: #0f172a !important; border: 1px solid #e2e8f0 !important; border-radius: 20px 20px 20px 0 !important; padding: 16px 20px !important; font-size: 16px !important; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05) !important; }
/* Bot Bubble - Dark Mode */
.dark .message.bot { background: #1e293b !important; color: #f8fafc !important; border-color: #334155 !important; }
/* Thinking Details Block - Light Mode */
#chat-window details { background-color: #f8fafc !important; border: 1px solid #e2e8f0 !important; border-radius: 12px !important; padding: 14px !important; margin-bottom: 16px !important; box-shadow: inset 0 2px 4px 0 rgb(0 0 0 / 0.02) !important; transition: all 0.2s ease !important; }
#chat-window summary { cursor: pointer !important; font-weight: 600 !important; font-size: 15px !important; user-select: none !important; outline: none !important; color: #334155 !important;}
/* Thinking Details Block - Dark Mode */
.dark #chat-window details { background-color: #0f172a !important; border-color: #1e293b !important; color: #cbd5e1 !important; }
.dark #chat-window summary { color: #94a3b8 !important; }
#chat-window details[open] summary { margin-bottom: 12px !important; padding-bottom: 12px !important; border-bottom: 1px solid rgba(128,128,128,0.2) !important; }
/* Input Row - Adaptive */
.input-row { align-items: center !important; border-radius: 30px !important; padding: 6px 14px !important; border: 1px solid #cbd5e1 !important; transition: all 0.2s; box-shadow: 0 4px 6px -1px rgba(0,0,0,0.05) !important; background: #f8fafc !important; }
.dark .input-row { background: #1e293b !important; border-color: #334155 !important; }
.input-row:focus-within { border-color: #3b82f6 !important; box-shadow: 0 4px 12px rgba(59, 130, 246, 0.15) !important; }
.input-row textarea { background: transparent !important; border: none !important; box-shadow: none !important; font-size: 16px !important; }
.input-row textarea:focus { outline: none !important; border: none !important; box-shadow: none !important; }
/* Buttons */
.send-button { background: #2563eb !important; color: white !important; border-radius: 50% !important; height: 42px !important; width: 42px !important; min-width: 42px !important; padding: 0 !important; border: none !important; display: flex; justify-content: center; align-items: center; }
.send-button:disabled { background: #94a3b8 !important; }
.dark .send-button:disabled { background: #334155 !important; color: #64748b !important; }
/* Suggestions - Adaptive */
.sugg-btn { background: #ffffff !important; border: 1px solid #e2e8f0 !important; border-radius: 16px !important; padding: 16px 20px !important; text-align: left !important; justify-content: flex-start !important; font-size: 16px !important; color: #1e293b !important; box-shadow: 0 1px 2px rgba(0,0,0,0.05) !important; margin-bottom: 12px !important; cursor: pointer !important; }
.dark .sugg-btn { background: #1e293b !important; border-color: #334155 !important; color: #f8fafc !important; }
.sugg-btn:hover { opacity: 0.8; }
/* LaTeX Fixes */
.katex-display { margin: 1em 0 !important; overflow-x: auto !important; overflow-y: hidden !important; padding: 8px 0 !important; }
.katex { font-size: 1.1em !important; }
footer, .label-wrap { display: none !important; }
"""
with gr.Blocks() as demo:
with gr.Column(elem_classes="main-wrap"):
with gr.Column(elem_id="hero-section") as hero:
gr.HTML("""
<div class="xerv-title">Xerv</div>
<div class="greeting">Hey there!</div>
<div class="subtitle">Let's make something happen.</div>
""")
with gr.Column(elem_id="suggestions-section") as suggestions:
btn1 = gr.Button(r"๐Ÿ” Prove that $\sqrt{2}$ is irrational", elem_classes="sugg-btn")
btn2 = gr.Button(r"๐Ÿงฎ Solve $x^3 - 6x^2 + 11x - 6 = 0$", elem_classes="sugg-btn")
btn3 = gr.Button(r"๐Ÿ“Š Explain eigenvalues with a matrix example", elem_classes="sugg-btn")
chatbot = gr.Chatbot(
visible=False,
elem_id="chat-window",
show_label=False,
avatar_images=(None, None),
sanitize_html=False,
# Note: Removed type="messages" to resolve the TypeError in Gradio 6.0
latex_delimiters=[
{"left": "$$", "right": "$$", "display": True},
{"left": "$", "right": "$", "display": False}
]
)
with gr.Column():
with gr.Row(elem_classes="input-row"):
chat_input = gr.Textbox(
show_label=False,
placeholder="Ask Xerv to solve complex math...",
lines=1,
max_lines=4,
scale=8
)
send_btn = gr.Button("๐Ÿš€", elem_classes="send-button", scale=1)
model_selector = gr.Radio(
choices=list(MODELS_CONFIG.keys()),
value="ReasonBorn-Instruct",
label="Reasoning Engine",
container=False
)
# --- Wire up Interactivity ---
chat_input.submit(
process_chat_stream,
inputs=[chat_input, chatbot, model_selector],
outputs=[chat_input, chatbot, hero, suggestions]
)
send_btn.click(
process_chat_stream,
inputs=[chat_input, chatbot, model_selector],
outputs=[chat_input, chatbot, hero, suggestions]
)
btn1.click(
fn=lambda: r"Prove that $\sqrt{2}$ is irrational using step-by-step logic",
outputs=[chat_input]
).then(
fn=process_chat_stream,
inputs=[chat_input, chatbot, model_selector],
outputs=[chat_input, chatbot, hero, suggestions]
)
btn2.click(
fn=lambda: r"Solve $x^3 - 6x^2 + 11x - 6 = 0$ and verify roots",
outputs=[chat_input]
).then(
fn=process_chat_stream,
inputs=[chat_input, chatbot, model_selector],
outputs=[chat_input, chatbot, hero, suggestions]
)
btn3.click(
fn=lambda: r"Explain eigenvalues in linear algebra with an example matrix",
outputs=[chat_input]
).then(
fn=process_chat_stream,
inputs=[chat_input, chatbot, model_selector],
outputs=[chat_input, chatbot, hero, suggestions]
)
if __name__ == "__main__":
# Removed the manual light mode javascript. Added adaptive CSS directly to launch parameters.
demo.launch(
share=True,
debug=True,
css=CSS,
theme=gr.themes.Default()
)