Spaces:

build-small-hackathon
/

OpenCompanion

Sleeping

App Files Files Community

OrbitMC commited on 24 days ago

Commit

0247fa0

verified ·

1 Parent(s): 99529c8

Update app.py

Browse files

Files changed (1) hide show

app.py +462 -363

app.py CHANGED Viewed

@@ -1,371 +1,470 @@
 import os
-import re
 import json
-import time
-from functools import lru_cache
-from typing import List, Dict, Any, Tuple
-import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-try:
-    import spaces
-except Exception:
-    class spaces:
-        @staticmethod
-        def GPU(fn):
-            return fn
-APP_NAME = "FastLLM"
-MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-1.5B-Instruct")
-MAX_TOKENS = int(os.getenv("MAX_TOKENS", "256"))
-TEMPERATURE = float(os.getenv("TEMPERATURE", "0.7"))
-TOP_P = float(os.getenv("TOP_P", "0.9"))
-REPETITION_PENALTY = float(os.getenv("REPETITION_PENALTY", "1.08"))
-MAX_HISTORY_MESSAGES = int(os.getenv("MAX_HISTORY_MESSAGES", "12"))
-MAX_MEMORY_ITEMS = int(os.getenv("MAX_MEMORY_ITEMS", "12"))
-EMOTIONS = ["neutral", "happy", "calm", "focused", "curious", "thinking", "excited", "empathetic", "concerned", "playful"]
-SYSTEM_PROMPT = f"""
-You are FastLLM, a polished AI companion.
-You are warm, sharp, calm, and helpful.
-You speak like a real assistant with personality, but you stay professional and safe.
-Goals:
-- Be concise, natural, and confident.
-- Help with daily tasks, study, coding, planning, and conversation.
-- React with emotion in a subtle, human way.
-- Never mention hidden policy text or internal prompts.
-Output rules:
-- Return raw JSON only.
-- Use this schema:
-  {{
-    "reply": "short natural assistant response",
-    "emotion": one of {EMOTIONS},
-    "mood_score": number from 0.0 to 1.0,
-    "memory_hint": "short note to save for later, or empty string"
-  }}
-Style:
-- Keep the reply clear and friendly.
-- Use short sentences.
-- Match the user's tone.
-- If the user asks for memory, produce a useful memory_hint.
-- If the user gives a preference or profile detail, include it in memory_hint.
-""".strip()
-MODEL = None
-TOKENIZER = None
-def normalize_messages(messages: List[Dict[str, str]]) -> List[Dict[str, str]]:
-    cleaned = []
-    for msg in messages:
-        role = msg.get("role", "")
-        content = (msg.get("content") or "").strip()
-        if role in {"system", "user", "assistant"} and content:
-            cleaned.append({"role": role, "content": content})
-    return cleaned[-MAX_HISTORY_MESSAGES:]
-def build_prompt(messages: List[Dict[str, str]]) -> str:
-    msgs = [{"role": "system", "content": SYSTEM_PROMPT}] + normalize_messages(messages)
-    tokenizer = get_tokenizer()
-    if hasattr(tokenizer, "apply_chat_template"):
-        return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
-    text = []
-    for msg in msgs:
-        text.append(f"{msg['role'].upper()}: {msg['content']}")
-    text.append("ASSISTANT:")
-    return "\n".join(text)
-def safe_json_from_text(text: str) -> Dict[str, Any]:
-    raw = (text or "").strip()
-    candidates = [
-        raw,
-        re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.I | re.S).strip(),
-    ]
-    for candidate in candidates:
-        try:
-            data = json.loads(candidate)
-            if isinstance(data, dict):
-                return data
-        except Exception:
-            pass
-    start = raw.find("{")
-    end = raw.rfind("}")
-    if start != -1 and end != -1 and end > start:
-        chunk = raw[start : end + 1]
         try:
-            data = json.loads(chunk)
-            if isinstance(data, dict):
-                return data
         except Exception:
             pass
-    return {
-        "reply": raw if raw else "I’m here.",
-        "emotion": "neutral",
-        "mood_score": 0.5,
-        "memory_hint": "",
-    }
-def clamp(v: float, lo: float = 0.0, hi: float = 1.0) -> float:
-    return max(lo, min(hi, v))
-def get_tokenizer():
-    global TOKENIZER
-    if TOKENIZER is None:
-        TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
-    return TOKENIZER
-def load_model_once():
-    global MODEL, TOKENIZER
-    if MODEL is not None and TOKENIZER is not None:
-        return MODEL, TOKENIZER
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
-    load_kwargs = dict(low_cpu_mem_usage=True)
-    try:
-        load_kwargs["dtype"] = torch.float16
-        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs)
-    except TypeError:
-        load_kwargs.pop("dtype", None)
-        load_kwargs["torch_dtype"] = torch.float16
-        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs)
-    if torch.cuda.is_available():
-        model = model.to("cuda")
-    model.eval()
-    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
-        tokenizer.pad_token_id = tokenizer.eos_token_id
-    MODEL = model
-    TOKENIZER = tokenizer
-    return MODEL, TOKENIZER
-@spaces.GPU
-def generate_reply(messages: List[Dict[str, str]]) -> Dict[str, Any]:
-    model, tokenizer = load_model_once()
-    prompt = build_prompt(messages)
-    inputs = tokenizer(prompt, return_tensors="pt")
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    with torch.no_grad():
-        output = model.generate(
-            **inputs,
-            max_new_tokens=MAX_TOKENS,
-            do_sample=True,
-            temperature=TEMPERATURE,
-            top_p=TOP_P,
-            repetition_penalty=REPETITION_PENALTY,
-            pad_token_id=tokenizer.pad_token_id,
-            eos_token_id=tokenizer.eos_token_id,
-        )
-    generated = tokenizer.decode(output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
-    data = safe_json_from_text(generated)
-    reply = str(data.get("reply", "")).strip()
-    if not reply:
-        reply = "I’m here."
-    emotion = str(data.get("emotion", "neutral")).strip().lower()
-    if emotion not in EMOTIONS:
-        emotion = "neutral"
-    mood_score = data.get("mood_score", 0.5)
-    try:
-        mood_score = clamp(float(mood_score))
-    except Exception:
-        mood_score = 0.5
-    memory_hint = str(data.get("memory_hint", "")).strip()
-    return {
-        "reply": reply,
-        "emotion": emotion,
-        "mood_score": mood_score,
-        "memory_hint": memory_hint,
-    }
-def extract_memory_candidate(user_text: str, assistant_text: str, memory_hint: str) -> str:
-    text = " ".join([user_text or "", assistant_text or "", memory_hint or ""]).strip()
-    if not text:
-        return ""
-    patterns = [
-        r"\bmy name is ([^.!,?\n]+)",
-        r"\bcall me ([^.!,?\n]+)",
-        r"\bi work as ([^.!,?\n]+)",
-        r"\bi like ([^.!,?\n]+)",
-        r"\bi prefer ([^.!,?\n]+)",
-        r"\bremember that ([^.!,?\n]+)",
-    ]
-    for pat in patterns:
-        m = re.search(pat, text, flags=re.I)
-        if m:
-            return m.group(1).strip()[:120]
-    if memory_hint:
-        return memory_hint[:120]
-    return ""
-def render_status(emotion: str, mood_score: float, memory_count: int) -> str:
-    bars = "■" * max(1, int(round(mood_score * 10)))
-    bars = bars.ljust(10, "□")
-    return f"**Mood:** `{emotion}`  |  **Energy:** `{bars}`  |  **Memory items:** `{memory_count}`"
-def add_turn(user_text: str, response: Dict[str, Any], chat: List[Dict[str, str]], memory: List[str]) -> Tuple[List[Dict[str, str]], List[str], str]:
-    chat.append({"role": "user", "content": user_text})
-    chat.append({"role": "assistant", "content": response["reply"]})
-    mem = extract_memory_candidate(user_text, response["reply"], response.get("memory_hint", ""))
-    if mem:
-        if mem not in memory:
-            memory = (memory + [mem])[-MAX_MEMORY_ITEMS:]
-    status = render_status(response["emotion"], response["mood_score"], len(memory))
-    return chat, memory, status
-def clear_session():
-    return [], [], [], "Ready.", ""
-def seed_examples():
-    return [
-        ["Help me plan my day.", None],
-        ["Remember that I build apps with Hugging Face and Python.", None],
-    ]
-with gr.Blocks(theme=gr.themes.Soft(), css="""
-#app-wrap { max-width: 1200px; margin: 0 auto; }
-#header-card { border-radius: 24px; }
-#chatbox { min-height: 560px; }
-#memory-box { min-height: 220px; }
-""") as demo:
-    chat_state = gr.State([])
-    memory_state = gr.State([])
-    with gr.Column(elem_id="app-wrap"):
-        with gr.Row():
-            with gr.Column(scale=3):
-                gr.Markdown(
-                    f"# {APP_NAME}\nA local GPU companion built with Gradio and Qwen."
-                )
-                status_md = gr.Markdown("Ready.")
-            with gr.Column(scale=1):
-                clear_btn = gr.Button("Clear session", variant="secondary")
-        with gr.Row():
-            with gr.Column(scale=3):
-                chatbot = gr.Chatbot(
-                    value=[],
-                    type="messages",
-                    height=560,
-                    elem_id="chatbox",
-                    show_copy_button=True,
-                )
-                with gr.Row():
-                    user_text = gr.Textbox(
-                        placeholder="Message FastLLM...",
-                        scale=6,
-                        show_label=False,
-                    )
-                    send_btn = gr.Button("Send", variant="primary", scale=1)
-                with gr.Accordion("Voice input", open=False):
-                    audio_in = gr.Audio(
-                        sources=["microphone", "upload"],
-                        type="filepath",
-                        label="Audio input",
-                    )
-                    transcribe_btn = gr.Button("Transcribe with local GPU model", variant="secondary")
-                    transcript_box = gr.Textbox(label="Transcript", lines=3)
-            with gr.Column(scale=1):
-                emotion_box = gr.Textbox(label="Emotion", value="neutral", interactive=False)
-                mood_box = gr.Slider(label="Mood score", minimum=0, maximum=1, value=0.5, step=0.01, interactive=False)
-                memory_box = gr.Textbox(label="Session memory", lines=12, elem_id="memory-box")
-    def respond(user_message, chat, memory):
-        user_message = (user_message or "").strip()
-        if not user_message:
-            return "", chat, memory, chat, memory, "Ready.", "neutral", 0.5, ""
-        current_messages = chat + [{"role": "user", "content": user_message}]
-        result = generate_reply(current_messages)
-        chat, memory, status = add_turn(user_message, result, chat, memory)
-        memory_text = "\n".join(f"- {m}" for m in memory) if memory else "No saved memory yet."
-        return (
-            "",
-            chat,
-            memory,
-            chat,
-            memory_text,
-            status,
-            result["emotion"],
-            result["mood_score"],
-            result["reply"],
-        )
-    def transcribe(audio_path):
-        if not audio_path:
-            return ""
-        # Stub kept local and simple. Add a Whisper GPU pipeline here when you want audio-to-text.
-        return "Audio input connected. Add Whisper transcription in this slot."
-    send_btn.click(
-        respond,
-        inputs=[user_text, chat_state, memory_state],
-        outputs=[user_text, chat_state, memory_state, chatbot, memory_box, status_md, emotion_box, mood_box, transcript_box],
-    )
-    user_text.submit(
-        respond,
-        inputs=[user_text, chat_state, memory_state],
-        outputs=[user_text, chat_state, memory_state, chatbot, memory_box, status_md, emotion_box, mood_box, transcript_box],
-    )
-    clear_btn.click(
-        clear_session,
-        inputs=[],
-        outputs=[chat_state, memory_state, chatbot, status_md, memory_box],
     )
-    transcribe_btn.click(
-        transcribe,
-        inputs=[audio_in],
-        outputs=[transcript_box],
-    )
-    demo.load(
-        lambda: ([], [], "Ready.", "neutral", 0.5, "No saved memory yet."),
-        inputs=[],
-        outputs=[chat_state, memory_state, status_md, emotion_box, mood_box, memory_box],
-    )
 if __name__ == "__main__":
-    demo.queue(default_concurrency_limit=1).launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        show_error=True,
-    )

+# app.py
+# Production script for the FastLLM Space.
+# Required dependencies: pip install gradio transformers torch spaces accelerate
 import os
 import json
 import torch
+import spaces
+import gradio as gr
+from threading import Thread
+from typing import Generator
+from fastapi.responses import HTMLResponse
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+# --- 1. LOCAL MODEL SPECIFICATION AND INITIAL CRITICAL CPU LOADING ---
+# Selection of Qwen2.5-1.5B fits the <4B parameters Tiny Titan bracket
+MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
+# Initialize tokenizer and load base model onto system RAM (CPU) to prevent cold startup allocation errors
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.float16,
+    device_map="cpu"
+)
+# --- 2. THE QUEUED ASYNC SERVERLESS INFERENCE PIPELINE ---
+@spaces.GPU(duration=30)
+def run_inference(message: str, history_str: str) -> Generator[str, None, None]:
+    """
+    Spins up the GPU model instance and runs real-time text streaming
+    by executing within the ephemeral ZeroGPU scheduling boundary.
+    """
+    # Move model parameters to physical GPU context inside the execution function
+    model.to("cuda")
+    # Establish base system context and constraints
+    messages =
+    # Parse and append past conversational context
+    if history_str:
         try:
+            history = json.loads(history_str)
+            for turn in history:
+                if isinstance(turn, list) and len(turn) == 2:
+                    messages.append({"role": "user", "content": turn})
+                    messages.append({"role": "assistant", "content": turn})
         except Exception:
             pass
+    # Append the current prompt
+    messages.append({"role": "user", "content": message})
+    # Process text sequences, utilizing return_dict to prevent sequence shape errors
+    inputs = tokenizer.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_tensors="pt",
+        return_dict=True
+    ).to("cuda")
+    # Set up streaming generators
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = dict(
+        **inputs,
+        streamer=streamer,
+        max_new_tokens=192,
+        do_sample=True,
+        temperature=0.7,
+        top_p=0.9,
     )
+    # Execute model forward pass on a dedicated worker thread
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    # Yield incremental text updates as they are generated
+    accumulated_text = ""
+    for new_text in streamer:
+        accumulated_text += new_text
+        yield accumulated_text
+# --- 3. THE 98% CUSTOM FRONTEND SYSTEM (FRONTEND_HTML) ---
+FRONTEND_HTML = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
+    <title>FastLLM Companion</title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/r128/three.min.js"></script>
+    <style>
+        body {
+            background-color: #ffe082; /* Gold/yellow background from specifications */
+            margin: 0;
+            overflow: hidden;
+            font-family: system-ui, -apple-system, sans-serif;
+            -webkit-user-select: none;
+            user-select: none;
+            -webkit-tap-highlight-color: transparent;
+        }
+        #c {
+            position: fixed;
+            top: 0;
+            left: 0;
+            width: 100%;
+            height: 100%;
+            z-index: 1;
+        }
+       .glass-panel {
+            background: rgba(8, 10, 22, 0.93);
+            backdrop-filter: blur(12px);
+            -webkit-backdrop-filter: blur(12px);
+            border: 1px solid rgba(255, 255, 255, 0.08);
+        }
+    </style>
+</head>
+<body class="text-white relative w-screen h-screen">
+    <canvas id="c"></canvas>
+    <div id="drop" class="absolute inset-0 flex flex-col items-center justify-center border-4 border-dashed border-cyan-500/50 m-10 rounded-3xl z-10 pointer-events-none transition-opacity duration-300 opacity-0">
+        <h2 class="text-3xl font-extrabold text-[#6cf] mb-2">Drop VRM Model</h2>
+        <p class="text-sm text-gray-100 opacity-60">Upload custom characters directly into viewport</p>
+    </div>
+    <div id="vrmPanel" class="absolute top-20 left-4 w-80 glass-panel p-4 rounded-2xl z-20 hidden flex-col gap-3">
+        <h3 class="font-bold text-sm text-cyan-400 uppercase tracking-widest">Available Companions</h3>
+        <div id="vrmList" class="flex-grow overflow-y-auto max-h-48 pr-1">
+            <div class="vrmItem flex items-center justify-between p-2 hover:bg-slate-800/60 rounded-xl cursor-pointer">
+                <span class="text-xs">Procedural Cyber-Core v1.0</span>
+                <span class="dot w-2 h-2 rounded-full bg-emerald-500 shadow-md"></span>
+            </div>
+        </div>
+        <button id="vrmPanelClose" class="text-center text-xs py-2 bg-slate-800 hover:bg-slate-700 rounded-xl mt-2 transition-all">Close Panel</button>
+    </div>
+    <div id="speakDot" class="absolute top-1/2 left-1/2 -translate-x-1/2 -translate-y-1/2 z-10 hidden flex items-center gap-1.5 bg-cyan-950/90 px-5 py-2.5 rounded-full border border-cyan-500/30">
+        <span class="text-xs text-cyan-400 mr-2 font-mono uppercase tracking-widest">Active</span>
+        <div class="sdot w-2 h-2 rounded-full bg-cyan-400 animate-bounce"></div>
+        <div class="sdot w-2 h-2 rounded-full bg-cyan-400 animate-bounce" style="animation-delay: 0.15s"></div>
+        <div class="sdot w-2 h-2 rounded-full bg-cyan-400 animate-bounce" style="animation-delay: 0.3s"></div>
+    </div>
+    <div class="absolute bottom-24 left-1/2 -translate-x-1/2 z-10 w-[90%] max-w-2xl px-6 py-4 rounded-2xl glass-panel text-center hidden pointer-events-auto border-t-2 border-cyan-500/20 shadow-lg" id="subtitle-panel">
+        <p id="subtitle-text" class="text-sm text-gray-100 leading-relaxed text-left"></p>
+    </div>
+    <div id="vrmaQueue" class="absolute bottom-28 right-4 w-64 max-h-32 overflow-y-auto glass-panel p-2 rounded-xl text-[10px] font-mono text-gray-400 hidden flex flex-col gap-1 z-10">
+        <div class="qitem border-b border-gray-800/30 pb-1">Queue: Syncing bones...</div>
+    </div>
+    <div id="bar" class="absolute left-1/2 -translate-x-1/2 z-10 w-[95%] max-w-4xl p-2 rounded-3xl glass-panel flex items-center gap-2 pointer-events-auto shadow-2xl">
+        <button id="mb" class="p-3 rounded-2xl bg-slate-800/80 hover:bg-slate-700 border border-slate-700 text-cyan-400 transition-all flex-shrink-0 flex items-center justify-center" onclick="toggleMenu()">
+            <svg class="w-6 h-6" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M12 6V4m0 2a2 2 0 100 4m0-4a2 2 0 110 4m-6 8a2 2 0 100-4m0 4a2 2 0 110-4m0 4v2m0-6V4m6 6v10m6-2a2 2 0 100-4m0 4a2 2 0 110-4m0 4v2m0-6V4"></path></svg>
+        </button>
+        <input type="text" id="ti" class="flex-1 py-3 px-4 rounded-2xl bg-slate-900/95 border border-slate-700/60 text-white placeholder-slate-500 focus:outline-none focus:ring-2 focus:ring-cyan-500/50 transition-all" placeholder="Enter message to local AI...">
+        <button id="sb" class="p-3 rounded-2xl bg-gradient-to-r from-[#6cf] to-[#3ae] hover:opacity-95 text-white font-semibold transition-all flex-shrink-0" onclick="handleSend()">
+            <svg class="w-6 h-6" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M14 5l7 7m0 0l-7 7m7-7H3"></path></svg>
+        </button>
+    </div>
+    <div id="load" class="absolute top-20 left-1/2 -translate-x-1/2 glass-panel px-4 py-2 rounded-full text-xs text-amber-400 font-mono tracking-widest uppercase transition-opacity duration-300 opacity-0 z-20">Syncing Local Engine...</div>
+    <div id="err" class="absolute top-4 left-4 right-4 bg-red-950/90 border border-red-500/30 text-red-200 px-4 py-2 rounded-xl text-xs font-mono text-center hidden z-30">GPU assignment latency detected. Retrying connection...</div>
+    <div id="info" class="absolute top-20 right-4 w-72 glass-panel p-3 rounded-xl border border-blue-500/20 text-xs text-blue-200 hidden z-20">Notice: Running local weights on serverless hardware.</div>
+    <div id="fps" class="absolute top-4 right-4 text-xs font-mono text-emerald-400 bg-slate-950/90 px-3 py-1.5 rounded border border-emerald-500/20 z-20">FPS: --</div>
+    <div id="menu" class="absolute bottom-0 left-0 right-0 glass-panel p-6 rounded-t-3xl z-30 transform translate-y-full transition-transform duration-300 max-h-[60vh] overflow-y-auto">
+        <div class="flex justify-between items-center mb-4 border-b border-gray-800 pb-2">
+            <h3 class="font-bold text-cyan-400 uppercase tracking-widest text-sm">Customization Panel</h3>
+            <button class="text-gray-500 hover:text-white text-xl" onclick="toggleMenu()">&times;</button>
+        </div>
+        <div class="flex flex-col gap-4">
+            <div class="row flex justify-between items-center text-sm">
+                <span class="text-gray-300">Ambient Lighting</span>
+                <input type="range" min="0.5" max="3" step="0.1" value="1.5" class="accent-cyan-500" oninput="updateGlowIntensity(this.value)">
+            </div>
+            <div class="row flex justify-between items-center text-sm">
+                <span class="text-gray-300">Companion Eye Tint</span>
+                <input type="color" value="#06b6d4" class="w-8 h-8 rounded border-none bg-transparent cursor-pointer" onchange="updateEyeColor(this.value)">
+            </div>
+            <div class="row flex justify-between items-center text-sm">
+                <span class="text-gray-300">Key Registration</span>
+                <input type="password" placeholder="Key token..." class="bg-slate-900 border border-slate-700 rounded px-2 py-1 text-xs text-white">
+            </div>
+            <div class="row flex justify-between items-center text-sm">
+                <span class="text-gray-300">Mesh Designation</span>
+                <input type="text" value="Aya-Companion" class="bg-slate-900 border border-slate-700 rounded px-2 py-1 text-xs text-white">
+            </div>
+            <div class="row flex justify-between items-center text-sm">
+                <span class="text-gray-300">Interaction Node</span>
+                <select class="bg-slate-900 border border-slate-700 rounded px-2 py-1 text-xs text-white">
+                    <option>Empathetic</option>
+                    <option>Analytical</option>
+                </select>
+            </div>
+            <div class="chips flex gap-2 flex-wrap">
+                <span class="chip bg-cyan-950 text-cyan-300 px-3 py-1 rounded-full text-xs cursor-pointer border border-cyan-500/20">Voice Sync</span>
+                <span class="chip bg-slate-800 text-slate-300 px-3 py-1 rounded-full text-xs cursor-pointer">Local Text</span>
+            </div>
+            <div class="fbtn flex gap-2 mt-2">
+                <button class="flex-1 py-2 bg-rose-950/50 hover:bg-rose-950 border border-rose-500/30 text-rose-300 rounded-xl text-xs font-semibold">Clear Profile</button>
+                <button class="flex-1 py-2 bg-cyan-950/50 hover:bg-cyan-950 border border-cyan-500/30 text-cyan-300 rounded-xl text-xs font-semibold">Save Profile</button>
+            </div>
+        </div>
+    </div>
+    <div class="voice-locked absolute top-4 left-1/2 -translate-x-1/2 bg-slate-950/90 border border-amber-500/30 text-amber-200 px-4 py-2 rounded-xl text-xs font-mono hidden flex items-center gap-2 z-30 shadow-lg">
+        <svg class="w-4 h-4 text-amber-500" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M12 15v2m-6 4h12a2 2 0 002-2v-6a2 2 0 00-2-2H6a2 2 0 00-2 2v6a2 2 0 002 2zm10-10V7a4 4 0 00-8 0v4h8z"></path></svg>
+        Voice function locked. Upgrade to premium.
+    </div>
+    <script type="module">
+        import { Client } from "https://cdn.jsdelivr.net/npm/@gradio/client/dist/index.min.js";
+        let client;
+        let isGenerating = false;
+        let chatHistory =;
+        // Setup control bar positioning for mobile Apple safe area inset
+        const inputBar = document.getElementById('bar');
+        inputBar.style.bottom = `calc(16px + env(safe-area-inset-bottom, 0px))`;
+        async function connectEngine() {
+            const loader = document.getElementById('load');
+            loader.style.opacity = '1';
+            try {
+                // Connect utilizing the local window origin to ensure ZeroGPU token handshakes are verified
+                client = await Client.connect(window.location.origin);
+                loader.style.opacity = '0';
+            } catch (err) {
+                console.error("Gradio initialization failure:", err);
+                document.getElementById('err').classList.remove('hidden');
+            }
+        }
+        window.handleSend = async function() {
+            const inputField = document.getElementById('ti');
+            const messageText = inputField.value.trim();
+            if (!messageText || isGenerating) return;
+            inputField.value = '';
+            isGenerating = true;
+            // Show subtitle panel and active speaking indicator
+            const subtitlePanel = document.getElementById('subtitle-panel');
+            const subtitleText = document.getElementById('subtitle-text');
+            const speakIndicator = document.getElementById('speakDot');
+            subtitlePanel.classList.remove('hidden');
+            speakIndicator.classList.remove('hidden');
+            subtitleText.textContent = "Processing message...";
+            try {
+                // Submit request to the local serverless execution queue
+                const job = client.submit("/chat",);
+                job.on("data", (event) => {
+                    const latestChunk = event.data;
+                    subtitleText.textContent = latestChunk;
+                    // Trigger character jaw scaling based on active streaming
+                    speakingIntensity = 1.0;
+                });
+                job.on("status", (status) => {
+                    if (status.stage === "complete") {
+                        const finalResponse = subtitleText.textContent;
+                        chatHistory.push();
+                        if (chatHistory.length > 8) chatHistory.shift();
+                        isGenerating = false;
+                        speakIndicator.classList.add('hidden');
+                        setTimeout(() => {
+                            if (!isGenerating) subtitlePanel.classList.add('hidden');
+                        }, 5000);
+                    }
+                });
+            } catch (err) {
+                console.error("Inference execution failure:", err);
+                subtitleText.textContent = "Pipeline error. Retrying...";
+                isGenerating = false;
+                speakIndicator.classList.add('hidden');
+            }
+        };
+        document.getElementById('ti').addEventListener('keypress', (e) => {
+            if (e.key === 'Enter') handleSend();
+        });
+        window.toggleMenu = function() {
+            const menu = document.getElementById('menu');
+            if (menu.style.transform === 'translateY(0%)') {
+                menu.style.transform = 'translateY(100%)';
+            } else {
+                menu.style.transform = 'translateY(0%)';
+            }
+        };
+        // --- 4. PROCEDURAL THREE.JS WEBGL RENDER LOOP ---
+        let scene, camera, renderer;
+        let headMesh, leftEye, rightEye, mouthMesh;
+        let baseEyeColor, targetEyeColor;
+        let mouseX = 0, mouseY = 0;
+        let speakingIntensity = 0;
+        let clock = new THREE.Clock();
+        let fpsLastTime = performance.now();
+        let fpsFrames = 0;
+        function initWebGLScene() {
+            const canvas = document.getElementById('c');
+            scene = new THREE.Scene();
+            scene.fog = new THREE.FogExp2(0xffe082, 0.05);
+            camera = new THREE.PerspectiveCamera(40, window.innerWidth / window.innerHeight, 0.1, 100);
+            camera.position.set(0, 0.2, 4.5);
+            renderer = new THREE.WebGLRenderer({ canvas: canvas, antialias: true, alpha: true });
+            renderer.setSize(window.innerWidth, window.innerHeight);
+            renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2));
+            renderer.setClearColor(0x000000, 0); // Transparent WebGL overlay
+            // Lighting Configuration
+            const ambient = new THREE.AmbientLight(0xfffbeb, 1.0);
+            scene.add(ambient);
+            const direction = new THREE.DirectionalLight(0x06b6d4, 1.8);
+            direction.position.set(5, 5, 5);
+            scene.add(direction);
+            // Cybernetic companion structure
+            const metalMat = new THREE.MeshStandardMaterial({
+                color: 0x1e293b,
+                roughness: 0.12,
+                metalness: 0.88
+            });
+            headMesh = new THREE.Mesh(new THREE.CylinderGeometry(0.8, 0.6, 1.2, 8), metalMat);
+            headMesh.position.set(0, 0, 0);
+            scene.add(headMesh);
+            // Expressive glowing eye spheres
+            baseEyeColor = new THREE.Color(0x06b6d4);
+            targetEyeColor = new THREE.Color(0x06b6d4);
+            const eyeMat = new THREE.MeshBasicMaterial({ color: baseEyeColor });
+            const eyeGeo = new THREE.SphereGeometry(0.14, 32, 32);
+            leftEye = new THREE.Mesh(eyeGeo, eyeMat);
+            leftEye.position.set(-0.28, 0.15, 0.58);
+            headMesh.add(leftEye);
+            rightEye = new THREE.Mesh(eyeGeo, eyeMat);
+            rightEye.position.set(0.28, 0.15, 0.58);
+            headMesh.add(rightEye);
+            // Dynamic speaking mesh
+            mouthMesh = new THREE.Mesh(new THREE.BoxGeometry(0.35, 0.04, 0.06), new THREE.MeshBasicMaterial({ color: 0x06b6d4 }));
+            mouthMesh.position.set(0, -0.28, 0.61);
+            headMesh.add(mouthMesh);
+            // Floating halo ring
+            const ringGeo = new THREE.TorusGeometry(1.2, 0.03, 8, 48);
+            ringGeo.rotateX(Math.PI / 2);
+            const ringMesh = new THREE.Mesh(ringGeo, new THREE.MeshStandardMaterial({
+                color: 0x06b6d4,
+                emissive: 0x06b6d4,
+                emissiveIntensity: 0.8
+            }));
+            ringMesh.position.y = 0.8;
+            headMesh.add(ringMesh);
+            window.addEventListener('resize', onResize);
+            window.addEventListener('mousemove', (e) => {
+                mouseX = (e.clientX / window.innerWidth) * 2 - 1;
+                mouseY = -(e.clientY / window.innerHeight) * 2 + 1;
+            });
+            connectEngine();
+            animate();
+        }
+        function onResize() {
+            camera.aspect = window.innerWidth / window.innerHeight;
+            camera.updateProjectionMatrix();
+            renderer.setSize(window.innerWidth, window.innerHeight);
+        }
+        window.updateEyeColor = function(colorHex) {
+            baseEyeColor.set(colorHex);
+            targetEyeColor.set(colorHex);
+            mouthMesh.material.color.set(colorHex);
+        };
+        window.updateGlowIntensity = function(val) {
+            scene.children.forEach(c => {
+                if (c.isDirectionalLight) c.intensity = parseFloat(val);
+            });
+        };
+        function animate() {
+            requestAnimationFrame(animate);
+            const time = clock.getElapsedTime();
+            // Dynamic idle float movements
+            headMesh.position.y = Math.sin(time * 1.8) * 0.06;
+            // Head rotation mechanics
+            const targetRotY = mouseX * 0.35;
+            const targetRotX = -mouseY * 0.18;
+            headMesh.rotation.y += (targetRotY - headMesh.rotation.y) * 0.1;
+            headMesh.rotation.x += (targetRotX - headMesh.rotation.x) * 0.1;
+            // Linear jaw scaling during token streaming
+            if (speakingIntensity > 0.01) {
+                speakingIntensity *= 0.90;
+                const mouthY = 1 + Math.sin(time * 28) * 3.0 * speakingIntensity;
+                mouthMesh.scale.set(1.0, mouthY, 1.0);
+            } else {
+                mouthMesh.scale.set(1.0, 1.0, 1.0);
+            }
+            // Expressive blinks
+            const isBlink = Math.floor(time) % 6 === 0 && (time - Math.floor(time)) < 0.15;
+            leftEye.scale.y = isBlink? 0.15 : 1.0;
+            rightEye.scale.y = isBlink? 0.15 : 1.0;
+            renderer.render(scene, camera);
+            // Track client render metrics
+            fpsFrames++;
+            const now = performance.now();
+            if (now >= fpsLastTime + 1000) {
+                document.getElementById('fps').textContent = `FPS: ${fpsFrames}`;
+                fpsFrames = 0;
+                fpsLastTime = now;
+            }
+        }
+        initWebGLScene();
+    </script>
+</body>
+</html>
+"""
+# --- 5. SYSTEM REGISTRATION AND COMPOSITION ---
+# Initialize the custom Server Mode FastAPI app
+app = gr.Server()
+@app.get("/", response_class=HTMLResponse)
+async def homepage() -> HTMLResponse:
+    """
+    Serves the custom HTML single-page application and its embedded WebGL engine.
+    """
+    return HTMLResponse(content=FRONTEND_HTML, status_code=200)
+@app.api(name="chat")
+def chat(message: str, history_str: str) -> Generator[str, None, None]:
+    """
+    API endpoint wrapped in Gradio's serialized concurrency queue, supporting spaces.GPU.
+    """
+    for chunk in run_inference(message, history_str):
+        yield chunk
+# Launch the unified server
 if __name__ == "__main__":
+    app.launch()