OrbitMC commited on
Commit
0247fa0
·
verified ·
1 Parent(s): 99529c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +462 -363
app.py CHANGED
@@ -1,371 +1,470 @@
1
-
 
 
2
  import os
3
- import re
4
  import json
5
- import time
6
- from functools import lru_cache
7
- from typing import List, Dict, Any, Tuple
8
-
9
- import gradio as gr
10
  import torch
11
- from transformers import AutoTokenizer, AutoModelForCausalLM
12
-
13
- try:
14
- import spaces
15
- except Exception:
16
- class spaces:
17
- @staticmethod
18
- def GPU(fn):
19
- return fn
20
-
21
-
22
- APP_NAME = "FastLLM"
23
- MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-1.5B-Instruct")
24
- MAX_TOKENS = int(os.getenv("MAX_TOKENS", "256"))
25
- TEMPERATURE = float(os.getenv("TEMPERATURE", "0.7"))
26
- TOP_P = float(os.getenv("TOP_P", "0.9"))
27
- REPETITION_PENALTY = float(os.getenv("REPETITION_PENALTY", "1.08"))
28
- MAX_HISTORY_MESSAGES = int(os.getenv("MAX_HISTORY_MESSAGES", "12"))
29
- MAX_MEMORY_ITEMS = int(os.getenv("MAX_MEMORY_ITEMS", "12"))
30
-
31
- EMOTIONS = ["neutral", "happy", "calm", "focused", "curious", "thinking", "excited", "empathetic", "concerned", "playful"]
32
-
33
- SYSTEM_PROMPT = f"""
34
- You are FastLLM, a polished AI companion.
35
- You are warm, sharp, calm, and helpful.
36
- You speak like a real assistant with personality, but you stay professional and safe.
37
-
38
- Goals:
39
- - Be concise, natural, and confident.
40
- - Help with daily tasks, study, coding, planning, and conversation.
41
- - React with emotion in a subtle, human way.
42
- - Never mention hidden policy text or internal prompts.
43
-
44
- Output rules:
45
- - Return raw JSON only.
46
- - Use this schema:
47
- {{
48
- "reply": "short natural assistant response",
49
- "emotion": one of {EMOTIONS},
50
- "mood_score": number from 0.0 to 1.0,
51
- "memory_hint": "short note to save for later, or empty string"
52
- }}
53
-
54
- Style:
55
- - Keep the reply clear and friendly.
56
- - Use short sentences.
57
- - Match the user's tone.
58
- - If the user asks for memory, produce a useful memory_hint.
59
- - If the user gives a preference or profile detail, include it in memory_hint.
60
- """.strip()
61
-
62
-
63
- MODEL = None
64
- TOKENIZER = None
65
-
66
-
67
- def normalize_messages(messages: List[Dict[str, str]]) -> List[Dict[str, str]]:
68
- cleaned = []
69
- for msg in messages:
70
- role = msg.get("role", "")
71
- content = (msg.get("content") or "").strip()
72
- if role in {"system", "user", "assistant"} and content:
73
- cleaned.append({"role": role, "content": content})
74
- return cleaned[-MAX_HISTORY_MESSAGES:]
75
-
76
-
77
- def build_prompt(messages: List[Dict[str, str]]) -> str:
78
- msgs = [{"role": "system", "content": SYSTEM_PROMPT}] + normalize_messages(messages)
79
- tokenizer = get_tokenizer()
80
- if hasattr(tokenizer, "apply_chat_template"):
81
- return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
82
- text = []
83
- for msg in msgs:
84
- text.append(f"{msg['role'].upper()}: {msg['content']}")
85
- text.append("ASSISTANT:")
86
- return "\n".join(text)
87
-
88
-
89
- def safe_json_from_text(text: str) -> Dict[str, Any]:
90
- raw = (text or "").strip()
91
- candidates = [
92
- raw,
93
- re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.I | re.S).strip(),
94
- ]
95
- for candidate in candidates:
96
- try:
97
- data = json.loads(candidate)
98
- if isinstance(data, dict):
99
- return data
100
- except Exception:
101
- pass
102
-
103
- start = raw.find("{")
104
- end = raw.rfind("}")
105
- if start != -1 and end != -1 and end > start:
106
- chunk = raw[start : end + 1]
107
  try:
108
- data = json.loads(chunk)
109
- if isinstance(data, dict):
110
- return data
 
 
111
  except Exception:
112
  pass
113
-
114
- return {
115
- "reply": raw if raw else "I’m here.",
116
- "emotion": "neutral",
117
- "mood_score": 0.5,
118
- "memory_hint": "",
119
- }
120
-
121
-
122
- def clamp(v: float, lo: float = 0.0, hi: float = 1.0) -> float:
123
- return max(lo, min(hi, v))
124
-
125
-
126
- def get_tokenizer():
127
- global TOKENIZER
128
- if TOKENIZER is None:
129
- TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
130
- return TOKENIZER
131
-
132
-
133
- def load_model_once():
134
- global MODEL, TOKENIZER
135
- if MODEL is not None and TOKENIZER is not None:
136
- return MODEL, TOKENIZER
137
-
138
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
139
-
140
- load_kwargs = dict(low_cpu_mem_usage=True)
141
- try:
142
- load_kwargs["dtype"] = torch.float16
143
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs)
144
- except TypeError:
145
- load_kwargs.pop("dtype", None)
146
- load_kwargs["torch_dtype"] = torch.float16
147
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs)
148
-
149
- if torch.cuda.is_available():
150
- model = model.to("cuda")
151
-
152
- model.eval()
153
- if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
154
- tokenizer.pad_token_id = tokenizer.eos_token_id
155
-
156
- MODEL = model
157
- TOKENIZER = tokenizer
158
- return MODEL, TOKENIZER
159
-
160
-
161
- @spaces.GPU
162
- def generate_reply(messages: List[Dict[str, str]]) -> Dict[str, Any]:
163
- model, tokenizer = load_model_once()
164
- prompt = build_prompt(messages)
165
- inputs = tokenizer(prompt, return_tensors="pt")
166
-
167
- device = "cuda" if torch.cuda.is_available() else "cpu"
168
- inputs = {k: v.to(device) for k, v in inputs.items()}
169
-
170
- with torch.no_grad():
171
- output = model.generate(
172
- **inputs,
173
- max_new_tokens=MAX_TOKENS,
174
- do_sample=True,
175
- temperature=TEMPERATURE,
176
- top_p=TOP_P,
177
- repetition_penalty=REPETITION_PENALTY,
178
- pad_token_id=tokenizer.pad_token_id,
179
- eos_token_id=tokenizer.eos_token_id,
180
- )
181
-
182
- generated = tokenizer.decode(output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
183
- data = safe_json_from_text(generated)
184
-
185
- reply = str(data.get("reply", "")).strip()
186
- if not reply:
187
- reply = "I’m here."
188
-
189
- emotion = str(data.get("emotion", "neutral")).strip().lower()
190
- if emotion not in EMOTIONS:
191
- emotion = "neutral"
192
-
193
- mood_score = data.get("mood_score", 0.5)
194
- try:
195
- mood_score = clamp(float(mood_score))
196
- except Exception:
197
- mood_score = 0.5
198
-
199
- memory_hint = str(data.get("memory_hint", "")).strip()
200
-
201
- return {
202
- "reply": reply,
203
- "emotion": emotion,
204
- "mood_score": mood_score,
205
- "memory_hint": memory_hint,
206
- }
207
-
208
-
209
- def extract_memory_candidate(user_text: str, assistant_text: str, memory_hint: str) -> str:
210
- text = " ".join([user_text or "", assistant_text or "", memory_hint or ""]).strip()
211
- if not text:
212
- return ""
213
- patterns = [
214
- r"\bmy name is ([^.!,?\n]+)",
215
- r"\bcall me ([^.!,?\n]+)",
216
- r"\bi work as ([^.!,?\n]+)",
217
- r"\bi like ([^.!,?\n]+)",
218
- r"\bi prefer ([^.!,?\n]+)",
219
- r"\bremember that ([^.!,?\n]+)",
220
- ]
221
- for pat in patterns:
222
- m = re.search(pat, text, flags=re.I)
223
- if m:
224
- return m.group(1).strip()[:120]
225
- if memory_hint:
226
- return memory_hint[:120]
227
- return ""
228
-
229
-
230
- def render_status(emotion: str, mood_score: float, memory_count: int) -> str:
231
- bars = "■" * max(1, int(round(mood_score * 10)))
232
- bars = bars.ljust(10, "□")
233
- return f"**Mood:** `{emotion}` | **Energy:** `{bars}` | **Memory items:** `{memory_count}`"
234
-
235
-
236
- def add_turn(user_text: str, response: Dict[str, Any], chat: List[Dict[str, str]], memory: List[str]) -> Tuple[List[Dict[str, str]], List[str], str]:
237
- chat.append({"role": "user", "content": user_text})
238
- chat.append({"role": "assistant", "content": response["reply"]})
239
-
240
- mem = extract_memory_candidate(user_text, response["reply"], response.get("memory_hint", ""))
241
- if mem:
242
- if mem not in memory:
243
- memory = (memory + [mem])[-MAX_MEMORY_ITEMS:]
244
-
245
- status = render_status(response["emotion"], response["mood_score"], len(memory))
246
- return chat, memory, status
247
-
248
-
249
- def clear_session():
250
- return [], [], [], "Ready.", ""
251
-
252
-
253
- def seed_examples():
254
- return [
255
- ["Help me plan my day.", None],
256
- ["Remember that I build apps with Hugging Face and Python.", None],
257
- ]
258
-
259
-
260
- with gr.Blocks(theme=gr.themes.Soft(), css="""
261
- #app-wrap { max-width: 1200px; margin: 0 auto; }
262
- #header-card { border-radius: 24px; }
263
- #chatbox { min-height: 560px; }
264
- #memory-box { min-height: 220px; }
265
- """) as demo:
266
- chat_state = gr.State([])
267
- memory_state = gr.State([])
268
-
269
- with gr.Column(elem_id="app-wrap"):
270
- with gr.Row():
271
- with gr.Column(scale=3):
272
- gr.Markdown(
273
- f"# {APP_NAME}\nA local GPU companion built with Gradio and Qwen."
274
- )
275
- status_md = gr.Markdown("Ready.")
276
- with gr.Column(scale=1):
277
- clear_btn = gr.Button("Clear session", variant="secondary")
278
-
279
- with gr.Row():
280
- with gr.Column(scale=3):
281
- chatbot = gr.Chatbot(
282
- value=[],
283
- type="messages",
284
- height=560,
285
- elem_id="chatbox",
286
- show_copy_button=True,
287
- )
288
- with gr.Row():
289
- user_text = gr.Textbox(
290
- placeholder="Message FastLLM...",
291
- scale=6,
292
- show_label=False,
293
- )
294
- send_btn = gr.Button("Send", variant="primary", scale=1)
295
-
296
- with gr.Accordion("Voice input", open=False):
297
- audio_in = gr.Audio(
298
- sources=["microphone", "upload"],
299
- type="filepath",
300
- label="Audio input",
301
- )
302
- transcribe_btn = gr.Button("Transcribe with local GPU model", variant="secondary")
303
- transcript_box = gr.Textbox(label="Transcript", lines=3)
304
-
305
- with gr.Column(scale=1):
306
- emotion_box = gr.Textbox(label="Emotion", value="neutral", interactive=False)
307
- mood_box = gr.Slider(label="Mood score", minimum=0, maximum=1, value=0.5, step=0.01, interactive=False)
308
- memory_box = gr.Textbox(label="Session memory", lines=12, elem_id="memory-box")
309
-
310
- def respond(user_message, chat, memory):
311
- user_message = (user_message or "").strip()
312
- if not user_message:
313
- return "", chat, memory, chat, memory, "Ready.", "neutral", 0.5, ""
314
-
315
- current_messages = chat + [{"role": "user", "content": user_message}]
316
- result = generate_reply(current_messages)
317
- chat, memory, status = add_turn(user_message, result, chat, memory)
318
-
319
- memory_text = "\n".join(f"- {m}" for m in memory) if memory else "No saved memory yet."
320
- return (
321
- "",
322
- chat,
323
- memory,
324
- chat,
325
- memory_text,
326
- status,
327
- result["emotion"],
328
- result["mood_score"],
329
- result["reply"],
330
- )
331
-
332
- def transcribe(audio_path):
333
- if not audio_path:
334
- return ""
335
- # Stub kept local and simple. Add a Whisper GPU pipeline here when you want audio-to-text.
336
- return "Audio input connected. Add Whisper transcription in this slot."
337
-
338
- send_btn.click(
339
- respond,
340
- inputs=[user_text, chat_state, memory_state],
341
- outputs=[user_text, chat_state, memory_state, chatbot, memory_box, status_md, emotion_box, mood_box, transcript_box],
342
- )
343
- user_text.submit(
344
- respond,
345
- inputs=[user_text, chat_state, memory_state],
346
- outputs=[user_text, chat_state, memory_state, chatbot, memory_box, status_md, emotion_box, mood_box, transcript_box],
347
- )
348
- clear_btn.click(
349
- clear_session,
350
- inputs=[],
351
- outputs=[chat_state, memory_state, chatbot, status_md, memory_box],
352
  )
353
- transcribe_btn.click(
354
- transcribe,
355
- inputs=[audio_in],
356
- outputs=[transcript_box],
357
- )
358
-
359
- demo.load(
360
- lambda: ([], [], "Ready.", "neutral", 0.5, "No saved memory yet."),
361
- inputs=[],
362
- outputs=[chat_state, memory_state, status_md, emotion_box, mood_box, memory_box],
363
- )
364
-
365
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  if __name__ == "__main__":
367
- demo.queue(default_concurrency_limit=1).launch(
368
- server_name="0.0.0.0",
369
- server_port=7860,
370
- show_error=True,
371
- )
 
1
+ # app.py
2
+ # Production script for the FastLLM Space.
3
+ # Required dependencies: pip install gradio transformers torch spaces accelerate
4
  import os
 
5
  import json
 
 
 
 
 
6
  import torch
7
+ import spaces
8
+ import gradio as gr
9
+ from threading import Thread
10
+ from typing import Generator
11
+ from fastapi.responses import HTMLResponse
12
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
13
+
14
+ # --- 1. LOCAL MODEL SPECIFICATION AND INITIAL CRITICAL CPU LOADING ---
15
+ # Selection of Qwen2.5-1.5B fits the <4B parameters Tiny Titan bracket
16
+ MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
17
+
18
+ # Initialize tokenizer and load base model onto system RAM (CPU) to prevent cold startup allocation errors
19
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
20
+ model = AutoModelForCausalLM.from_pretrained(
21
+ MODEL_ID,
22
+ torch_dtype=torch.float16,
23
+ device_map="cpu"
24
+ )
25
+
26
+ # --- 2. THE QUEUED ASYNC SERVERLESS INFERENCE PIPELINE ---
27
+ @spaces.GPU(duration=30)
28
+ def run_inference(message: str, history_str: str) -> Generator[str, None, None]:
29
+ """
30
+ Spins up the GPU model instance and runs real-time text streaming
31
+ by executing within the ephemeral ZeroGPU scheduling boundary.
32
+ """
33
+ # Move model parameters to physical GPU context inside the execution function
34
+ model.to("cuda")
35
+
36
+ # Establish base system context and constraints
37
+ messages =
38
+
39
+ # Parse and append past conversational context
40
+ if history_str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  try:
42
+ history = json.loads(history_str)
43
+ for turn in history:
44
+ if isinstance(turn, list) and len(turn) == 2:
45
+ messages.append({"role": "user", "content": turn})
46
+ messages.append({"role": "assistant", "content": turn})
47
  except Exception:
48
  pass
49
+
50
+ # Append the current prompt
51
+ messages.append({"role": "user", "content": message})
52
+
53
+ # Process text sequences, utilizing return_dict to prevent sequence shape errors
54
+ inputs = tokenizer.apply_chat_template(
55
+ messages,
56
+ tokenize=True,
57
+ add_generation_prompt=True,
58
+ return_tensors="pt",
59
+ return_dict=True
60
+ ).to("cuda")
61
+
62
+ # Set up streaming generators
63
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
64
+ generation_kwargs = dict(
65
+ **inputs,
66
+ streamer=streamer,
67
+ max_new_tokens=192,
68
+ do_sample=True,
69
+ temperature=0.7,
70
+ top_p=0.9,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  )
72
+
73
+ # Execute model forward pass on a dedicated worker thread
74
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
75
+ thread.start()
76
+
77
+ # Yield incremental text updates as they are generated
78
+ accumulated_text = ""
79
+ for new_text in streamer:
80
+ accumulated_text += new_text
81
+ yield accumulated_text
82
+
83
+ # --- 3. THE 98% CUSTOM FRONTEND SYSTEM (FRONTEND_HTML) ---
84
+ FRONTEND_HTML = """
85
+ <!DOCTYPE html>
86
+ <html lang="en">
87
+ <head>
88
+ <meta charset="UTF-8">
89
+ <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
90
+ <title>FastLLM Companion</title>
91
+ <script src="https://cdn.tailwindcss.com"></script>
92
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/r128/three.min.js"></script>
93
+ <style>
94
+ body {
95
+ background-color: #ffe082; /* Gold/yellow background from specifications */
96
+ margin: 0;
97
+ overflow: hidden;
98
+ font-family: system-ui, -apple-system, sans-serif;
99
+ -webkit-user-select: none;
100
+ user-select: none;
101
+ -webkit-tap-highlight-color: transparent;
102
+ }
103
+ #c {
104
+ position: fixed;
105
+ top: 0;
106
+ left: 0;
107
+ width: 100%;
108
+ height: 100%;
109
+ z-index: 1;
110
+ }
111
+ .glass-panel {
112
+ background: rgba(8, 10, 22, 0.93);
113
+ backdrop-filter: blur(12px);
114
+ -webkit-backdrop-filter: blur(12px);
115
+ border: 1px solid rgba(255, 255, 255, 0.08);
116
+ }
117
+ </style>
118
+ </head>
119
+ <body class="text-white relative w-screen h-screen">
120
+
121
+ <canvas id="c"></canvas>
122
+
123
+ <div id="drop" class="absolute inset-0 flex flex-col items-center justify-center border-4 border-dashed border-cyan-500/50 m-10 rounded-3xl z-10 pointer-events-none transition-opacity duration-300 opacity-0">
124
+ <h2 class="text-3xl font-extrabold text-[#6cf] mb-2">Drop VRM Model</h2>
125
+ <p class="text-sm text-gray-100 opacity-60">Upload custom characters directly into viewport</p>
126
+ </div>
127
+
128
+ <div id="vrmPanel" class="absolute top-20 left-4 w-80 glass-panel p-4 rounded-2xl z-20 hidden flex-col gap-3">
129
+ <h3 class="font-bold text-sm text-cyan-400 uppercase tracking-widest">Available Companions</h3>
130
+ <div id="vrmList" class="flex-grow overflow-y-auto max-h-48 pr-1">
131
+ <div class="vrmItem flex items-center justify-between p-2 hover:bg-slate-800/60 rounded-xl cursor-pointer">
132
+ <span class="text-xs">Procedural Cyber-Core v1.0</span>
133
+ <span class="dot w-2 h-2 rounded-full bg-emerald-500 shadow-md"></span>
134
+ </div>
135
+ </div>
136
+ <button id="vrmPanelClose" class="text-center text-xs py-2 bg-slate-800 hover:bg-slate-700 rounded-xl mt-2 transition-all">Close Panel</button>
137
+ </div>
138
+
139
+ <div id="speakDot" class="absolute top-1/2 left-1/2 -translate-x-1/2 -translate-y-1/2 z-10 hidden flex items-center gap-1.5 bg-cyan-950/90 px-5 py-2.5 rounded-full border border-cyan-500/30">
140
+ <span class="text-xs text-cyan-400 mr-2 font-mono uppercase tracking-widest">Active</span>
141
+ <div class="sdot w-2 h-2 rounded-full bg-cyan-400 animate-bounce"></div>
142
+ <div class="sdot w-2 h-2 rounded-full bg-cyan-400 animate-bounce" style="animation-delay: 0.15s"></div>
143
+ <div class="sdot w-2 h-2 rounded-full bg-cyan-400 animate-bounce" style="animation-delay: 0.3s"></div>
144
+ </div>
145
+
146
+ <div class="absolute bottom-24 left-1/2 -translate-x-1/2 z-10 w-[90%] max-w-2xl px-6 py-4 rounded-2xl glass-panel text-center hidden pointer-events-auto border-t-2 border-cyan-500/20 shadow-lg" id="subtitle-panel">
147
+ <p id="subtitle-text" class="text-sm text-gray-100 leading-relaxed text-left"></p>
148
+ </div>
149
+
150
+ <div id="vrmaQueue" class="absolute bottom-28 right-4 w-64 max-h-32 overflow-y-auto glass-panel p-2 rounded-xl text-[10px] font-mono text-gray-400 hidden flex flex-col gap-1 z-10">
151
+ <div class="qitem border-b border-gray-800/30 pb-1">Queue: Syncing bones...</div>
152
+ </div>
153
+
154
+ <div id="bar" class="absolute left-1/2 -translate-x-1/2 z-10 w-[95%] max-w-4xl p-2 rounded-3xl glass-panel flex items-center gap-2 pointer-events-auto shadow-2xl">
155
+ <button id="mb" class="p-3 rounded-2xl bg-slate-800/80 hover:bg-slate-700 border border-slate-700 text-cyan-400 transition-all flex-shrink-0 flex items-center justify-center" onclick="toggleMenu()">
156
+ <svg class="w-6 h-6" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M12 6V4m0 2a2 2 0 100 4m0-4a2 2 0 110 4m-6 8a2 2 0 100-4m0 4a2 2 0 110-4m0 4v2m0-6V4m6 6v10m6-2a2 2 0 100-4m0 4a2 2 0 110-4m0 4v2m0-6V4"></path></svg>
157
+ </button>
158
+
159
+ <input type="text" id="ti" class="flex-1 py-3 px-4 rounded-2xl bg-slate-900/95 border border-slate-700/60 text-white placeholder-slate-500 focus:outline-none focus:ring-2 focus:ring-cyan-500/50 transition-all" placeholder="Enter message to local AI...">
160
+
161
+ <button id="sb" class="p-3 rounded-2xl bg-gradient-to-r from-[#6cf] to-[#3ae] hover:opacity-95 text-white font-semibold transition-all flex-shrink-0" onclick="handleSend()">
162
+ <svg class="w-6 h-6" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M14 5l7 7m0 0l-7 7m7-7H3"></path></svg>
163
+ </button>
164
+ </div>
165
+
166
+ <div id="load" class="absolute top-20 left-1/2 -translate-x-1/2 glass-panel px-4 py-2 rounded-full text-xs text-amber-400 font-mono tracking-widest uppercase transition-opacity duration-300 opacity-0 z-20">Syncing Local Engine...</div>
167
+ <div id="err" class="absolute top-4 left-4 right-4 bg-red-950/90 border border-red-500/30 text-red-200 px-4 py-2 rounded-xl text-xs font-mono text-center hidden z-30">GPU assignment latency detected. Retrying connection...</div>
168
+ <div id="info" class="absolute top-20 right-4 w-72 glass-panel p-3 rounded-xl border border-blue-500/20 text-xs text-blue-200 hidden z-20">Notice: Running local weights on serverless hardware.</div>
169
+ <div id="fps" class="absolute top-4 right-4 text-xs font-mono text-emerald-400 bg-slate-950/90 px-3 py-1.5 rounded border border-emerald-500/20 z-20">FPS: --</div>
170
+
171
+ <div id="menu" class="absolute bottom-0 left-0 right-0 glass-panel p-6 rounded-t-3xl z-30 transform translate-y-full transition-transform duration-300 max-h-[60vh] overflow-y-auto">
172
+ <div class="flex justify-between items-center mb-4 border-b border-gray-800 pb-2">
173
+ <h3 class="font-bold text-cyan-400 uppercase tracking-widest text-sm">Customization Panel</h3>
174
+ <button class="text-gray-500 hover:text-white text-xl" onclick="toggleMenu()">&times;</button>
175
+ </div>
176
+ <div class="flex flex-col gap-4">
177
+ <div class="row flex justify-between items-center text-sm">
178
+ <span class="text-gray-300">Ambient Lighting</span>
179
+ <input type="range" min="0.5" max="3" step="0.1" value="1.5" class="accent-cyan-500" oninput="updateGlowIntensity(this.value)">
180
+ </div>
181
+ <div class="row flex justify-between items-center text-sm">
182
+ <span class="text-gray-300">Companion Eye Tint</span>
183
+ <input type="color" value="#06b6d4" class="w-8 h-8 rounded border-none bg-transparent cursor-pointer" onchange="updateEyeColor(this.value)">
184
+ </div>
185
+ <div class="row flex justify-between items-center text-sm">
186
+ <span class="text-gray-300">Key Registration</span>
187
+ <input type="password" placeholder="Key token..." class="bg-slate-900 border border-slate-700 rounded px-2 py-1 text-xs text-white">
188
+ </div>
189
+ <div class="row flex justify-between items-center text-sm">
190
+ <span class="text-gray-300">Mesh Designation</span>
191
+ <input type="text" value="Aya-Companion" class="bg-slate-900 border border-slate-700 rounded px-2 py-1 text-xs text-white">
192
+ </div>
193
+ <div class="row flex justify-between items-center text-sm">
194
+ <span class="text-gray-300">Interaction Node</span>
195
+ <select class="bg-slate-900 border border-slate-700 rounded px-2 py-1 text-xs text-white">
196
+ <option>Empathetic</option>
197
+ <option>Analytical</option>
198
+ </select>
199
+ </div>
200
+ <div class="chips flex gap-2 flex-wrap">
201
+ <span class="chip bg-cyan-950 text-cyan-300 px-3 py-1 rounded-full text-xs cursor-pointer border border-cyan-500/20">Voice Sync</span>
202
+ <span class="chip bg-slate-800 text-slate-300 px-3 py-1 rounded-full text-xs cursor-pointer">Local Text</span>
203
+ </div>
204
+ <div class="fbtn flex gap-2 mt-2">
205
+ <button class="flex-1 py-2 bg-rose-950/50 hover:bg-rose-950 border border-rose-500/30 text-rose-300 rounded-xl text-xs font-semibold">Clear Profile</button>
206
+ <button class="flex-1 py-2 bg-cyan-950/50 hover:bg-cyan-950 border border-cyan-500/30 text-cyan-300 rounded-xl text-xs font-semibold">Save Profile</button>
207
+ </div>
208
+ </div>
209
+ </div>
210
+
211
+ <div class="voice-locked absolute top-4 left-1/2 -translate-x-1/2 bg-slate-950/90 border border-amber-500/30 text-amber-200 px-4 py-2 rounded-xl text-xs font-mono hidden flex items-center gap-2 z-30 shadow-lg">
212
+ <svg class="w-4 h-4 text-amber-500" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M12 15v2m-6 4h12a2 2 0 002-2v-6a2 2 0 00-2-2H6a2 2 0 00-2 2v6a2 2 0 002 2zm10-10V7a4 4 0 00-8 0v4h8z"></path></svg>
213
+ Voice function locked. Upgrade to premium.
214
+ </div>
215
+
216
+ <script type="module">
217
+ import { Client } from "https://cdn.jsdelivr.net/npm/@gradio/client/dist/index.min.js";
218
+
219
+ let client;
220
+ let isGenerating = false;
221
+ let chatHistory =;
222
+
223
+ // Setup control bar positioning for mobile Apple safe area inset
224
+ const inputBar = document.getElementById('bar');
225
+ inputBar.style.bottom = `calc(16px + env(safe-area-inset-bottom, 0px))`;
226
+
227
+ async function connectEngine() {
228
+ const loader = document.getElementById('load');
229
+ loader.style.opacity = '1';
230
+ try {
231
+ // Connect utilizing the local window origin to ensure ZeroGPU token handshakes are verified
232
+ client = await Client.connect(window.location.origin);
233
+ loader.style.opacity = '0';
234
+ } catch (err) {
235
+ console.error("Gradio initialization failure:", err);
236
+ document.getElementById('err').classList.remove('hidden');
237
+ }
238
+ }
239
+
240
+ window.handleSend = async function() {
241
+ const inputField = document.getElementById('ti');
242
+ const messageText = inputField.value.trim();
243
+ if (!messageText || isGenerating) return;
244
+
245
+ inputField.value = '';
246
+ isGenerating = true;
247
+
248
+ // Show subtitle panel and active speaking indicator
249
+ const subtitlePanel = document.getElementById('subtitle-panel');
250
+ const subtitleText = document.getElementById('subtitle-text');
251
+ const speakIndicator = document.getElementById('speakDot');
252
+
253
+ subtitlePanel.classList.remove('hidden');
254
+ speakIndicator.classList.remove('hidden');
255
+ subtitleText.textContent = "Processing message...";
256
+
257
+ try {
258
+ // Submit request to the local serverless execution queue
259
+ const job = client.submit("/chat",);
260
+
261
+ job.on("data", (event) => {
262
+ const latestChunk = event.data;
263
+ subtitleText.textContent = latestChunk;
264
+
265
+ // Trigger character jaw scaling based on active streaming
266
+ speakingIntensity = 1.0;
267
+ });
268
+
269
+ job.on("status", (status) => {
270
+ if (status.stage === "complete") {
271
+ const finalResponse = subtitleText.textContent;
272
+ chatHistory.push();
273
+ if (chatHistory.length > 8) chatHistory.shift();
274
+
275
+ isGenerating = false;
276
+ speakIndicator.classList.add('hidden');
277
+
278
+ setTimeout(() => {
279
+ if (!isGenerating) subtitlePanel.classList.add('hidden');
280
+ }, 5000);
281
+ }
282
+ });
283
+ } catch (err) {
284
+ console.error("Inference execution failure:", err);
285
+ subtitleText.textContent = "Pipeline error. Retrying...";
286
+ isGenerating = false;
287
+ speakIndicator.classList.add('hidden');
288
+ }
289
+ };
290
+
291
+ document.getElementById('ti').addEventListener('keypress', (e) => {
292
+ if (e.key === 'Enter') handleSend();
293
+ });
294
+
295
+ window.toggleMenu = function() {
296
+ const menu = document.getElementById('menu');
297
+ if (menu.style.transform === 'translateY(0%)') {
298
+ menu.style.transform = 'translateY(100%)';
299
+ } else {
300
+ menu.style.transform = 'translateY(0%)';
301
+ }
302
+ };
303
+
304
+ // --- 4. PROCEDURAL THREE.JS WEBGL RENDER LOOP ---
305
+ let scene, camera, renderer;
306
+ let headMesh, leftEye, rightEye, mouthMesh;
307
+ let baseEyeColor, targetEyeColor;
308
+ let mouseX = 0, mouseY = 0;
309
+ let speakingIntensity = 0;
310
+ let clock = new THREE.Clock();
311
+ let fpsLastTime = performance.now();
312
+ let fpsFrames = 0;
313
+
314
+ function initWebGLScene() {
315
+ const canvas = document.getElementById('c');
316
+ scene = new THREE.Scene();
317
+ scene.fog = new THREE.FogExp2(0xffe082, 0.05);
318
+
319
+ camera = new THREE.PerspectiveCamera(40, window.innerWidth / window.innerHeight, 0.1, 100);
320
+ camera.position.set(0, 0.2, 4.5);
321
+
322
+ renderer = new THREE.WebGLRenderer({ canvas: canvas, antialias: true, alpha: true });
323
+ renderer.setSize(window.innerWidth, window.innerHeight);
324
+ renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2));
325
+ renderer.setClearColor(0x000000, 0); // Transparent WebGL overlay
326
+
327
+ // Lighting Configuration
328
+ const ambient = new THREE.AmbientLight(0xfffbeb, 1.0);
329
+ scene.add(ambient);
330
+
331
+ const direction = new THREE.DirectionalLight(0x06b6d4, 1.8);
332
+ direction.position.set(5, 5, 5);
333
+ scene.add(direction);
334
+
335
+ // Cybernetic companion structure
336
+ const metalMat = new THREE.MeshStandardMaterial({
337
+ color: 0x1e293b,
338
+ roughness: 0.12,
339
+ metalness: 0.88
340
+ });
341
+
342
+ headMesh = new THREE.Mesh(new THREE.CylinderGeometry(0.8, 0.6, 1.2, 8), metalMat);
343
+ headMesh.position.set(0, 0, 0);
344
+ scene.add(headMesh);
345
+
346
+ // Expressive glowing eye spheres
347
+ baseEyeColor = new THREE.Color(0x06b6d4);
348
+ targetEyeColor = new THREE.Color(0x06b6d4);
349
+ const eyeMat = new THREE.MeshBasicMaterial({ color: baseEyeColor });
350
+ const eyeGeo = new THREE.SphereGeometry(0.14, 32, 32);
351
+
352
+ leftEye = new THREE.Mesh(eyeGeo, eyeMat);
353
+ leftEye.position.set(-0.28, 0.15, 0.58);
354
+ headMesh.add(leftEye);
355
+
356
+ rightEye = new THREE.Mesh(eyeGeo, eyeMat);
357
+ rightEye.position.set(0.28, 0.15, 0.58);
358
+ headMesh.add(rightEye);
359
+
360
+ // Dynamic speaking mesh
361
+ mouthMesh = new THREE.Mesh(new THREE.BoxGeometry(0.35, 0.04, 0.06), new THREE.MeshBasicMaterial({ color: 0x06b6d4 }));
362
+ mouthMesh.position.set(0, -0.28, 0.61);
363
+ headMesh.add(mouthMesh);
364
+
365
+ // Floating halo ring
366
+ const ringGeo = new THREE.TorusGeometry(1.2, 0.03, 8, 48);
367
+ ringGeo.rotateX(Math.PI / 2);
368
+ const ringMesh = new THREE.Mesh(ringGeo, new THREE.MeshStandardMaterial({
369
+ color: 0x06b6d4,
370
+ emissive: 0x06b6d4,
371
+ emissiveIntensity: 0.8
372
+ }));
373
+ ringMesh.position.y = 0.8;
374
+ headMesh.add(ringMesh);
375
+
376
+ window.addEventListener('resize', onResize);
377
+ window.addEventListener('mousemove', (e) => {
378
+ mouseX = (e.clientX / window.innerWidth) * 2 - 1;
379
+ mouseY = -(e.clientY / window.innerHeight) * 2 + 1;
380
+ });
381
+
382
+ connectEngine();
383
+ animate();
384
+ }
385
+
386
+ function onResize() {
387
+ camera.aspect = window.innerWidth / window.innerHeight;
388
+ camera.updateProjectionMatrix();
389
+ renderer.setSize(window.innerWidth, window.innerHeight);
390
+ }
391
+
392
+ window.updateEyeColor = function(colorHex) {
393
+ baseEyeColor.set(colorHex);
394
+ targetEyeColor.set(colorHex);
395
+ mouthMesh.material.color.set(colorHex);
396
+ };
397
+
398
+ window.updateGlowIntensity = function(val) {
399
+ scene.children.forEach(c => {
400
+ if (c.isDirectionalLight) c.intensity = parseFloat(val);
401
+ });
402
+ };
403
+
404
+ function animate() {
405
+ requestAnimationFrame(animate);
406
+ const time = clock.getElapsedTime();
407
+
408
+ // Dynamic idle float movements
409
+ headMesh.position.y = Math.sin(time * 1.8) * 0.06;
410
+
411
+ // Head rotation mechanics
412
+ const targetRotY = mouseX * 0.35;
413
+ const targetRotX = -mouseY * 0.18;
414
+ headMesh.rotation.y += (targetRotY - headMesh.rotation.y) * 0.1;
415
+ headMesh.rotation.x += (targetRotX - headMesh.rotation.x) * 0.1;
416
+
417
+ // Linear jaw scaling during token streaming
418
+ if (speakingIntensity > 0.01) {
419
+ speakingIntensity *= 0.90;
420
+ const mouthY = 1 + Math.sin(time * 28) * 3.0 * speakingIntensity;
421
+ mouthMesh.scale.set(1.0, mouthY, 1.0);
422
+ } else {
423
+ mouthMesh.scale.set(1.0, 1.0, 1.0);
424
+ }
425
+
426
+ // Expressive blinks
427
+ const isBlink = Math.floor(time) % 6 === 0 && (time - Math.floor(time)) < 0.15;
428
+ leftEye.scale.y = isBlink? 0.15 : 1.0;
429
+ rightEye.scale.y = isBlink? 0.15 : 1.0;
430
+
431
+ renderer.render(scene, camera);
432
+
433
+ // Track client render metrics
434
+ fpsFrames++;
435
+ const now = performance.now();
436
+ if (now >= fpsLastTime + 1000) {
437
+ document.getElementById('fps').textContent = `FPS: ${fpsFrames}`;
438
+ fpsFrames = 0;
439
+ fpsLastTime = now;
440
+ }
441
+ }
442
+
443
+ initWebGLScene();
444
+ </script>
445
+ </body>
446
+ </html>
447
+ """
448
+
449
+ # --- 5. SYSTEM REGISTRATION AND COMPOSITION ---
450
+ # Initialize the custom Server Mode FastAPI app
451
+ app = gr.Server()
452
+
453
+ @app.get("/", response_class=HTMLResponse)
454
+ async def homepage() -> HTMLResponse:
455
+ """
456
+ Serves the custom HTML single-page application and its embedded WebGL engine.
457
+ """
458
+ return HTMLResponse(content=FRONTEND_HTML, status_code=200)
459
+
460
+ @app.api(name="chat")
461
+ def chat(message: str, history_str: str) -> Generator[str, None, None]:
462
+ """
463
+ API endpoint wrapped in Gradio's serialized concurrency queue, supporting spaces.GPU.
464
+ """
465
+ for chunk in run_inference(message, history_str):
466
+ yield chunk
467
+
468
+ # Launch the unified server
469
  if __name__ == "__main__":
470
+ app.launch()