Ana

Paused

App Files Files Community

OrbitMC commited on 11 days ago

Commit

6ff3c42

verified ·

1 Parent(s): 5edd9d0

Update app.py

Browse files

Files changed (1) hide show

app.py +396 -300

app.py CHANGED Viewed

@@ -1,320 +1,416 @@
 """
-J.A.R.V.I.S — FastAPI backend
-Model is loaded ONCE at startup and kept in RAM for instant responses.
 """
-import os
-import json
-import time
-import warnings
-import asyncio
-from pathlib import Path
-from contextlib import asynccontextmanager
-import uvicorn
-from fastapi import FastAPI, HTTPException
-from fastapi.staticfiles import StaticFiles
-from fastapi.responses import HTMLResponse, StreamingResponse, FileResponse
-from pydantic import BaseModel
-warnings.filterwarnings("ignore")
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-os.environ["HF_HOME"] = "/app/cache"
-os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
-os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/app/cache"
-# ── Paths ──
-VECTOR_DIR = Path("/app/database/vector_store")
-LEARN_DIR  = Path("/app/database/learning_data")
-CHATS_DIR  = Path("/app/database/chats_data")
-CACHE_DIR  = Path("/app/cache")
-for d in [VECTOR_DIR, LEARN_DIR, CHATS_DIR]:
-    d.mkdir(parents=True, exist_ok=True)
-# ── Global model holders (loaded once, never reloaded) ──
-LLM       = None
-RETRIEVER = None
-TTS       = None
-TTS_OK    = False
 SYSTEM_PROMPT = (
-    "You are J.A.R.V.I.S, a concise and intelligent AI assistant. "
-    "Always reply in 1–2 short, direct sentences. "
-    "Never use emojis, markdown, asterisks, or filler phrases. "
-    "Be helpful, precise, and slightly formal."
 )
-# ══════════════════════════════════════════
-#  STARTUP — load everything into RAM once
-# ══════════════════════════════════════════
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    global LLM, RETRIEVER, TTS, TTS_OK
-    print("=" * 55)
-    print("  J.A.R.V.I.S  —  starting up")
-    print("=" * 55)
-    # 1. Vector store / embeddings
-    print("[1/3] Loading embeddings & vector store...", flush=True)
-    from langchain_huggingface import HuggingFaceEmbeddings
-    from langchain_community.vectorstores import FAISS
-    from langchain_text_splitters import RecursiveCharacterTextSplitter
-    from langchain_core.documents import Document
-    embeddings = HuggingFaceEmbeddings(
-        model_name="sentence-transformers/all-MiniLM-L6-v2",
-        model_kwargs={"device": "cpu"},
-        cache_folder=str(CACHE_DIR),
-    )
-    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-    def _load_docs():
-        docs = []
-        for f in LEARN_DIR.glob("*.txt"):
-            try:
-                docs.append(Document(page_content=f.read_text(errors="ignore"),
-                                     metadata={"source": f.name}))
-            except Exception:
-                pass
-        for f in CHATS_DIR.glob("*.json"):
-            try:
-                data = json.loads(f.read_text(errors="ignore"))
-                content = "\n".join(
-                    f"{m['role']}: {m['content']}"
-                    for m in data.get("messages", [])
-                    if isinstance(m, dict) and "role" in m and "content" in m
-                )
-                if content.strip():
-                    docs.append(Document(page_content=content,
-                                         metadata={"source": f.name}))
-            except Exception:
-                pass
-        return docs
-    index_file = VECTOR_DIR / "index.faiss"
-    if index_file.exists():
-        try:
-            vs = FAISS.load_local(str(VECTOR_DIR), embeddings,
-                                   allow_dangerous_deserialization=True)
-            print("  Vector store loaded from disk.")
-        except Exception:
-            vs = None
-    if not index_file.exists() or vs is None:
-        docs = _load_docs() or [Document(page_content="No data yet.")]
-        chunks = splitter.split_documents(docs)
-        vs = FAISS.from_documents(chunks, embeddings)
-        vs.save_local(str(VECTOR_DIR))
-        print("  Vector store built and saved.")
-    RETRIEVER = vs.as_retriever(search_kwargs={"k": 3})
-    print("  ✓ Vector store ready")
-    # 2. LLM — loaded into RAM, stays there forever
-    print("[2/3] Loading LLM into RAM (model pre-cached in image)...", flush=True)
-    from huggingface_hub import hf_hub_download
-    from llama_cpp import Llama
-    model_path = hf_hub_download(
-        repo_id="unsloth/Qwen3.5-0.8B-GGUF",
-        filename="Qwen3.5-0.8B-UD-Q2_K_XL.gguf",
-        cache_dir=str(CACHE_DIR),
-        local_files_only=True,   # ← never re-download; use baked image cache
-    )
-    LLM = Llama(
-        model_path=model_path,
-        n_ctx=2048,
-        n_threads=os.cpu_count() or 4,
-        n_batch=512,          # larger batch = faster prompt processing
-        use_mmap=True,        # memory-map the file — fastest cold load on CPU
-        use_mlock=True,       # lock pages in RAM — prevents swap thrashing
-        verbose=False,
-    )
-    print("  ✓ LLM ready")
-    # 3. TTS (optional)
-    print("[3/3] Loading TTS...", flush=True)
-    try:
-        from kittentts import KittenTTS
-        TTS    = KittenTTS("KittenML/kitten-tts-nano-0.8-fp32")
-        TTS_OK = True
-        print("  ✓ TTS ready (Kiki)")
-    except Exception as e:
-        print(f"  ⚠ TTS unavailable: {e}")
-    print("\n  ✓ ALL SYSTEMS ONLINE — serving on :7860\n")
-    yield
-    # Shutdown
-    print("J.A.R.V.I.S: shutting down.")
-# ══════════════════════════════════════════
-#  APP
-# ══════════════════════════════════════════
-app = FastAPI(title="J.A.R.V.I.S", lifespan=lifespan)
-app.mount("/static", StaticFiles(directory="static"), name="static")
-# ── Request / response schemas ──
-class ChatRequest(BaseModel):
-    message: str
-    history: list[list[str]] = []   # [[user, assistant], ...]
-class ChatResponse(BaseModel):
-    reply: str
-# ── Routes ──
-@app.get("/", response_class=HTMLResponse)
-async def root():
-    return FileResponse("static/index.html")
-@app.get("/health")
-async def health():
-    return {"status": "ok", "llm": LLM is not None, "tts": TTS_OK}
-@app.post("/chat", response_model=ChatResponse)
-async def chat(req: ChatRequest):
-    if LLM is None:
-        raise HTTPException(503, "Model not ready yet")
-    # Retrieve context
-    context = ""
     try:
-        docs    = RETRIEVER.invoke(req.message)
-        context = "\n".join(d.page_content for d in docs)
-    except Exception:
-        pass
-    # Build messages
-    system = SYSTEM_PROMPT
-    if context.strip():
-        system += f"\n\nBackground context (use only if relevant):\n{context}"
-    messages = [{"role": "system", "content": system}]
-    for turn in req.history[-4:]:
-        if len(turn) == 2:
-            messages.append({"role": "user",      "content": turn[0]})
-            messages.append({"role": "assistant", "content": turn[1]})
-    messages.append({"role": "user", "content": req.message})
-    # Generate (run in thread so we don't block the event loop)
-    loop = asyncio.get_event_loop()
-    def _generate():
-        result = LLM.create_chat_completion(
-            messages=messages,
-            max_tokens=150,
-            temperature=0.65,
-            top_p=0.9,
             repeat_penalty=1.1,
-            stream=False,
         )
-        return result["choices"][0]["message"]["content"].strip()
-    reply = await loop.run_in_executor(None, _generate)
-    return ChatResponse(reply=reply)
-@app.post("/chat/stream")
-async def chat_stream(req: ChatRequest):
-    """Server-Sent Events streaming endpoint."""
-    if LLM is None:
-        raise HTTPException(503, "Model not ready yet")
-    context = ""
     try:
-        docs    = RETRIEVER.invoke(req.message)
-        context = "\n".join(d.page_content for d in docs)
-    except Exception:
-        pass
-    system = SYSTEM_PROMPT
-    if context.strip():
-        system += f"\n\nBackground context:\n{context}"
-    messages = [{"role": "system", "content": system}]
-    for turn in req.history[-4:]:
-        if len(turn) == 2:
-            messages.append({"role": "user",      "content": turn[0]})
-            messages.append({"role": "assistant", "content": turn[1]})
-    messages.append({"role": "user", "content": req.message})
-    async def event_gen():
-        loop   = asyncio.get_event_loop()
-        queue  = asyncio.Queue()
-        def _stream():
-            for chunk in LLM.create_chat_completion(
-                messages=messages,
-                max_tokens=150,
-                temperature=0.65,
-                top_p=0.9,
-                repeat_penalty=1.1,
-                stream=True,
-            ):
-                piece = chunk["choices"][0].get("delta", {}).get("content", "")
-                if piece:
-                    asyncio.run_coroutine_threadsafe(queue.put(piece), loop)
-            asyncio.run_coroutine_threadsafe(queue.put(None), loop)  # sentinel
-        loop.run_in_executor(None, _stream)
-        while True:
-            piece = await queue.get()
-            if piece is None:
-                yield "data: [DONE]\n\n"
-                break
-            yield f"data: {json.dumps(piece)}\n\n"
-    return StreamingResponse(event_gen(), media_type="text/event-stream")
-@app.post("/tts")
-async def tts_endpoint(body: dict):
-    """Return raw PCM audio bytes for the given text."""
-    if not TTS_OK:
-        raise HTTPException(503, "TTS not available")
-    text = body.get("text", "").strip()
-    if not text:
-        raise HTTPException(400, "No text provided")
-    loop = asyncio.get_event_loop()
-    def _speak():
-        return TTS.generate(text, voice="Kiki")
-    audio_bytes = await loop.run_in_executor(None, _speak)
-    return StreamingResponse(iter([bytes(audio_bytes)]),
-                             media_type="audio/wav")
-@app.post("/save")
-async def save_chat(body: dict):
-    history = body.get("history", [])
-    if not history:
-        return {"saved": False}
-    path = CHATS_DIR / f"session_{int(time.time())}.json"
-    messages = []
-    for turn in history:
-        if len(turn) == 2:
-            messages.append({"role": "user",      "content": turn[0]})
-            messages.append({"role": "assistant", "content": turn[1]})
-    path.write_text(json.dumps({"messages": messages}, ensure_ascii=False, indent=2))
-    return {"saved": True, "path": str(path)}
 if __name__ == "__main__":
-    uvicorn.run(
-        "app:app",
-        host="0.0.0.0",
-        port=int(os.environ.get("PORT", 7860)),
-        log_level="warning",
-    )

 """
+ChatGPT-style local AI chat with TTS
+- LLM: gemma-3-270m-it-F16.gguf via llama-cpp-python
+- TTS: Kokoro ONNX (af_kore = "kiki" voice)
+- UI:  Flask + embedded HTML (no Gradio)
+- Target: HuggingFace Docker Space (free CPU)
 """
+import os, io, base64, json, threading
+import numpy as np
+import soundfile as sf
+from flask import Flask, request, jsonify, Response, stream_with_context
+# ── Paths ────────────────────────────────────────────────────────────────────
+MODEL_PATH   = os.environ.get("MODEL_PATH",  "/app/models/gemma-3-270m-it-F16.gguf")
+ONNX_MODEL   = os.environ.get("ONNX_MODEL",  "/app/models/kokoro-v1.0.int8.onnx")
+VOICES_BIN   = os.environ.get("VOICES_BIN",  "/app/models/voices-v1.0.bin")
+TTS_VOICE    = os.environ.get("TTS_VOICE",   "af_kore")   # closest to "kiki"
+PORT         = int(os.environ.get("PORT", 7860))
+# ── Lazy-load LLM & TTS (init once, reuse) ───────────────────────────────────
+_llm_lock = threading.Lock()
+_llm = None
+def get_llm():
+    global _llm
+    if _llm is None:
+        with _llm_lock:
+            if _llm is None:
+                from llama_cpp import Llama
+                print(f"[LLM] Loading {MODEL_PATH} …")
+                _llm = Llama(
+                    model_path=MODEL_PATH,
+                    n_ctx=2048,
+                    n_threads=os.cpu_count() or 4,
+                    verbose=False,
+                )
+                print("[LLM] Ready.")
+    return _llm
+_tts_lock = threading.Lock()
+_tts = None
+def get_tts():
+    global _tts
+    if _tts is None:
+        with _tts_lock:
+            if _tts is None:
+                from kokoro_onnx import Kokoro
+                print(f"[TTS] Loading {ONNX_MODEL} …")
+                _tts = Kokoro(ONNX_MODEL, VOICES_BIN)
+                print("[TTS] Ready.")
+    return _tts
+# ── Flask app ─────────────────────────────────────────────────────────────────
+app = Flask(__name__)
+# ── Helpers ───────────────────────────────────────────────────────────────────
 SYSTEM_PROMPT = (
+    "You are a friendly, knowledgeable AI assistant. "
+    "Keep responses clear and concise."
 )
+def build_messages(history: list[dict], user_msg: str) -> list[dict]:
+    msgs = [{"role": "system", "content": SYSTEM_PROMPT}]
+    for turn in history[-10:]:          # last 10 turns for context
+        msgs.append({"role": turn["role"], "content": turn["content"]})
+    msgs.append({"role": "user", "content": user_msg})
+    return msgs
+def text_to_wav_b64(text: str) -> str:
+    """Return base64-encoded WAV of the TTS output."""
+    kokoro = get_tts()
+    samples, sr = kokoro.create(text, voice=TTS_VOICE, speed=1.0, lang="en-us")
+    buf = io.BytesIO()
+    sf.write(buf, samples, sr, format="WAV")
+    buf.seek(0)
+    return base64.b64encode(buf.read()).decode()
+# ── API routes ────────────────────────────────────────────────────────────────
+@app.route("/api/chat", methods=["POST"])
+def chat():
+    data     = request.get_json(force=True)
+    user_msg = data.get("message", "").strip()
+    history  = data.get("history", [])
+    if not user_msg:
+        return jsonify({"error": "empty message"}), 400
+    llm  = get_llm()
+    msgs = build_messages(history, user_msg)
     try:
+        resp = llm.create_chat_completion(
+            messages=msgs,
+            max_tokens=512,
+            temperature=0.7,
+            top_p=0.95,
             repeat_penalty=1.1,
         )
+        assistant_text = resp["choices"][0]["message"]["content"].strip()
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+    # TTS
     try:
+        audio_b64 = text_to_wav_b64(assistant_text)
+    except Exception as e:
+        print(f"[TTS] Warning: {e}")
+        audio_b64 = None
+    return jsonify({
+        "text":  assistant_text,
+        "audio": audio_b64,        # base64 WAV or null
+    })
+@app.route("/api/health")
+def health():
+    return jsonify({"status": "ok", "voice": TTS_VOICE})
+# ── Single-file HTML UI ──────────���────────────────────────────────────────────
+HTML = r"""<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Kitten Chat</title>
+<link rel="preconnect" href="https://fonts.googleapis.com">
+<link href="https://fonts.googleapis.com/css2?family=Sora:wght@300;400;500;600&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
+<style>
+  :root {
+    --bg:       #0d0f14;
+    --surface:  #161923;
+    --border:   #252a36;
+    --accent:   #a78bfa;
+    --accent2:  #f0abfc;
+    --text:     #e8eaf0;
+    --muted:    #6b7280;
+    --user-bg:  #1e1b4b;
+    --ai-bg:    #161923;
+    --radius:   14px;
+    --glow:     0 0 18px rgba(167,139,250,.25);
+  }
+  *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0 }
+  html, body { height: 100%; background: var(--bg); color: var(--text);
+    font-family: 'Sora', sans-serif; overflow: hidden }
+  /* ── layout ── */
+  #app { display: flex; flex-direction: column; height: 100vh; max-width: 860px;
+    margin: 0 auto; padding: 0 16px }
+  /* ── header ── */
+  header { display: flex; align-items: center; gap: 12px;
+    padding: 18px 0 14px; border-bottom: 1px solid var(--border) }
+  .logo { width: 38px; height: 38px; border-radius: 50%;
+    background: linear-gradient(135deg,#a78bfa,#f0abfc);
+    display: flex; align-items: center; justify-content: center;
+    font-size: 18px; box-shadow: var(--glow) }
+  header h1 { font-size: 1.05rem; font-weight: 600; letter-spacing: .3px }
+  header span { font-size: .75rem; color: var(--muted); display: block;
+    font-weight: 300 }
+  .status { margin-left: auto; display: flex; align-items: center; gap: 6px;
+    font-size: .72rem; color: var(--muted) }
+  .dot { width: 7px; height: 7px; border-radius: 50%; background: #34d399 }
+  /* ── messages ── */
+  #messages { flex: 1; overflow-y: auto; padding: 20px 0;
+    display: flex; flex-direction: column; gap: 16px; scroll-behavior: smooth }
+  #messages::-webkit-scrollbar { width: 4px }
+  #messages::-webkit-scrollbar-track { background: transparent }
+  #messages::-webkit-scrollbar-thumb { background: var(--border); border-radius: 4px }
+  .msg { display: flex; gap: 10px; max-width: 82%; animation: fadeUp .25s ease }
+  .msg.user { align-self: flex-end; flex-direction: row-reverse }
+  .msg.ai   { align-self: flex-start }
+  @keyframes fadeUp {
+    from { opacity: 0; transform: translateY(8px) }
+    to   { opacity: 1; transform: translateY(0) }
+  }
+  .avatar { width: 32px; height: 32px; border-radius: 50%; flex-shrink: 0;
+    display: flex; align-items: center; justify-content: center; font-size: 14px }
+  .msg.user .avatar { background: var(--user-bg); border: 1px solid #4338ca }
+  .msg.ai   .avatar { background: linear-gradient(135deg,#a78bfa22,#f0abfc22);
+    border: 1px solid var(--border) }
+  .bubble { padding: 11px 15px; border-radius: var(--radius); font-size: .88rem;
+    line-height: 1.6; word-break: break-word }
+  .msg.user .bubble { background: var(--user-bg);
+    border-bottom-right-radius: 4px }
+  .msg.ai   .bubble { background: var(--ai-bg); border: 1px solid var(--border);
+    border-bottom-left-radius: 4px }
+  /* ── audio player ── */
+  .audio-row { margin-top: 8px }
+  audio { width: 100%; height: 28px; border-radius: 20px;
+    accent-color: var(--accent); outline: none }
+  audio::-webkit-media-controls-panel { background: #1e2030 }
+  /* ── typing indicator ── */
+  .typing { display: flex; gap: 5px; padding: 4px 2px }
+  .typing span { width: 7px; height: 7px; border-radius: 50%;
+    background: var(--accent); opacity: .4;
+    animation: blink 1.2s infinite }
+  .typing span:nth-child(2) { animation-delay: .2s }
+  .typing span:nth-child(3) { animation-delay: .4s }
+  @keyframes blink { 0%,80%,100% { opacity:.4 } 40% { opacity:1 } }
+  /* ── input area ── */
+  #input-bar { display: flex; gap: 10px; padding: 14px 0 20px;
+    border-top: 1px solid var(--border) }
+  #user-input { flex: 1; background: var(--surface); border: 1px solid var(--border);
+    color: var(--text); border-radius: var(--radius); padding: 11px 16px;
+    font-family: 'Sora', sans-serif; font-size: .88rem; resize: none;
+    outline: none; transition: border-color .2s, box-shadow .2s; min-height: 48px;
+    max-height: 140px }
+  #user-input:focus { border-color: var(--accent); box-shadow: var(--glow) }
+  #user-input::placeholder { color: var(--muted) }
+  #send-btn { width: 48px; height: 48px; border-radius: var(--radius);
+    background: linear-gradient(135deg,var(--accent),var(--accent2));
+    border: none; cursor: pointer; display: flex; align-items: center;
+    justify-content: center; transition: opacity .2s, transform .1s;
+    flex-shrink: 0 }
+  #send-btn:hover   { opacity: .85 }
+  #send-btn:active  { transform: scale(.93) }
+  #send-btn:disabled { opacity: .35; cursor: default }
+  #send-btn svg { width: 20px; height: 20px; fill: #fff }
+  /* ── footer note ── */
+  .footnote { text-align: center; font-size: .68rem; color: var(--muted);
+    padding-bottom: 6px; font-family: 'JetBrains Mono', monospace }
+  /* ── empty state ── */
+  .empty { flex: 1; display: flex; flex-direction: column; align-items: center;
+    justify-content: center; gap: 14px; opacity: .45; user-select: none }
+  .empty .big { font-size: 3.5rem }
+  .empty p { font-size: .82rem; color: var(--muted) }
+</style>
+</head>
+<body>
+<div id="app">
+  <header>
+    <div class="logo">🐱</div>
+    <div>
+      <h1>Kitten Chat</h1>
+      <span>Gemma 3 · Kokoro TTS · voice: kiki</span>
+    </div>
+    <div class="status"><div class="dot"></div>local</div>
+  </header>
+  <div id="messages">
+    <div class="empty" id="empty-state">
+      <div class="big">✨</div>
+      <p>Send a message to start chatting. Replies include voice audio.</p>
+    </div>
+  </div>
+  <div id="input-bar">
+    <textarea id="user-input" placeholder="Ask anything…" rows="1"></textarea>
+    <button id="send-btn" title="Send">
+      <svg viewBox="0 0 24 24"><path d="M2.01 21L23 12 2.01 3 2 10l15 2-15 2z"/></svg>
+    </button>
+  </div>
+  <div class="footnote">running locally · gemma-3-270m · kokoro kiki voice</div>
+</div>
+<script>
+const messagesEl  = document.getElementById('messages');
+const inputEl     = document.getElementById('user-input');
+const sendBtn     = document.getElementById('send-btn');
+const emptyState  = document.getElementById('empty-state');
+let history = [];
+/* auto-resize textarea */
+inputEl.addEventListener('input', () => {
+  inputEl.style.height = 'auto';
+  inputEl.style.height = Math.min(inputEl.scrollHeight, 140) + 'px';
+});
+/* send on Enter (Shift+Enter = newline) */
+inputEl.addEventListener('keydown', e => {
+  if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); send(); }
+});
+sendBtn.addEventListener('click', send);
+function scrollBottom() {
+  messagesEl.scrollTop = messagesEl.scrollHeight;
+}
+function addMessage(role, text, audioB64) {
+  if (emptyState) emptyState.remove();
+  const wrap = document.createElement('div');
+  wrap.className = `msg ${role}`;
+  const avatar = document.createElement('div');
+  avatar.className = 'avatar';
+  avatar.textContent = role === 'user' ? '🧑' : '🤖';
+  const inner = document.createElement('div');
+  const bubble = document.createElement('div');
+  bubble.className = 'bubble';
+  bubble.textContent = text;
+  inner.appendChild(bubble);
+  if (audioB64 && role === 'ai') {
+    const audioRow = document.createElement('div');
+    audioRow.className = 'audio-row';
+    const audioEl = document.createElement('audio');
+    audioEl.controls = true;
+    audioEl.autoplay = true;
+    audioEl.src = 'data:audio/wav;base64,' + audioB64;
+    audioRow.appendChild(audioEl);
+    inner.appendChild(audioRow);
+  }
+  wrap.appendChild(avatar);
+  wrap.appendChild(inner);
+  messagesEl.appendChild(wrap);
+  scrollBottom();
+}
+function addTyping() {
+  const wrap = document.createElement('div');
+  wrap.className = 'msg ai';
+  wrap.id = 'typing-indicator';
+  const avatar = document.createElement('div');
+  avatar.className = 'avatar';
+  avatar.textContent = '🤖';
+  const inner = document.createElement('div');
+  const bubble = document.createElement('div');
+  bubble.className = 'bubble';
+  const t = document.createElement('div');
+  t.className = 'typing';
+  t.innerHTML = '<span></span><span></span><span></span>';
+  bubble.appendChild(t);
+  inner.appendChild(bubble);
+  wrap.appendChild(avatar);
+  wrap.appendChild(inner);
+  messagesEl.appendChild(wrap);
+  scrollBottom();
+}
+function removeTyping() {
+  const el = document.getElementById('typing-indicator');
+  if (el) el.remove();
+}
+async function send() {
+  const text = inputEl.value.trim();
+  if (!text) return;
+  inputEl.value = '';
+  inputEl.style.height = 'auto';
+  sendBtn.disabled = true;
+  addMessage('user', text);
+  history.push({ role: 'user', content: text });
+  addTyping();
+  try {
+    const res = await fetch('/api/chat', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ message: text, history: history.slice(0, -1) }),
+    });
+    const data = await res.json();
+    removeTyping();
+    if (data.error) {
+      addMessage('ai', '⚠️ ' + data.error, null);
+    } else {
+      addMessage('ai', data.text, data.audio);
+      history.push({ role: 'assistant', content: data.text });
+    }
+  } catch (err) {
+    removeTyping();
+    addMessage('ai', '⚠️ Connection error: ' + err.message, null);
+  } finally {
+    sendBtn.disabled = false;
+    inputEl.focus();
+  }
+}
+</script>
+</body>
+</html>"""
+@app.route("/")
+def index():
+    return Response(HTML, mimetype="text/html")
+# ── Entry ─────────────────────────────────────────────────────���───────────────
 if __name__ == "__main__":
+    # Pre-warm models in background so first request isn't cold
+    def warm():
+        try:
+            get_llm()
+            get_tts()
+        except Exception as e:
+            print(f"[WARM] {e}")
+    threading.Thread(target=warm, daemon=True).start()
+    app.run(host="0.0.0.0", port=PORT, threaded=True)