OrbitMC commited on
Commit
6ff3c42
Β·
verified Β·
1 Parent(s): 5edd9d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +396 -300
app.py CHANGED
@@ -1,320 +1,416 @@
1
  """
2
- J.A.R.V.I.S β€” FastAPI backend
3
- Model is loaded ONCE at startup and kept in RAM for instant responses.
 
 
 
4
  """
5
 
6
- import os
7
- import json
8
- import time
9
- import warnings
10
- import asyncio
11
- from pathlib import Path
12
- from contextlib import asynccontextmanager
13
-
14
- import uvicorn
15
- from fastapi import FastAPI, HTTPException
16
- from fastapi.staticfiles import StaticFiles
17
- from fastapi.responses import HTMLResponse, StreamingResponse, FileResponse
18
- from pydantic import BaseModel
19
-
20
- warnings.filterwarnings("ignore")
21
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
22
- os.environ["HF_HOME"] = "/app/cache"
23
- os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
24
- os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/app/cache"
25
-
26
- # ── Paths ──
27
- VECTOR_DIR = Path("/app/database/vector_store")
28
- LEARN_DIR = Path("/app/database/learning_data")
29
- CHATS_DIR = Path("/app/database/chats_data")
30
- CACHE_DIR = Path("/app/cache")
31
-
32
- for d in [VECTOR_DIR, LEARN_DIR, CHATS_DIR]:
33
- d.mkdir(parents=True, exist_ok=True)
34
-
35
- # ── Global model holders (loaded once, never reloaded) ──
36
- LLM = None
37
- RETRIEVER = None
38
- TTS = None
39
- TTS_OK = False
40
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  SYSTEM_PROMPT = (
42
- "You are J.A.R.V.I.S, a concise and intelligent AI assistant. "
43
- "Always reply in 1–2 short, direct sentences. "
44
- "Never use emojis, markdown, asterisks, or filler phrases. "
45
- "Be helpful, precise, and slightly formal."
46
  )
47
 
48
-
49
- # ══════════════════════════════════════════
50
- # STARTUP β€” load everything into RAM once
51
- # ══════════════════════════════════════════
52
- @asynccontextmanager
53
- async def lifespan(app: FastAPI):
54
- global LLM, RETRIEVER, TTS, TTS_OK
55
-
56
- print("=" * 55)
57
- print(" J.A.R.V.I.S β€” starting up")
58
- print("=" * 55)
59
-
60
- # 1. Vector store / embeddings
61
- print("[1/3] Loading embeddings & vector store...", flush=True)
62
- from langchain_huggingface import HuggingFaceEmbeddings
63
- from langchain_community.vectorstores import FAISS
64
- from langchain_text_splitters import RecursiveCharacterTextSplitter
65
- from langchain_core.documents import Document
66
-
67
- embeddings = HuggingFaceEmbeddings(
68
- model_name="sentence-transformers/all-MiniLM-L6-v2",
69
- model_kwargs={"device": "cpu"},
70
- cache_folder=str(CACHE_DIR),
71
- )
72
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
73
-
74
- def _load_docs():
75
- docs = []
76
- for f in LEARN_DIR.glob("*.txt"):
77
- try:
78
- docs.append(Document(page_content=f.read_text(errors="ignore"),
79
- metadata={"source": f.name}))
80
- except Exception:
81
- pass
82
- for f in CHATS_DIR.glob("*.json"):
83
- try:
84
- data = json.loads(f.read_text(errors="ignore"))
85
- content = "\n".join(
86
- f"{m['role']}: {m['content']}"
87
- for m in data.get("messages", [])
88
- if isinstance(m, dict) and "role" in m and "content" in m
89
- )
90
- if content.strip():
91
- docs.append(Document(page_content=content,
92
- metadata={"source": f.name}))
93
- except Exception:
94
- pass
95
- return docs
96
-
97
- index_file = VECTOR_DIR / "index.faiss"
98
- if index_file.exists():
99
- try:
100
- vs = FAISS.load_local(str(VECTOR_DIR), embeddings,
101
- allow_dangerous_deserialization=True)
102
- print(" Vector store loaded from disk.")
103
- except Exception:
104
- vs = None
105
-
106
- if not index_file.exists() or vs is None:
107
- docs = _load_docs() or [Document(page_content="No data yet.")]
108
- chunks = splitter.split_documents(docs)
109
- vs = FAISS.from_documents(chunks, embeddings)
110
- vs.save_local(str(VECTOR_DIR))
111
- print(" Vector store built and saved.")
112
-
113
- RETRIEVER = vs.as_retriever(search_kwargs={"k": 3})
114
- print(" βœ“ Vector store ready")
115
-
116
- # 2. LLM β€” loaded into RAM, stays there forever
117
- print("[2/3] Loading LLM into RAM (model pre-cached in image)...", flush=True)
118
- from huggingface_hub import hf_hub_download
119
- from llama_cpp import Llama
120
-
121
- model_path = hf_hub_download(
122
- repo_id="unsloth/Qwen3.5-0.8B-GGUF",
123
- filename="Qwen3.5-0.8B-UD-Q2_K_XL.gguf",
124
- cache_dir=str(CACHE_DIR),
125
- local_files_only=True, # ← never re-download; use baked image cache
126
- )
127
-
128
- LLM = Llama(
129
- model_path=model_path,
130
- n_ctx=2048,
131
- n_threads=os.cpu_count() or 4,
132
- n_batch=512, # larger batch = faster prompt processing
133
- use_mmap=True, # memory-map the file β€” fastest cold load on CPU
134
- use_mlock=True, # lock pages in RAM β€” prevents swap thrashing
135
- verbose=False,
136
- )
137
- print(" βœ“ LLM ready")
138
-
139
- # 3. TTS (optional)
140
- print("[3/3] Loading TTS...", flush=True)
141
- try:
142
- from kittentts import KittenTTS
143
- TTS = KittenTTS("KittenML/kitten-tts-nano-0.8-fp32")
144
- TTS_OK = True
145
- print(" βœ“ TTS ready (Kiki)")
146
- except Exception as e:
147
- print(f" ⚠ TTS unavailable: {e}")
148
-
149
- print("\n βœ“ ALL SYSTEMS ONLINE β€” serving on :7860\n")
150
- yield
151
-
152
- # Shutdown
153
- print("J.A.R.V.I.S: shutting down.")
154
-
155
-
156
- # ══════════════════════════════════════════
157
- # APP
158
- # ══════════════════════════════════════════
159
- app = FastAPI(title="J.A.R.V.I.S", lifespan=lifespan)
160
- app.mount("/static", StaticFiles(directory="static"), name="static")
161
 
162
 
163
- # ── Request / response schemas ──
164
- class ChatRequest(BaseModel):
165
- message: str
166
- history: list[list[str]] = [] # [[user, assistant], ...]
 
 
 
 
167
 
168
- class ChatResponse(BaseModel):
169
- reply: str
170
 
 
171
 
172
- # ── Routes ──
173
- @app.get("/", response_class=HTMLResponse)
174
- async def root():
175
- return FileResponse("static/index.html")
 
176
 
 
 
177
 
178
- @app.get("/health")
179
- async def health():
180
- return {"status": "ok", "llm": LLM is not None, "tts": TTS_OK}
181
 
182
-
183
- @app.post("/chat", response_model=ChatResponse)
184
- async def chat(req: ChatRequest):
185
- if LLM is None:
186
- raise HTTPException(503, "Model not ready yet")
187
-
188
- # Retrieve context
189
- context = ""
190
  try:
191
- docs = RETRIEVER.invoke(req.message)
192
- context = "\n".join(d.page_content for d in docs)
193
- except Exception:
194
- pass
195
-
196
- # Build messages
197
- system = SYSTEM_PROMPT
198
- if context.strip():
199
- system += f"\n\nBackground context (use only if relevant):\n{context}"
200
-
201
- messages = [{"role": "system", "content": system}]
202
- for turn in req.history[-4:]:
203
- if len(turn) == 2:
204
- messages.append({"role": "user", "content": turn[0]})
205
- messages.append({"role": "assistant", "content": turn[1]})
206
- messages.append({"role": "user", "content": req.message})
207
-
208
- # Generate (run in thread so we don't block the event loop)
209
- loop = asyncio.get_event_loop()
210
-
211
- def _generate():
212
- result = LLM.create_chat_completion(
213
- messages=messages,
214
- max_tokens=150,
215
- temperature=0.65,
216
- top_p=0.9,
217
  repeat_penalty=1.1,
218
- stream=False,
219
  )
220
- return result["choices"][0]["message"]["content"].strip()
221
-
222
- reply = await loop.run_in_executor(None, _generate)
223
- return ChatResponse(reply=reply)
224
-
225
-
226
- @app.post("/chat/stream")
227
- async def chat_stream(req: ChatRequest):
228
- """Server-Sent Events streaming endpoint."""
229
- if LLM is None:
230
- raise HTTPException(503, "Model not ready yet")
231
 
232
- context = ""
233
  try:
234
- docs = RETRIEVER.invoke(req.message)
235
- context = "\n".join(d.page_content for d in docs)
236
- except Exception:
237
- pass
238
-
239
- system = SYSTEM_PROMPT
240
- if context.strip():
241
- system += f"\n\nBackground context:\n{context}"
242
-
243
- messages = [{"role": "system", "content": system}]
244
- for turn in req.history[-4:]:
245
- if len(turn) == 2:
246
- messages.append({"role": "user", "content": turn[0]})
247
- messages.append({"role": "assistant", "content": turn[1]})
248
- messages.append({"role": "user", "content": req.message})
249
-
250
- async def event_gen():
251
- loop = asyncio.get_event_loop()
252
- queue = asyncio.Queue()
253
-
254
- def _stream():
255
- for chunk in LLM.create_chat_completion(
256
- messages=messages,
257
- max_tokens=150,
258
- temperature=0.65,
259
- top_p=0.9,
260
- repeat_penalty=1.1,
261
- stream=True,
262
- ):
263
- piece = chunk["choices"][0].get("delta", {}).get("content", "")
264
- if piece:
265
- asyncio.run_coroutine_threadsafe(queue.put(piece), loop)
266
- asyncio.run_coroutine_threadsafe(queue.put(None), loop) # sentinel
267
-
268
- loop.run_in_executor(None, _stream)
269
-
270
- while True:
271
- piece = await queue.get()
272
- if piece is None:
273
- yield "data: [DONE]\n\n"
274
- break
275
- yield f"data: {json.dumps(piece)}\n\n"
276
-
277
- return StreamingResponse(event_gen(), media_type="text/event-stream")
278
-
279
-
280
- @app.post("/tts")
281
- async def tts_endpoint(body: dict):
282
- """Return raw PCM audio bytes for the given text."""
283
- if not TTS_OK:
284
- raise HTTPException(503, "TTS not available")
285
- text = body.get("text", "").strip()
286
- if not text:
287
- raise HTTPException(400, "No text provided")
288
-
289
- loop = asyncio.get_event_loop()
290
-
291
- def _speak():
292
- return TTS.generate(text, voice="Kiki")
293
-
294
- audio_bytes = await loop.run_in_executor(None, _speak)
295
- return StreamingResponse(iter([bytes(audio_bytes)]),
296
- media_type="audio/wav")
297
-
298
-
299
- @app.post("/save")
300
- async def save_chat(body: dict):
301
- history = body.get("history", [])
302
- if not history:
303
- return {"saved": False}
304
- path = CHATS_DIR / f"session_{int(time.time())}.json"
305
- messages = []
306
- for turn in history:
307
- if len(turn) == 2:
308
- messages.append({"role": "user", "content": turn[0]})
309
- messages.append({"role": "assistant", "content": turn[1]})
310
- path.write_text(json.dumps({"messages": messages}, ensure_ascii=False, indent=2))
311
- return {"saved": True, "path": str(path)}
312
-
313
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  if __name__ == "__main__":
315
- uvicorn.run(
316
- "app:app",
317
- host="0.0.0.0",
318
- port=int(os.environ.get("PORT", 7860)),
319
- log_level="warning",
320
- )
 
 
 
 
 
1
  """
2
+ ChatGPT-style local AI chat with TTS
3
+ - LLM: gemma-3-270m-it-F16.gguf via llama-cpp-python
4
+ - TTS: Kokoro ONNX (af_kore = "kiki" voice)
5
+ - UI: Flask + embedded HTML (no Gradio)
6
+ - Target: HuggingFace Docker Space (free CPU)
7
  """
8
 
9
+ import os, io, base64, json, threading
10
+ import numpy as np
11
+ import soundfile as sf
12
+ from flask import Flask, request, jsonify, Response, stream_with_context
13
+
14
+ # ── Paths ────────────────────────────────────────────────────────────────────
15
+ MODEL_PATH = os.environ.get("MODEL_PATH", "/app/models/gemma-3-270m-it-F16.gguf")
16
+ ONNX_MODEL = os.environ.get("ONNX_MODEL", "/app/models/kokoro-v1.0.int8.onnx")
17
+ VOICES_BIN = os.environ.get("VOICES_BIN", "/app/models/voices-v1.0.bin")
18
+ TTS_VOICE = os.environ.get("TTS_VOICE", "af_kore") # closest to "kiki"
19
+ PORT = int(os.environ.get("PORT", 7860))
20
+
21
+ # ── Lazy-load LLM & TTS (init once, reuse) ───────────────────────────────────
22
+ _llm_lock = threading.Lock()
23
+ _llm = None
24
+
25
+ def get_llm():
26
+ global _llm
27
+ if _llm is None:
28
+ with _llm_lock:
29
+ if _llm is None:
30
+ from llama_cpp import Llama
31
+ print(f"[LLM] Loading {MODEL_PATH} …")
32
+ _llm = Llama(
33
+ model_path=MODEL_PATH,
34
+ n_ctx=2048,
35
+ n_threads=os.cpu_count() or 4,
36
+ verbose=False,
37
+ )
38
+ print("[LLM] Ready.")
39
+ return _llm
40
+
41
+ _tts_lock = threading.Lock()
42
+ _tts = None
43
+
44
+ def get_tts():
45
+ global _tts
46
+ if _tts is None:
47
+ with _tts_lock:
48
+ if _tts is None:
49
+ from kokoro_onnx import Kokoro
50
+ print(f"[TTS] Loading {ONNX_MODEL} …")
51
+ _tts = Kokoro(ONNX_MODEL, VOICES_BIN)
52
+ print("[TTS] Ready.")
53
+ return _tts
54
+
55
+ # ── Flask app ─────────────────────────────────────────────────────────────────
56
+ app = Flask(__name__)
57
+
58
+ # ── Helpers ───────────────────────────────────────────────────────────────────
59
  SYSTEM_PROMPT = (
60
+ "You are a friendly, knowledgeable AI assistant. "
61
+ "Keep responses clear and concise."
 
 
62
  )
63
 
64
+ def build_messages(history: list[dict], user_msg: str) -> list[dict]:
65
+ msgs = [{"role": "system", "content": SYSTEM_PROMPT}]
66
+ for turn in history[-10:]: # last 10 turns for context
67
+ msgs.append({"role": turn["role"], "content": turn["content"]})
68
+ msgs.append({"role": "user", "content": user_msg})
69
+ return msgs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
 
72
+ def text_to_wav_b64(text: str) -> str:
73
+ """Return base64-encoded WAV of the TTS output."""
74
+ kokoro = get_tts()
75
+ samples, sr = kokoro.create(text, voice=TTS_VOICE, speed=1.0, lang="en-us")
76
+ buf = io.BytesIO()
77
+ sf.write(buf, samples, sr, format="WAV")
78
+ buf.seek(0)
79
+ return base64.b64encode(buf.read()).decode()
80
 
 
 
81
 
82
+ # ── API routes ────────────────────────────────────────────────────────────────
83
 
84
+ @app.route("/api/chat", methods=["POST"])
85
+ def chat():
86
+ data = request.get_json(force=True)
87
+ user_msg = data.get("message", "").strip()
88
+ history = data.get("history", [])
89
 
90
+ if not user_msg:
91
+ return jsonify({"error": "empty message"}), 400
92
 
93
+ llm = get_llm()
94
+ msgs = build_messages(history, user_msg)
 
95
 
 
 
 
 
 
 
 
 
96
  try:
97
+ resp = llm.create_chat_completion(
98
+ messages=msgs,
99
+ max_tokens=512,
100
+ temperature=0.7,
101
+ top_p=0.95,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  repeat_penalty=1.1,
 
103
  )
104
+ assistant_text = resp["choices"][0]["message"]["content"].strip()
105
+ except Exception as e:
106
+ return jsonify({"error": str(e)}), 500
 
 
 
 
 
 
 
 
107
 
108
+ # TTS
109
  try:
110
+ audio_b64 = text_to_wav_b64(assistant_text)
111
+ except Exception as e:
112
+ print(f"[TTS] Warning: {e}")
113
+ audio_b64 = None
114
+
115
+ return jsonify({
116
+ "text": assistant_text,
117
+ "audio": audio_b64, # base64 WAV or null
118
+ })
119
+
120
+
121
+ @app.route("/api/health")
122
+ def health():
123
+ return jsonify({"status": "ok", "voice": TTS_VOICE})
124
+
125
+
126
+ # ── Single-file HTML UI ──────────���────────────────────────────────────────────
127
+ HTML = r"""<!DOCTYPE html>
128
+ <html lang="en">
129
+ <head>
130
+ <meta charset="UTF-8">
131
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
132
+ <title>Kitten Chat</title>
133
+ <link rel="preconnect" href="https://fonts.googleapis.com">
134
+ <link href="https://fonts.googleapis.com/css2?family=Sora:wght@300;400;500;600&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
135
+ <style>
136
+ :root {
137
+ --bg: #0d0f14;
138
+ --surface: #161923;
139
+ --border: #252a36;
140
+ --accent: #a78bfa;
141
+ --accent2: #f0abfc;
142
+ --text: #e8eaf0;
143
+ --muted: #6b7280;
144
+ --user-bg: #1e1b4b;
145
+ --ai-bg: #161923;
146
+ --radius: 14px;
147
+ --glow: 0 0 18px rgba(167,139,250,.25);
148
+ }
149
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0 }
150
+ html, body { height: 100%; background: var(--bg); color: var(--text);
151
+ font-family: 'Sora', sans-serif; overflow: hidden }
152
+
153
+ /* ── layout ── */
154
+ #app { display: flex; flex-direction: column; height: 100vh; max-width: 860px;
155
+ margin: 0 auto; padding: 0 16px }
156
+
157
+ /* ── header ── */
158
+ header { display: flex; align-items: center; gap: 12px;
159
+ padding: 18px 0 14px; border-bottom: 1px solid var(--border) }
160
+ .logo { width: 38px; height: 38px; border-radius: 50%;
161
+ background: linear-gradient(135deg,#a78bfa,#f0abfc);
162
+ display: flex; align-items: center; justify-content: center;
163
+ font-size: 18px; box-shadow: var(--glow) }
164
+ header h1 { font-size: 1.05rem; font-weight: 600; letter-spacing: .3px }
165
+ header span { font-size: .75rem; color: var(--muted); display: block;
166
+ font-weight: 300 }
167
+ .status { margin-left: auto; display: flex; align-items: center; gap: 6px;
168
+ font-size: .72rem; color: var(--muted) }
169
+ .dot { width: 7px; height: 7px; border-radius: 50%; background: #34d399 }
170
+
171
+ /* ── messages ── */
172
+ #messages { flex: 1; overflow-y: auto; padding: 20px 0;
173
+ display: flex; flex-direction: column; gap: 16px; scroll-behavior: smooth }
174
+ #messages::-webkit-scrollbar { width: 4px }
175
+ #messages::-webkit-scrollbar-track { background: transparent }
176
+ #messages::-webkit-scrollbar-thumb { background: var(--border); border-radius: 4px }
177
+
178
+ .msg { display: flex; gap: 10px; max-width: 82%; animation: fadeUp .25s ease }
179
+ .msg.user { align-self: flex-end; flex-direction: row-reverse }
180
+ .msg.ai { align-self: flex-start }
181
+
182
+ @keyframes fadeUp {
183
+ from { opacity: 0; transform: translateY(8px) }
184
+ to { opacity: 1; transform: translateY(0) }
185
+ }
186
+
187
+ .avatar { width: 32px; height: 32px; border-radius: 50%; flex-shrink: 0;
188
+ display: flex; align-items: center; justify-content: center; font-size: 14px }
189
+ .msg.user .avatar { background: var(--user-bg); border: 1px solid #4338ca }
190
+ .msg.ai .avatar { background: linear-gradient(135deg,#a78bfa22,#f0abfc22);
191
+ border: 1px solid var(--border) }
192
+
193
+ .bubble { padding: 11px 15px; border-radius: var(--radius); font-size: .88rem;
194
+ line-height: 1.6; word-break: break-word }
195
+ .msg.user .bubble { background: var(--user-bg);
196
+ border-bottom-right-radius: 4px }
197
+ .msg.ai .bubble { background: var(--ai-bg); border: 1px solid var(--border);
198
+ border-bottom-left-radius: 4px }
199
+
200
+ /* ── audio player ── */
201
+ .audio-row { margin-top: 8px }
202
+ audio { width: 100%; height: 28px; border-radius: 20px;
203
+ accent-color: var(--accent); outline: none }
204
+ audio::-webkit-media-controls-panel { background: #1e2030 }
205
+
206
+ /* ── typing indicator ── */
207
+ .typing { display: flex; gap: 5px; padding: 4px 2px }
208
+ .typing span { width: 7px; height: 7px; border-radius: 50%;
209
+ background: var(--accent); opacity: .4;
210
+ animation: blink 1.2s infinite }
211
+ .typing span:nth-child(2) { animation-delay: .2s }
212
+ .typing span:nth-child(3) { animation-delay: .4s }
213
+ @keyframes blink { 0%,80%,100% { opacity:.4 } 40% { opacity:1 } }
214
+
215
+ /* ── input area ── */
216
+ #input-bar { display: flex; gap: 10px; padding: 14px 0 20px;
217
+ border-top: 1px solid var(--border) }
218
+ #user-input { flex: 1; background: var(--surface); border: 1px solid var(--border);
219
+ color: var(--text); border-radius: var(--radius); padding: 11px 16px;
220
+ font-family: 'Sora', sans-serif; font-size: .88rem; resize: none;
221
+ outline: none; transition: border-color .2s, box-shadow .2s; min-height: 48px;
222
+ max-height: 140px }
223
+ #user-input:focus { border-color: var(--accent); box-shadow: var(--glow) }
224
+ #user-input::placeholder { color: var(--muted) }
225
+
226
+ #send-btn { width: 48px; height: 48px; border-radius: var(--radius);
227
+ background: linear-gradient(135deg,var(--accent),var(--accent2));
228
+ border: none; cursor: pointer; display: flex; align-items: center;
229
+ justify-content: center; transition: opacity .2s, transform .1s;
230
+ flex-shrink: 0 }
231
+ #send-btn:hover { opacity: .85 }
232
+ #send-btn:active { transform: scale(.93) }
233
+ #send-btn:disabled { opacity: .35; cursor: default }
234
+ #send-btn svg { width: 20px; height: 20px; fill: #fff }
235
+
236
+ /* ── footer note ── */
237
+ .footnote { text-align: center; font-size: .68rem; color: var(--muted);
238
+ padding-bottom: 6px; font-family: 'JetBrains Mono', monospace }
239
+
240
+ /* ── empty state ── */
241
+ .empty { flex: 1; display: flex; flex-direction: column; align-items: center;
242
+ justify-content: center; gap: 14px; opacity: .45; user-select: none }
243
+ .empty .big { font-size: 3.5rem }
244
+ .empty p { font-size: .82rem; color: var(--muted) }
245
+ </style>
246
+ </head>
247
+ <body>
248
+ <div id="app">
249
+ <header>
250
+ <div class="logo">🐱</div>
251
+ <div>
252
+ <h1>Kitten Chat</h1>
253
+ <span>Gemma 3 Β· Kokoro TTS Β· voice: kiki</span>
254
+ </div>
255
+ <div class="status"><div class="dot"></div>local</div>
256
+ </header>
257
+
258
+ <div id="messages">
259
+ <div class="empty" id="empty-state">
260
+ <div class="big">✨</div>
261
+ <p>Send a message to start chatting. Replies include voice audio.</p>
262
+ </div>
263
+ </div>
264
+
265
+ <div id="input-bar">
266
+ <textarea id="user-input" placeholder="Ask anything…" rows="1"></textarea>
267
+ <button id="send-btn" title="Send">
268
+ <svg viewBox="0 0 24 24"><path d="M2.01 21L23 12 2.01 3 2 10l15 2-15 2z"/></svg>
269
+ </button>
270
+ </div>
271
+ <div class="footnote">running locally Β· gemma-3-270m Β· kokoro kiki voice</div>
272
+ </div>
273
+
274
+ <script>
275
+ const messagesEl = document.getElementById('messages');
276
+ const inputEl = document.getElementById('user-input');
277
+ const sendBtn = document.getElementById('send-btn');
278
+ const emptyState = document.getElementById('empty-state');
279
+
280
+ let history = [];
281
+
282
+ /* auto-resize textarea */
283
+ inputEl.addEventListener('input', () => {
284
+ inputEl.style.height = 'auto';
285
+ inputEl.style.height = Math.min(inputEl.scrollHeight, 140) + 'px';
286
+ });
287
+
288
+ /* send on Enter (Shift+Enter = newline) */
289
+ inputEl.addEventListener('keydown', e => {
290
+ if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); send(); }
291
+ });
292
+ sendBtn.addEventListener('click', send);
293
+
294
+ function scrollBottom() {
295
+ messagesEl.scrollTop = messagesEl.scrollHeight;
296
+ }
297
+
298
+ function addMessage(role, text, audioB64) {
299
+ if (emptyState) emptyState.remove();
300
+
301
+ const wrap = document.createElement('div');
302
+ wrap.className = `msg ${role}`;
303
+
304
+ const avatar = document.createElement('div');
305
+ avatar.className = 'avatar';
306
+ avatar.textContent = role === 'user' ? 'πŸ§‘' : 'πŸ€–';
307
+
308
+ const inner = document.createElement('div');
309
+ const bubble = document.createElement('div');
310
+ bubble.className = 'bubble';
311
+ bubble.textContent = text;
312
+ inner.appendChild(bubble);
313
+
314
+ if (audioB64 && role === 'ai') {
315
+ const audioRow = document.createElement('div');
316
+ audioRow.className = 'audio-row';
317
+ const audioEl = document.createElement('audio');
318
+ audioEl.controls = true;
319
+ audioEl.autoplay = true;
320
+ audioEl.src = 'data:audio/wav;base64,' + audioB64;
321
+ audioRow.appendChild(audioEl);
322
+ inner.appendChild(audioRow);
323
+ }
324
+
325
+ wrap.appendChild(avatar);
326
+ wrap.appendChild(inner);
327
+ messagesEl.appendChild(wrap);
328
+ scrollBottom();
329
+ }
330
+
331
+ function addTyping() {
332
+ const wrap = document.createElement('div');
333
+ wrap.className = 'msg ai';
334
+ wrap.id = 'typing-indicator';
335
+
336
+ const avatar = document.createElement('div');
337
+ avatar.className = 'avatar';
338
+ avatar.textContent = 'πŸ€–';
339
+
340
+ const inner = document.createElement('div');
341
+ const bubble = document.createElement('div');
342
+ bubble.className = 'bubble';
343
+ const t = document.createElement('div');
344
+ t.className = 'typing';
345
+ t.innerHTML = '<span></span><span></span><span></span>';
346
+ bubble.appendChild(t);
347
+ inner.appendChild(bubble);
348
+
349
+ wrap.appendChild(avatar);
350
+ wrap.appendChild(inner);
351
+ messagesEl.appendChild(wrap);
352
+ scrollBottom();
353
+ }
354
+
355
+ function removeTyping() {
356
+ const el = document.getElementById('typing-indicator');
357
+ if (el) el.remove();
358
+ }
359
+
360
+ async function send() {
361
+ const text = inputEl.value.trim();
362
+ if (!text) return;
363
+
364
+ inputEl.value = '';
365
+ inputEl.style.height = 'auto';
366
+ sendBtn.disabled = true;
367
+
368
+ addMessage('user', text);
369
+ history.push({ role: 'user', content: text });
370
+ addTyping();
371
+
372
+ try {
373
+ const res = await fetch('/api/chat', {
374
+ method: 'POST',
375
+ headers: { 'Content-Type': 'application/json' },
376
+ body: JSON.stringify({ message: text, history: history.slice(0, -1) }),
377
+ });
378
+ const data = await res.json();
379
+ removeTyping();
380
+
381
+ if (data.error) {
382
+ addMessage('ai', '⚠️ ' + data.error, null);
383
+ } else {
384
+ addMessage('ai', data.text, data.audio);
385
+ history.push({ role: 'assistant', content: data.text });
386
+ }
387
+ } catch (err) {
388
+ removeTyping();
389
+ addMessage('ai', '⚠️ Connection error: ' + err.message, null);
390
+ } finally {
391
+ sendBtn.disabled = false;
392
+ inputEl.focus();
393
+ }
394
+ }
395
+ </script>
396
+ </body>
397
+ </html>"""
398
+
399
+
400
+ @app.route("/")
401
+ def index():
402
+ return Response(HTML, mimetype="text/html")
403
+
404
+
405
+ # ── Entry ─────────────────────────────────────────────────────���───────────────
406
  if __name__ == "__main__":
407
+ # Pre-warm models in background so first request isn't cold
408
+ def warm():
409
+ try:
410
+ get_llm()
411
+ get_tts()
412
+ except Exception as e:
413
+ print(f"[WARM] {e}")
414
+ threading.Thread(target=warm, daemon=True).start()
415
+
416
+ app.run(host="0.0.0.0", port=PORT, threaded=True)