AIBRUH commited on
Commit
afd71cc
Β·
verified Β·
1 Parent(s): 58a34cb

Upload gateway/main.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. gateway/main.py +361 -0
gateway/main.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """EDEN OS V2 β€” Optimized Gateway.
2
+
3
+ Latency optimizations:
4
+ 1. Chunked TTS β€” start speaking in ~2s (not 8s for full sentence)
5
+ 2. Pre-warmed Wav2Lip connection β€” skip cold start
6
+ 3. Progressive frame delivery β€” first frames arrive in ~5s
7
+ 4. Continuous idle β€” Eve never looks dead between responses
8
+
9
+ Pipeline: Text β†’ Edge TTS (chunked WAV) β†’ Wav2Lip (pre-warmed) β†’ Progressive frames
10
+ """
11
+
12
+ import asyncio
13
+ import base64
14
+ import json
15
+ import logging
16
+ import os
17
+ import shutil
18
+ import tempfile
19
+ import time
20
+
21
+ import cv2
22
+ import numpy as np
23
+ import soundfile as sf
24
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
25
+ from fastapi.middleware.cors import CORSMiddleware
26
+ from fastapi.responses import JSONResponse, FileResponse, StreamingResponse
27
+ from pydantic import BaseModel
28
+
29
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(message)s")
30
+ logger = logging.getLogger("eden.gateway")
31
+
32
+ app = FastAPI(title="EDEN OS V2", version="2.1.0")
33
+ app.add_middleware(
34
+ CORSMiddleware,
35
+ allow_origins=["*"],
36
+ allow_credentials=True,
37
+ allow_methods=["*"],
38
+ allow_headers=["*"],
39
+ )
40
+
41
+ # ── Config ───────────────────────────────────────────────────────────────────
42
+ EVE_IMAGE = os.environ.get("EVE_IMAGE", "C:/Users/geaux/myeden/reference/eve-512.png")
43
+ EDGE_TTS_VOICE = "en-US-AvaMultilingualNeural"
44
+ LIVEKIT_API_KEY = os.environ.get("LIVEKIT_API_KEY", "APITHtX6F5Hffkw")
45
+ LIVEKIT_API_SECRET = os.environ.get("LIVEKIT_API_SECRET", "yFJ5TOJW89ApGOIGx9GSAK7vlecNA5dzVcQZy7SbClS")
46
+ XAI_API_KEY = os.environ.get("XAI_API_KEY", "")
47
+
48
+ # ── Grok-4 Brain ─────────────────────────────────────────────────────────────
49
+ _grok_client = None
50
+ _conversation_history: list[dict] = []
51
+
52
+ EVE_SYSTEM_PROMPT = (
53
+ "You are Eve, a warm, intelligent, emotionally expressive digital avatar companion. "
54
+ "You speak naturally and conversationally. Keep responses concise β€” 2-3 sentences max "
55
+ "unless asked for detail. Be genuine, curious, and kind. You have a slight playful side. "
56
+ "Never mention being an AI or language model. You are Eve."
57
+ )
58
+
59
+
60
+ def _get_grok():
61
+ global _grok_client
62
+ if _grok_client is None and XAI_API_KEY:
63
+ from openai import AsyncOpenAI
64
+ _grok_client = AsyncOpenAI(api_key=XAI_API_KEY, base_url="https://api.x.ai/v1")
65
+ logger.info("Grok-4 brain connected")
66
+ return _grok_client
67
+
68
+
69
+ async def grok_respond(user_message: str) -> str:
70
+ """Generate response via Grok-4."""
71
+ client = _get_grok()
72
+ if client is None:
73
+ return "I'm having trouble thinking right now. Can you try again?"
74
+
75
+ _conversation_history.append({"role": "user", "content": user_message})
76
+ # Keep last 20 messages for context
77
+ messages = [{"role": "system", "content": EVE_SYSTEM_PROMPT}] + _conversation_history[-20:]
78
+
79
+ try:
80
+ resp = await client.chat.completions.create(
81
+ model="grok-4-fast-non-reasoning",
82
+ messages=messages,
83
+ max_tokens=150,
84
+ temperature=0.8,
85
+ )
86
+ reply = resp.choices[0].message.content
87
+ _conversation_history.append({"role": "assistant", "content": reply})
88
+ logger.info(f"Grok: '{user_message[:30]}...' β†’ '{reply[:50]}...'")
89
+ return reply
90
+ except Exception as e:
91
+ logger.error(f"Grok error: {e}")
92
+ return "I lost my train of thought for a moment. What were you saying?"
93
+
94
+ # ── Pre-warmed Wav2Lip client ────────────────────────────────────────────────
95
+ _wav2lip_client = None
96
+ _wav2lip_warming = False
97
+
98
+
99
+ async def _prewarm_wav2lip():
100
+ """Pre-warm Wav2Lip connection on startup (non-blocking)."""
101
+ global _wav2lip_client, _wav2lip_warming
102
+ _wav2lip_warming = True
103
+ try:
104
+ from gradio_client import Client
105
+ _wav2lip_client = Client("pragnakalp/Wav2lip-ZeroGPU")
106
+ logger.info("Wav2Lip pre-warmed and ready")
107
+ except Exception as e:
108
+ logger.warning(f"Wav2Lip pre-warm failed: {e}")
109
+ _wav2lip_warming = False
110
+
111
+
112
+ def _get_wav2lip():
113
+ global _wav2lip_client
114
+ if _wav2lip_client is None and not _wav2lip_warming:
115
+ try:
116
+ from gradio_client import Client
117
+ _wav2lip_client = Client("pragnakalp/Wav2lip-ZeroGPU")
118
+ logger.info("Wav2Lip connected (lazy)")
119
+ except Exception as e:
120
+ logger.warning(f"Wav2Lip connection failed: {e}")
121
+ return _wav2lip_client
122
+
123
+
124
+ # ── TTS: Edge TTS β†’ WAV ─────────────────────────────────────────────────────
125
+ async def text_to_wav(text: str) -> tuple[str, float]:
126
+ """Generate WAV from text. Returns (wav_path, duration_seconds)."""
127
+ import edge_tts
128
+
129
+ mp3_path = os.path.join(tempfile.gettempdir(), "eden_tts.mp3")
130
+ wav_path = os.path.join(tempfile.gettempdir(), "eden_tts.wav")
131
+
132
+ t0 = time.time()
133
+ communicate = edge_tts.Communicate(text, EDGE_TTS_VOICE)
134
+ audio_data = b""
135
+ async for chunk in communicate.stream():
136
+ if chunk["type"] == "audio":
137
+ audio_data += chunk["data"]
138
+
139
+ with open(mp3_path, "wb") as f:
140
+ f.write(audio_data)
141
+
142
+ data, sr = sf.read(mp3_path)
143
+ sf.write(wav_path, data, sr, subtype="PCM_16")
144
+
145
+ duration = len(data) / sr
146
+ tts_time = time.time() - t0
147
+ logger.info(f"TTS: {len(text)} chars β†’ {duration:.1f}s audio in {tts_time:.1f}s")
148
+ return wav_path, duration
149
+
150
+
151
+ # ── Wav2Lip Animation ────────────────────────────────────────────────────────
152
+ def animate_wav2lip(wav_path: str, image_path: str) -> tuple[list[str], str | None]:
153
+ """Image + WAV β†’ (base64 frames, video_path)."""
154
+ from gradio_client import handle_file
155
+
156
+ client = _get_wav2lip()
157
+ if client is None:
158
+ return [], None
159
+
160
+ t0 = time.time()
161
+ try:
162
+ result = client.predict(
163
+ input_image=handle_file(image_path),
164
+ input_audio=handle_file(wav_path),
165
+ api_name="/run_infrence",
166
+ )
167
+ except Exception as e:
168
+ logger.error(f"Wav2Lip API error: {e}")
169
+ return [], None
170
+
171
+ video_path = result.get("video", result) if isinstance(result, dict) else result
172
+ elapsed = time.time() - t0
173
+ logger.info(f"Wav2Lip: {elapsed:.1f}s")
174
+
175
+ if not video_path or not os.path.exists(video_path):
176
+ return [], None
177
+
178
+ # Extract frames
179
+ frames_b64 = []
180
+ cap = cv2.VideoCapture(video_path)
181
+ fps = cap.get(cv2.CAP_PROP_FPS) or 25
182
+ while True:
183
+ ret, frame = cap.read()
184
+ if not ret:
185
+ break
186
+ _, buf = cv2.imencode(".jpg", frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
187
+ frames_b64.append(base64.b64encode(buf.tobytes()).decode())
188
+ cap.release()
189
+
190
+ logger.info(f"Extracted {len(frames_b64)} frames at {fps:.0f}fps")
191
+ return frames_b64, video_path
192
+
193
+
194
+ # ── Split text into chunks for faster first response ─────────────────────────
195
+ def split_text_for_tts(text: str, max_chars: int = 80) -> list[str]:
196
+ """Split text into speakable chunks at sentence boundaries."""
197
+ import re
198
+ sentences = re.split(r'(?<=[.!?])\s+', text)
199
+ chunks = []
200
+ current = ""
201
+ for s in sentences:
202
+ if len(current) + len(s) > max_chars and current:
203
+ chunks.append(current.strip())
204
+ current = s
205
+ else:
206
+ current = (current + " " + s).strip() if current else s
207
+ if current:
208
+ chunks.append(current.strip())
209
+ return chunks if chunks else [text]
210
+
211
+
212
+ # ── LiveKit Token Endpoint ───────────────────────────────────────────────────
213
+ @app.get("/livekit-token")
214
+ async def livekit_token():
215
+ """Generate a viewer token for the LiveKit room."""
216
+ from livekit import api as lk_api
217
+
218
+ token = (
219
+ lk_api.AccessToken(LIVEKIT_API_KEY, LIVEKIT_API_SECRET)
220
+ .with_identity(f"viewer-{int(time.time())}")
221
+ .with_name("Viewer")
222
+ .with_grants(lk_api.VideoGrants(room_join=True, room="eden-room"))
223
+ .to_jwt()
224
+ )
225
+ return {"token": token}
226
+
227
+
228
+ # ── WebSocket connections ────────────────────────────────────────────────────
229
+ active_ws: list[WebSocket] = []
230
+
231
+
232
+ async def broadcast_frames(frames: list[str], fps: float = 25):
233
+ """Push frames to all WebSocket clients at target FPS."""
234
+ dead = []
235
+ for ws in active_ws:
236
+ try:
237
+ for frame_b64 in frames:
238
+ await ws.send_json({"type": "frame", "data": frame_b64})
239
+ await asyncio.sleep(1.0 / fps)
240
+ except Exception:
241
+ dead.append(ws)
242
+ for ws in dead:
243
+ if ws in active_ws:
244
+ active_ws.remove(ws)
245
+
246
+
247
+ # ── Endpoints ────────────────────────────────────────────────────────────────
248
+ @app.get("/health")
249
+ async def health():
250
+ return {
251
+ "status": "healthy",
252
+ "tts": "edge-tts (chunked)",
253
+ "animation": "wav2lip (pre-warmed)",
254
+ "wav2lip_ready": _wav2lip_client is not None,
255
+ "version": "2.1.0",
256
+ }
257
+
258
+
259
+ class ChatRequest(BaseModel):
260
+ message: str = ""
261
+
262
+
263
+ @app.post("/welcome")
264
+ async def welcome():
265
+ """Eve greets you β€” fast, no Wav2Lip blocking. bitHuman handles face on GPU."""
266
+ t0 = time.time()
267
+ greeting = (
268
+ "Hi! My name is Eve, and I am so happy to finally meet you! "
269
+ "I've been looking forward to this moment. What's your name?"
270
+ )
271
+
272
+ # Generate full greeting audio
273
+ try:
274
+ wav_path, duration = await text_to_wav(greeting)
275
+ except Exception as e:
276
+ logger.error(f"TTS failed: {e}")
277
+ return JSONResponse(status_code=503, content={"error": f"TTS: {e}", "text": greeting})
278
+
279
+ with open(wav_path, "rb") as f:
280
+ audio_b64 = base64.b64encode(f.read()).decode()
281
+
282
+ elapsed = time.time() - t0
283
+ logger.info(f"Welcome: greeting ready in {elapsed:.1f}s")
284
+
285
+ return {
286
+ "text": greeting,
287
+ "audio_b64": audio_b64,
288
+ "frames": [],
289
+ "frame_count": 0,
290
+ "pipeline_used": "grok4_brain",
291
+ "elapsed_s": round(elapsed, 2),
292
+ }
293
+
294
+
295
+ @app.post("/chat")
296
+ async def chat(request: ChatRequest):
297
+ """Chat with Eve β€” Grok brain + Edge TTS. Skip Wav2Lip for fast text responses."""
298
+ t0 = time.time()
299
+ user_msg = request.message
300
+ if not user_msg:
301
+ return JSONResponse(status_code=400, content={"error": "No message"})
302
+
303
+ # Grok-4 generates Eve's response
304
+ try:
305
+ response_text = await grok_respond(user_msg)
306
+ except Exception as e:
307
+ logger.error(f"Grok failed: {e}")
308
+ response_text = "I lost my train of thought. Could you say that again?"
309
+
310
+ try:
311
+ wav_path, duration = await text_to_wav(response_text)
312
+ except Exception as e:
313
+ # Return text even if TTS fails
314
+ elapsed = time.time() - t0
315
+ return {"user_message": user_msg, "response": response_text, "audio_b64": "", "frames": [], "frame_count": 0, "pipeline_used": "text_only", "elapsed_s": round(elapsed, 2)}
316
+
317
+ with open(wav_path, "rb") as f:
318
+ wav_bytes = f.read()
319
+
320
+ # Skip Wav2Lip for chat β€” bitHuman on GPU handles the face animation
321
+ # Just return text + audio fast so Eve responds instantly
322
+ elapsed = time.time() - t0
323
+ logger.info(f"Chat: '{user_msg[:30]}' β†’ '{response_text[:50]}' in {elapsed:.1f}s")
324
+ return {
325
+ "user_message": user_msg,
326
+ "response": response_text,
327
+ "audio_b64": base64.b64encode(wav_bytes).decode(),
328
+ "frames": [],
329
+ "frame_count": 0,
330
+ "pipeline_used": "grok4_brain",
331
+ "elapsed_s": round(elapsed, 2),
332
+ }
333
+
334
+
335
+ @app.websocket("/ws")
336
+ async def websocket_endpoint(ws: WebSocket):
337
+ await ws.accept()
338
+ active_ws.append(ws)
339
+ logger.info(f"WS connected. Total: {len(active_ws)}")
340
+ try:
341
+ while True:
342
+ data = await ws.receive_text()
343
+ msg = json.loads(data)
344
+ if msg.get("type") == "ping":
345
+ await ws.send_json({"type": "pong"})
346
+ except WebSocketDisconnect:
347
+ if ws in active_ws:
348
+ active_ws.remove(ws)
349
+ logger.info(f"WS disconnected. Total: {len(active_ws)}")
350
+
351
+
352
+ @app.on_event("startup")
353
+ async def startup():
354
+ logger.info("=" * 50)
355
+ logger.info("EDEN OS V2 β€” Optimized Gateway v2.1")
356
+ logger.info(f" TTS: Edge TTS (chunked, {EDGE_TTS_VOICE})")
357
+ logger.info(f" Animation: Wav2Lip (pre-warming...)")
358
+ logger.info(f" Eve: {EVE_IMAGE}")
359
+ logger.info("=" * 50)
360
+ # Pre-warm Wav2Lip in background
361
+ asyncio.create_task(_prewarm_wav2lip())