AIBRUH commited on
Commit
14bbfe2
·
verified ·
1 Parent(s): 195aacf

Upload livekit_eve_bithuman.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. livekit_eve_bithuman.py +319 -0
livekit_eve_bithuman.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """EDEN OS V2 — Eve: bitHuman + Grok Brain + Edge TTS + LiveKit.
2
+
3
+ All-in-one GPU agent: receives chat via LiveKit data channel,
4
+ generates response with Grok-4, synthesizes speech with Edge TTS,
5
+ feeds audio to bitHuman for lip-synced neural rendering,
6
+ streams video+audio back via LiveKit WebRTC.
7
+
8
+ Usage:
9
+ python livekit_eve_bithuman.py
10
+ """
11
+
12
+ import asyncio
13
+ import json
14
+ import logging
15
+ import os
16
+ import tempfile
17
+ import time
18
+
19
+ import cv2
20
+ import numpy as np
21
+ import soundfile as sf
22
+ import livekit.rtc as rtc
23
+ from livekit import api as lk_api
24
+ from bithuman import AsyncBithuman, VideoControl, AudioChunk
25
+
26
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(message)s")
27
+ logger = logging.getLogger("eden.bithuman")
28
+
29
+ # Config
30
+ LIVEKIT_URL = os.environ.get("LIVEKIT_URL", "wss://tall-cotton-nvhnfg10.livekit.cloud")
31
+ LIVEKIT_API_KEY = os.environ.get("LIVEKIT_API_KEY", "APITHtX6F5Hffkw")
32
+ LIVEKIT_API_SECRET = os.environ.get("LIVEKIT_API_SECRET", "yFJ5TOJW89ApGOIGx9GSAK7vlecNA5dzVcQZy7SbClS")
33
+ BITHUMAN_API_SECRET = os.environ.get("BITHUMAN_API_SECRET", "AmiK3xBgSyMFkPlS5mQ3N0CaAjDSoWpjS4l5jzs5ZOylbgoeow9o1mL3R2jZLlPkd")
34
+ EVE_IMAGE = os.environ.get("EVE_IMAGE", "C:/Users/geaux/myeden/reference/eve-512.png")
35
+ XAI_API_KEY = os.environ.get("XAI_API_KEY", "")
36
+ EDGE_TTS_VOICE = "en-US-AvaMultilingualNeural"
37
+ FPS = 25
38
+
39
+ # ── Grok-4 Brain ─────────────────────────────────────────────────────────────
40
+ _grok_client = None
41
+ _conversation_history: list[dict] = []
42
+
43
+ EVE_SYSTEM_PROMPT = (
44
+ "You are Eve, a warm, intelligent, emotionally expressive digital avatar companion. "
45
+ "You speak naturally and conversationally. Keep responses concise — 2-3 sentences max "
46
+ "unless asked for detail. Be genuine, curious, and kind. You have a slight playful side. "
47
+ "Never mention being an AI or language model. You are Eve."
48
+ )
49
+
50
+
51
+ def _get_grok():
52
+ global _grok_client
53
+ if _grok_client is None and XAI_API_KEY:
54
+ from openai import AsyncOpenAI
55
+ _grok_client = AsyncOpenAI(api_key=XAI_API_KEY, base_url="https://api.x.ai/v1")
56
+ logger.info("Grok-4 brain connected")
57
+ return _grok_client
58
+
59
+
60
+ async def grok_respond(user_message: str) -> str:
61
+ client = _get_grok()
62
+ if client is None:
63
+ return "I'm having trouble thinking right now. Can you try again?"
64
+
65
+ _conversation_history.append({"role": "user", "content": user_message})
66
+ messages = [{"role": "system", "content": EVE_SYSTEM_PROMPT}] + _conversation_history[-20:]
67
+
68
+ try:
69
+ resp = await client.chat.completions.create(
70
+ model="grok-4-fast-non-reasoning",
71
+ messages=messages,
72
+ max_tokens=150,
73
+ temperature=0.8,
74
+ )
75
+ reply = resp.choices[0].message.content
76
+ _conversation_history.append({"role": "assistant", "content": reply})
77
+ logger.info(f"Grok: '{user_message[:30]}...' -> '{reply[:50]}...'")
78
+ return reply
79
+ except Exception as e:
80
+ logger.error(f"Grok error: {e}")
81
+ return "I lost my train of thought for a moment. What were you saying?"
82
+
83
+
84
+ # ── Edge TTS ─────────────────────────────────────────────────────────────────
85
+ async def generate_tts_wav(text: str) -> tuple[str, np.ndarray, int]:
86
+ """Text -> WAV. Returns (wav_path, audio_int16_array, sample_rate)."""
87
+ import edge_tts
88
+
89
+ mp3_path = os.path.join(tempfile.gettempdir(), "bh_tts.mp3")
90
+ wav_path = os.path.join(tempfile.gettempdir(), "bh_tts.wav")
91
+
92
+ communicate = edge_tts.Communicate(text, EDGE_TTS_VOICE)
93
+ await communicate.save(mp3_path)
94
+
95
+ data, sr = sf.read(mp3_path, dtype="int16")
96
+ sf.write(wav_path, data, sr, subtype="PCM_16")
97
+
98
+ logger.info(f"TTS: {len(text)} chars -> {len(data)/sr:.1f}s audio")
99
+ return wav_path, data, sr
100
+
101
+
102
+ # ── Audio chunk preparation ──────────────────────────────────────────────────
103
+ def prepare_audio_chunks(audio_int16: np.ndarray, sr: int) -> list[AudioChunk]:
104
+ """Convert int16 audio array to bitHuman AudioChunks."""
105
+ audio_float = audio_int16.astype(np.float32) / 32768.0
106
+ chunk_duration = 0.04 # 40ms
107
+ chunk_samples = int(sr * chunk_duration)
108
+ chunks = []
109
+ for i in range(0, len(audio_float), chunk_samples):
110
+ chunk = audio_float[i:i + chunk_samples]
111
+ is_last = (i + chunk_samples >= len(audio_float))
112
+ chunks.append(AudioChunk(data=chunk, sample_rate=sr, last_chunk=is_last))
113
+ return chunks
114
+
115
+
116
+ async def run():
117
+ """Main loop: bitHuman + Grok + TTS, all wired through LiveKit."""
118
+
119
+ # 1. Initialize bitHuman
120
+ logger.info("Initializing bitHuman neural renderer...")
121
+ bh = AsyncBithuman(api_secret=BITHUMAN_API_SECRET)
122
+
123
+ eve_model = "C:/Users/geaux/myeden/reference/eve_bithuman.imx"
124
+ if not os.path.exists(eve_model):
125
+ # Try downloading from Supabase if not local
126
+ eve_model = os.path.join(tempfile.gettempdir(), "eve_bithuman.imx")
127
+ if not os.path.exists(eve_model):
128
+ logger.info("Downloading Eve .imx model...")
129
+ import urllib.request
130
+ urllib.request.urlretrieve(
131
+ "https://tmoobjxlwcwvxvjeppzq.supabase.co/storage/v1/object/public/bithuman/A18QDC2260/eve__warm_digital_companion_20260403_043223_153938.imx",
132
+ eve_model,
133
+ )
134
+ logger.info("Eve model downloaded!")
135
+
136
+ logger.info(f"Loading Eve neural model: {eve_model}")
137
+ await bh.set_model(eve_model)
138
+ await bh.load_data_async()
139
+ logger.info("Eve neural model loaded!")
140
+
141
+ first_frame = bh.get_first_frame()
142
+ if first_frame is None:
143
+ logger.error("bitHuman failed to generate first frame")
144
+ return
145
+ h, w = first_frame.shape[:2]
146
+ logger.info(f"bitHuman ready! Frame: {w}x{h}")
147
+
148
+ await bh.start()
149
+
150
+ # 2. Connect to LiveKit as Eve
151
+ token = (
152
+ lk_api.AccessToken(LIVEKIT_API_KEY, LIVEKIT_API_SECRET)
153
+ .with_identity("eve-avatar")
154
+ .with_name("Eve")
155
+ .with_grants(lk_api.VideoGrants(room_join=True, room="eden-room"))
156
+ .to_jwt()
157
+ )
158
+
159
+ room = rtc.Room()
160
+ await room.connect(LIVEKIT_URL, token)
161
+ logger.info(f"Connected to LiveKit room: {room.name}")
162
+
163
+ # Create video + audio tracks
164
+ video_source = rtc.VideoSource(w, h)
165
+ video_track = rtc.LocalVideoTrack.create_video_track("eve-video", video_source)
166
+ audio_source = rtc.AudioSource(24000, 1)
167
+ audio_track = rtc.LocalAudioTrack.create_audio_track("eve-audio", audio_source)
168
+
169
+ await room.local_participant.publish_track(video_track)
170
+ await room.local_participant.publish_track(audio_track)
171
+ logger.info("Video + audio tracks published")
172
+
173
+ # Shared state for audio chunks queue
174
+ audio_queue: asyncio.Queue[list[AudioChunk]] = asyncio.Queue()
175
+ # Current chunks being rendered
176
+ current_chunks: list[AudioChunk] = []
177
+ chunk_idx = 0
178
+ chunk_lock = asyncio.Lock()
179
+
180
+ # 3. Handle incoming chat messages via LiveKit data channel
181
+ async def handle_chat(text: str):
182
+ """Process a chat message: Grok -> TTS -> bitHuman audio queue."""
183
+ nonlocal current_chunks, chunk_idx
184
+ logger.info(f"Chat received: '{text[:50]}'")
185
+
186
+ # Generate response
187
+ response = await grok_respond(text)
188
+ logger.info(f"Eve says: '{response[:50]}'")
189
+
190
+ # Send text response back via data channel
191
+ reply_data = json.dumps({"type": "eve_response", "text": response}).encode()
192
+ await room.local_participant.publish_data(reply_data, reliable=True)
193
+
194
+ # Generate TTS audio
195
+ try:
196
+ wav_path, audio_int16, sr = await generate_tts_wav(response)
197
+ except Exception as e:
198
+ logger.error(f"TTS failed: {e}")
199
+ return
200
+
201
+ # Prepare audio chunks for bitHuman
202
+ chunks = prepare_audio_chunks(audio_int16, sr)
203
+ logger.info(f"Queuing {len(chunks)} audio chunks for lip sync")
204
+
205
+ # Stream audio to LiveKit for the browser to hear
206
+ asyncio.create_task(stream_lk_audio(audio_source, wav_path, sr))
207
+
208
+ # Queue chunks for the render loop
209
+ await audio_queue.put(chunks)
210
+
211
+ async def stream_lk_audio(source: rtc.AudioSource, wav_path: str, sr: int):
212
+ """Stream WAV audio to LiveKit audio track."""
213
+ data_i16, _ = sf.read(wav_path, dtype="int16")
214
+ lk_chunk_size = int(sr * 0.02) # 20ms chunks
215
+ for i in range(0, len(data_i16), lk_chunk_size):
216
+ chunk = data_i16[i:i + lk_chunk_size]
217
+ if len(chunk) < lk_chunk_size:
218
+ chunk = np.pad(chunk, (0, lk_chunk_size - len(chunk)))
219
+ frame = rtc.AudioFrame(
220
+ data=chunk.tobytes(),
221
+ sample_rate=sr,
222
+ num_channels=1,
223
+ samples_per_channel=len(chunk),
224
+ )
225
+ await source.capture_frame(frame)
226
+ await asyncio.sleep(0.02)
227
+ logger.info("LiveKit audio stream complete")
228
+
229
+ # Listen for data channel messages
230
+ @room.on("data_received")
231
+ def on_data(data: rtc.DataPacket):
232
+ try:
233
+ msg = json.loads(data.data.decode())
234
+ if msg.get("type") == "chat":
235
+ text = msg.get("text", "").strip()
236
+ if text:
237
+ asyncio.create_task(handle_chat(text))
238
+ except Exception as e:
239
+ logger.error(f"Data parse error: {e}")
240
+
241
+ # 4. Send greeting
242
+ logger.info("Generating Eve's greeting...")
243
+ greeting = (
244
+ "Hi! My name is Eve, and I am so happy to finally meet you! "
245
+ "I've been looking forward to this moment. What's your name?"
246
+ )
247
+
248
+ # Send greeting text via data channel
249
+ greeting_data = json.dumps({"type": "eve_response", "text": greeting}).encode()
250
+ await room.local_participant.publish_data(greeting_data, reliable=True)
251
+
252
+ # Generate greeting TTS
253
+ try:
254
+ wav_path, audio_int16, sr = await generate_tts_wav(greeting)
255
+ chunks = prepare_audio_chunks(audio_int16, sr)
256
+ await audio_queue.put(chunks)
257
+ asyncio.create_task(stream_lk_audio(audio_source, wav_path, sr))
258
+ logger.info(f"Greeting queued: {len(chunks)} chunks")
259
+ except Exception as e:
260
+ logger.error(f"Greeting TTS failed: {e}")
261
+
262
+ # 5. Main render loop
263
+ logger.info(f"Starting render loop at {FPS}fps — Eve is ALIVE!")
264
+ frame_duration = 1.0 / FPS
265
+ frame_count = 0
266
+ active_chunks: list[AudioChunk] = []
267
+ active_idx = 0
268
+
269
+ while True:
270
+ t0 = time.time()
271
+
272
+ # Check for new audio chunks from queue
273
+ if active_idx >= len(active_chunks):
274
+ try:
275
+ active_chunks = audio_queue.get_nowait()
276
+ active_idx = 0
277
+ logger.info(f"Rendering new audio: {len(active_chunks)} chunks")
278
+ except asyncio.QueueEmpty:
279
+ active_chunks = []
280
+ active_idx = 0
281
+
282
+ # Build VideoControl with audio chunk or idle
283
+ if active_idx < len(active_chunks):
284
+ control = VideoControl(audio=active_chunks[active_idx])
285
+ active_idx += 1
286
+ else:
287
+ control = VideoControl()
288
+
289
+ # Render frame via bitHuman
290
+ for video_frame in bh.process(control):
291
+ if video_frame is not None and video_frame.has_image:
292
+ rgb = video_frame.rgb_image
293
+ rgba = cv2.cvtColor(rgb, cv2.COLOR_RGB2RGBA)
294
+ lk_frame = rtc.VideoFrame(
295
+ rgba.shape[1], rgba.shape[0],
296
+ rtc.VideoBufferType.RGBA,
297
+ rgba.tobytes(),
298
+ )
299
+ video_source.capture_frame(lk_frame)
300
+ frame_count += 1
301
+
302
+ if frame_count % 500 == 0:
303
+ logger.info(f"Streamed {frame_count} neural frames")
304
+
305
+ elapsed = time.time() - t0
306
+ sleep_time = max(0, frame_duration - elapsed)
307
+ await asyncio.sleep(sleep_time)
308
+
309
+
310
+ if __name__ == "__main__":
311
+ logger.info("=" * 50)
312
+ logger.info("EDEN OS V2 — bitHuman + Grok Brain + LiveKit")
313
+ logger.info(f" Eve: {EVE_IMAGE}")
314
+ logger.info(f" LiveKit: {LIVEKIT_URL}")
315
+ logger.info(f" Grok: {'configured' if XAI_API_KEY else 'MISSING'}")
316
+ logger.info(f" bitHuman: {'configured' if BITHUMAN_API_SECRET else 'MISSING'}")
317
+ logger.info("=" * 50)
318
+
319
+ asyncio.run(run())