ASH-BAND / app.py
saadpie's picture
Create app.py
87461f5 verified
raw
history blame
4.8 kB
import os
import asyncio
from quart import Quart, websocket
from google import genai
from google.genai import types
app = Quart(__name__)
# Ensure your HF Space has GEMINI_API_KEY set in its secrets/environment variables
client = genai.Client()
# Note: The official live model name is currently gemini-2.0-flash-exp.
# Update this if you have specific access to a 3.1 live preview endpoint.
MODEL = "gemini-2.0-flash-exp"
VOICE_MODES = {
'Zephyr': 'Zephyr', # Default / Balanced
'Puck': 'Puck', # Energetic / Bright
'Charon': 'Charon', # Deep / Calm
'Kore': 'Kore', # Soft / Warm
'Fenrir': 'Fenrir' # Formal / Sharp
}
@app.websocket('/stream')
async def ws_stream():
"""
WebSocket endpoint for the Termux client.
Connect via: ws://<hf-space-url>/stream?voice=Zephyr
"""
# Grab the requested voice from the URL parameter, default to Zephyr
requested_voice = websocket.args.get("voice", "Zephyr")
voice_name = VOICE_MODES.get(requested_voice, "Zephyr")
# Mirroring your TS configuration
config = types.LiveConnectConfig(
response_modalities=[types.LiveModality.AUDIO],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=voice_name
)
)
),
tools=[{"google_search": {}}],
system_instruction=types.Content(
parts=[types.Part.from_text(
"You are ASH-BAND, a high-fidelity AI wearable companion. "
"Speak in a professional, concise, and helpful tone. "
"You have access to Google Search. Keep responses brief to minimize latency. "
"Your responses are spoken aloud."
)]
)
)
print(f"Connecting to Gemini Live API with voice: {voice_name}...")
try:
async with client.aio.live.connect(model=MODEL, config=config) as session:
print("Live session established.")
# Task 1: Stream audio from Client (Termux) -> Gemini
async def client_to_gemini():
try:
while True:
# Receive audio chunks from the client
data = await websocket.receive()
if isinstance(data, bytes):
# The TS file was downsampling to 16000Hz PCM
await session.send(
input={"data": data, "mime_type": "audio/pcm;rate=16000"}
)
except asyncio.CancelledError:
pass
except Exception as e:
print(f"Error reading from client: {e}")
# Task 2: Stream audio from Gemini -> Client (Termux)
async def gemini_to_client():
try:
async for message in session.receive():
server_content = message.server_content
if server_content is not None:
# Handle Interruption
if server_content.interrupted:
print("AI Interrupted by user.")
# In a more complex setup, send a control message to client to clear audio queue
model_turn = server_content.model_turn
if model_turn is not None:
for part in model_turn.parts:
# Output raw audio back to the client
if part.inline_data and part.inline_data.data:
# Gemini returns 24kHz PCM audio
await websocket.send(part.inline_data.data)
except asyncio.CancelledError:
pass
except Exception as e:
print(f"Error receiving from Gemini: {e}")
# Run both streaming directions concurrently
task1 = asyncio.create_task(client_to_gemini())
task2 = asyncio.create_task(gemini_to_client())
# Wait until one of the connections drops
done, pending = await asyncio.wait(
[task1, task2],
return_when=asyncio.FIRST_COMPLETED,
)
# Clean up the remaining task
for p in pending:
p.cancel()
except Exception as e:
print(f"Connection failed: {e}")
# Hugging Face Spaces standard port is 7860
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860)