Spaces:

afouda
/

Audio_test

Runtime error

App Files Files Community

afouda commited on Oct 5, 2025

Commit

4fada6b

verified ·

1 Parent(s): c71961b

Create app.py

Browse files

Files changed (1) hide show

app.py +513 -0

app.py ADDED Viewed

	@@ -0,0 +1,513 @@

+import asyncio
+import base64
+import os
+import time
+from io import BytesIO
+from google.genai import types
+from google.genai.types import (
+    LiveConnectConfig,
+    SpeechConfig,
+    VoiceConfig,
+    PrebuiltVoiceConfig,
+    Content,
+    Part,
+)
+import gradio as gr
+import numpy as np
+import websockets
+from dotenv import load_dotenv
+from fastrtc import (
+    AsyncAudioVideoStreamHandler,
+    Stream,
+    WebRTC,
+    get_cloudflare_turn_credentials_async,
+    wait_for_item,
+)
+from google import genai
+from gradio.utils import get_space
+from PIL import Image
+# ------------------------------------------
+import asyncio
+import base64
+import json
+import os
+import pathlib
+from typing import AsyncGenerator, Literal
+import gradio as gr
+import numpy as np
+from dotenv import load_dotenv
+from fastapi import FastAPI
+from fastapi.responses import HTMLResponse
+from fastrtc import (
+    AsyncStreamHandler,
+    Stream,
+    get_cloudflare_turn_credentials_async,
+    wait_for_item,
+)
+from google import genai
+from google.genai.types import (
+    LiveConnectConfig,
+    PrebuiltVoiceConfig,
+    SpeechConfig,
+    VoiceConfig,
+)
+from gradio.utils import get_space
+from pydantic import BaseModel
+# ------------------------------------------------
+from dotenv import load_dotenv
+load_dotenv()
+import os
+import io
+import asyncio
+from pydub import AudioSegment
+async def safe_get_ice_config_async():
+    """Return Cloudflare TURN credentials when available, otherwise return a STUN-only fallback.
+    This prevents the library from raising the HF_TOKEN / CLOUDFLARE_* error when those
+    environment variables are not set during local testing.
+    """
+    # If HuggingFace token or Cloudflare TURN env vars are present, try to use the helper
+    if os.getenv("HF_TOKEN") or (os.getenv("CLOUDFLARE_TURN_KEY_ID") and os.getenv("CLOUDFLARE_TURN_KEY_API_TOKEN")):
+        try:
+            return await get_cloudflare_turn_credentials_async()
+        except Exception as e:
+            print("Warning: failed to get Cloudflare TURN credentials, falling back to STUN-only. Error:", e)
+    # Fallback: return minimal STUN servers so WebRTC can still attempt peer connections locally
+    return {"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}
+# Gemini: google-genai
+from google import genai
+# ---------------------------------------------------
+# VAD imports from reference code
+import collections
+import webrtcvad
+import time
+# Weaviate imports
+import weaviate
+from weaviate.classes.init import Auth
+from contextlib import contextmanager
+# helper functions
+GEMINI_API_KEY="AIzaSyATK7Q1xqWLa7nw1Y40mvRrB8motyQl1oo"
+HF_TOKEN ="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"
+WEAVIATE_URL="18vysvlxqza0ux821ecbg.c0.us-west3.gcp.weaviate.cloud"
+WEAVIATE_API_KEY="b2d4dC9sV1Y0dkZjSnlkRV9EMU04V0FyRE9HSlBPQnhlbENzQ0dWQm9pbENyRUVuWXpWc3R3YmpjK1pBPV92MjAw"
+DEEPINFRA_API_KEY="285LUJulGIprqT6hcPhiXtcrphU04FG4"
+DEEPINFRA_BASE_URL="https://api.deepinfra.com/v1/openai"
+from openai import OpenAI
+openai = OpenAI(
+    api_key=DEEPINFRA_API_KEY,
+    base_url="https://api.deepinfra.com/v1/openai",
+)
+@contextmanager
+def weaviate_client():
+    """
+    Context manager that yields a Weaviate client and
+    guarantees client.close() on exit.
+    """
+    client = weaviate.connect_to_weaviate_cloud(
+        cluster_url=WEAVIATE_URL,
+        auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
+    )
+    try:
+        yield client
+    finally:
+        client.close()
+def encode_audio(data: np.ndarray) -> dict:
+    """Encode Audio data to send to the server"""
+    return {
+        "mime_type": "audio/pcm",
+        "data": base64.b64encode(data.tobytes()).decode("UTF-8"),
+    }
+def encode_audio2(data: np.ndarray) -> bytes:
+    """Encode Audio data to send to the server"""
+    return data.tobytes()
+import soundfile as sf
+def numpy_array_to_wav_bytes(audio_array, sample_rate=16000):
+    buffer = io.BytesIO()
+    sf.write(buffer, audio_array, sample_rate, format='WAV')
+    return buffer.getvalue()
+def numpy_array_to_wav_bytes(audio_array, sample_rate=16000):
+    """
+    Convert a NumPy audio array to WAV bytes.
+    Args:
+        audio_array (np.ndarray): Audio signal (1D or 2D).
+        sample_rate (int): Sample rate in Hz.
+    Returns:
+        bytes: WAV-formatted audio data.
+    """
+    buffer = io.BytesIO()
+    sf.write(buffer, audio_array, sample_rate, format='WAV')
+    buffer.seek(0)
+    return buffer.read()
+# webrtc handler class
+class GeminiHandler(AsyncStreamHandler):
+    """Handler for the Gemini API with chained latency calculation."""
+    def __init__(
+        self,
+        expected_layout: Literal["mono"] = "mono",
+        output_sample_rate: int = 24000,prompt_dict: dict = {"prompt":"PHQ-9"},
+    ) -> None:
+        super().__init__(
+            expected_layout,
+            output_sample_rate,
+            input_sample_rate=16000,
+        )
+        self.input_queue: asyncio.Queue = asyncio.Queue()
+        self.output_queue: asyncio.Queue = asyncio.Queue()
+        self.quit: asyncio.Event = asyncio.Event()
+        self.prompt_dict = prompt_dict
+        # self.model = "gemini-2.5-flash-preview-tts"
+        self.model = "gemini-2.0-flash-live-001"
+        self.t2t_model = "gemini-2.5-flash-lite"
+        self.s2t_model = "gemini-2.5-flash-lite"
+        # --- VAD Initialization ---
+        self.vad = webrtcvad.Vad(3)
+        self.VAD_RATE = 16000
+        self.VAD_FRAME_MS = 20
+        self.VAD_FRAME_SAMPLES = int(self.VAD_RATE * (self.VAD_FRAME_MS / 1000.0))
+        self.VAD_FRAME_BYTES = self.VAD_FRAME_SAMPLES * 2
+        padding_ms = 300
+        self.vad_padding_frames = padding_ms // self.VAD_FRAME_MS
+        self.vad_ring_buffer = collections.deque(maxlen=self.vad_padding_frames)
+        self.vad_ratio = 0.9
+        self.vad_triggered = False
+        self.wav_data = bytearray()
+        self.internal_buffer = bytearray()
+        self.end_of_speech_time: float | None = None
+        self.first_latency_calculated: bool = False
+    def copy(self) -> "GeminiHandler":
+        return GeminiHandler(
+            expected_layout="mono",
+            output_sample_rate=self.output_sample_rate,
+            prompt_dict=self.prompt_dict,
+        )
+    def s2t(self, audio) -> str:
+        response = self.s2t_client.models.generate_content(
+            model=self.s2t_model,
+            contents=[
+                types.Part.from_bytes(data=audio, mime_type='audio/wav'),
+                'Generate a transcript of the speech.'
+            ]
+        )
+        return response.text
+    def embed_texts(self, texts: list[str], batch_size: int = 50) -> list[list[float]]:
+        """Embed a list of texts using the configured OpenAI/DeepInfra client.
+        Returns a list of embedding vectors (or empty lists on failure for each item).
+        """
+        all_embeddings: list[list[float]] = []
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i : i + batch_size]
+            try:
+                resp = openai.embeddings.create(
+                    model="Qwen/Qwen3-Embedding-8B",
+                    input=batch,
+                    encoding_format="float"
+                )
+                batch_embs = [item.embedding for item in resp.data]
+                all_embeddings.extend(batch_embs)
+            except Exception as e:
+                print(f"Embedding batch error (items {i}–{i+len(batch)-1}): {e}")
+                all_embeddings.extend([[] for _ in batch])
+        return all_embeddings
+    def s2t_and_embed(self, audio) -> list[float]:
+        """Convert speech to text, then embed the transcript."""
+        transcript = self.s2t(audio)           # Step 1: Speech → Text
+        if not transcript:
+            return []
+        embeddings = self.embed_texts([transcript]) # Step 2: Text → Embedding
+        return embeddings[0] if embeddings else []
+    def encode_query(self, query: str) -> list[float] | None:
+        """Generate a single embedding vector for a query string."""
+        embs = self.embed_texts([query], batch_size=1)
+        if embs and embs[0]:
+            print("Query embedding (first 5 dims):", embs[0][:5])
+            return embs[0]
+        print("Failed to generate query embedding.")
+        return None
+    def rag_autism(self, query: str, top_k: int = 3) -> dict:
+            """
+            Run a RAG retrieval on the 'UserDocument' collection in Weaviate using v4 syntax.
+            Returns up to `top_k` matching text chunks as {'answer': [texts...]}
+            """
+            qe = self.encode_query(query)
+            if not qe:
+                return {"answer": []}
+            try:
+                with weaviate_client() as client:
+                    books_collection = client.collections.get("UserDocument")
+                    response = books_collection.query.near_vector(
+                        near_vector=qe,
+                        limit=top_k,
+                        return_properties=["text"]
+                    )
+                    # Extract the text property from each object
+                    hits = [obj.properties.get("text") for obj in response.objects if "text" in obj.properties]
+                    # --- FIX: REMOVE REPEATED CONTEXT ---
+                    # Convert to a dictionary's keys to get unique items, then back to a list
+                    unique_hits = list(dict.fromkeys(hits))
+                    if not unique_hits:
+                        return {"answer": []}
+                    return {"answer": unique_hits}
+            except Exception as e:
+                print("RAG Error:", e)
+                return {"answer": []}
+    def t2t(self, text: str) -> str:
+        """
+        Sends text to the pre-initialized chat model and returns the text response.
+        """
+        try:
+            # Ensure the chat session exists before using it.
+            if not hasattr(self, 'chat'):
+                print("Error: Chat session (self.chat) is not initialized.")
+                return "I'm sorry, my chat function is not ready."
+            # Use the existing chat session to send the message.
+            print("--> Attempting to send prompt to t2t model...")
+            response = self.chat.send_message(text)
+            print("--> Successfully received response from t2t model.")
+            return response.text
+        except Exception as e:
+            print(f"t2t error: {e}")
+            return ""
+    async def start_up(self):
+        # Flag for if we are using text-to-text in the middle of the chain or not.
+        self.t2t_bool = False
+        self.sys_prompt = None
+        self.t2t_client = genai.Client(api_key=GEMINI_API_KEY)
+        self.s2t_client = genai.Client(api_key=GEMINI_API_KEY)
+        if self.sys_prompt is not None:
+            chat_config = types.GenerateContentConfig(system_instruction=self.sys_prompt)
+        else:
+            chat_config = types.GenerateContentConfig(system_instruction="You are a helpful assistant.")
+        self.chat = self.t2t_client.chats.create(model=self.t2t_model, config=chat_config)
+        self.t2s_client = genai.Client(api_key=GEMINI_API_KEY)
+        voice_name = "Puck"
+        if self.t2t_bool:
+            sys_instruction = f""" You are Wisal, an AI assistant developed by Compumacy AI , and a knowledgeable Autism .
+                                Your sole purpose is to provide helpful, respectful, and easy-to-understand answers about Autism Spectrum Disorder (ASD).
+                                Always be clear, non-judgmental, and supportive."""
+        else:
+            sys_instruction = self.sys_prompt
+        if sys_instruction is not None:
+            config = LiveConnectConfig(
+            response_modalities=["AUDIO"],
+            speech_config=SpeechConfig(
+                voice_config=VoiceConfig(
+                    prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=voice_name)
+                )
+            ),
+            system_instruction=Content(parts=[Part.from_text(text=sys_instruction)])
+            )
+        else:
+            config = LiveConnectConfig(
+                response_modalities=["AUDIO"],
+                speech_config=SpeechConfig(
+                    voice_config=VoiceConfig(
+                        prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=voice_name)
+                    )
+                ),
+            )
+        async with self.t2s_client.aio.live.connect(model=self.model, config=config) as session:
+            async for text_from_user in self.stream():
+                print("--------------------------------------------")
+                print(f"Received text from user and reading aloud: {text_from_user}")
+                print("--------------------------------------------")
+                if not text_from_user or not text_from_user.strip():
+                    continue
+                # 1) Run RAG retrieval on the user input to get contextual snippets
+                try:
+                    rag_res = self.rag_autism(text_from_user, top_k=3)
+                    context_snippets = rag_res.get("answer", []) if isinstance(rag_res, dict) else []
+                    # --- ADDED THIS BLOCK TO PRINT THE RAG CONTEXT ---
+                    if context_snippets:
+                        print("\n--- RAG CONTEXT RETRIEVED ---")
+                        for i, snippet in enumerate(context_snippets):
+                            print(f"Snippet {i+1}: {snippet}...")
+                        print("-----------------------------\n")
+                    #
+                except Exception as e:
+                    print("Error running RAG:", e)
+                    context_snippets = []
+                # 2) Build the prompt for t2t model including retrieved context
+                combined_context = "\n\n".join(context_snippets) if context_snippets else ""
+                if combined_context:
+                    prompt =(
+                    "Please answer the user's question based on the following context. "
+                    "Be helpful and concise.\n\n"
+                    f"--- CONTEXT ---\n{combined_context}\n\n"
+                    f"--- USER QUESTION ---\n{text_from_user}"
+                )
+                else:
+                    prompt = (
+                        "Answer the user's question from your own knowledge as a helpful assistant "
+                        "specializing in Autism Spectrum Disorder.\n\n"
+                        f"--- USER QUESTION ---\n{text_from_user}"
+                    )
+                    print(prompt)
+                # 3) Send prompt to chat (t2t) to obtain reply text
+                try:
+                    reply_text = self.t2t(prompt)
+                    print("\n--- FINAL AI RESPONSE ---")
+                    print(reply_text)
+                    print("-----------------------------")
+                except Exception as e:
+                    print("t2t generation error:", e)
+                    reply_text = ""
+                if not reply_text:
+                    print("No t2t reply generated, skipping t2s send.")
+                    continue
+                # 4) Send the reply_text to the live TTS session to speak it
+                try:
+                    text_to_speak = f"Read the following text aloud exactly as it is, without adding or changing anything: '{reply_text}'"
+                    print(f">>> MODIFIED TEXT SENT TO T2S API: '{text_to_speak}' <<<")
+                    await session.send_client_content(
+                        turns=types.Content(role='user', parts=[types.Part(text=text_to_speak)])
+                    )
+                    async for resp_chunk in session.receive():
+                        if getattr(resp_chunk, "data", None):
+                            array = np.frombuffer(resp_chunk.data, dtype=np.int16)
+                            self.output_queue.put_nowait((self.output_sample_rate, array))
+                except Exception as e:
+                    print("Error sending to live TTS session:", e)
+    async def stream(self) -> AsyncGenerator[bytes, None]:
+        while not self.quit.is_set():
+            try:
+                # Get the text message to be converted to speech
+                text_to_speak = await self.input_queue.get()
+                yield text_to_speak
+            except (asyncio.TimeoutError, TimeoutError):
+                pass
+    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
+        sr, array = frame
+        audio_bytes = array.tobytes()
+        self.internal_buffer.extend(audio_bytes)
+        while len(self.internal_buffer) >= self.VAD_FRAME_BYTES:
+            vad_frame = self.internal_buffer[:self.VAD_FRAME_BYTES]
+            self.internal_buffer = self.internal_buffer[self.VAD_FRAME_BYTES:]
+            is_speech = self.vad.is_speech(vad_frame, self.VAD_RATE)
+            if not self.vad_triggered:
+                self.vad_ring_buffer.append((vad_frame, is_speech))
+                num_voiced = len([f for f, speech in self.vad_ring_buffer if speech])
+                if num_voiced > self.vad_ratio * self.vad_ring_buffer.maxlen:
+                    print("Speech detected, starting to record...")
+                    self.vad_triggered = True
+                    for f, s in self.vad_ring_buffer:
+                        self.wav_data.extend(f)
+                    self.vad_ring_buffer.clear()
+            else:
+                self.wav_data.extend(vad_frame)
+                self.vad_ring_buffer.append((vad_frame, is_speech))
+                num_unvoiced = len([f for f, speech in self.vad_ring_buffer if not speech])
+                if num_unvoiced > self.vad_ratio * self.vad_ring_buffer.maxlen:
+                    print("End of speech detected.")
+                    self.end_of_speech_time = time.monotonic()
+                    self.vad_triggered = False
+                    full_utterance_np = np.frombuffer(self.wav_data, dtype=np.int16)
+                    audio_input_wav = numpy_array_to_wav_bytes(full_utterance_np, sr)
+                    text_input = self.s2t(audio_input_wav)
+                    # --- ADDED THIS BLOCK TO PRINT THE S2T TRANSCRIPT ---
+                    print("\n--- FULL S2T TRANSCRIPT ---")
+                    print(f"'{text_input}'")
+                    print("---------------------------\n")
+                    # ----------------------------------------------------
+                    if text_input and text_input.strip():
+                        if self.t2t_bool:
+                            text_message = self.t2t(text_input)
+                        else:
+                            text_message = text_input
+                        self.input_queue.put_nowait(text_message)
+                    else:
+                        print("STT returned empty transcript, skipping.")
+                    self.vad_ring_buffer.clear()
+                    self.wav_data = bytearray()
+    async def emit(self) -> tuple[int, np.ndarray] | None:
+        return await wait_for_item(self.output_queue)
+    def shutdown(self) -> None:
+        self.quit.set()
+with gr.Blocks() as demo:
+    gr.Markdown("# Gemini Chained Speech-to-Speech Demo")
+    with gr.Row() as row2:
+        with gr.Column():
+            webrtc2 = WebRTC(
+                label="Audio Chat",
+                modality="audio",
+                mode="send-receive",
+                elem_id="audio-source",
+                rtc_configuration=safe_get_ice_config_async,
+                icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
+                pulse_color="rgb(255, 255, 255)",
+                icon_button_color="rgb(255, 255, 255)",
+            )
+            webrtc2.stream(
+                GeminiHandler(),
+                inputs=[webrtc2],
+                outputs=[webrtc2],
+                time_limit=180 if get_space() else None,
+                concurrency_limit=2 if get_space() else None,
+            )
+if __name__ == "__main__":
+    demo.launch(server_port=9090, debug=True)