Spaces:

AakashJammula
/

tts_realtime

Runtime error

App Files Files Community

Aakash jammula commited on Jun 20, 2025

Commit

88edcbb

1 Parent(s): 241379e

init

Browse files

Files changed (1) hide show

app.py +149 -55

app.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import os
 import torch
 from kokoro import KPipeline
-from langchain_core.messages import SystemMessage, HumanMessage, AIMessage, ToolMessage # Ensure AIMessage and ToolMessage are available if needed directly
 from langchain_core.tools import Tool
 from langgraph.graph import MessagesState, StateGraph, START
 from langgraph.prebuilt import tools_condition, ToolNode
 from langgraph.checkpoint.memory import MemorySaver
 from langchain_google_genai import ChatGoogleGenerativeAI
 from tavily import TavilyClient
-import warnings, logging
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
@@ -33,61 +34,32 @@ assert GOOGLE_API_KEY and TAVILY_API_KEY, "Missing API keys in environment."
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
-llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash",
-                              temperature=0.6, max_tokens=60,
-                              google_api_key=GOOGLE_API_KEY)
-tavily = TavilyClient(api_key=TAVILY_API_KEY)
-search_tool = Tool.from_function(name="TavilySearch",
-                                 func=lambda q: tavily.search(q, max_results=3),
-                                 description="Fetch factual current information from the web. Input should be a search query.")
-llm_tools = llm.bind_tools([search_tool])
-print("Loading Kokoro TTS model...")
-tts_pipeline = KPipeline(lang_code="a", device=device, repo_id="hexgrad/Kokoro-82M")
-print("Kokoro TTS model loaded.")
-def assistant_node(state: MessagesState) -> MessagesState:
-    msgs = state.get("messages", [])
-    if not msgs:
-        return state
-    sys = SystemMessage(
-            "You are Jarvis, a helpful and concise AI assistant. "
-            "Your responses should be brief, informative, and directly answer the user's query. "
-            "Aim for responses around 1 to 3 sentences. Maximum 60 tokens output. " # Increased slightly for tool summarization
-            "If you use tools, search web for any factual information needed, or any information which is time sensitive, like today's news and latest model releases. "
-            "After receiving tool results, you MUST summarize them or state you couldn't find the information based on the tool output."
-        )
-    resp = llm_tools.invoke([sys] + msgs[-5:])
-    state["messages"] = msgs + [resp]
-    return state
-builder = StateGraph(MessagesState)
-builder.add_node("assistant", assistant_node)
-builder.add_node("tools", ToolNode([search_tool]))
-builder.add_edge(START, "assistant")
-builder.add_conditional_edges("assistant", tools_condition)
-builder.add_edge("tools", "assistant")
-agent_graph = builder.compile(MemorySaver())
-def query_agent(prompt: str, thread_id: str) -> str:
-    payload = {"messages": [HumanMessage(content=prompt)]}
-    cfg = {"configurable": {"thread_id": thread_id}}
-    out = agent_graph.invoke(payload, cfg)
-    return out["messages"][-1].content.strip()
-KOKORO_SAMPLE_RATE = 24000
-def generate_speech_audio(text: str) -> io.BytesIO:
-    if not text or text.isspace():
-        print("Empty text received for TTS, generating silent audio.")
         duration_ms = 100
         num_samples = int(KOKORO_SAMPLE_RATE * (duration_ms / 1000.0))
         audio_data_int16 = np.zeros(num_samples, dtype=np.int16)
     else:
-        print(f"Generating speech for: '{text}'")
         audio_chunks = []
         try:
             for _, _, audio_segment in tts_pipeline(text, voice="af_heart", speed=1.3):
@@ -98,7 +70,7 @@ def generate_speech_audio(text: str) -> io.BytesIO:
                         print(f"Warning: TTS produced an empty or non-1D audio segment. Shape: {audio_segment.shape if hasattr(audio_segment, 'shape') else 'N/A'}")
                     else:
                         print(f"Warning: TTS produced None audio segment for part of text: '{text}'")
             if not audio_chunks:
                 print(f"Warning: TTS produced no valid audio for text: '{text}'. Generating silence.")
                 duration_ms = 100
@@ -121,12 +93,129 @@ def generate_speech_audio(text: str) -> io.BytesIO:
         wf.setsampwidth(2)
         wf.setframerate(KOKORO_SAMPLE_RATE)
         wf.writeframes(audio_data_int16.tobytes())
     wav_buffer.seek(0)
-    print(f"Generated WAV audio of size: {len(wav_buffer.getvalue())} bytes")
     return wav_buffer
-app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
@@ -146,7 +235,7 @@ async def process_command(payload: TextInput):
     if not payload.text or payload.text.isspace():
         print("Empty text received, responding with silent audio.")
-        silent_audio_wav = await asyncio.to_thread(generate_speech_audio, "")
         return StreamingResponse(silent_audio_wav, media_type="audio/wav")
     try:
@@ -154,10 +243,10 @@ async def process_command(payload: TextInput):
         print(f"LLM response: '{llm_response_text}'")
         if not llm_response_text or llm_response_text.isspace():
-            print("LLM returned empty response, generating silent audio.")
             llm_response_text = "I don't have a response for that."
-        audio_wav_buffer = await asyncio.to_thread(generate_speech_audio, llm_response_text)
         return StreamingResponse(audio_wav_buffer, media_type="audio/wav")
     except Exception as e:
@@ -165,9 +254,14 @@ async def process_command(payload: TextInput):
         traceback.print_exc()
         error_message_text = "Sorry, I encountered an error."
         try:
-            error_audio_buffer = await asyncio.to_thread(generate_speech_audio, error_message_text)
             return StreamingResponse(error_audio_buffer, media_type="audio/wav", status_code=500)
         except Exception as audio_err:
             print(f"Critical error generating error audio: {audio_err}")
             raise HTTPException(status_code=500, detail="Internal server error during audio generation")

 import os
 import torch
 from kokoro import KPipeline
+from langchain_core.messages import SystemMessage, HumanMessage
 from langchain_core.tools import Tool
 from langgraph.graph import MessagesState, StateGraph, START
 from langgraph.prebuilt import tools_condition, ToolNode
 from langgraph.checkpoint.memory import MemorySaver
 from langchain_google_genai import ChatGoogleGenerativeAI
 from tavily import TavilyClient
+import warnings
+import logging
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
+TTS_CACHE = {}
+KOKORO_SAMPLE_RATE = 24000
+tts_pipeline: KPipeline # To be initialized later
+def generate_speech_audio_optimized(text: str) -> io.BytesIO:
+    global TTS_CACHE, KOKORO_SAMPLE_RATE, tts_pipeline
+    cache_key_to_check = text
+    is_silent_request = False
+    if not text or text.isspace():
+        cache_key_to_check = "SILENT_AUDIO_OPTIMIZED"
+        is_silent_request = True
+    if cache_key_to_check in TTS_CACHE:
+        cached_buffer = io.BytesIO(TTS_CACHE[cache_key_to_check])
+        cached_buffer.seek(0)
+        return cached_buffer
+    audio_data_int16: np.ndarray
+    if is_silent_request:
         duration_ms = 100
         num_samples = int(KOKORO_SAMPLE_RATE * (duration_ms / 1000.0))
         audio_data_int16 = np.zeros(num_samples, dtype=np.int16)
     else:
         audio_chunks = []
         try:
             for _, _, audio_segment in tts_pipeline(text, voice="af_heart", speed=1.3):
                         print(f"Warning: TTS produced an empty or non-1D audio segment. Shape: {audio_segment.shape if hasattr(audio_segment, 'shape') else 'N/A'}")
                     else:
                         print(f"Warning: TTS produced None audio segment for part of text: '{text}'")
             if not audio_chunks:
                 print(f"Warning: TTS produced no valid audio for text: '{text}'. Generating silence.")
                 duration_ms = 100
         wf.setsampwidth(2)
         wf.setframerate(KOKORO_SAMPLE_RATE)
         wf.writeframes(audio_data_int16.tobytes())
+    wav_data_bytes = wav_buffer.getvalue()
+    texts_to_always_cache = [
+        "Sorry, I encountered an error.",
+        "I don't have a response for that."
+    ]
+    current_cache_target_key = "SILENT_AUDIO_OPTIMIZED" if is_silent_request else text
+    if current_cache_target_key == "SILENT_AUDIO_OPTIMIZED" and current_cache_target_key not in TTS_CACHE:
+        TTS_CACHE[current_cache_target_key] = wav_data_bytes
+    elif current_cache_target_key in texts_to_always_cache and current_cache_target_key not in TTS_CACHE:
+        TTS_CACHE[current_cache_target_key] = wav_data_bytes
     wav_buffer.seek(0)
     return wav_buffer
+def pre_populate_tts_cache_on_startup():
+    global TTS_CACHE, KOKORO_SAMPLE_RATE, tts_pipeline
+    if not hasattr(tts_pipeline, '__call__'):
+        print("Error: TTS pipeline not available for pre-caching.")
+        return
+    print("Pre-populating TTS cache...")
+    common_texts_to_pre_cache = [
+        "Sorry, I encountered an error.",
+        "I don't have a response for that."
+    ]
+    for text_to_cache in common_texts_to_pre_cache:
+        if text_to_cache not in TTS_CACHE:
+            try:
+                audio_chunks = []
+                for _, _, audio_segment in tts_pipeline(text_to_cache, voice="af_heart", speed=1.3):
+                    if audio_segment is not None and audio_segment.ndim == 1 and audio_segment.numel() > 0:
+                        audio_chunks.append(audio_segment)
+                if not audio_chunks:
+                    duration_ms = 100
+                    num_samples = int(KOKORO_SAMPLE_RATE * (duration_ms / 1000.0))
+                    audio_data_int16_precached = np.zeros(num_samples, dtype=np.int16)
+                else:
+                    full_audio_data_tensor = torch.cat(audio_chunks)
+                    audio_data_int16_precached = (full_audio_data_tensor.cpu().numpy() * 32767).astype(np.int16)
+                wav_buffer_temp = io.BytesIO()
+                with wave.open(wav_buffer_temp, 'wb') as wf:
+                    wf.setnchannels(1)
+                    wf.setsampwidth(2)
+                    wf.setframerate(KOKORO_SAMPLE_RATE)
+                    wf.writeframes(audio_data_int16_precached.tobytes())
+                TTS_CACHE[text_to_cache] = wav_buffer_temp.getvalue()
+            except Exception as e:
+                print(f"Error pre-caching TTS for '{text_to_cache}': {e}")
+                traceback.print_exc()
+    if "SILENT_AUDIO_OPTIMIZED" not in TTS_CACHE:
+        try:
+            duration_ms = 100
+            num_samples = int(KOKORO_SAMPLE_RATE * (duration_ms / 1000.0))
+            audio_data_int16_silent = np.zeros(num_samples, dtype=np.int16)
+            wav_buffer_silent = io.BytesIO()
+            with wave.open(wav_buffer_silent, 'wb') as wf:
+                wf.setnchannels(1)
+                wf.setsampwidth(2)
+                wf.setframerate(KOKORO_SAMPLE_RATE)
+                wf.writeframes(audio_data_int16_silent.tobytes())
+            TTS_CACHE["SILENT_AUDIO_OPTIMIZED"] = wav_buffer_silent.getvalue()
+        except Exception as e:
+            print(f"Error pre-caching silent audio: {e}")
+            traceback.print_exc()
+    print("TTS cache pre-population finished.")
+llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash",
+                              temperature=0.6, max_tokens=60,
+                              google_api_key=GOOGLE_API_KEY)
+tavily = TavilyClient(api_key=TAVILY_API_KEY)
+search_tool = Tool.from_function(name="TavilySearch",
+                                 func=lambda q: tavily.search(q, max_results=3),
+                                 description="Fetch factual current information from the web. Input should be a search query.")
+llm_tools = llm.bind_tools([search_tool])
+def assistant_node(state: MessagesState) -> MessagesState:
+    msgs = state.get("messages", [])
+    if not msgs:
+        return state
+    sys = SystemMessage(
+            "You are Jarvis, a helpful and concise AI assistant. "
+            "Your responses should be brief, informative, and directly answer the user's query. "
+            "Aim for responses around 1 to 3 sentences. Maximum 60 tokens output. "
+            "If you use tools, search web for any factual information needed, or any information which is time sensitive, like today's news and latest model releases. "
+            "After receiving tool results, you MUST summarize them or state you couldn't find the information based on the tool output."
+        )
+    resp = llm_tools.invoke([sys] + msgs[-5:])
+    current_messages = state.get("messages", []) # Re-fetch in case state was an empty dict initially
+    state["messages"] = current_messages + [resp]
+    return state
+builder = StateGraph(MessagesState)
+builder.add_node("assistant", assistant_node)
+builder.add_node("tools", ToolNode([search_tool]))
+builder.add_edge(START, "assistant")
+builder.add_conditional_edges("assistant", tools_condition)
+builder.add_edge("tools", "assistant")
+agent_graph = builder.compile(checkpointer=MemorySaver())
+def query_agent(prompt: str, thread_id: str) -> str:
+    payload = {"messages": [HumanMessage(content=prompt)]}
+    cfg = {"configurable": {"thread_id": thread_id}}
+    out = agent_graph.invoke(payload, cfg)
+    return out["messages"][-1].content.strip()
+print("Loading Kokoro TTS model...")
+tts_pipeline = KPipeline(lang_code="a", device=device, repo_id="hexgrad/Kokoro-82M")
+print("Kokoro TTS model loaded.")
+pre_populate_tts_cache_on_startup()
+app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
     if not payload.text or payload.text.isspace():
         print("Empty text received, responding with silent audio.")
+        silent_audio_wav = await asyncio.to_thread(generate_speech_audio_optimized, "")
         return StreamingResponse(silent_audio_wav, media_type="audio/wav")
     try:
         print(f"LLM response: '{llm_response_text}'")
         if not llm_response_text or llm_response_text.isspace():
+            print("LLM returned empty response, generating canned audio.")
             llm_response_text = "I don't have a response for that."
+        audio_wav_buffer = await asyncio.to_thread(generate_speech_audio_optimized, llm_response_text)
         return StreamingResponse(audio_wav_buffer, media_type="audio/wav")
     except Exception as e:
         traceback.print_exc()
         error_message_text = "Sorry, I encountered an error."
         try:
+            error_audio_buffer = await asyncio.to_thread(generate_speech_audio_optimized, error_message_text)
             return StreamingResponse(error_audio_buffer, media_type="audio/wav", status_code=500)
         except Exception as audio_err:
             print(f"Critical error generating error audio: {audio_err}")
+            traceback.print_exc()
             raise HTTPException(status_code=500, detail="Internal server error during audio generation")
+if __name__ == "__main__":
+    import uvicorn
+    print("Starting FastAPI server with Uvicorn...")
+    uvicorn.run(app, host="0.0.0.0", port=8000)