Spaces:

BACKENDAPI2024
/

radarbackend11262024v11

Runtime error

App Files Files Community

Pijush2023 commited on Oct 25, 2024

Commit

b8d3256

verified ·

1 Parent(s): 3595ee8

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -95

app.py CHANGED Viewed

@@ -8,18 +8,42 @@ from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
 from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 from langchain_community.vectorstores import Neo4jVector
 from langchain_community.graphs import Neo4jGraph
 from langchain_core.prompts import ChatPromptTemplate
 import time
 import os
-from dataclasses import dataclass
-# Define AppState to store audio state information
 @dataclass
 class AppState:
     stream: np.ndarray | None = None
     sampling_rate: int = 0
     pause_detected: bool = False
-    started_talking: bool = False
 # Neo4j setup
 graph = Neo4jGraph(
@@ -58,79 +82,10 @@ pipe_asr = pipeline(
     return_timestamps=True
 )
-# Adjusted function to determine if a pause occurred
-def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
-    """Take in the stream, determine if a pause happened."""
-    temp_audio = audio
-    dur_vad = len(temp_audio) / sampling_rate  # Simulating VAD duration for this example
-    duration = len(audio) / sampling_rate
-    # Log the duration and VAD result for debugging
-    print(f"Duration after VAD: {dur_vad:.3f} s, Total Duration: {duration:.3f} s")
-    # Check if speech has started
-    if dur_vad > 0.5 and not state.started_talking:
-        print("Started talking")
-        state.started_talking = True
-        return False
-    # If the difference between total duration and VAD duration is significant, consider it a pause
-    # Adjust the threshold for pause detection (e.g., 0.5 seconds)
-    pause_threshold = 0.5  # This value can be adjusted to be more sensitive
-    if (duration - dur_vad) > pause_threshold and state.started_talking:
-        print("Pause detected")
-        return True
-    return False
-# Function to process audio input, detect pauses, and handle state
-def process_audio(audio: tuple, state: AppState):
-    # Ensure audio input is not None and has valid data
-    if audio is None or audio[1] is None:
-        print("Audio input is None or empty.")
-        return None, state
-    if state.stream is None:
-        state.stream = audio[1]
-        state.sampling_rate = audio[0]
-    else:
-        state.stream = np.concatenate((state.stream, audio[1]))
-    # Check for a pause in speech
-    pause_detected = determine_pause(state.stream, state.sampling_rate, state)
-    state.pause_detected = pause_detected
-    if state.pause_detected and state.started_talking:
-        # Transcribe the audio when a pause is detected
-        _, transcription, _ = transcribe_function(state.stream, (state.sampling_rate, state.stream))
-        print(f"Transcription: {transcription}")
-        # Check if transcription is empty
-        if not transcription:
-            print("No transcription available.")
-            return None, state
-        # Retrieve hybrid response using Neo4j and other methods
-        response_text = retriever(transcription)
-        print(f"Response: {response_text}")
-        # Check if the response is empty before proceeding
-        if not response_text:
-            print("No response generated.")
-            return None, state
-        # Generate audio from the response text
-        audio_path = generate_audio_elevenlabs(response_text)
-        # Reset state for the next input
-        state.stream = None
-        state.started_talking = False
-        state.pause_detected = False
-        return audio_path, state
-    return None, state
 # Function to process audio input and transcribe it
@@ -141,6 +96,7 @@ def transcribe_function(stream, new_chunk):
         print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
         return stream, "", None
     if y is None or len(y) == 0:
         return stream, "", None
@@ -149,27 +105,51 @@ def transcribe_function(stream, new_chunk):
     if max_abs_y > 0:
         y = y / max_abs_y
     if stream is not None and len(stream) > 0:
         stream = np.concatenate([stream, y])
     else:
         stream = y
     result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
     full_text = result.get("text", "")
     return stream, full_text, full_text
 # Function to generate a full-text search query for Neo4j
 def generate_full_text_query(input: str) -> str:
     words = [el for el in input.split() if el]
     if not words:
         return ""  # Return an empty string or a default query if desired
     full_text_query = ""
     for word in words[:-1]:
         full_text_query += f" {word}~2 AND"
     full_text_query += f" {words[-1]}~2"
     return full_text_query.strip()
 # Function to generate audio with Eleven Labs TTS
 def generate_audio_elevenlabs(text):
     XI_API_KEY = os.environ['ELEVENLABS_API']
@@ -196,14 +176,14 @@ def generate_audio_elevenlabs(text):
                 if chunk:
                     f.write(chunk)
             audio_path = f.name
-        return audio_path
     else:
         print(f"Error generating audio: {response.text}")
         return None
 # Define the template for generating responses based on context
 template = """I am a guide for Birmingham, Alabama. I can provide recommendations and insights about the city, including events and activities.
-Ask your question directly, and I'll provide a precise and quick, short and crisp response in a conversational and straightforward way without any Greet.
 Context:
 {context}
@@ -219,12 +199,14 @@ def generate_response_with_prompt(context, question):
         context=context,
         question=question
     )
     llm = ChatOpenAI(temperature=0, api_key=os.environ['OPENAI_API_KEY'])
     response = llm(formatted_prompt)
     return response.content.strip()
 # Define the function to generate a hybrid response using Neo4j and other retrieval methods
 def retriever(question: str):
     structured_query = f"""
     CALL db.index.fulltext.queryNodes('entity', $query, {{limit: 2}})
     YIELD node, score
@@ -235,28 +217,58 @@ def retriever(question: str):
     structured_data = graph.query(structured_query, {"query": generate_full_text_query(question)})
     structured_response = "\n".join([f"{record['entity']}: {record['context']}" for record in structured_data])
     unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
     unstructured_response = "\n".join(unstructured_data)
     combined_context = f"Structured data:\n{structured_response}\n\nUnstructured data:\n{unstructured_response}"
     final_response = generate_response_with_prompt(combined_context, question)
     return final_response
-# Create Gradio interface for audio input and output
-interface = gr.Interface(
-    fn=lambda audio, state: process_audio(audio, state),
-    inputs=[
-        gr.Audio(sources="microphone", type="numpy", streaming=True),
-        gr.State(AppState())
-    ],
-    outputs=[
-        gr.Audio(type="filepath", autoplay=True, interactive=False),
-        gr.State()
-    ],
-    live=True,
-    description="Ask questions via audio and receive audio responses.",
-    allow_flagging="never"
-)
-# Launch the Gradio app
-interface.launch()

 from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 from langchain_community.vectorstores import Neo4jVector
 from langchain_community.graphs import Neo4jGraph
+from langchain_experimental.graph_transformers import LLMGraphTransformer
 from langchain_core.prompts import ChatPromptTemplate
 import time
 import os
 @dataclass
 class AppState:
     stream: np.ndarray | None = None
     sampling_rate: int = 0
     pause_detected: bool = False
+    started_talking: bool =  False
+    stopped: bool = False
+    conversation: list = field(default_factory=list)
+def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
+    """Take in the stream, determine if a pause happened"""
+    temp_audio = audio
+    dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
+    duration = len(audio) / sampling_rate
+    if dur_vad > 0.5 and not state.started_talking:
+        print("started talking")
+        state.started_talking = True
+        return False
+    print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
+    return (duration - dur_vad) > 1
+def start_recording_user(state: AppState):
+    if not state.stopped:
+        return gr.Audio(recording=True)
 # Neo4j setup
 graph = Neo4jGraph(
     return_timestamps=True
 )
+# Function to reset the state after 10 seconds
+def auto_reset_state():
+    time.sleep(2)
+    return None, ""  # Reset the state and clear input text
 # Function to process audio input and transcribe it
         print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
         return stream, "", None
+    # Ensure y is not empty and is at least 1-dimensional
     if y is None or len(y) == 0:
         return stream, "", None
     if max_abs_y > 0:
         y = y / max_abs_y
+    # Ensure stream is also at least 1-dimensional before concatenation
     if stream is not None and len(stream) > 0:
         stream = np.concatenate([stream, y])
     else:
         stream = y
+    # Process the audio data for transcription
     result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
     full_text = result.get("text", "")
+    # Start a thread to reset the state after 10 seconds
+    threading.Thread(target=auto_reset_state).start()
     return stream, full_text, full_text
+# Function to generate a full-text search query for Neo4j
+#def generate_full_text_query(input: str) -> str:
+    #full_text_query = ""
+    #words = [el for el in input.split() if el]
+    #for word in words[:-1]:
+        #full_text_query += f" {word}~2 AND"
+    #full_text_query += f" {words[-1]}~2"
+    #return full_text_query.strip()
 # Function to generate a full-text search query for Neo4j
 def generate_full_text_query(input: str) -> str:
+    # Split the input into words, ignoring any empty strings
     words = [el for el in input.split() if el]
+    # Check if there are no words
     if not words:
         return ""  # Return an empty string or a default query if desired
+    # Create the full-text query with fuzziness (~2 for proximity search)
     full_text_query = ""
     for word in words[:-1]:
         full_text_query += f" {word}~2 AND"
     full_text_query += f" {words[-1]}~2"
     return full_text_query.strip()
 # Function to generate audio with Eleven Labs TTS
 def generate_audio_elevenlabs(text):
     XI_API_KEY = os.environ['ELEVENLABS_API']
                 if chunk:
                     f.write(chunk)
             audio_path = f.name
+        return audio_path  # Return audio path for automatic playback
     else:
         print(f"Error generating audio: {response.text}")
         return None
 # Define the template for generating responses based on context
 template = """I am a guide for Birmingham, Alabama. I can provide recommendations and insights about the city, including events and activities.
+Ask your question directly, and I'll provide a precise and quick,short and crisp response in a conversational and straight-foreward way without any Greet.
 Context:
 {context}
         context=context,
         question=question
     )
+    # Use the ChatOpenAI instance to generate a response directly from the formatted prompt
     llm = ChatOpenAI(temperature=0, api_key=os.environ['OPENAI_API_KEY'])
     response = llm(formatted_prompt)
     return response.content.strip()
 # Define the function to generate a hybrid response using Neo4j and other retrieval methods
 def retriever(question: str):
+    # Structured data retrieval from Neo4j
     structured_query = f"""
     CALL db.index.fulltext.queryNodes('entity', $query, {{limit: 2}})
     YIELD node, score
     structured_data = graph.query(structured_query, {"query": generate_full_text_query(question)})
     structured_response = "\n".join([f"{record['entity']}: {record['context']}" for record in structured_data])
+    # Unstructured data retrieval from vector store
     unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
     unstructured_response = "\n".join(unstructured_data)
+    # Combine structured and unstructured responses
     combined_context = f"Structured data:\n{structured_response}\n\nUnstructured data:\n{unstructured_response}"
+    # Generate the final response using the prompt template
     final_response = generate_response_with_prompt(combined_context, question)
     return final_response
+# Function to handle the entire audio query and response process
+def process_audio_query(audio_input):
+    stream = None
+    _, transcription, _ = transcribe_function(stream, audio_input)
+    print(f"Transcription: {transcription}")
+    # Retrieve hybrid response using Neo4j and other methods
+    response_text = retriever(transcription)
+    print(f"Response: {response_text}")
+    # Generate audio from the response text
+    audio_path = generate_audio_elevenlabs(response_text)
+    return audio_path
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column():
+            input_audio = gr.Audio(label="Input Audio", sources="microphone", type="numpy")
+        with gr.Column():
+            chatbot = gr.Chatbot(label="Conversation", type="messages")
+            output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
+    state = gr.State(value=AppState())
+    stream = input_audio.stream(
+        process_audio_query,
+        [input_audio, state],
+        [output_audio, state],
+        stream_every=0.50,
+        time_limit=30,
+    )
+    restart = output_audio.stop(
+        start_recording_user,
+        [state],
+        [input_audio]
+    )
+    cancel = gr.Button("Stop Conversation", variant="stop")
+    cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None,
+                 [state, input_audio], cancels=[stream, restart])
+    demo.launch()