Spaces:

SreekarB
/

STSLP

Sleeping

App Files Files Community

SreekarB commited on Apr 28, 2025

Commit

e394327

verified ·

1 Parent(s): 7eac230

Upload 7 files

Browse files

Files changed (7) hide show

app.py +39 -50
audio_utils.py +8 -16
conversation_logic.py +5 -21
prompts.py +10 -0
requirements.txt +1 -6
save_state.py +16 -0
session_manager.py +24 -107

app.py CHANGED Viewed

@@ -1,54 +1,43 @@
 import gradio as gr
-from nova_sonic.session_manager import NovaSession
-from nlp.conversation_logic import handle_conversation_turn, get_initial_greeting
-from nova_sonic.audio_utils import encode_audio, decode_audio, play_audio
-import os, base64, json
-# Initialize the session with Nova
-session = NovaSession()
-conversation_log = []
-# Gradio Interface - Speech to Text
-def interact(audio_input):
-    base64_audio = encode_audio(audio_input)
-    nova_response = session.send_audio(base64_audio)
-    decoded_audio = decode_audio(nova_response['audio'])
-    user_text = nova_response['user_transcript']
-    system_text = nova_response['system_transcript']
-    feedback = handle_conversation_turn(user_text, system_text)
-    conversation_log.append({
-        "user_text": user_text,
-        "system_text": system_text,
-        "feedback": feedback
-    })
-    return decoded_audio
-def save_logs():
-    with open("logs/session_log.json", "w") as f:
-        json.dump(conversation_log, f)
-# Initial greeting from Nova
-def initiate_conversation():
-    initial_message = get_initial_greeting()  # Fetch random initial question
-    nova_audio = session.generate_audio(initial_message)  # Nova's response
-    play_audio(nova_audio)  # Play Nova's initial message
-    return nova_audio  # Return audio for Gradio to handle
-# Gradio Interface setup
-app = gr.Interface(
-    fn=interact,
-    inputs=gr.Audio(source="microphone", type="filepath"),
-    outputs=gr.Audio(type="numpy"),
-    live=True
 )
-# Launch and save logs after each session
-app.launch(after_live=save_logs)
-# Call Nova to initiate the conversation with a simple question
-initiate_conversation()

 import gradio as gr
+from session_manager import NovaSession
+from conversation_logic import generate_response
+from save_state import save_conversation, load_conversation
+from prompts import get_opening_prompt
+import audio_utils
+nova = NovaSession()
+state = {}
+def start_conversation(user_audio, session_id=None):
+    if session_id and session_id in state:
+        history = load_conversation(session_id)
+        opening_text = None
+    else:
+        session_id = nova.start_session()
+        opening_text = get_opening_prompt()
+        history = [{"role": "system", "content": opening_text}]
+        save_conversation(session_id, history)
+    user_text = nova.audio_to_text(user_audio)
+    history.append({"role": "user", "content": user_text})
+    nova_response = generate_response(history)
+    history.append({"role": "assistant", "content": nova_response})
+    save_conversation(session_id, history)
+    response_audio = nova.text_to_audio(nova_response)
+    return response_audio, session_id
+iface = gr.Interface(
+    fn=start_conversation,
+    inputs=[gr.Audio(source="microphone", type="filepath"), gr.Textbox(label="Session ID (leave blank to start new)")],
+    outputs=[gr.Audio(type="filepath", label="Nova Response"), gr.Textbox(label="Session ID")],
+    live=True,
+    title="NovaSpeech Therapy",
+    description="Real-time natural conversation speech therapy assistant"
 )
+if __name__ == "__main__":
+    iface.launch()

audio_utils.py CHANGED Viewed

@@ -1,19 +1,11 @@
 import base64
-import soundfile as sf
-import numpy as np
-def encode_audio(filepath):
-    data, samplerate = sf.read(filepath)
-    byte_data = (data * 32767).astype(np.int16).tobytes()
-    return base64.b64encode(byte_data).decode('utf-8')
-def decode_audio(base64_audio):
-    byte_data = base64.b64decode(base64_audio)
-    audio = np.frombuffer(byte_data, dtype=np.int16)
-    return audio, 24000
-def play_audio(audio_data):
-    """ A simple method to play audio (you can enhance with a Gradio component) """
-    # Using Gradio or any audio library to play the audio
-    # This is a placeholder, adjust according to your environment or library
-    return audio_data

 import base64
+def encode_audio_to_base64(audio_path):
+    with open(audio_path, "rb") as f:
+        return base64.b64encode(f.read()).decode()
+def decode_audio_from_base64(encoded_audio, output_path):
+    audio_data = base64.b64decode(encoded_audio)
+    with open(output_path, "wb") as f:
+        f.write(audio_data)
+    return output_path

conversation_logic.py CHANGED Viewed

@@ -1,21 +1,5 @@
-import random
-# Simple set of questions for initiating conversation
-initial_questions = [
-    "Hi there! It's good to see you. What's your favorite color?",
-    "Hello! If you could have any pet, what would it be?",
-    "Hey! What's your favorite food?",
-    "Hi! Do you like to play outside or inside more?",
-    "Hello there! Can you tell me your favorite animal?"
-]
-def get_initial_greeting():
-    return random.choice(initial_questions)
-def handle_conversation_turn(user_text, system_text):
-    feedback = {}
-    if "goed" in user_text:
-        feedback["correction"] = "You should say 'went' instead of 'goed'."
-        feedback["error_type"] = "past tense verb"
-    # Additional feedback logic can be added here
-    return feedback

+def generate_response(history):
+    last_user_input = history[-1]['content']
+    if "hello" in last_user_input.lower():
+        return "Hi there! How are you feeling today?"
+    return "That's interesting! Tell me more."

prompts.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import random
+def get_opening_prompt():
+    openings = [
+        "Hey! I'm Nova. I'm excited to talk with you today! What's something fun you did recently?",
+        "Hello friend! I'm Nova. Let's chat! What's your favorite animal?",
+        "Hi there! Nova here. If you could visit any place in the world, where would you go?",
+        "Hey! I'm Nova. I can't wait to learn more about you! What's your favorite color?"
+    ]
+    return random.choice(openings)

requirements.txt CHANGED Viewed

@@ -1,7 +1,2 @@
 gradio
-requests
-soundfile
-numpy
-transformers
-torch
-pyaudio


1	gradio
2	+ requests

save_state.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import json
+import os
+SAVE_DIR = "logs/"
+def save_conversation(session_id, history):
+    os.makedirs(SAVE_DIR, exist_ok=True)
+    with open(f"{SAVE_DIR}/{session_id}.json", "w") as f:
+        json.dump(history, f)
+def load_conversation(session_id):
+    try:
+        with open(f"{SAVE_DIR}/{session_id}.json", "r") as f:
+            return json.load(f)
+    except FileNotFoundError:
+        return []

session_manager.py CHANGED Viewed

@@ -1,115 +1,32 @@
 import requests
-import uuid
-import json
 class NovaSession:
     def __init__(self):
-        self.session_id = str(uuid.uuid4())
-        self.prompt_id = "126680f5-5859-4d15-ae70-488de4146484"
-        self.session_url = "https://api.novasonic.aws/session"
-        self.headers = {"Content-Type": "application/json"}
-        self.init_session()
-    def init_session(self):
-        payload = {
-            "event": {
-                "sessionStart": {
-                    "inferenceConfiguration": {
-                        "maxTokens": 1024,
-                        "topP": 0.9,
-                        "temperature": 0.7
-                    }
-                },
-                "promptStart": {
-                    "promptName": self.prompt_id,
-                    "textOutputConfiguration": {"mediaType": "text/plain"},
-                    "audioOutputConfiguration": {
-                        "mediaType": "audio/lpcm",
-                        "sampleRateHertz": 24000,
-                        "sampleSizeBits": 16,
-                        "channelCount": 1,
-                        "voiceId": "matthew",
-                        "encoding": "base64",
-                        "audioType": "SPEECH"
-                    },
-                    "toolUseOutputConfiguration": {
-                        "mediaType": "application/json"
-                    },
-                    "toolConfiguration": {
-                        "tools": [
-                            {
-                                "toolSpec": {
-                                    "name": "getDateTool",
-                                    "description": "get information about the current date",
-                                    "inputSchema": {
-                                        "json": '{"type":"object","properties":{},"required":[]}'
-                                    }
-                                }
-                            }
-                        ]
-                    }
-                }
             }
-        }
-        requests.post(self.session_url, headers=self.headers, json=payload)
-    def send_audio(self, base64_audio):
-        content_id = str(uuid.uuid4())
-        payload = {
-            "event": {
-                "contentStart": {
-                    "promptName": self.prompt_id,
-                    "contentName": content_id,
-                    "type": "AUDIO",
-                    "interactive": True,
-                    "role": "USER",
-                    "audioInputConfiguration": {
-                        "mediaType": "audio/lpcm",
-                        "sampleRateHertz": 16000,
-                        "sampleSizeBits": 16,
-                        "channelCount": 1,
-                        "audioType": "SPEECH",
-                        "encoding": "base64"
-                    }
-                },
-                "audioInput": {
-                    "promptName": self.prompt_id,
-                    "contentName": content_id,
-                    "content": base64_audio
-                },
-                "contentEnd": {
-                    "promptName": self.prompt_id,
-                    "contentName": content_id
-                }
-            }
-        }
-        response = requests.post(self.session_url, headers=self.headers, json=payload)
-        return response.json()
-    def generate_audio(self, message):
-        # Send a text message to Nova to get an audio response
-        payload = {
-            "event": {
-                "contentStart": {
-                    "promptName": self.prompt_id,
-                    "contentName": str(uuid.uuid4()),
-                    "type": "TEXT",
-                    "role": "SYSTEM",
-                    "interactive": True,
-                    "textInputConfiguration": {
-                        "mediaType": "text/plain"
-                    }
-                },
-                "textInput": {
-                    "promptName": self.prompt_id,
-                    "contentName": str(uuid.uuid4()),
-                    "content": message
-                },
-                "contentEnd": {
-                    "promptName": self.prompt_id,
-                    "contentName": str(uuid.uuid4())
-                }
-            }
-        }
-        response = requests.post(self.session_url, headers=self.headers, json=payload)
-        return response.json()['audio']

 import requests
+import base64
 class NovaSession:
     def __init__(self):
+        self.session_url = "https://api.novasonic.com/start_session"
+        self.audio_to_text_url = "https://api.novasonic.com/audio_to_text"
+        self.text_to_audio_url = "https://api.novasonic.com/text_to_audio"
+    def start_session(self):
+        response = requests.post(self.session_url, json={
+            "inferenceConfiguration": {
+                "maxTokens": 1024,
+                "topP": 0.9,
+                "temperature": 0.7
             }
+        })
+        return response.json().get("session_id", "new_session")
+    def audio_to_text(self, audio_path):
+        with open(audio_path, "rb") as f:
+            encoded_audio = base64.b64encode(f.read()).decode()
+        response = requests.post(self.audio_to_text_url, json={"audio": encoded_audio})
+        return response.json().get("text", "")
+    def text_to_audio(self, text):
+        response = requests.post(self.text_to_audio_url, json={"text": text})
+        audio_content = base64.b64decode(response.json().get("audio", ""))
+        output_path = "/tmp/nova_response.wav"
+        with open(output_path, "wb") as f:
+            f.write(audio_content)
+        return output_path