SreekarB commited on
Commit
e394327
·
verified ·
1 Parent(s): 7eac230

Upload 7 files

Browse files
Files changed (7) hide show
  1. app.py +39 -50
  2. audio_utils.py +8 -16
  3. conversation_logic.py +5 -21
  4. prompts.py +10 -0
  5. requirements.txt +1 -6
  6. save_state.py +16 -0
  7. session_manager.py +24 -107
app.py CHANGED
@@ -1,54 +1,43 @@
1
  import gradio as gr
2
- from nova_sonic.session_manager import NovaSession
3
- from nlp.conversation_logic import handle_conversation_turn, get_initial_greeting
4
- from nova_sonic.audio_utils import encode_audio, decode_audio, play_audio
5
- import os, base64, json
6
-
7
- # Initialize the session with Nova
8
- session = NovaSession()
9
-
10
- conversation_log = []
11
-
12
- # Gradio Interface - Speech to Text
13
- def interact(audio_input):
14
- base64_audio = encode_audio(audio_input)
15
- nova_response = session.send_audio(base64_audio)
16
-
17
- decoded_audio = decode_audio(nova_response['audio'])
18
- user_text = nova_response['user_transcript']
19
- system_text = nova_response['system_transcript']
20
-
21
- feedback = handle_conversation_turn(user_text, system_text)
22
 
23
- conversation_log.append({
24
- "user_text": user_text,
25
- "system_text": system_text,
26
- "feedback": feedback
27
- })
28
-
29
- return decoded_audio
30
-
31
- def save_logs():
32
- with open("logs/session_log.json", "w") as f:
33
- json.dump(conversation_log, f)
34
-
35
- # Initial greeting from Nova
36
- def initiate_conversation():
37
- initial_message = get_initial_greeting() # Fetch random initial question
38
- nova_audio = session.generate_audio(initial_message) # Nova's response
39
- play_audio(nova_audio) # Play Nova's initial message
40
- return nova_audio # Return audio for Gradio to handle
41
-
42
- # Gradio Interface setup
43
- app = gr.Interface(
44
- fn=interact,
45
- inputs=gr.Audio(source="microphone", type="filepath"),
46
- outputs=gr.Audio(type="numpy"),
47
- live=True
48
  )
49
 
50
- # Launch and save logs after each session
51
- app.launch(after_live=save_logs)
52
-
53
- # Call Nova to initiate the conversation with a simple question
54
- initiate_conversation()
 
1
  import gradio as gr
2
+ from session_manager import NovaSession
3
+ from conversation_logic import generate_response
4
+ from save_state import save_conversation, load_conversation
5
+ from prompts import get_opening_prompt
6
+ import audio_utils
7
+
8
+ nova = NovaSession()
9
+ state = {}
10
+
11
+ def start_conversation(user_audio, session_id=None):
12
+ if session_id and session_id in state:
13
+ history = load_conversation(session_id)
14
+ opening_text = None
15
+ else:
16
+ session_id = nova.start_session()
17
+ opening_text = get_opening_prompt()
18
+ history = [{"role": "system", "content": opening_text}]
19
+ save_conversation(session_id, history)
 
 
20
 
21
+ user_text = nova.audio_to_text(user_audio)
22
+ history.append({"role": "user", "content": user_text})
23
+
24
+ nova_response = generate_response(history)
25
+ history.append({"role": "assistant", "content": nova_response})
26
+
27
+ save_conversation(session_id, history)
28
+
29
+ response_audio = nova.text_to_audio(nova_response)
30
+
31
+ return response_audio, session_id
32
+
33
+ iface = gr.Interface(
34
+ fn=start_conversation,
35
+ inputs=[gr.Audio(source="microphone", type="filepath"), gr.Textbox(label="Session ID (leave blank to start new)")],
36
+ outputs=[gr.Audio(type="filepath", label="Nova Response"), gr.Textbox(label="Session ID")],
37
+ live=True,
38
+ title="NovaSpeech Therapy",
39
+ description="Real-time natural conversation speech therapy assistant"
 
 
 
 
 
 
40
  )
41
 
42
+ if __name__ == "__main__":
43
+ iface.launch()
 
 
 
audio_utils.py CHANGED
@@ -1,19 +1,11 @@
1
  import base64
2
- import soundfile as sf
3
- import numpy as np
4
 
5
- def encode_audio(filepath):
6
- data, samplerate = sf.read(filepath)
7
- byte_data = (data * 32767).astype(np.int16).tobytes()
8
- return base64.b64encode(byte_data).decode('utf-8')
9
 
10
- def decode_audio(base64_audio):
11
- byte_data = base64.b64decode(base64_audio)
12
- audio = np.frombuffer(byte_data, dtype=np.int16)
13
- return audio, 24000
14
-
15
- def play_audio(audio_data):
16
- """ A simple method to play audio (you can enhance with a Gradio component) """
17
- # Using Gradio or any audio library to play the audio
18
- # This is a placeholder, adjust according to your environment or library
19
- return audio_data
 
1
  import base64
 
 
2
 
3
+ def encode_audio_to_base64(audio_path):
4
+ with open(audio_path, "rb") as f:
5
+ return base64.b64encode(f.read()).decode()
 
6
 
7
+ def decode_audio_from_base64(encoded_audio, output_path):
8
+ audio_data = base64.b64decode(encoded_audio)
9
+ with open(output_path, "wb") as f:
10
+ f.write(audio_data)
11
+ return output_path
 
 
 
 
 
conversation_logic.py CHANGED
@@ -1,21 +1,5 @@
1
- import random
2
-
3
- # Simple set of questions for initiating conversation
4
- initial_questions = [
5
- "Hi there! It's good to see you. What's your favorite color?",
6
- "Hello! If you could have any pet, what would it be?",
7
- "Hey! What's your favorite food?",
8
- "Hi! Do you like to play outside or inside more?",
9
- "Hello there! Can you tell me your favorite animal?"
10
- ]
11
-
12
- def get_initial_greeting():
13
- return random.choice(initial_questions)
14
-
15
- def handle_conversation_turn(user_text, system_text):
16
- feedback = {}
17
- if "goed" in user_text:
18
- feedback["correction"] = "You should say 'went' instead of 'goed'."
19
- feedback["error_type"] = "past tense verb"
20
- # Additional feedback logic can be added here
21
- return feedback
 
1
+ def generate_response(history):
2
+ last_user_input = history[-1]['content']
3
+ if "hello" in last_user_input.lower():
4
+ return "Hi there! How are you feeling today?"
5
+ return "That's interesting! Tell me more."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
prompts.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ def get_opening_prompt():
4
+ openings = [
5
+ "Hey! I'm Nova. I'm excited to talk with you today! What's something fun you did recently?",
6
+ "Hello friend! I'm Nova. Let's chat! What's your favorite animal?",
7
+ "Hi there! Nova here. If you could visit any place in the world, where would you go?",
8
+ "Hey! I'm Nova. I can't wait to learn more about you! What's your favorite color?"
9
+ ]
10
+ return random.choice(openings)
requirements.txt CHANGED
@@ -1,7 +1,2 @@
1
  gradio
2
- requests
3
- soundfile
4
- numpy
5
- transformers
6
- torch
7
- pyaudio
 
1
  gradio
2
+ requests
 
 
 
 
 
save_state.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ SAVE_DIR = "logs/"
5
+
6
+ def save_conversation(session_id, history):
7
+ os.makedirs(SAVE_DIR, exist_ok=True)
8
+ with open(f"{SAVE_DIR}/{session_id}.json", "w") as f:
9
+ json.dump(history, f)
10
+
11
+ def load_conversation(session_id):
12
+ try:
13
+ with open(f"{SAVE_DIR}/{session_id}.json", "r") as f:
14
+ return json.load(f)
15
+ except FileNotFoundError:
16
+ return []
session_manager.py CHANGED
@@ -1,115 +1,32 @@
1
  import requests
2
- import uuid
3
- import json
4
 
5
  class NovaSession:
6
  def __init__(self):
7
- self.session_id = str(uuid.uuid4())
8
- self.prompt_id = "126680f5-5859-4d15-ae70-488de4146484"
9
- self.session_url = "https://api.novasonic.aws/session"
10
- self.headers = {"Content-Type": "application/json"}
11
- self.init_session()
12
 
13
- def init_session(self):
14
- payload = {
15
- "event": {
16
- "sessionStart": {
17
- "inferenceConfiguration": {
18
- "maxTokens": 1024,
19
- "topP": 0.9,
20
- "temperature": 0.7
21
- }
22
- },
23
- "promptStart": {
24
- "promptName": self.prompt_id,
25
- "textOutputConfiguration": {"mediaType": "text/plain"},
26
- "audioOutputConfiguration": {
27
- "mediaType": "audio/lpcm",
28
- "sampleRateHertz": 24000,
29
- "sampleSizeBits": 16,
30
- "channelCount": 1,
31
- "voiceId": "matthew",
32
- "encoding": "base64",
33
- "audioType": "SPEECH"
34
- },
35
- "toolUseOutputConfiguration": {
36
- "mediaType": "application/json"
37
- },
38
- "toolConfiguration": {
39
- "tools": [
40
- {
41
- "toolSpec": {
42
- "name": "getDateTool",
43
- "description": "get information about the current date",
44
- "inputSchema": {
45
- "json": '{"type":"object","properties":{},"required":[]}'
46
- }
47
- }
48
- }
49
- ]
50
- }
51
- }
52
  }
53
- }
54
- requests.post(self.session_url, headers=self.headers, json=payload)
55
 
56
- def send_audio(self, base64_audio):
57
- content_id = str(uuid.uuid4())
58
- payload = {
59
- "event": {
60
- "contentStart": {
61
- "promptName": self.prompt_id,
62
- "contentName": content_id,
63
- "type": "AUDIO",
64
- "interactive": True,
65
- "role": "USER",
66
- "audioInputConfiguration": {
67
- "mediaType": "audio/lpcm",
68
- "sampleRateHertz": 16000,
69
- "sampleSizeBits": 16,
70
- "channelCount": 1,
71
- "audioType": "SPEECH",
72
- "encoding": "base64"
73
- }
74
- },
75
- "audioInput": {
76
- "promptName": self.prompt_id,
77
- "contentName": content_id,
78
- "content": base64_audio
79
- },
80
- "contentEnd": {
81
- "promptName": self.prompt_id,
82
- "contentName": content_id
83
- }
84
- }
85
- }
86
- response = requests.post(self.session_url, headers=self.headers, json=payload)
87
- return response.json()
88
 
89
- def generate_audio(self, message):
90
- # Send a text message to Nova to get an audio response
91
- payload = {
92
- "event": {
93
- "contentStart": {
94
- "promptName": self.prompt_id,
95
- "contentName": str(uuid.uuid4()),
96
- "type": "TEXT",
97
- "role": "SYSTEM",
98
- "interactive": True,
99
- "textInputConfiguration": {
100
- "mediaType": "text/plain"
101
- }
102
- },
103
- "textInput": {
104
- "promptName": self.prompt_id,
105
- "contentName": str(uuid.uuid4()),
106
- "content": message
107
- },
108
- "contentEnd": {
109
- "promptName": self.prompt_id,
110
- "contentName": str(uuid.uuid4())
111
- }
112
- }
113
- }
114
- response = requests.post(self.session_url, headers=self.headers, json=payload)
115
- return response.json()['audio']
 
1
  import requests
2
+ import base64
 
3
 
4
  class NovaSession:
5
  def __init__(self):
6
+ self.session_url = "https://api.novasonic.com/start_session"
7
+ self.audio_to_text_url = "https://api.novasonic.com/audio_to_text"
8
+ self.text_to_audio_url = "https://api.novasonic.com/text_to_audio"
 
 
9
 
10
+ def start_session(self):
11
+ response = requests.post(self.session_url, json={
12
+ "inferenceConfiguration": {
13
+ "maxTokens": 1024,
14
+ "topP": 0.9,
15
+ "temperature": 0.7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  }
17
+ })
18
+ return response.json().get("session_id", "new_session")
19
 
20
+ def audio_to_text(self, audio_path):
21
+ with open(audio_path, "rb") as f:
22
+ encoded_audio = base64.b64encode(f.read()).decode()
23
+ response = requests.post(self.audio_to_text_url, json={"audio": encoded_audio})
24
+ return response.json().get("text", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ def text_to_audio(self, text):
27
+ response = requests.post(self.text_to_audio_url, json={"text": text})
28
+ audio_content = base64.b64decode(response.json().get("audio", ""))
29
+ output_path = "/tmp/nova_response.wav"
30
+ with open(output_path, "wb") as f:
31
+ f.write(audio_content)
32
+ return output_path