File size: 5,325 Bytes
39377a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import whisper
from gtts import gTTS
from dotenv import load_dotenv
import openai
import streamlit as st
import tempfile

# Load environment variables
load_dotenv()

# Initialize Whisper Model
@st.cache_resource
def load_whisper_model():
    return whisper.load_model("small")

whisper_model = load_whisper_model()

# Streamlit UI
st.title("Conversational AI with Speech-to-Speech Response")
st.write("Record your voice or upload an audio file to start the process.")

# Sidebar Interaction Mode
interaction_mode = st.sidebar.selectbox(
    "Choose Interaction Mode:", ["Record Voice", "Upload Audio"]
)

# Record Voice Functionality with st.audio_input
if interaction_mode == "Record Voice":
    st.write("Use the audio recorder below to record your voice:")
    
    # Record audio using st.audio_input
    audio_data = st.audio_input("Record your voice")
    
    if audio_data:
        st.info("Recording received. Processing...")
        
        # Save the audio data to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
            temp_audio.write(audio_data.getvalue())  # Use .getvalue() to extract raw bytes
            temp_audio_path = temp_audio.name

        # Play back the saved audio
        st.audio(temp_audio_path, format="audio/wav")
        st.success("Audio saved and ready for transcription!")


# Upload Audio Functionality
elif interaction_mode == "Upload Audio":
    uploaded_file = st.file_uploader("Upload your audio file (MP3/WAV)", type=["mp3", "wav"])
    
    if uploaded_file is not None:
        st.info("File uploaded. Saving...")
        
        # Save the uploaded audio file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
            temp_audio.write(uploaded_file.read())  # Write uploaded audio content
            temp_audio_path = temp_audio.name

        # Play back the uploaded audio
        st.audio(temp_audio_path, format="audio/mp3")
        st.success("Audio uploaded and ready for transcription!")

# Transcribe and Process Audio
if 'temp_audio_path' in locals() and temp_audio_path:
    st.write("Processing the audio file for transcription...")
    
    with st.spinner("Transcribing audio..."):
        result = whisper_model.transcribe(temp_audio_path)
        user_text = result["text"]
        st.write("Transcribed Text:", user_text)
        st.success("Transcription complete!")

    # Generate AI Response
    st.write("Generating a conversational response...")
    
    with st.spinner("Generating response..."):
       
        client = openai.OpenAI(
            #Uncomment below if you want to use .env file for localhost or other deployment
            #api_key=os.environ.get("SAMBANOVA_API_KEY"),

            #for streamlit deployment
            api_key= st.secrets["SAMBANOVA_API_KEY"],
            base_url="https://api.sambanova.ai/v1",
        )
        
        response = client.chat.completions.create(
            model='Meta-Llama-3.1-8B-Instruct',
            messages=[
                {"role": "system", "content": (
                    "You are a kind, empathetic, and intelligent assistant capable of meaningful conversations and emotional support. "
                    "Your primary goals are: "
                    "1. To engage in casual, friendly, and supportive conversations when the user seeks companionship or emotional relief. "
                    "2. To adapt your tone and responses to match the user's mood, providing warmth and encouragement if they seem distressed or seeking emotional support. "
                    "3. To answer questions accurately and provide explanations when asked, adjusting the depth and length of your answers based on the user's needs. "
                    "4. To maintain a positive and non-judgmental tone, offering helpful advice or lighthearted dialogue when appropriate. "
                    "5. To ensure the user feels heard, understood, and valued during every interaction. "
                    "If the user does not ask a question, keep the conversation engaging and meaningful by responding thoughtfully or with light humor where appropriate."
                )},
                {"role": "user", "content": user_text},
            ],
            temperature=0.1,
            top_p=0.1,
        )
        
        answer = response.choices[0].message.content
        st.write("Response:", answer)
        st.success("Response generated!")

    # Convert response text to speech using gTTS
    st.write("Converting the response to speech...")
    
    with st.spinner("Converting text to speech..."):
        tts = gTTS(text=answer, slow=False)
        response_audio_path = "final_response.mp3"
        tts.save(response_audio_path)
        st.success("Conversion complete!")

    # Play and download the response MP3
    st.audio(response_audio_path, format="audio/mp3")
    st.download_button(
        label="Download the Response",
        data=open(response_audio_path, "rb"),
        file_name="final_response.mp3",
        mime="audio/mpeg",
    )

    # Clean up temporary files
    os.remove(temp_audio_path)
    os.remove(response_audio_path)