Spaces:

Daaku-C5
/

Voice-Bot

Sleeping

File size: 10,267 Bytes

d7dfe8c
79a67ac
 
 
 
72ad8d0
 
d7dfe8c
72ad8d0
 
 
 
 
 
79a67ac
 
 
 
 
72ad8d0
 
 
965fe26
 
 
 
 
 
 
72ad8d0
965fe26
 
 
 
72ad8d0
965fe26
72ad8d0
965fe26
 
 
 
 
 
 
 
 
 
 
 
 
 
72ad8d0
 
 
965fe26
72ad8d0
 
 
79a67ac
72ad8d0
 
 
 
 
 
 
 
 
 
8277f45
 
79a67ac
 
72ad8d0
79a67ac
3d391d9
 
 
72ad8d0
 
 
 
 
6e6e6ae
72ad8d0
3d391d9
72ad8d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79a67ac
72ad8d0
79a67ac
72ad8d0
 
 
 
 
 
 
 
 
 
 
6e6e6ae
72ad8d0
 
 
 
 
 
 
 
 
 
79a67ac
 
 
72ad8d0
 
79a67ac
72ad8d0
 
79a67ac
72ad8d0
 
6e6e6ae
 
72ad8d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79a67ac
 
 
72ad8d0
 
 
 
 
 
 
 
 
 
 
 
 
30948ea
72ad8d0
 
 
 
79a67ac
72ad8d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79a67ac
72ad8d0
 
6e6e6ae
72ad8d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e6e6ae
 
 
 
72ad8d0
 
8277f45
 
 
30948ea
8277f45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e6e6ae
 
8277f45
 
72ad8d0
 
 
 
 
 
 
 
79a67ac
72ad8d0
 
79a67ac
72ad8d0
 
c3e2346
79a67ac
72ad8d0
 
 
30948ea
72ad8d0
 
 
 
 
 
 
 
 
 
 
 
 
6e6e6ae
72ad8d0
 
6e6e6ae
 
72ad8d0
6e6e6ae
 
 
 
72ad8d0
 
 
 
30948ea
79a67ac
72ad8d0

import streamlit as st
from openai import OpenAI
import io
import base64
import os
import tempfile
from audio_recorder_streamlit import audio_recorder

# Page configuration
st.set_page_config(
    page_title="Voice Bot",
    layout="wide",
    initial_sidebar_state="collapsed"
)

# Configuration
TEMP_AUDIO_FILE = "temp_audio.wav"

# Initialize OpenAI client
@st.cache_resource
def init_openai_client():
    try:
        # Check for API key in multiple places
        api_key = None
        
        # 1. Try environment variable first (HF Spaces secrets appear as env vars)
        api_key = os.environ.get("OPENAI_API_KEY")
        
        # 2. Try Streamlit secrets (for local development)
        if not api_key:
            try:
                api_key = st.secrets["OPENAI_API_KEY"]
            except (KeyError, FileNotFoundError):
                pass
        
        # 3. Check if we found the key
        if not api_key:
            st.error("⚠️ OpenAI API key not found!")
            st.markdown("""
            **For Hugging Face Spaces:**
            1. Go to your Space settings
            2. Click on "Repository secrets" 
            3. Add a new secret with name: `OPENAI_API_KEY`
            4. Restart your Space
            
            **For local development:**
            Create `.streamlit/secrets.toml` with:
            ```
            OPENAI_API_KEY = "your-key-here"
            ```
            """)
            st.stop()
        
        return OpenAI(api_key=api_key)
        
    except Exception as e:
        st.error(f"Error initializing OpenAI client: {str(e)}")
        st.stop()

client = init_openai_client()

# Initialize session state variables
def init_session_state():
    if 'conversation_history' not in st.session_state:
        st.session_state.conversation_history = []
    if 'context' not in st.session_state:
        st.session_state.context = load_context()
    if 'processing' not in st.session_state:
        st.session_state.processing = False
    if 'last_audio_hash' not in st.session_state:
        st.session_state.last_audio_hash = None

def load_context():
    """Load the context from file or return default."""
    try:
        base_dir = os.path.dirname(os.path.abspath(__file__))
        context_path = os.path.join(base_dir, 'context.txt')
        
        if os.path.exists(context_path):
            with open(context_path, "r", encoding='utf-8') as f:
                return f.read().strip()
        else:
            # Default context if file doesn't exist
            return """I am Prakhar, an AI assistant. I can help you with general questions and conversations.
I aim to be helpful, harmless, and honest in all my interactions."""
            
    except Exception as e:
        st.error(f"Error loading context: {str(e)}")
        return "I am Prakhar, an AI assistant."

def save_context(context_text):
    """Save context to file."""
    try:
        base_dir = os.path.dirname(os.path.abspath(__file__))
        context_path = os.path.join(base_dir, 'context.txt')
        
        with open(context_path, "w", encoding='utf-8') as f:
            f.write(context_text)
        return True
    except Exception as e:
        st.error(f"Error saving context: {str(e)}")
        return False

def transcribe_audio(audio_bytes):
    """Transcribe audio using Whisper API."""
    try:
        # Create a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
            tmp_file.write(audio_bytes)
            tmp_file_path = tmp_file.name
        
        # Transcribe using OpenAI Whisper
        with open(tmp_file_path, "rb") as audio_file:
            transcript = client.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file,
                language="en"  # You can remove this to auto-detect language
            )
        
        # Clean up temporary file
        os.unlink(tmp_file_path)
        
        return transcript.text.strip()
        
    except Exception as e:
        st.error(f"Error transcribing audio: {str(e)}")
        return None

def get_ai_response(user_text, context):
    """Get AI response using GPT-4."""
    try:
        system_prompt = f"""You are Prakhar. You should respond naturally and helpfully.

Context about you:
{context}

Instructions:
- Use the context above to inform your responses
- If asked about something not covered in the context, you can use your general knowledge
- If you're not sure about something specific to your context, say "I'm not sure about that based on what I know about myself"
- Keep responses conversational and natural
- Be helpful and engaging"""

        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_text}
            ],
            max_tokens=500,
            temperature=0.7
        )
        
        return response.choices[0].message.content.strip()
        
    except Exception as e:
        st.error(f"Error getting AI response: {str(e)}")
        return "I'm sorry, I encountered an error while processing your request."

def text_to_speech(text):
    """Convert text to speech using OpenAI TTS."""
    try:
        response = client.audio.speech.create(
            model="tts-1",
            voice="onyx",  # Available voices: alloy, echo, fable, onyx, nova, shimmer
            input=text,
            speed=1.0
        )
        
        return response.content
        
    except Exception as e:
        st.error(f"Error generating speech: {str(e)}")
        return None

def process_audio(audio_bytes):
    """Process recorded audio through the full pipeline."""
    if not audio_bytes:
        return None, None, None
    
    # Transcribe audio
    with st.spinner("🎯 Transcribing audio..."):
        user_text = transcribe_audio(audio_bytes)
    
    if not user_text:
        return None, None, None
    
    # Get AI response
    with st.spinner("🤖 Generating response..."):
        ai_response = get_ai_response(user_text, st.session_state.context)
    
    # Convert to speech
    with st.spinner("🔊 Converting to speech..."):
        speech_audio = text_to_speech(ai_response)
    
    return user_text, ai_response, speech_audio

def main():
    st.title("🎙️ Voice Bot")
    st.markdown("*Talk to Prakhar using your voice!*")
    
    # Initialize session state
    init_session_state()
    
    # Create main layout
    col1, col2 = st.columns([1, 1], gap="large")
    
    with col1:
        st.subheader("🎤 Voice Input")
        
        # Audio recorder
        audio_bytes = audio_recorder(
            text="Click to record",
            recording_color="#e74c3c",
            neutral_color="#34495e",
            icon_name="microphone",
            icon_size="2x",
            pause_threshold=2.0,
            sample_rate=44100
        )
        
        # Show current recording
        if audio_bytes:
            st.audio(audio_bytes, format="audio/wav")
        
        # Process audio when new recording is available
        if audio_bytes and not st.session_state.processing:
            # Create a hash of the audio to detect new recordings
            import hashlib
            audio_hash = hashlib.md5(audio_bytes).hexdigest()
            
            # Only process if this is a new recording
            if audio_hash != st.session_state.last_audio_hash:
                st.session_state.processing = True
                st.session_state.last_audio_hash = audio_hash
                
                user_text, ai_response, speech_audio = process_audio(audio_bytes)
                
                if user_text and ai_response:
                    # Add to conversation history
                    st.session_state.conversation_history.append({
                        "user": user_text,
                        "ai": ai_response,
                        "speech": speech_audio
                    })
                
                # Reset processing flag before rerun
                st.session_state.processing = False
                
                # Force a rerun to update the conversation display
                if user_text and ai_response:
                    st.rerun()
    
    with col2:
        st.subheader("💬 Conversation")
        
        # Display conversation history
        if st.session_state.conversation_history:
            # Show the most recent conversation
            latest = st.session_state.conversation_history[-1]
            
            st.markdown("**You said:**")
            st.info(latest["user"])
            
            st.markdown("**Prakhar replied:**")
            st.success(latest["ai"])
            st.session_state.processing = False
            
            # Play AI response audio
            if latest["speech"]:
                st.audio(latest["speech"], format="audio/mp3")
            
            # Show conversation history
            if len(st.session_state.conversation_history) > 1:
                with st.expander("📜 Previous conversations"):
                    for i, conv in enumerate(reversed(st.session_state.conversation_history[:-1])):
                        st.markdown(f"**Conversation {len(st.session_state.conversation_history) - i - 1}:**")
                        st.markdown(f"👤 You: {conv['user']}")
                        st.markdown(f"🤖 Prakhar: {conv['ai']}")
                        if conv["speech"]:
                            st.audio(conv["speech"], format="audio/mp3")
                        st.divider()
        else:
            st.info("👋 Start by recording your voice message above!")
    
    # Context display section
    st.divider()
    
    with st.expander("ℹ️ Context", expanded=False):
        st.info(st.session_state.context)
        
        # Only keep the clear conversation button
        if st.button("🗑️ Clear Conversation"):
            st.session_state.conversation_history = []
            st.rerun()
    
    # Status indicators
    if st.session_state.processing:
        st.info("🔄 Processing your request...")

if __name__ == "__main__":
    main()