Spaces:

mozzicato
/

VOC

Sleeping

App Files Files Community

mozzicato commited on Oct 29, 2025

Commit

1e5a24e

verified ·

1 Parent(s): 3dca110

xx

Browse files

Files changed (1) hide show

voc6.py +0 -794

voc6.py CHANGED Viewed

@@ -7,10 +7,6 @@ Original file is located at
     https://colab.research.google.com/drive/17WecCovbP3TgYvHDyZ4Yckj77r2q5Nam
 """
-!pip install langchain langchain-google-genai langchain-core sentence-transformers faiss-cpu numpy gradio
-!pip install langchain-google-genai
-# Cell 1: Install packages
-!pip install spitch gradio pydub python-dotenv
 # Cell to add FIRST - Your Original WemaRAGSystem
 import json
@@ -1546,793 +1542,3 @@ iface = create_voice_gradio_interface(
 )
 iface.launch(share=True, debug=True)
-# ============================================================================
-# Wema Bank Voice-Enabled RAG Chatbot with Spitch Integration - CORRECTED
-# ============================================================================
-import tempfile
-import os
-import atexit
-import glob
-import io
-from typing import Optional
-from spitch import Spitch
-import gradio as gr
-from google.colab import userdata
-# ============================================================================
-# STEP 1: Initialize Spitch Client
-# ============================================================================
-class SpitchVoiceHandler:
-    """
-    Handles all voice-related operations using Spitch API.
-    Supports multilingual speech-to-text and text-to-speech.
-    """
-    def __init__(self, api_key: str):
-        """
-        Initialize Spitch client.
-        Args:
-            api_key: Your Spitch API key
-        """
-        self.client = Spitch(api_key=api_key)
-    def transcribe_audio(
-        self,
-        audio_file,
-        source_language: str = "en",
-        model: str = "mansa_v1"
-    ) -> str:
-        """
-        Transcribe audio to text using Spitch.
-        Supports multiple African and international languages.
-        Args:
-            audio_file: Audio file path or file-like object
-            source_language: Language code (e.g., 'en', 'yo', 'ig', 'ha')
-            model: Spitch model to use (default: mansa_v1)
-        Returns:
-            Transcribed text
-        """
-        try:
-            print(f"🎤 Transcribing audio file: {audio_file}")
-            # If audio_file is a path, open it
-            if isinstance(audio_file, str):
-                with open(audio_file, 'rb') as f:
-                    response = self.client.speech.transcribe(
-                        content=f,
-                        language=source_language,
-                        model=model
-                    )
-            else:
-                # Assume it's already a file-like object (from Gradio)
-                response = self.client.speech.transcribe(
-                    content=audio_file,
-                    language=source_language,
-                    model=model
-                )
-            print(f"Response type: {type(response)}")
-            # ✅ Spitch transcribe returns a response object with .text or json()
-            if hasattr(response, 'text') and callable(response.text):
-                # It's a method, not an attribute
-                transcription_text = response.text()
-            elif hasattr(response, 'text'):
-                # It's an attribute
-                transcription_text = response.text
-            elif hasattr(response, 'json'):
-                # Try to parse JSON response
-                json_data = response.json()
-                transcription_text = json_data.get('text', str(json_data))
-            else:
-                # Try to convert response to string
-                transcription_text = str(response)
-            print(f"✅ Transcription: {transcription_text}")
-            return transcription_text
-        except Exception as e:
-            print(f"❌ Transcription error: {e}")
-            import traceback
-            traceback.print_exc()
-            return f"Sorry, I couldn't understand the audio. Error: {str(e)}"
-    def translate_to_english(self, text: str, source_lang: str = "auto") -> str:
-        """
-        Translate text to English using Spitch translation API.
-        Args:
-            text: Text to translate
-            source_lang: Source language code or 'auto' for auto-detection
-        Returns:
-            Translated text in English
-        """
-        try:
-            # If already in English, return as is
-            if source_lang == "en":
-                return text
-            print(f"🌍 Translating from {source_lang} to English...")
-            print(f"📝 Original text: {text}")
-            translation = self.client.text.translate(
-                text=text,
-                source=source_lang,
-                target="en"
-            )
-            english_text = translation.text
-            print(f"✅ Translated to English: {english_text}")
-            return english_text
-        except Exception as e:
-            error_msg = f"Translation failed: {str(e)}"
-            print(f"❌ {error_msg}")
-            import traceback
-            traceback.print_exc()
-            # Return original if translation fails
-            return text
-    def synthesize_speech(
-        self,
-        text: str,
-        target_language: str = "en",
-        voice: str = "lina"
-    ) -> bytes:
-        """
-        Convert text to speech using Spitch TTS.
-        Args:
-            text: Text to convert to speech
-            target_language: Target language for speech
-            voice: Voice to use (e.g., 'lina', 'ada', 'kofi')
-        Returns:
-            Audio bytes
-        """
-        try:
-            # Call Spitch TTS API
-            response = self.client.speech.generate(
-                text=text,
-                language=target_language,
-                voice=voice
-            )
-            # ✅ FIX: Spitch returns BinaryAPIResponse, use .read() to get bytes
-            if hasattr(response, 'read'):
-                audio_bytes = response.read()
-                print(f"✅ TTS generated {len(audio_bytes)} bytes of audio")
-                return audio_bytes
-            else:
-                print(f"❌ Response type: {type(response)}")
-                print(f"❌ Response attributes: {dir(response)}")
-                return None
-        except Exception as e:
-            print(f"❌ TTS error: {e}")
-            import traceback
-            traceback.print_exc()
-            return None
-# ============================================================================
-# STEP 2: Integrate Voice with Your LangChain RAG System
-# ============================================================================
-class WemaVoiceAssistant:
-    """
-    Complete voice-enabled assistant combining Spitch voice I/O
-    with your existing Wema RAG system.
-    """
-    def __init__(
-        self,
-        rag_system,
-        chain,
-        spitch_api_key: str
-    ):
-        """
-        Initialize the voice assistant.
-        Args:
-            rag_system: Your initialized WemaRAGSystem
-            chain: Your LangChain RAG chain (already created)
-            spitch_api_key: Spitch API key
-        """
-        self.rag_system = rag_system
-        self.voice_handler = SpitchVoiceHandler(spitch_api_key)
-        self.chain = chain
-    def process_voice_query(
-        self,
-        audio_input,
-        input_language: str = "en",
-        output_language: str = "en",
-        voice: str = "lina"
-    ):
-        """
-        Complete voice interaction pipeline:
-        1. Speech to text (any language)
-        2. Translate to English if needed
-        3. Query RAG system
-        4. Generate response
-        5. Translate response if needed
-        6. Text to speech
-        Args:
-            audio_input: Audio file from user
-            input_language: User's spoken language
-            output_language: Desired response language
-            voice: TTS voice to use
-        Returns:
-            tuple: (response_text, response_audio)
-        """
-        try:
-            # Step 1: Transcribe audio to text
-            print(f"Transcribing audio in {input_language}...")
-            transcribed_text = self.voice_handler.transcribe_audio(
-                audio_input,
-                source_language=input_language
-            )
-            print(f"Transcribed: {transcribed_text}")
-            # Step 2: Translate to English if not already
-            if input_language != "en":
-                print("Translating to English...")
-                english_query = self.voice_handler.translate_to_english(
-                    transcribed_text,
-                    source_lang=input_language
-                )
-            else:
-                english_query = transcribed_text
-            print(f"English query: {english_query}")
-            # Step 3: Get response from RAG system (in English)
-            print("Querying RAG system...")
-            response_text = self.chain.invoke({"query": english_query})
-            print(f"RAG response: {response_text[:100]}...")
-            # Step 4: Translate response if needed
-            if output_language != "en":
-                print(f"Translating response to {output_language}...")
-                translation = self.voice_handler.client.text.translate(
-                    text=response_text,
-                    source="en",
-                    target=output_language
-                )
-                final_text = translation.text
-            else:
-                final_text = response_text
-            # Step 5: Generate speech
-            print("Generating speech...")
-            audio_response = self.voice_handler.synthesize_speech(
-                final_text,
-                target_language=output_language,
-                voice=voice
-            )
-            return final_text, audio_response
-        except Exception as e:
-            error_msg = f"An error occurred: {str(e)}"
-            print(error_msg)
-            return error_msg, None
-# ============================================================================
-# STEP 3: Helper Functions for Audio File Management
-# ============================================================================
-def save_audio_to_temp_file(audio_bytes):
-    """Save audio bytes to a temporary file and return the path."""
-    if audio_bytes is None:
-        return None
-    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
-    temp_file.write(audio_bytes)
-    temp_file.close()
-    return temp_file.name
-def cleanup_temp_audio_files():
-    """Clean up temporary audio files on exit."""
-    temp_dir = tempfile.gettempdir()
-    for temp_file in glob.glob(os.path.join(temp_dir, "tmp*.mp3")):
-        try:
-            os.remove(temp_file)
-        except:
-            pass
-# Register cleanup function to run on exit
-atexit.register(cleanup_temp_audio_files)
-# ============================================================================
-# STEP 4: Create Gradio Interface (With Text AND Voice Options)
-# ============================================================================
-def create_voice_gradio_interface(
-    rag_system,
-    chain,
-    spitch_api_key: str
-):
-    """
-    Create a Gradio interface with BOTH text and voice input/output capabilities.
-    Args:
-        rag_system: Your initialized WemaRAGSystem
-        chain: Your LangChain RAG chain (already created)
-        spitch_api_key: Spitch API key
-    Returns:
-        Gradio Interface
-    """
-    # Initialize voice assistant
-    assistant = WemaVoiceAssistant(rag_system, chain, spitch_api_key)
-    # ✅ CORRECT: Exact voice-language mapping from Spitch documentation
-    LANGUAGE_CONFIG = {
-        "English": {
-            "code": "en",
-            "voices": ["john", "lucy", "lina", "jude", "henry", "kani", "kingsley",
-                      "favour", "comfort", "daniel", "remi"]
-        },
-        "Yoruba": {
-            "code": "yo",
-            "voices": ["sade", "funmi", "segun", "femi"]
-        },
-        "Igbo": {
-            "code": "ig",
-            "voices": ["obinna", "ngozi", "amara", "ebuka"]
-        },
-        "Hausa": {
-            "code": "ha",
-            "voices": ["hasan", "amina", "zainab", "aliyu"]
-        }
-    }
-    # Extract just language names for dropdowns
-    ALL_LANGUAGES = list(LANGUAGE_CONFIG.keys())
-    # ✅ FIXED: Only voices that actually exist in Spitch
-    # Check Spitch docs for exact voice names
-    VOICES = ["lina", "ada", "kofi"]  # Verify these exist
-    def handle_text_query(text_input):
-        """Handle text-only queries."""
-        if not text_input or text_input.strip() == "":
-            return "Please enter a question.", None
-        try:
-            response = chain.invoke({"query": text_input})
-            return response, None
-        except Exception as e:
-            return f"Error: {str(e)}", None
-    def update_voices(language):
-        """Update voice dropdown based on selected language."""
-        voices = LANGUAGE_CONFIG.get(language, {}).get("voices", ["lina"])
-        return gr.Dropdown(choices=voices, value=voices[0])
-    def handle_voice_interaction(audio, input_lang, output_lang, voice):
-        """Gradio handler function for voice - FIXED VERSION."""
-        print("="*60)
-        print("VOICE INTERACTION STARTED")
-        print(f"Audio input: {audio}")
-        print(f"Input language: {input_lang}")
-        print(f"Output language: {output_lang}")
-        print(f"Voice: {voice}")
-        print("="*60)
-        if audio is None:
-            return "Please record or upload audio.", None
-        # Get language codes and voices
-        input_config = LANGUAGE_CONFIG.get(input_lang, LANGUAGE_CONFIG["English"])
-        output_config = LANGUAGE_CONFIG.get(output_lang, LANGUAGE_CONFIG["English"])
-        input_code = input_config["code"]
-        output_code = output_config["code"]
-        # Validate voice for output language
-        available_voices = output_config["voices"]
-        if voice not in available_voices:
-            voice = available_voices[0]
-            print(f"⚠️ Voice changed to {voice} for {output_lang}")
-        try:
-            # Process voice query
-            print("\n🎤 Processing voice query...")
-            # Step 1: Transcribe (supports more languages)
-            transcribed_text = assistant.voice_handler.transcribe_audio(
-                audio,
-                source_language=input_code
-            )
-            print(f"📝 Transcribed ({input_lang}): {transcribed_text}")
-            # Check if transcription failed
-            if "Error" in transcribed_text or "Sorry" in transcribed_text:
-                return transcribed_text, None
-            # Step 2: Translate to English if needed
-            if input_code != "en":
-                print("🌍 Translating to English...")
-                english_query = assistant.voice_handler.translate_to_english(
-                    transcribed_text,
-                    source_lang=input_code
-                )
-                print(f"🇬🇧 English query: {english_query}")
-            else:
-                english_query = transcribed_text
-            # Step 3: Get RAG response (ALWAYS in English first)
-            print("🔍 Querying RAG system...")
-            try:
-                response_text = assistant.chain.invoke({"query": english_query})
-                print(f"✅ RAG response (English): {response_text[:200]}...")
-            except Exception as e:
-                error_msg = f"Error getting response: {str(e)}"
-                print(f"❌ RAG Error: {error_msg}")
-                return error_msg, None
-            # Step 4: Decide what to do with translation
-            if output_code != "en":
-                print(f"🌍 Translating response from English to {output_lang}...")
-                # ⚠️ IMPORTANT: Keep response short for better translation
-                # Long technical responses translate poorly
-                if len(response_text) > 500:
-                    print(f"⚠️ Response is long ({len(response_text)} chars), keeping English for accuracy")
-                    final_text = response_text
-                    tts_text = response_text
-                    tts_language = "en"
-                    tts_voice = "lina"
-                    translation_note = f"\n\n⚠️ (Audio response is in English for accuracy. Full {output_lang} translation above.)"
-                else:
-                    try:
-                        translation = assistant.voice_handler.client.text.translate(
-                            text=response_text,
-                            source="en",
-                            target=output_code
-                        )
-                        translated_text = translation.text
-                        print(f"✅ Translated to {output_lang}: {translated_text[:200]}...")
-                        final_text = translated_text
-                        tts_text = translated_text
-                        tts_language = output_code
-                        tts_voice = voice
-                        translation_note = ""
-                    except Exception as e:
-                        print(f"⚠️ Translation failed: {e}, using English")
-                        final_text = response_text
-                        tts_text = response_text
-                        tts_language = "en"
-                        tts_voice = "lina"
-                        translation_note = f"\n\n⚠️ (Translation to {output_lang} failed, showing English response)"
-            else:
-                final_text = response_text
-                tts_text = response_text
-                tts_language = "en"
-                tts_voice = voice
-                translation_note = ""
-            # Step 5: Generate speech
-            print(f"🔊 Generating speech in {tts_language} with voice {tts_voice}...")
-            print(f"🔊 TTS Text preview: {tts_text[:100]}...")
-            audio_bytes = assistant.voice_handler.synthesize_speech(
-                tts_text,
-                target_language=tts_language,
-                voice=tts_voice
-            )
-            print(f"🔊 Audio bytes type: {type(audio_bytes)}")
-            print(f"🔊 Audio bytes length: {len(audio_bytes) if audio_bytes else 0}")
-            # ✅ FIX: Convert audio bytes to file path
-            audio_file_path = None
-            if audio_bytes:
-                print("\n💾 Saving audio to temp file...")
-                audio_file_path = save_audio_to_temp_file(audio_bytes)
-                print(f"✅ Audio saved to: {audio_file_path}")
-                # Verify file exists and has content
-                if audio_file_path and os.path.exists(audio_file_path):
-                    file_size = os.path.getsize(audio_file_path)
-                    print(f"✅ File size: {file_size} bytes")
-                else:
-                    print("❌ File was not created properly!")
-            else:
-                print("❌ No audio bytes received from TTS")
-            # Add translation note if needed
-            final_text = final_text + translation_note
-            print("="*60)
-            return final_text, audio_file_path
-        except Exception as e:
-            error_msg = f"Error processing voice: {str(e)}"
-            print(f"\n❌ ERROR: {error_msg}")
-            import traceback
-            traceback.print_exc()
-            print("="*60)
-            return error_msg, None
-    # Create Gradio interface with BOTH text and voice
-    with gr.Blocks(theme=gr.themes.Soft()) as demo:
-        gr.Markdown("""
-        # 🏦 Wema Bank AI Assistant
-        ### Powered by Spitch AI & LangChain RAG
-        Choose how you want to interact: Type or Speak!
-        """)
-        with gr.Tabs():
-            # TEXT TAB
-            with gr.Tab("💬 Text Chat"):
-                gr.Markdown("### Type your banking questions")
-                text_input = gr.Textbox(
-                    label="Your Question",
-                    placeholder="Ask me anything about Wema Bank products and services...",
-                    lines=3
-                )
-                text_submit_btn = gr.Button("📤 Send", variant="primary", size="lg")
-                text_output = gr.Textbox(
-                    label="Response",
-                    lines=10,
-                    interactive=False
-                )
-                # Examples for text
-                gr.Examples(
-                    examples=[
-                        ["What is ALAT?"],
-                        ["How do I open a savings account?"],
-                        ["Tell me about Wema Kiddies Account"],
-                        ["How can I avoid phishing scams?"],
-                        ["What loans does Wema Bank offer?"]
-                    ],
-                    inputs=text_input,
-                    label="💡 Try these questions"
-                )
-                text_submit_btn.click(
-                    fn=handle_text_query,
-                    inputs=text_input,
-                    outputs=[text_output, gr.Audio(visible=False)]
-                )
-                # Also submit on Enter
-                text_input.submit(
-                    fn=handle_text_query,
-                    inputs=text_input,
-                    outputs=[text_output, gr.Audio(visible=False)]
-                )
-            # VOICE TAB
-            with gr.Tab("🎤 Voice Chat"):
-                gr.Markdown("""
-                ### Speak your banking questions in your language!
-                **✅ Fully Supported Nigerian Languages:**
-                - 🇬🇧 **English** - 11 voices available
-                - 🇳🇬 **Yoruba** - 4 voices (Sade, Funmi, Segun, Femi)
-                - 🇳🇬 **Igbo** - 4 voices (Obinna, Ngozi, Amara, Ebuka)
-                - 🇳🇬 **Hausa** - 4 voices (Hasan, Amina, Zainab, Aliyu)
-                **💡 Translation Tips:**
-                - Simple questions translate best (e.g., "What is ALAT?", "How do I save money?")
-                - Long technical responses may be kept in English for accuracy
-                - You can always ask in your language and get text in both languages!
-                """)
-                with gr.Row():
-                    with gr.Column():
-                        audio_input = gr.Audio(
-                            sources=["microphone", "upload"],
-                            type="filepath",
-                            label="🎙️ Record or Upload Audio"
-                        )
-                        input_language = gr.Dropdown(
-                            choices=ALL_LANGUAGES,
-                            value="English",
-                            label="Your Language (Speech Input)"
-                        )
-                    with gr.Column():
-                        output_language = gr.Dropdown(
-                            choices=ALL_LANGUAGES,
-                            value="English",
-                            label="Response Language (Audio Output)"
-                        )
-                        voice_selection = gr.Dropdown(
-                            choices=LANGUAGE_CONFIG["English"]["voices"],
-                            value="lina",
-                            label="Voice"
-                        )
-                # Update voices when output language changes
-                output_language.change(
-                    fn=update_voices,
-                    inputs=output_language,
-                    outputs=voice_selection
-                )
-                voice_submit_btn = gr.Button("🚀 Ask Wema Assist", variant="primary", size="lg")
-                voice_text_output = gr.Textbox(
-                    label="📝 Text Response",
-                    lines=8,
-                    interactive=False
-                )
-                voice_audio_output = gr.Audio(
-                    label="🔊 Audio Response",
-                    type="filepath"  # ✅ Important: must be filepath
-                )
-                voice_submit_btn.click(
-                    fn=handle_voice_interaction,
-                    inputs=[audio_input, input_language, output_language, voice_selection],
-                    outputs=[voice_text_output, voice_audio_output]
-                )
-        gr.Markdown("""
-        ---
-        ### 📌 Features
-        - **Text Chat**: Fast and simple - just type and get instant responses
-        - **Voice Chat**: Full support for Nigerian languages!
-        ### 🇳🇬 Supported Nigerian Languages
-        ✅ **English** - 11 different voices (male & female)
-        ✅ **Yoruba** - E ku ọjọ! (4 authentic Yoruba voices)
-        ✅ **Igbo** - Nnọọ! (4 authentic Igbo voices)
-        ✅ **Hausa** - Sannu! (4 authentic Hausa voices)
-        💡 **All features work in every language:**
-        - 🎤 Speak your question in your language
-        - 📝 Get text response translated
-        - 🔊 Hear authentic audio response in your language
-        - 🔄 Seamless translation between languages
-        """)
-    return demo
-# ============================================================================
-# ALTERNATIVE: Simpler Hybrid Interface
-# ============================================================================
-def create_hybrid_interface(
-    rag_system,
-    chain,
-    spitch_api_key: str
-):
-    """
-    Creates a simpler interface supporting both text and voice input.
-    Args:
-        rag_system: Your initialized WemaRAGSystem
-        chain: Your LangChain RAG chain (already created)
-        spitch_api_key: Spitch API key
-    Returns:
-        Gradio Interface
-    """
-    assistant = WemaVoiceAssistant(rag_system, chain, spitch_api_key)
-    def handle_text_query(text_input):
-        """Handle text-only query."""
-        try:
-            response = chain.invoke({"query": text_input})
-            return response, None
-        except Exception as e:
-            return f"Error: {str(e)}", None
-    def handle_voice_query(audio, input_lang, output_lang, voice):
-        """Handle voice query."""
-        if audio is None:
-            return "Please provide audio input.", None
-        LANGUAGES = {
-            "English": "en",
-            "Yoruba": "yo",
-            "Igbo": "ig",
-            "Hausa": "ha"
-        }
-        input_code = LANGUAGES.get(input_lang, "en")
-        output_code = LANGUAGES.get(output_lang, "en")
-        # Process voice query
-        text_response, audio_bytes = assistant.process_voice_query(
-            audio,
-            input_language=input_code,
-            output_language=output_code,
-            voice=voice
-        )
-        # Convert audio bytes to file path
-        audio_file_path = None
-        if audio_bytes:
-            audio_file_path = save_audio_to_temp_file(audio_bytes)
-        return text_response, audio_file_path
-    # Create tabbed interface
-    with gr.Blocks(theme=gr.themes.Soft()) as demo:
-        gr.Markdown("# 🏦 Wema Bank AI Assistant")
-        with gr.Tabs():
-            # Text Tab
-            with gr.Tab("💬 Text Chat"):
-                text_input = gr.Textbox(
-                    label="Type your question",
-                    placeholder="Ask about Wema Bank products and services..."
-                )
-                text_submit = gr.Button("Send")
-                text_output = gr.Textbox(label="Response", lines=10)
-                text_submit.click(
-                    fn=handle_text_query,
-                    inputs=text_input,
-                    outputs=[text_output, gr.Audio(visible=False)]
-                )
-            # Voice Tab
-            with gr.Tab("🎤 Voice Chat"):
-                audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
-                with gr.Row():
-                    input_lang = gr.Dropdown(
-                        ["English", "Yoruba", "Igbo", "Hausa"],
-                        value="English",
-                        label="Input Language"
-                    )
-                    output_lang = gr.Dropdown(
-                        ["English", "Yoruba", "Igbo", "Hausa"],
-                        value="English",
-                        label="Output Language"
-                    )
-                    voice = gr.Dropdown(
-                        ["lina", "ada", "kofi"],
-                        value="lina",
-                        label="Voice"
-                    )
-                voice_submit = gr.Button("Ask")
-                voice_text_output = gr.Textbox(label="Response Text", lines=8)
-                voice_audio_output = gr.Audio(label="Audio Response", type="filepath")
-                voice_submit.click(
-                    fn=handle_voice_query,
-                    inputs=[audio_input, input_lang, output_lang, voice],
-                    outputs=[voice_text_output, voice_audio_output]
-                )
-    return demo

     https://colab.research.google.com/drive/17WecCovbP3TgYvHDyZ4Yckj77r2q5Nam
 """
 # Cell to add FIRST - Your Original WemaRAGSystem
 import json
 )
 iface.launch(share=True, debug=True)