Spaces:

crevans
/

NAT-Eval-Test

Sleeping

App Files Files Community

crevans commited on Nov 6, 2025

Commit

eb1ddde

verified ·

1 Parent(s): 4b70236

Upload 2 files

Browse files

Files changed (2) hide show

app.py +320 -0
requirements.txt +11 -0

app.py ADDED Viewed

	@@ -0,0 +1,320 @@

+import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from google.cloud import speech, texttospeech
+import os
+import tempfile
+import time
+from pydub import AudioSegment # For audio conversion
+# ==============================================================================
+# 1. CONFIGURE AND LOAD N-ATLaS MODEL
+# ==============================================================================
+MODEL_ID = "NCAIR1/N-ATLaS"
+print(f"Loading model: {MODEL_ID}...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+# Load model for local Mac testing (dtype=torch.float16, no quantization)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    dtype=torch.float16,
+    device_map="auto",
+)
+print("✅ N-ATLaS Model loaded.")
+# ==============================================================================
+# 2. INITIALIZE GOOGLE CLOUD CLIENTS
+# Assumes GOOGLE_APPLICATION_CREDENTIALS is set in your environment
+# ==============================================================================
+try:
+    speech_client = speech.SpeechClient()
+    tts_client = texttospeech.TextToSpeechClient()
+    print("✅ Google Cloud STT/TTS clients initialized.")
+except Exception as e:
+    print(f"🛑 CRITICAL: Could not initialize Google Cloud clients. {e}")
+    print("   Make sure you have set the GOOGLE_APPLICATION_CREDENTIALS environment variable.")
+    exit()
+# ==============================================================================
+# 3. HELPER FUNCTIONS (STT AND TTS)
+# ==============================================================================
+def transcribe_audio(audio_filepath: str, language_code: str):
+    """
+    Converts audio to WAV/LINEAR16 format and transcribes using Google Cloud STT.
+    """
+    if not audio_filepath:
+        return ""
+    print(f"Loading audio file: {audio_filepath}")
+    try:
+        # Load audio using pydub (handles various input formats)
+        audio = AudioSegment.from_file(audio_filepath)
+        print("   -> AudioSegment loaded successfully.")
+        target_sample_rate = 16000
+        target_channels = 1 # Mono
+        # Resample and convert to mono
+        audio = audio.set_frame_rate(target_sample_rate).set_channels(target_channels)
+        # Get raw PCM data (LINEAR16)
+        wav_data = audio.raw_data
+        print(f"Transcribing {len(wav_data)} bytes with language: {language_code} at {target_sample_rate} Hz...")
+        # Configure Google STT for LINEAR16 (Default model)
+        recognition_audio = speech.RecognitionAudio(content=wav_data)
+        recognition_config = speech.RecognitionConfig(
+            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
+            sample_rate_hertz=target_sample_rate,
+            language_code=language_code,
+            audio_channel_count=target_channels
+        )
+        response = speech_client.recognize(config=recognition_config, audio=recognition_audio)
+        if not response.results:
+            return "[Could not understand audio]"
+        transcribed_text = response.results[0].alternatives[0].transcript
+        print(f"   -> Transcribed: {transcribed_text}")
+        return transcribed_text
+    except Exception as e:
+        print(f"   -> 🛑 ERROR during audio processing or transcription: {e}")
+        return f"[Error processing audio: {e}]"
+    finally:
+        # Clean up the temporary file created by Gradio
+        if audio_filepath and os.path.exists(audio_filepath):
+             try:
+                 os.remove(audio_filepath)
+                 print(f"   -> Cleaned up temp file: {audio_filepath}")
+             except OSError as e:
+                 print(f"   -> Error deleting temp file {audio_filepath}: {e}")
+def synthesize_speech(text, voice_code):
+    """Synthesizes speech using Google Cloud TTS with robust voice selection."""
+    print(f"Synthesizing speech with requested code: {voice_code}...")
+    synthesis_input = texttospeech.SynthesisInput(text=text)
+    # --- Robust Voice Selection Logic ---
+    selected_voice_name = None
+    selected_ssml_gender = None
+    # Use high-quality US WaveNet for any English request
+    if voice_code.startswith("en"):
+        selected_language_code = "en-US"
+        selected_voice_name = "en-US-Wavenet-A"
+        print(f"   -> Using high-quality English voice: {selected_voice_name}")
+    else:
+        # For non-English (ha, ig, yo), provide the BASE language code
+        # and request a specific gender. Google should pick a default.
+        selected_language_code = voice_code.split('-')[0] # Use 'ha', 'ig', 'yo'
+        selected_ssml_gender = texttospeech.SsmlVoiceGender.FEMALE # Ask for a female voice
+        print(f"   -> Requesting default FEMALE voice for language: {selected_language_code}")
+    # Set parameters, omitting 'name' if None
+    voice_params = {"language_code": selected_language_code}
+    if selected_voice_name:
+        voice_params["name"] = selected_voice_name
+    elif selected_ssml_gender:
+         voice_params["ssml_gender"] = selected_ssml_gender
+    voice = texttospeech.VoiceSelectionParams(**voice_params)
+    # --- End Voice Selection Logic ---
+    audio_config = texttospeech.AudioConfig(
+        audio_encoding=texttospeech.AudioEncoding.MP3
+    )
+    # Diagnostic check for non-English voices
+    if not voice_code.startswith("en"):
+        try:
+            print(f"--- Listing available voices for language code: {selected_language_code} ---")
+            list_voices_response = tts_client.list_voices(language_code=selected_language_code)
+            available_voices = [v.name for v in list_voices_response.voices]
+            if available_voices:
+                print(f"Available voices found: {available_voices}")
+            else:
+                print("No voices found for this language code.")
+        except Exception as list_err:
+            print(f"   -> ERROR trying to list voices: {list_err}")
+    try:
+        response = tts_client.synthesize_speech(
+            input=synthesis_input, voice=voice, audio_config=audio_config
+        )
+        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp:
+            fp.write(response.audio_content)
+            temp_audio_path = fp.name
+        print(f"   -> Audio saved to: {temp_audio_path}")
+        return temp_audio_path
+    except Exception as e:
+        print(f"   -> 🛑 ERROR during speech synthesis: {e}")
+        return None
+# ==============================================================================
+# 4. CORE CHAT FUNCTION (AS A GENERATOR) - DUAL RESPONSE
+# ==============================================================================
+def speech_to_speech_chat(audio_input, history, input_lang, output_voice):
+    """
+    Main function for the Gradio app. Handles filepath audio input, uses 'yield',
+    and generates BOTH a translation and a conversational reply.
+    """
+    # --- STAGE 0: Get Filepath ---
+    user_audio_path = audio_input # Gradio passes the filepath directly
+    if user_audio_path is None:
+         # Handle case where user clicks submit without recording
+         yield history, None, None
+         return # Stop processing
+    print(f"Received audio filepath: {user_audio_path}")
+    # ----- STAGE 1: Transcribe User -----
+    transcribed_text = transcribe_audio(user_audio_path, input_lang) # Pass filepath
+    if transcribed_text is None:
+        print("   -> 🛑 Transcription function returned None unexpectedly.")
+        transcribed_text = "[Error: Transcription failed internally]"
+    history.append((transcribed_text, None))
+    yield history, None, None # Update UI with transcribed text
+    if transcribed_text.startswith("["):
+        return # Stop processing if transcription failed
+    # ----- STAGE 2: Get N-ATLaS Response (RUN 1: CONVERSATION) -----
+    print("Generating N-ATLaS response (Run 1: Conversation)...")
+    # Get target language name
+    if output_voice.startswith("ha"): lang = "Hausa"
+    elif output_voice.startswith("yo"): lang = "Yoruba"
+    elif output_voice.startswith("ig"): lang = "Igbo"
+    else: lang = "Nigerian English"
+    # Create persona prompt for conversation
+    system_prompt = f"You are a helpful, friendly assistant. Listen to what the user says and respond naturally. You must respond ONLY in {lang}."
+    # Build conversation history
+    messages = []
+    for user_msg, assistant_msg in history:
+        user_content = str(user_msg) if user_msg is not None else "[empty]"
+        messages.append({"role": "user", "content": user_content})
+        if assistant_msg:
+             # Extract just the conversational part from previous turns
+             if "**Conversational Reply:**" in str(assistant_msg):
+                 reply_text = str(assistant_msg).split("---")[0].replace("**Conversational Reply:**\n", "").strip()
+                 messages.append({"role": "assistant", "content": reply_text})
+             else:
+                 messages.append({"role": "assistant", "content": str(assistant_msg)})
+    # Add the final system prompt
+    conversation_messages = messages + [{"role": "system", "content": system_prompt}]
+    conversation_prompt = tokenizer.apply_chat_template(conversation_messages, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(conversation_prompt, return_tensors="pt").to(model.device)
+    input_length = inputs.input_ids.shape[1]
+    outputs = model.generate(
+        **inputs, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id,
+        pad_token_id=tokenizer.eos_token_id, do_sample=True, temperature=0.7, top_p=0.9
+    )
+    conversational_text = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True).strip()
+    print(f"   -> Conversational Reply: {conversational_text}")
+    # ----- STAGE 3: Get N-ATLaS Response (RUN 2: TRANSLATION) -----
+    print("Generating N-ATLaS response (Run 2: Translation)...")
+    translation_system_prompt = f"Translate the following text to {lang}:"
+    translation_messages = [
+        {"role": "system", "content": translation_system_prompt},
+        {"role": "user", "content": transcribed_text} # Translate only the last user input
+    ]
+    translation_prompt = tokenizer.apply_chat_template(translation_messages, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(translation_prompt, return_tensors="pt").to(model.device)
+    input_length = inputs.input_ids.shape[1]
+    outputs = model.generate(
+        **inputs, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id,
+        pad_token_id=tokenizer.eos_token_id, do_sample=False, temperature=0.1, top_p=0.9 # Lower temp for translation
+    )
+    translation_text = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True).strip()
+    print(f"   -> Direct Translation: {translation_text}")
+    # ----- STAGE 4: Synthesize and Format Response -----
+    # Synthesize speech for the CONVERSATIONAL reply only
+    bot_audio_path = synthesize_speech(conversational_text, output_voice)
+    # Format a single string for the chatbot UI
+    bot_response_string = f"""
+**Conversational Reply:**
+{conversational_text}
+---
+**Direct Translation:**
+{translation_text}
+"""
+    # Update the history with the user's text and the bot's combined text
+    final_user_text = transcribed_text if transcribed_text is not None else "[Error]"
+    history[-1] = (final_user_text, bot_response_string)
+    # Yield the final history, the bot's audio, and clear the mic input
+    yield history, bot_audio_path, None
+# ==============================================================================
+# 5. GRADIO UI (using Blocks) - Gradio 3.x compatible
+# ==============================================================================
+with gr.Blocks(theme=gr.themes.Soft(), title="N-ATLaS Voice Test").queue() as iface:
+    gr.Markdown("# 🇳🇬 N-ATLaS Multilingual Voice Test")
+    gr.Markdown(
+        "**Instructions:** Select your spoken language and desired response voice. "
+        "Speak into the microphone, then press 'Submit'.\n"
+        "**⚠️ IMPORTANT: Response from the N-ATLaS 8B model may take 30-90 seconds locally.**"
+    )
+    with gr.Row():
+        input_lang = gr.Dropdown(
+            label="1. Language I am Speaking",
+            choices=[
+                ("American English", "en-US"),
+                ("Nigerian Pidgin / English", "en-NG"),
+                ("Hausa", "ha-NG"),
+                ("Igbo", "ig-NG"),
+                ("Yoruba", "yo-NG")
+            ],
+            value="en-US" # Default to US English for local testing
+        )
+        output_voice = gr.Dropdown(
+            label="2. Language for Bot to Speak",
+            choices=[
+                ("Nigerian English", "en-NG"),
+                ("Hausa", "ha-NG"),
+                ("Igbo", "ig-NG"),
+                ("Yoruba", "yo-NG")
+            ],
+            value="en-NG"
+        )
+    chatbot = gr.Chatbot(label="Conversation", height=400)
+    mic_input = gr.Audio(
+        source="microphone", # Use 'source' (singular) for Gradio 3.x
+        type="filepath",     # Use 'filepath'
+        label="3. Press record and speak"
+    )
+    bot_audio_output = gr.Audio(
+        label="Bot's Spoken Response",
+        autoplay=True
+    )
+    submit_btn = gr.Button("Submit Audio")
+    chat_history = gr.State([])
+    submit_btn.click(
+        fn=speech_to_speech_chat,
+        inputs=[mic_input, chat_history, input_lang, output_voice],
+        outputs=[chatbot, bot_audio_output, mic_input]
+    )
+print("Launching Gradio interface...")
+iface.launch(share=True) # share=True for public link, remove queue=True

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio==3.50.2
+transformers
+torch
+accelerate
+bitsandbytes
+sentencepiece
+google-cloud-speech
+google-cloud-texttospeech
+ffmpeg-python
+pydub
+pydantic<2