Spaces:

forever-sheikh
/

speech2speech

Sleeping

File size: 13,108 Bytes

import os
import gradio as gr
from groq import Groq
from gtts import gTTS
import torch
from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
import io
import numpy as np
import soundfile as sf
# Removed: from google.colab import userdata # This library is specific to Google Colab
import uuid # For unique temporary audio filenames
import librosa # Ensure librosa is imported for resampling

# --- Configuration & Global Variables ---
# IMPORTANT: Ensure your GROQ_API_KEY is set in Hugging Face Space's Repository Secrets!
# It will be directly available via os.environ.get()

# Groq LLM Model
GROQ_MODEL = "llama-3.3-70b-versatile" # A fast and capable model from Groq

# Whisper STT Model (smaller models are faster for Colab free tier, and also good for Spaces)
WHISPER_MODEL_ID = "openai/whisper-tiny"
WHISPER_DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
WHISPER_BATCH_SIZE = 8 # Adjust based on your Space's allocated GPU/CPU memory

# Global chat history for the LLM to maintain context
# This will store messages in the format expected by the Groq API
llm_chat_history = []

# --- Initialization Functions ---
def initialize_groq_client():
    """Initializes the Groq client from environment variable."""
    global client
    try:
        # Now directly get the API key from environment variables.
        # This works automatically when you set secrets in Hugging Face Spaces.
        groq_api_key = os.environ.get("GROQ_API_KEY")

        if not groq_api_key:
            raise ValueError("GROQ_API_KEY environment variable is not set. Please add it to your Hugging Face Space's Repository Secrets.")
        client = Groq(api_key=groq_api_key)
        print("Groq client initialized successfully.")
    except ValueError as ve:
        print(f"ERROR: Groq client initialization failed: {ve}")
        print("ACTION REQUIRED: Please ensure the 'GROQ_API_KEY' environment variable is set correctly in your Hugging Face Space's Repository Secrets.")
        client = None # Set client to None if initialization fails
    except Exception as e:
        print(f"ERROR: An unexpected error occurred during Groq client initialization: {e}")
        client = None

# --- Initialize Whisper STT Pipeline ---
whisper_pipeline = None
def initialize_whisper_pipeline():
    """Initializes the Whisper STT pipeline."""
    global whisper_pipeline
    try:
        print(f"Loading Whisper model: {WHISPER_MODEL_ID} on {WHISPER_DEVICE}...")
        processor = AutoProcessor.from_pretrained(WHISPER_MODEL_ID)
        model = AutoModelForSpeechSeq2Seq.from_pretrained(
            WHISPER_MODEL_ID,
            torch_dtype=torch.float16 if WHISPER_DEVICE == "cuda:0" else torch.float32,
            low_cpu_mem_usage=True,
            use_safetensors=True
        )
        model.to(WHISPER_DEVICE)
        whisper_pipeline = pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            chunk_length_s=30, # Helps with longer audio inputs
            batch_size=WHISPER_BATCH_SIZE,
            device=WHISPER_DEVICE,
            torch_dtype=torch.float16 if WHISPER_DEVICE == "cuda:0" else torch.float32,
        )
        print("Whisper STT pipeline initialized successfully.")
    except Exception as e:
        print(f"ERROR: Whisper STT pipeline initialization failed: {e}")
        print("ACTION REQUIRED: Ensure all required 'transformers' dependencies (e.g., bitsandbytes, accelerate) are in requirements.txt. Check Hugging Face Space GPU availability if using 'cuda'.")
        whisper_pipeline = None # Set to None if initialization fails

# Call initialization functions when the app starts
initialize_groq_client()
initialize_whisper_pipeline()

# --- Chatbot Logic Function ---
# Modified to accept both audio_file and text_message inputs
def speech_to_speech_chat(audio_file, text_message, current_chatbot_history):
    """
    Processes speech input or text input, generates a text response using Groq,
    converts it to speech using gTTS, and updates the chat history.

    Args:
        audio_file (str): Path to the recorded audio file from Gradio's microphone (will be None if text input is used).
        text_message (str): Text input from the textbox (will be None or empty string if audio input is used).
        current_chatbot_history (list): Gradio's chat history.

    Returns:
        tuple: (updated_chatbot_history, text_response_for_display, audio_response_path, audio_input_reset_value, text_input_reset_value)
    """
    global llm_chat_history # Access the global LLM context history

    user_text = ""
    bot_text = ""
    bot_audio_path = None

    # Determine input source: Prioritize audio if available, otherwise use text
    if audio_file:
        if whisper_pipeline:
            try:
                audio_input_data, samplerate = sf.read(audio_file)
                if samplerate != 16000:
                    audio_input_data = librosa.resample(y=audio_input_data, orig_sr=samplerate, target_sr=16000)
                if audio_input_data.ndim > 1:
                    audio_input_data = audio_input_data[:, 0]
                                
                print(f"Transcribing audio file: {audio_file}")
                user_text = whisper_pipeline(audio_input_data)["text"]
                print(f"User Transcribed: {user_text}")

                if not user_text.strip():
                    user_text = "[No speech detected. Please try speaking clearer or louder.]"
            except Exception as e:
                user_text = f"[Transcription Error: {e}. Please check audio file or Whisper setup.]"
                print(f"Whisper Transcription Error: {e}")
        else:
            user_text = "[Whisper STT not initialized. Please check initialization errors in your Space logs.]"
            print("Error: Whisper pipeline is None. Cannot perform transcription.")
    elif text_message and text_message.strip():
        # Process text input
        user_text = text_message.strip()
        print(f"User Input (Text): {user_text}")
    else:
        # No valid input provided
        # Reset both audio and text inputs and provide a message
        return current_chatbot_history, "[Please provide input via speech or text.]", None, gr.update(value=None), ""


    # --- Update LLM Chat History with User's Message ---
    llm_chat_history.append({"role": "user", "content": user_text})

    # --- 2. Groq Large Language Model ---
    if client:
        try:
            print(f"Sending to Groq: {user_text}")
            chat_completion = client.chat.completions.create(
                messages=llm_chat_history, # Send the full history
                model=GROQ_MODEL,
                temperature=0.7,
                max_tokens=1024,
                top_p=1,
                stop=None,
                stream=False,
            )
            bot_text = chat_completion.choices[0].message.content
            print(f"Groq Response: {bot_text}")

        except Exception as e:
            bot_text = f"An API error occurred: {e}. Please check your Groq API key and network."
            print(f"Groq API Error: {e}")
    else:
        bot_text = "[Groq client not initialized. Cannot generate response.]"

    # --- Update LLM Chat History with Bot's Message ---
    llm_chat_history.append({"role": "assistant", "content": bot_text})

    # --- 3. Text-to-Speech (gTTS) ---
    if bot_text and not bot_text.startswith("An API error occurred") and not bot_text.startswith("[Groq client not initialized]"): # Only synthesize if there's a valid response
        try:
            print("Generating speech with gTTS...")
            tts = gTTS(text=bot_text, lang='en', slow=False) 
            # Create a unique temporary filename for the audio
            bot_audio_path = f"temp_bot_response_{uuid.uuid4()}.mp3"
            tts.save(bot_audio_path)
            print(f"Speech saved to {bot_audio_path}")
        except Exception as e:
            print(f"gTTS Error: {e}")
            bot_audio_path = None
    else:
        print("No valid text to convert to speech or an error occurred.")
        bot_audio_path = None # Ensure bot_audio_path is None if TTS skipped

    # --- Update Gradio Chatbot History ---
    updated_chatbot_history = []
    # Reconstruct from llm_chat_history to ensure consistent display
    for i in range(0, len(llm_chat_history), 2):
        user_msg = llm_chat_history[i]["content"] if i < len(llm_chat_history) else ""
        bot_msg = llm_chat_history[i+1]["content"] if (i+1) < len(llm_chat_history) else ""
        updated_chatbot_history.append([user_msg, bot_msg])

    # Return the updated history for the Chatbot, the generated text,
    # the audio path, AND values to reset both audio_input and text_input.
    return updated_chatbot_history, bot_text, bot_audio_path, gr.update(value=None), "" # Added "" to clear text_input


# --- Gradio Interface ---
# We use gr.Blocks for a more flexible layout
with gr.Blocks(theme=gr.themes.Soft(), title="Salman's Speech-Text-Speech Chatbot") as demo:
    gr.Markdown(
        """
        # 🗣️ Speech-to-Speech Chatbot 💬
        Speak into the microphone, type, and I'll respond in text and speech!
        Powered by Whisper (STT), Groq (LLM), and gTTS (TTS).
        """
    )

    # Chatbot for displaying text history
    chatbot = gr.Chatbot(
        label="Conversation History",
        value=[], # Initialize with empty history
        height=400,
        show_copy_button=True
    )

    # Textbox to display the latest LLM response
    latest_response_text = gr.Textbox(
        label="Latest Bot Response (Text)",
        interactive=False, # User can't type here
        lines=3
    )

    # Input components: Audio (microphone) and Textbox
    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone"],
            type="filepath", # This ensures Gradio provides a file path
            label="Speak Here",
            streaming=False, # For simplicity, process full audio clip
            visible=True # Ensure microphone is visible
        )
        text_input = gr.Textbox(
            label="Type your message here",
            placeholder="Type your message...",
            lines=3,
            scale=1 # Allows it to grow/shrink with the row
        )

    # Audio component for bot's speech output
    audio_output = gr.Audio(
        label="Bot Response --Free IK -- (Speech)",
        autoplay=True, # Automatically play the bot's response
        streaming=True # Stream playback for better user experience
    )

    # Buttons for control
    with gr.Row():
        # Unified Send button for both audio and text inputs
        submit_btn = gr.Button("Send Message ➡️")
        # Renamed Clear Chat to Reset Chat
        reset_btn = gr.Button("Reset Chat 🗑️")

    # Event handling:
    # 1. When audio input is received (recording stops)
    # This automatically triggers submission of the audio file.
    audio_input.change(
        fn=speech_to_speech_chat,
        inputs=[audio_input, gr.State(""), chatbot], # Pass audio_file, empty string for text_message, and chat_history
        # Updated outputs to reset both audio_input and text_input
        outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input],
        api_name="speech_input_process"
    )

    # 2. When the submit button is clicked (primarily for text input, but acts as a general submit)
    submit_btn.click(
        fn=speech_to_speech_chat,
        inputs=[gr.State(None), text_input, chatbot], # Pass None for audio_file, text_message, and chat_history
        # Updated outputs to reset both audio_input and text_input
        outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input],
        api_name="text_input_process"
    )

    # 3. When the user presses Enter in the text input box
    text_input.submit(
        fn=speech_to_speech_chat,
        inputs=[gr.State(None), text_input, chatbot], # Pass None for audio_file, text_message, and chat_history
        # Updated outputs to reset both audio_input and text_input
        outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input],
        api_name="text_input_submit"
    )

    # Reset button functionality: Clears all displayed outputs and global chat history
    reset_btn.click(
        fn=lambda: ([], "", None, gr.update(value=None), ""), # Clear chatbot, text, audio, and reset inputs
        inputs=[],
        outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input],
        queue=False
    ).success(
        fn=lambda: llm_chat_history.clear(), # Clear the actual LLM history list
        inputs=[],
        outputs=[]
    )


# Launch the Gradio app
if __name__ == "__main__":
    # When deploying to Hugging Face Spaces, `share=True` is not needed and can be removed.
    # Spaces automatically makes your app public.
    demo.launch() # Removed share=True for Hugging Face Spaces deployment