speech2speech / app.py
forever-sheikh's picture
Update app.py
ea86e19 verified
import os
import gradio as gr
from groq import Groq
from gtts import gTTS
import torch
from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
import io
import numpy as np
import soundfile as sf
# Removed: from google.colab import userdata # This library is specific to Google Colab
import uuid # For unique temporary audio filenames
import librosa # Ensure librosa is imported for resampling
# --- Configuration & Global Variables ---
# IMPORTANT: Ensure your GROQ_API_KEY is set in Hugging Face Space's Repository Secrets!
# It will be directly available via os.environ.get()
# Groq LLM Model
GROQ_MODEL = "llama-3.3-70b-versatile" # A fast and capable model from Groq
# Whisper STT Model (smaller models are faster for Colab free tier, and also good for Spaces)
WHISPER_MODEL_ID = "openai/whisper-tiny"
WHISPER_DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
WHISPER_BATCH_SIZE = 8 # Adjust based on your Space's allocated GPU/CPU memory
# Global chat history for the LLM to maintain context
# This will store messages in the format expected by the Groq API
llm_chat_history = []
# --- Initialization Functions ---
def initialize_groq_client():
"""Initializes the Groq client from environment variable."""
global client
try:
# Now directly get the API key from environment variables.
# This works automatically when you set secrets in Hugging Face Spaces.
groq_api_key = os.environ.get("GROQ_API_KEY")
if not groq_api_key:
raise ValueError("GROQ_API_KEY environment variable is not set. Please add it to your Hugging Face Space's Repository Secrets.")
client = Groq(api_key=groq_api_key)
print("Groq client initialized successfully.")
except ValueError as ve:
print(f"ERROR: Groq client initialization failed: {ve}")
print("ACTION REQUIRED: Please ensure the 'GROQ_API_KEY' environment variable is set correctly in your Hugging Face Space's Repository Secrets.")
client = None # Set client to None if initialization fails
except Exception as e:
print(f"ERROR: An unexpected error occurred during Groq client initialization: {e}")
client = None
# --- Initialize Whisper STT Pipeline ---
whisper_pipeline = None
def initialize_whisper_pipeline():
"""Initializes the Whisper STT pipeline."""
global whisper_pipeline
try:
print(f"Loading Whisper model: {WHISPER_MODEL_ID} on {WHISPER_DEVICE}...")
processor = AutoProcessor.from_pretrained(WHISPER_MODEL_ID)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
WHISPER_MODEL_ID,
torch_dtype=torch.float16 if WHISPER_DEVICE == "cuda:0" else torch.float32,
low_cpu_mem_usage=True,
use_safetensors=True
)
model.to(WHISPER_DEVICE)
whisper_pipeline = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
chunk_length_s=30, # Helps with longer audio inputs
batch_size=WHISPER_BATCH_SIZE,
device=WHISPER_DEVICE,
torch_dtype=torch.float16 if WHISPER_DEVICE == "cuda:0" else torch.float32,
)
print("Whisper STT pipeline initialized successfully.")
except Exception as e:
print(f"ERROR: Whisper STT pipeline initialization failed: {e}")
print("ACTION REQUIRED: Ensure all required 'transformers' dependencies (e.g., bitsandbytes, accelerate) are in requirements.txt. Check Hugging Face Space GPU availability if using 'cuda'.")
whisper_pipeline = None # Set to None if initialization fails
# Call initialization functions when the app starts
initialize_groq_client()
initialize_whisper_pipeline()
# --- Chatbot Logic Function ---
# Modified to accept both audio_file and text_message inputs
def speech_to_speech_chat(audio_file, text_message, current_chatbot_history):
"""
Processes speech input or text input, generates a text response using Groq,
converts it to speech using gTTS, and updates the chat history.
Args:
audio_file (str): Path to the recorded audio file from Gradio's microphone (will be None if text input is used).
text_message (str): Text input from the textbox (will be None or empty string if audio input is used).
current_chatbot_history (list): Gradio's chat history.
Returns:
tuple: (updated_chatbot_history, text_response_for_display, audio_response_path, audio_input_reset_value, text_input_reset_value)
"""
global llm_chat_history # Access the global LLM context history
user_text = ""
bot_text = ""
bot_audio_path = None
# Determine input source: Prioritize audio if available, otherwise use text
if audio_file:
if whisper_pipeline:
try:
audio_input_data, samplerate = sf.read(audio_file)
if samplerate != 16000:
audio_input_data = librosa.resample(y=audio_input_data, orig_sr=samplerate, target_sr=16000)
if audio_input_data.ndim > 1:
audio_input_data = audio_input_data[:, 0]
print(f"Transcribing audio file: {audio_file}")
user_text = whisper_pipeline(audio_input_data)["text"]
print(f"User Transcribed: {user_text}")
if not user_text.strip():
user_text = "[No speech detected. Please try speaking clearer or louder.]"
except Exception as e:
user_text = f"[Transcription Error: {e}. Please check audio file or Whisper setup.]"
print(f"Whisper Transcription Error: {e}")
else:
user_text = "[Whisper STT not initialized. Please check initialization errors in your Space logs.]"
print("Error: Whisper pipeline is None. Cannot perform transcription.")
elif text_message and text_message.strip():
# Process text input
user_text = text_message.strip()
print(f"User Input (Text): {user_text}")
else:
# No valid input provided
# Reset both audio and text inputs and provide a message
return current_chatbot_history, "[Please provide input via speech or text.]", None, gr.update(value=None), ""
# --- Update LLM Chat History with User's Message ---
llm_chat_history.append({"role": "user", "content": user_text})
# --- 2. Groq Large Language Model ---
if client:
try:
print(f"Sending to Groq: {user_text}")
chat_completion = client.chat.completions.create(
messages=llm_chat_history, # Send the full history
model=GROQ_MODEL,
temperature=0.7,
max_tokens=1024,
top_p=1,
stop=None,
stream=False,
)
bot_text = chat_completion.choices[0].message.content
print(f"Groq Response: {bot_text}")
except Exception as e:
bot_text = f"An API error occurred: {e}. Please check your Groq API key and network."
print(f"Groq API Error: {e}")
else:
bot_text = "[Groq client not initialized. Cannot generate response.]"
# --- Update LLM Chat History with Bot's Message ---
llm_chat_history.append({"role": "assistant", "content": bot_text})
# --- 3. Text-to-Speech (gTTS) ---
if bot_text and not bot_text.startswith("An API error occurred") and not bot_text.startswith("[Groq client not initialized]"): # Only synthesize if there's a valid response
try:
print("Generating speech with gTTS...")
tts = gTTS(text=bot_text, lang='en', slow=False)
# Create a unique temporary filename for the audio
bot_audio_path = f"temp_bot_response_{uuid.uuid4()}.mp3"
tts.save(bot_audio_path)
print(f"Speech saved to {bot_audio_path}")
except Exception as e:
print(f"gTTS Error: {e}")
bot_audio_path = None
else:
print("No valid text to convert to speech or an error occurred.")
bot_audio_path = None # Ensure bot_audio_path is None if TTS skipped
# --- Update Gradio Chatbot History ---
updated_chatbot_history = []
# Reconstruct from llm_chat_history to ensure consistent display
for i in range(0, len(llm_chat_history), 2):
user_msg = llm_chat_history[i]["content"] if i < len(llm_chat_history) else ""
bot_msg = llm_chat_history[i+1]["content"] if (i+1) < len(llm_chat_history) else ""
updated_chatbot_history.append([user_msg, bot_msg])
# Return the updated history for the Chatbot, the generated text,
# the audio path, AND values to reset both audio_input and text_input.
return updated_chatbot_history, bot_text, bot_audio_path, gr.update(value=None), "" # Added "" to clear text_input
# --- Gradio Interface ---
# We use gr.Blocks for a more flexible layout
with gr.Blocks(theme=gr.themes.Soft(), title="Salman's Speech-Text-Speech Chatbot") as demo:
gr.Markdown(
"""
# 🗣️ Speech-to-Speech Chatbot 💬
Speak into the microphone, type, and I'll respond in text and speech!
Powered by Whisper (STT), Groq (LLM), and gTTS (TTS).
"""
)
# Chatbot for displaying text history
chatbot = gr.Chatbot(
label="Conversation History",
value=[], # Initialize with empty history
height=400,
show_copy_button=True
)
# Textbox to display the latest LLM response
latest_response_text = gr.Textbox(
label="Latest Bot Response (Text)",
interactive=False, # User can't type here
lines=3
)
# Input components: Audio (microphone) and Textbox
with gr.Row():
audio_input = gr.Audio(
sources=["microphone"],
type="filepath", # This ensures Gradio provides a file path
label="Speak Here",
streaming=False, # For simplicity, process full audio clip
visible=True # Ensure microphone is visible
)
text_input = gr.Textbox(
label="Type your message here",
placeholder="Type your message...",
lines=3,
scale=1 # Allows it to grow/shrink with the row
)
# Audio component for bot's speech output
audio_output = gr.Audio(
label="Bot Response --Free IK -- (Speech)",
autoplay=True, # Automatically play the bot's response
streaming=True # Stream playback for better user experience
)
# Buttons for control
with gr.Row():
# Unified Send button for both audio and text inputs
submit_btn = gr.Button("Send Message ➡️")
# Renamed Clear Chat to Reset Chat
reset_btn = gr.Button("Reset Chat 🗑️")
# Event handling:
# 1. When audio input is received (recording stops)
# This automatically triggers submission of the audio file.
audio_input.change(
fn=speech_to_speech_chat,
inputs=[audio_input, gr.State(""), chatbot], # Pass audio_file, empty string for text_message, and chat_history
# Updated outputs to reset both audio_input and text_input
outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input],
api_name="speech_input_process"
)
# 2. When the submit button is clicked (primarily for text input, but acts as a general submit)
submit_btn.click(
fn=speech_to_speech_chat,
inputs=[gr.State(None), text_input, chatbot], # Pass None for audio_file, text_message, and chat_history
# Updated outputs to reset both audio_input and text_input
outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input],
api_name="text_input_process"
)
# 3. When the user presses Enter in the text input box
text_input.submit(
fn=speech_to_speech_chat,
inputs=[gr.State(None), text_input, chatbot], # Pass None for audio_file, text_message, and chat_history
# Updated outputs to reset both audio_input and text_input
outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input],
api_name="text_input_submit"
)
# Reset button functionality: Clears all displayed outputs and global chat history
reset_btn.click(
fn=lambda: ([], "", None, gr.update(value=None), ""), # Clear chatbot, text, audio, and reset inputs
inputs=[],
outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input],
queue=False
).success(
fn=lambda: llm_chat_history.clear(), # Clear the actual LLM history list
inputs=[],
outputs=[]
)
# Launch the Gradio app
if __name__ == "__main__":
# When deploying to Hugging Face Spaces, `share=True` is not needed and can be removed.
# Spaces automatically makes your app public.
demo.launch() # Removed share=True for Hugging Face Spaces deployment