Spaces:

forever-sheikh
/

speech2speech

Sleeping

App Files Files Community

speech2speech / app.py

forever-sheikh

Update app.py

ea86e19 verified 7 months ago

raw

history blame contribute delete

13.1 kB

	import os
	import gradio as gr
	from groq import Groq
	from gtts import gTTS
	import torch
	from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
	import io
	import numpy as np
	import soundfile as sf
	# Removed: from google.colab import userdata # This library is specific to Google Colab
	import uuid # For unique temporary audio filenames
	import librosa # Ensure librosa is imported for resampling

	# --- Configuration & Global Variables ---
	# IMPORTANT: Ensure your GROQ_API_KEY is set in Hugging Face Space's Repository Secrets!
	# It will be directly available via os.environ.get()

	# Groq LLM Model
	GROQ_MODEL = "llama-3.3-70b-versatile" # A fast and capable model from Groq

	# Whisper STT Model (smaller models are faster for Colab free tier, and also good for Spaces)
	WHISPER_MODEL_ID = "openai/whisper-tiny"
	WHISPER_DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
	WHISPER_BATCH_SIZE = 8 # Adjust based on your Space's allocated GPU/CPU memory

	# Global chat history for the LLM to maintain context
	# This will store messages in the format expected by the Groq API
	llm_chat_history = []

	# --- Initialization Functions ---
	def initialize_groq_client():
	"""Initializes the Groq client from environment variable."""
	global client
	try:
	# Now directly get the API key from environment variables.
	# This works automatically when you set secrets in Hugging Face Spaces.
	groq_api_key = os.environ.get("GROQ_API_KEY")

	if not groq_api_key:
	raise ValueError("GROQ_API_KEY environment variable is not set. Please add it to your Hugging Face Space's Repository Secrets.")
	client = Groq(api_key=groq_api_key)
	print("Groq client initialized successfully.")
	except ValueError as ve:
	print(f"ERROR: Groq client initialization failed: {ve}")
	print("ACTION REQUIRED: Please ensure the 'GROQ_API_KEY' environment variable is set correctly in your Hugging Face Space's Repository Secrets.")
	client = None # Set client to None if initialization fails
	except Exception as e:
	print(f"ERROR: An unexpected error occurred during Groq client initialization: {e}")
	client = None

	# --- Initialize Whisper STT Pipeline ---
	whisper_pipeline = None
	def initialize_whisper_pipeline():
	"""Initializes the Whisper STT pipeline."""
	global whisper_pipeline
	try:
	print(f"Loading Whisper model: {WHISPER_MODEL_ID} on {WHISPER_DEVICE}...")
	processor = AutoProcessor.from_pretrained(WHISPER_MODEL_ID)
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	WHISPER_MODEL_ID,
	torch_dtype=torch.float16 if WHISPER_DEVICE == "cuda:0" else torch.float32,
	low_cpu_mem_usage=True,
	use_safetensors=True
	)
	model.to(WHISPER_DEVICE)
	whisper_pipeline = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	chunk_length_s=30, # Helps with longer audio inputs
	batch_size=WHISPER_BATCH_SIZE,
	device=WHISPER_DEVICE,
	torch_dtype=torch.float16 if WHISPER_DEVICE == "cuda:0" else torch.float32,
	)
	print("Whisper STT pipeline initialized successfully.")
	except Exception as e:
	print(f"ERROR: Whisper STT pipeline initialization failed: {e}")
	print("ACTION REQUIRED: Ensure all required 'transformers' dependencies (e.g., bitsandbytes, accelerate) are in requirements.txt. Check Hugging Face Space GPU availability if using 'cuda'.")
	whisper_pipeline = None # Set to None if initialization fails

	# Call initialization functions when the app starts
	initialize_groq_client()
	initialize_whisper_pipeline()

	# --- Chatbot Logic Function ---
	# Modified to accept both audio_file and text_message inputs
	def speech_to_speech_chat(audio_file, text_message, current_chatbot_history):
	"""
	Processes speech input or text input, generates a text response using Groq,
	converts it to speech using gTTS, and updates the chat history.

	Args:
	audio_file (str): Path to the recorded audio file from Gradio's microphone (will be None if text input is used).
	text_message (str): Text input from the textbox (will be None or empty string if audio input is used).
	current_chatbot_history (list): Gradio's chat history.

	Returns:
	tuple: (updated_chatbot_history, text_response_for_display, audio_response_path, audio_input_reset_value, text_input_reset_value)
	"""
	global llm_chat_history # Access the global LLM context history

	user_text = ""
	bot_text = ""
	bot_audio_path = None

	# Determine input source: Prioritize audio if available, otherwise use text
	if audio_file:
	if whisper_pipeline:
	try:
	audio_input_data, samplerate = sf.read(audio_file)
	if samplerate != 16000:
	audio_input_data = librosa.resample(y=audio_input_data, orig_sr=samplerate, target_sr=16000)
	if audio_input_data.ndim > 1:
	audio_input_data = audio_input_data[:, 0]

	print(f"Transcribing audio file: {audio_file}")
	user_text = whisper_pipeline(audio_input_data)["text"]
	print(f"User Transcribed: {user_text}")

	if not user_text.strip():
	user_text = "[No speech detected. Please try speaking clearer or louder.]"
	except Exception as e:
	user_text = f"[Transcription Error: {e}. Please check audio file or Whisper setup.]"
	print(f"Whisper Transcription Error: {e}")
	else:
	user_text = "[Whisper STT not initialized. Please check initialization errors in your Space logs.]"
	print("Error: Whisper pipeline is None. Cannot perform transcription.")
	elif text_message and text_message.strip():
	# Process text input
	user_text = text_message.strip()
	print(f"User Input (Text): {user_text}")
	else:
	# No valid input provided
	# Reset both audio and text inputs and provide a message
	return current_chatbot_history, "[Please provide input via speech or text.]", None, gr.update(value=None), ""


	# --- Update LLM Chat History with User's Message ---
	llm_chat_history.append({"role": "user", "content": user_text})

	# --- 2. Groq Large Language Model ---
	if client:
	try:
	print(f"Sending to Groq: {user_text}")
	chat_completion = client.chat.completions.create(
	messages=llm_chat_history, # Send the full history
	model=GROQ_MODEL,
	temperature=0.7,
	max_tokens=1024,
	top_p=1,
	stop=None,
	stream=False,
	)
	bot_text = chat_completion.choices[0].message.content
	print(f"Groq Response: {bot_text}")

	except Exception as e:
	bot_text = f"An API error occurred: {e}. Please check your Groq API key and network."
	print(f"Groq API Error: {e}")
	else:
	bot_text = "[Groq client not initialized. Cannot generate response.]"

	# --- Update LLM Chat History with Bot's Message ---
	llm_chat_history.append({"role": "assistant", "content": bot_text})

	# --- 3. Text-to-Speech (gTTS) ---
	if bot_text and not bot_text.startswith("An API error occurred") and not bot_text.startswith("[Groq client not initialized]"): # Only synthesize if there's a valid response
	try:
	print("Generating speech with gTTS...")
	tts = gTTS(text=bot_text, lang='en', slow=False)
	# Create a unique temporary filename for the audio
	bot_audio_path = f"temp_bot_response_{uuid.uuid4()}.mp3"
	tts.save(bot_audio_path)
	print(f"Speech saved to {bot_audio_path}")
	except Exception as e:
	print(f"gTTS Error: {e}")
	bot_audio_path = None
	else:
	print("No valid text to convert to speech or an error occurred.")
	bot_audio_path = None # Ensure bot_audio_path is None if TTS skipped

	# --- Update Gradio Chatbot History ---
	updated_chatbot_history = []
	# Reconstruct from llm_chat_history to ensure consistent display
	for i in range(0, len(llm_chat_history), 2):
	user_msg = llm_chat_history[i]["content"] if i < len(llm_chat_history) else ""
	bot_msg = llm_chat_history[i+1]["content"] if (i+1) < len(llm_chat_history) else ""
	updated_chatbot_history.append([user_msg, bot_msg])

	# Return the updated history for the Chatbot, the generated text,
	# the audio path, AND values to reset both audio_input and text_input.
	return updated_chatbot_history, bot_text, bot_audio_path, gr.update(value=None), "" # Added "" to clear text_input


	# --- Gradio Interface ---
	# We use gr.Blocks for a more flexible layout
	with gr.Blocks(theme=gr.themes.Soft(), title="Salman's Speech-Text-Speech Chatbot") as demo:
	gr.Markdown(
	"""
	# 🗣️ Speech-to-Speech Chatbot 💬
	Speak into the microphone, type, and I'll respond in text and speech!
	Powered by Whisper (STT), Groq (LLM), and gTTS (TTS).
	"""
	)

	# Chatbot for displaying text history
	chatbot = gr.Chatbot(
	label="Conversation History",
	value=[], # Initialize with empty history
	height=400,
	show_copy_button=True
	)

	# Textbox to display the latest LLM response
	latest_response_text = gr.Textbox(
	label="Latest Bot Response (Text)",
	interactive=False, # User can't type here
	lines=3
	)

	# Input components: Audio (microphone) and Textbox
	with gr.Row():
	audio_input = gr.Audio(
	sources=["microphone"],
	type="filepath", # This ensures Gradio provides a file path
	label="Speak Here",
	streaming=False, # For simplicity, process full audio clip
	visible=True # Ensure microphone is visible
	)
	text_input = gr.Textbox(
	label="Type your message here",
	placeholder="Type your message...",
	lines=3,
	scale=1 # Allows it to grow/shrink with the row
	)

	# Audio component for bot's speech output
	audio_output = gr.Audio(
	label="Bot Response --Free IK -- (Speech)",
	autoplay=True, # Automatically play the bot's response
	streaming=True # Stream playback for better user experience
	)

	# Buttons for control
	with gr.Row():
	# Unified Send button for both audio and text inputs
	submit_btn = gr.Button("Send Message ➡️")
	# Renamed Clear Chat to Reset Chat
	reset_btn = gr.Button("Reset Chat 🗑️")

	# Event handling:
	# 1. When audio input is received (recording stops)
	# This automatically triggers submission of the audio file.
	audio_input.change(
	fn=speech_to_speech_chat,
	inputs=[audio_input, gr.State(""), chatbot], # Pass audio_file, empty string for text_message, and chat_history
	# Updated outputs to reset both audio_input and text_input
	outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input],
	api_name="speech_input_process"
	)

	# 2. When the submit button is clicked (primarily for text input, but acts as a general submit)
	submit_btn.click(
	fn=speech_to_speech_chat,
	inputs=[gr.State(None), text_input, chatbot], # Pass None for audio_file, text_message, and chat_history
	# Updated outputs to reset both audio_input and text_input
	outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input],
	api_name="text_input_process"
	)

	# 3. When the user presses Enter in the text input box
	text_input.submit(
	fn=speech_to_speech_chat,
	inputs=[gr.State(None), text_input, chatbot], # Pass None for audio_file, text_message, and chat_history
	# Updated outputs to reset both audio_input and text_input
	outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input],
	api_name="text_input_submit"
	)

	# Reset button functionality: Clears all displayed outputs and global chat history
	reset_btn.click(
	fn=lambda: ([], "", None, gr.update(value=None), ""), # Clear chatbot, text, audio, and reset inputs
	inputs=[],
	outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input],
	queue=False
	).success(
	fn=lambda: llm_chat_history.clear(), # Clear the actual LLM history list
	inputs=[],
	outputs=[]
	)


	# Launch the Gradio app
	if __name__ == "__main__":
	# When deploying to Hugging Face Spaces, `share=True` is not needed and can be removed.
	# Spaces automatically makes your app public.
	demo.launch() # Removed share=True for Hugging Face Spaces deployment