Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| from groq import Groq | |
| from gtts import gTTS | |
| import torch | |
| from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq | |
| import io | |
| import numpy as np | |
| import soundfile as sf | |
| # Removed: from google.colab import userdata # This library is specific to Google Colab | |
| import uuid # For unique temporary audio filenames | |
| import librosa # Ensure librosa is imported for resampling | |
| # --- Configuration & Global Variables --- | |
| # IMPORTANT: Ensure your GROQ_API_KEY is set in Hugging Face Space's Repository Secrets! | |
| # It will be directly available via os.environ.get() | |
| # Groq LLM Model | |
| GROQ_MODEL = "llama-3.3-70b-versatile" # A fast and capable model from Groq | |
| # Whisper STT Model (smaller models are faster for Colab free tier, and also good for Spaces) | |
| WHISPER_MODEL_ID = "openai/whisper-tiny" | |
| WHISPER_DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| WHISPER_BATCH_SIZE = 8 # Adjust based on your Space's allocated GPU/CPU memory | |
| # Global chat history for the LLM to maintain context | |
| # This will store messages in the format expected by the Groq API | |
| llm_chat_history = [] | |
| # --- Initialization Functions --- | |
| def initialize_groq_client(): | |
| """Initializes the Groq client from environment variable.""" | |
| global client | |
| try: | |
| # Now directly get the API key from environment variables. | |
| # This works automatically when you set secrets in Hugging Face Spaces. | |
| groq_api_key = os.environ.get("GROQ_API_KEY") | |
| if not groq_api_key: | |
| raise ValueError("GROQ_API_KEY environment variable is not set. Please add it to your Hugging Face Space's Repository Secrets.") | |
| client = Groq(api_key=groq_api_key) | |
| print("Groq client initialized successfully.") | |
| except ValueError as ve: | |
| print(f"ERROR: Groq client initialization failed: {ve}") | |
| print("ACTION REQUIRED: Please ensure the 'GROQ_API_KEY' environment variable is set correctly in your Hugging Face Space's Repository Secrets.") | |
| client = None # Set client to None if initialization fails | |
| except Exception as e: | |
| print(f"ERROR: An unexpected error occurred during Groq client initialization: {e}") | |
| client = None | |
| # --- Initialize Whisper STT Pipeline --- | |
| whisper_pipeline = None | |
| def initialize_whisper_pipeline(): | |
| """Initializes the Whisper STT pipeline.""" | |
| global whisper_pipeline | |
| try: | |
| print(f"Loading Whisper model: {WHISPER_MODEL_ID} on {WHISPER_DEVICE}...") | |
| processor = AutoProcessor.from_pretrained(WHISPER_MODEL_ID) | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| WHISPER_MODEL_ID, | |
| torch_dtype=torch.float16 if WHISPER_DEVICE == "cuda:0" else torch.float32, | |
| low_cpu_mem_usage=True, | |
| use_safetensors=True | |
| ) | |
| model.to(WHISPER_DEVICE) | |
| whisper_pipeline = pipeline( | |
| "automatic-speech-recognition", | |
| model=model, | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor, | |
| chunk_length_s=30, # Helps with longer audio inputs | |
| batch_size=WHISPER_BATCH_SIZE, | |
| device=WHISPER_DEVICE, | |
| torch_dtype=torch.float16 if WHISPER_DEVICE == "cuda:0" else torch.float32, | |
| ) | |
| print("Whisper STT pipeline initialized successfully.") | |
| except Exception as e: | |
| print(f"ERROR: Whisper STT pipeline initialization failed: {e}") | |
| print("ACTION REQUIRED: Ensure all required 'transformers' dependencies (e.g., bitsandbytes, accelerate) are in requirements.txt. Check Hugging Face Space GPU availability if using 'cuda'.") | |
| whisper_pipeline = None # Set to None if initialization fails | |
| # Call initialization functions when the app starts | |
| initialize_groq_client() | |
| initialize_whisper_pipeline() | |
| # --- Chatbot Logic Function --- | |
| # Modified to accept both audio_file and text_message inputs | |
| def speech_to_speech_chat(audio_file, text_message, current_chatbot_history): | |
| """ | |
| Processes speech input or text input, generates a text response using Groq, | |
| converts it to speech using gTTS, and updates the chat history. | |
| Args: | |
| audio_file (str): Path to the recorded audio file from Gradio's microphone (will be None if text input is used). | |
| text_message (str): Text input from the textbox (will be None or empty string if audio input is used). | |
| current_chatbot_history (list): Gradio's chat history. | |
| Returns: | |
| tuple: (updated_chatbot_history, text_response_for_display, audio_response_path, audio_input_reset_value, text_input_reset_value) | |
| """ | |
| global llm_chat_history # Access the global LLM context history | |
| user_text = "" | |
| bot_text = "" | |
| bot_audio_path = None | |
| # Determine input source: Prioritize audio if available, otherwise use text | |
| if audio_file: | |
| if whisper_pipeline: | |
| try: | |
| audio_input_data, samplerate = sf.read(audio_file) | |
| if samplerate != 16000: | |
| audio_input_data = librosa.resample(y=audio_input_data, orig_sr=samplerate, target_sr=16000) | |
| if audio_input_data.ndim > 1: | |
| audio_input_data = audio_input_data[:, 0] | |
| print(f"Transcribing audio file: {audio_file}") | |
| user_text = whisper_pipeline(audio_input_data)["text"] | |
| print(f"User Transcribed: {user_text}") | |
| if not user_text.strip(): | |
| user_text = "[No speech detected. Please try speaking clearer or louder.]" | |
| except Exception as e: | |
| user_text = f"[Transcription Error: {e}. Please check audio file or Whisper setup.]" | |
| print(f"Whisper Transcription Error: {e}") | |
| else: | |
| user_text = "[Whisper STT not initialized. Please check initialization errors in your Space logs.]" | |
| print("Error: Whisper pipeline is None. Cannot perform transcription.") | |
| elif text_message and text_message.strip(): | |
| # Process text input | |
| user_text = text_message.strip() | |
| print(f"User Input (Text): {user_text}") | |
| else: | |
| # No valid input provided | |
| # Reset both audio and text inputs and provide a message | |
| return current_chatbot_history, "[Please provide input via speech or text.]", None, gr.update(value=None), "" | |
| # --- Update LLM Chat History with User's Message --- | |
| llm_chat_history.append({"role": "user", "content": user_text}) | |
| # --- 2. Groq Large Language Model --- | |
| if client: | |
| try: | |
| print(f"Sending to Groq: {user_text}") | |
| chat_completion = client.chat.completions.create( | |
| messages=llm_chat_history, # Send the full history | |
| model=GROQ_MODEL, | |
| temperature=0.7, | |
| max_tokens=1024, | |
| top_p=1, | |
| stop=None, | |
| stream=False, | |
| ) | |
| bot_text = chat_completion.choices[0].message.content | |
| print(f"Groq Response: {bot_text}") | |
| except Exception as e: | |
| bot_text = f"An API error occurred: {e}. Please check your Groq API key and network." | |
| print(f"Groq API Error: {e}") | |
| else: | |
| bot_text = "[Groq client not initialized. Cannot generate response.]" | |
| # --- Update LLM Chat History with Bot's Message --- | |
| llm_chat_history.append({"role": "assistant", "content": bot_text}) | |
| # --- 3. Text-to-Speech (gTTS) --- | |
| if bot_text and not bot_text.startswith("An API error occurred") and not bot_text.startswith("[Groq client not initialized]"): # Only synthesize if there's a valid response | |
| try: | |
| print("Generating speech with gTTS...") | |
| tts = gTTS(text=bot_text, lang='en', slow=False) | |
| # Create a unique temporary filename for the audio | |
| bot_audio_path = f"temp_bot_response_{uuid.uuid4()}.mp3" | |
| tts.save(bot_audio_path) | |
| print(f"Speech saved to {bot_audio_path}") | |
| except Exception as e: | |
| print(f"gTTS Error: {e}") | |
| bot_audio_path = None | |
| else: | |
| print("No valid text to convert to speech or an error occurred.") | |
| bot_audio_path = None # Ensure bot_audio_path is None if TTS skipped | |
| # --- Update Gradio Chatbot History --- | |
| updated_chatbot_history = [] | |
| # Reconstruct from llm_chat_history to ensure consistent display | |
| for i in range(0, len(llm_chat_history), 2): | |
| user_msg = llm_chat_history[i]["content"] if i < len(llm_chat_history) else "" | |
| bot_msg = llm_chat_history[i+1]["content"] if (i+1) < len(llm_chat_history) else "" | |
| updated_chatbot_history.append([user_msg, bot_msg]) | |
| # Return the updated history for the Chatbot, the generated text, | |
| # the audio path, AND values to reset both audio_input and text_input. | |
| return updated_chatbot_history, bot_text, bot_audio_path, gr.update(value=None), "" # Added "" to clear text_input | |
| # --- Gradio Interface --- | |
| # We use gr.Blocks for a more flexible layout | |
| with gr.Blocks(theme=gr.themes.Soft(), title="Salman's Speech-Text-Speech Chatbot") as demo: | |
| gr.Markdown( | |
| """ | |
| # 🗣️ Speech-to-Speech Chatbot 💬 | |
| Speak into the microphone, type, and I'll respond in text and speech! | |
| Powered by Whisper (STT), Groq (LLM), and gTTS (TTS). | |
| """ | |
| ) | |
| # Chatbot for displaying text history | |
| chatbot = gr.Chatbot( | |
| label="Conversation History", | |
| value=[], # Initialize with empty history | |
| height=400, | |
| show_copy_button=True | |
| ) | |
| # Textbox to display the latest LLM response | |
| latest_response_text = gr.Textbox( | |
| label="Latest Bot Response (Text)", | |
| interactive=False, # User can't type here | |
| lines=3 | |
| ) | |
| # Input components: Audio (microphone) and Textbox | |
| with gr.Row(): | |
| audio_input = gr.Audio( | |
| sources=["microphone"], | |
| type="filepath", # This ensures Gradio provides a file path | |
| label="Speak Here", | |
| streaming=False, # For simplicity, process full audio clip | |
| visible=True # Ensure microphone is visible | |
| ) | |
| text_input = gr.Textbox( | |
| label="Type your message here", | |
| placeholder="Type your message...", | |
| lines=3, | |
| scale=1 # Allows it to grow/shrink with the row | |
| ) | |
| # Audio component for bot's speech output | |
| audio_output = gr.Audio( | |
| label="Bot Response --Free IK -- (Speech)", | |
| autoplay=True, # Automatically play the bot's response | |
| streaming=True # Stream playback for better user experience | |
| ) | |
| # Buttons for control | |
| with gr.Row(): | |
| # Unified Send button for both audio and text inputs | |
| submit_btn = gr.Button("Send Message ➡️") | |
| # Renamed Clear Chat to Reset Chat | |
| reset_btn = gr.Button("Reset Chat 🗑️") | |
| # Event handling: | |
| # 1. When audio input is received (recording stops) | |
| # This automatically triggers submission of the audio file. | |
| audio_input.change( | |
| fn=speech_to_speech_chat, | |
| inputs=[audio_input, gr.State(""), chatbot], # Pass audio_file, empty string for text_message, and chat_history | |
| # Updated outputs to reset both audio_input and text_input | |
| outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input], | |
| api_name="speech_input_process" | |
| ) | |
| # 2. When the submit button is clicked (primarily for text input, but acts as a general submit) | |
| submit_btn.click( | |
| fn=speech_to_speech_chat, | |
| inputs=[gr.State(None), text_input, chatbot], # Pass None for audio_file, text_message, and chat_history | |
| # Updated outputs to reset both audio_input and text_input | |
| outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input], | |
| api_name="text_input_process" | |
| ) | |
| # 3. When the user presses Enter in the text input box | |
| text_input.submit( | |
| fn=speech_to_speech_chat, | |
| inputs=[gr.State(None), text_input, chatbot], # Pass None for audio_file, text_message, and chat_history | |
| # Updated outputs to reset both audio_input and text_input | |
| outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input], | |
| api_name="text_input_submit" | |
| ) | |
| # Reset button functionality: Clears all displayed outputs and global chat history | |
| reset_btn.click( | |
| fn=lambda: ([], "", None, gr.update(value=None), ""), # Clear chatbot, text, audio, and reset inputs | |
| inputs=[], | |
| outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input], | |
| queue=False | |
| ).success( | |
| fn=lambda: llm_chat_history.clear(), # Clear the actual LLM history list | |
| inputs=[], | |
| outputs=[] | |
| ) | |
| # Launch the Gradio app | |
| if __name__ == "__main__": | |
| # When deploying to Hugging Face Spaces, `share=True` is not needed and can be removed. | |
| # Spaces automatically makes your app public. | |
| demo.launch() # Removed share=True for Hugging Face Spaces deployment | |