File size: 13,108 Bytes
fadeb4a
 
 
1a50c77
 
 
 
 
 
5809432
1a50c77
 
fadeb4a
 
5809432
fadeb4a
 
 
ea86e19
5809432
1a50c77
 
 
 
5809432
1a50c77
 
 
5809432
 
 
 
1a50c77
5809432
1a50c77
5809432
 
 
 
 
1a50c77
5809432
 
 
1a50c77
 
5809432
 
1a50c77
5809432
1a50c77
 
 
 
 
5809432
1a50c77
 
 
 
 
 
 
5809432
1a50c77
 
 
 
 
 
 
 
 
 
 
 
5809432
1a50c77
 
 
5809432
1a50c77
5809432
1a50c77
5809432
1a50c77
 
 
5809432
1a50c77
 
5809432
 
1a50c77
 
 
5809432
 
1a50c77
5809432
 
 
1a50c77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5809432
fadeb4a
 
1a50c77
48fc7b4
fadeb4a
 
1a50c77
 
 
fadeb4a
 
 
1a50c77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e328c36
1a50c77
 
 
fadeb4a
1a50c77
fadeb4a
1a50c77
 
 
 
 
 
 
 
 
 
 
 
 
 
fadeb4a
 
1a50c77
fadeb4a
1a50c77
 
 
 
 
fadeb4a
 
1a50c77
fadeb4a
1a50c77
 
 
 
 
fadeb4a
 
1a50c77
 
 
fadeb4a
1a50c77
fadeb4a
 
1a50c77
fadeb4a
 
 
 
1a50c77
5809432
fadeb4a
1a50c77
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
import os
import gradio as gr
from groq import Groq
from gtts import gTTS
import torch
from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
import io
import numpy as np
import soundfile as sf
# Removed: from google.colab import userdata # This library is specific to Google Colab
import uuid # For unique temporary audio filenames
import librosa # Ensure librosa is imported for resampling

# --- Configuration & Global Variables ---
# IMPORTANT: Ensure your GROQ_API_KEY is set in Hugging Face Space's Repository Secrets!
# It will be directly available via os.environ.get()

# Groq LLM Model
GROQ_MODEL = "llama-3.3-70b-versatile" # A fast and capable model from Groq

# Whisper STT Model (smaller models are faster for Colab free tier, and also good for Spaces)
WHISPER_MODEL_ID = "openai/whisper-tiny"
WHISPER_DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
WHISPER_BATCH_SIZE = 8 # Adjust based on your Space's allocated GPU/CPU memory

# Global chat history for the LLM to maintain context
# This will store messages in the format expected by the Groq API
llm_chat_history = []

# --- Initialization Functions ---
def initialize_groq_client():
    """Initializes the Groq client from environment variable."""
    global client
    try:
        # Now directly get the API key from environment variables.
        # This works automatically when you set secrets in Hugging Face Spaces.
        groq_api_key = os.environ.get("GROQ_API_KEY")

        if not groq_api_key:
            raise ValueError("GROQ_API_KEY environment variable is not set. Please add it to your Hugging Face Space's Repository Secrets.")
        client = Groq(api_key=groq_api_key)
        print("Groq client initialized successfully.")
    except ValueError as ve:
        print(f"ERROR: Groq client initialization failed: {ve}")
        print("ACTION REQUIRED: Please ensure the 'GROQ_API_KEY' environment variable is set correctly in your Hugging Face Space's Repository Secrets.")
        client = None # Set client to None if initialization fails
    except Exception as e:
        print(f"ERROR: An unexpected error occurred during Groq client initialization: {e}")
        client = None

# --- Initialize Whisper STT Pipeline ---
whisper_pipeline = None
def initialize_whisper_pipeline():
    """Initializes the Whisper STT pipeline."""
    global whisper_pipeline
    try:
        print(f"Loading Whisper model: {WHISPER_MODEL_ID} on {WHISPER_DEVICE}...")
        processor = AutoProcessor.from_pretrained(WHISPER_MODEL_ID)
        model = AutoModelForSpeechSeq2Seq.from_pretrained(
            WHISPER_MODEL_ID,
            torch_dtype=torch.float16 if WHISPER_DEVICE == "cuda:0" else torch.float32,
            low_cpu_mem_usage=True,
            use_safetensors=True
        )
        model.to(WHISPER_DEVICE)
        whisper_pipeline = pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            chunk_length_s=30, # Helps with longer audio inputs
            batch_size=WHISPER_BATCH_SIZE,
            device=WHISPER_DEVICE,
            torch_dtype=torch.float16 if WHISPER_DEVICE == "cuda:0" else torch.float32,
        )
        print("Whisper STT pipeline initialized successfully.")
    except Exception as e:
        print(f"ERROR: Whisper STT pipeline initialization failed: {e}")
        print("ACTION REQUIRED: Ensure all required 'transformers' dependencies (e.g., bitsandbytes, accelerate) are in requirements.txt. Check Hugging Face Space GPU availability if using 'cuda'.")
        whisper_pipeline = None # Set to None if initialization fails

# Call initialization functions when the app starts
initialize_groq_client()
initialize_whisper_pipeline()

# --- Chatbot Logic Function ---
# Modified to accept both audio_file and text_message inputs
def speech_to_speech_chat(audio_file, text_message, current_chatbot_history):
    """
    Processes speech input or text input, generates a text response using Groq,
    converts it to speech using gTTS, and updates the chat history.

    Args:
        audio_file (str): Path to the recorded audio file from Gradio's microphone (will be None if text input is used).
        text_message (str): Text input from the textbox (will be None or empty string if audio input is used).
        current_chatbot_history (list): Gradio's chat history.

    Returns:
        tuple: (updated_chatbot_history, text_response_for_display, audio_response_path, audio_input_reset_value, text_input_reset_value)
    """
    global llm_chat_history # Access the global LLM context history

    user_text = ""
    bot_text = ""
    bot_audio_path = None

    # Determine input source: Prioritize audio if available, otherwise use text
    if audio_file:
        if whisper_pipeline:
            try:
                audio_input_data, samplerate = sf.read(audio_file)
                if samplerate != 16000:
                    audio_input_data = librosa.resample(y=audio_input_data, orig_sr=samplerate, target_sr=16000)
                if audio_input_data.ndim > 1:
                    audio_input_data = audio_input_data[:, 0]
                                
                print(f"Transcribing audio file: {audio_file}")
                user_text = whisper_pipeline(audio_input_data)["text"]
                print(f"User Transcribed: {user_text}")

                if not user_text.strip():
                    user_text = "[No speech detected. Please try speaking clearer or louder.]"
            except Exception as e:
                user_text = f"[Transcription Error: {e}. Please check audio file or Whisper setup.]"
                print(f"Whisper Transcription Error: {e}")
        else:
            user_text = "[Whisper STT not initialized. Please check initialization errors in your Space logs.]"
            print("Error: Whisper pipeline is None. Cannot perform transcription.")
    elif text_message and text_message.strip():
        # Process text input
        user_text = text_message.strip()
        print(f"User Input (Text): {user_text}")
    else:
        # No valid input provided
        # Reset both audio and text inputs and provide a message
        return current_chatbot_history, "[Please provide input via speech or text.]", None, gr.update(value=None), ""


    # --- Update LLM Chat History with User's Message ---
    llm_chat_history.append({"role": "user", "content": user_text})

    # --- 2. Groq Large Language Model ---
    if client:
        try:
            print(f"Sending to Groq: {user_text}")
            chat_completion = client.chat.completions.create(
                messages=llm_chat_history, # Send the full history
                model=GROQ_MODEL,
                temperature=0.7,
                max_tokens=1024,
                top_p=1,
                stop=None,
                stream=False,
            )
            bot_text = chat_completion.choices[0].message.content
            print(f"Groq Response: {bot_text}")

        except Exception as e:
            bot_text = f"An API error occurred: {e}. Please check your Groq API key and network."
            print(f"Groq API Error: {e}")
    else:
        bot_text = "[Groq client not initialized. Cannot generate response.]"

    # --- Update LLM Chat History with Bot's Message ---
    llm_chat_history.append({"role": "assistant", "content": bot_text})

    # --- 3. Text-to-Speech (gTTS) ---
    if bot_text and not bot_text.startswith("An API error occurred") and not bot_text.startswith("[Groq client not initialized]"): # Only synthesize if there's a valid response
        try:
            print("Generating speech with gTTS...")
            tts = gTTS(text=bot_text, lang='en', slow=False) 
            # Create a unique temporary filename for the audio
            bot_audio_path = f"temp_bot_response_{uuid.uuid4()}.mp3"
            tts.save(bot_audio_path)
            print(f"Speech saved to {bot_audio_path}")
        except Exception as e:
            print(f"gTTS Error: {e}")
            bot_audio_path = None
    else:
        print("No valid text to convert to speech or an error occurred.")
        bot_audio_path = None # Ensure bot_audio_path is None if TTS skipped

    # --- Update Gradio Chatbot History ---
    updated_chatbot_history = []
    # Reconstruct from llm_chat_history to ensure consistent display
    for i in range(0, len(llm_chat_history), 2):
        user_msg = llm_chat_history[i]["content"] if i < len(llm_chat_history) else ""
        bot_msg = llm_chat_history[i+1]["content"] if (i+1) < len(llm_chat_history) else ""
        updated_chatbot_history.append([user_msg, bot_msg])

    # Return the updated history for the Chatbot, the generated text,
    # the audio path, AND values to reset both audio_input and text_input.
    return updated_chatbot_history, bot_text, bot_audio_path, gr.update(value=None), "" # Added "" to clear text_input


# --- Gradio Interface ---
# We use gr.Blocks for a more flexible layout
with gr.Blocks(theme=gr.themes.Soft(), title="Salman's Speech-Text-Speech Chatbot") as demo:
    gr.Markdown(
        """
        # 🗣️ Speech-to-Speech Chatbot 💬
        Speak into the microphone, type, and I'll respond in text and speech!
        Powered by Whisper (STT), Groq (LLM), and gTTS (TTS).
        """
    )

    # Chatbot for displaying text history
    chatbot = gr.Chatbot(
        label="Conversation History",
        value=[], # Initialize with empty history
        height=400,
        show_copy_button=True
    )

    # Textbox to display the latest LLM response
    latest_response_text = gr.Textbox(
        label="Latest Bot Response (Text)",
        interactive=False, # User can't type here
        lines=3
    )

    # Input components: Audio (microphone) and Textbox
    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone"],
            type="filepath", # This ensures Gradio provides a file path
            label="Speak Here",
            streaming=False, # For simplicity, process full audio clip
            visible=True # Ensure microphone is visible
        )
        text_input = gr.Textbox(
            label="Type your message here",
            placeholder="Type your message...",
            lines=3,
            scale=1 # Allows it to grow/shrink with the row
        )

    # Audio component for bot's speech output
    audio_output = gr.Audio(
        label="Bot Response --Free IK -- (Speech)",
        autoplay=True, # Automatically play the bot's response
        streaming=True # Stream playback for better user experience
    )

    # Buttons for control
    with gr.Row():
        # Unified Send button for both audio and text inputs
        submit_btn = gr.Button("Send Message ➡️")
        # Renamed Clear Chat to Reset Chat
        reset_btn = gr.Button("Reset Chat 🗑️")

    # Event handling:
    # 1. When audio input is received (recording stops)
    # This automatically triggers submission of the audio file.
    audio_input.change(
        fn=speech_to_speech_chat,
        inputs=[audio_input, gr.State(""), chatbot], # Pass audio_file, empty string for text_message, and chat_history
        # Updated outputs to reset both audio_input and text_input
        outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input],
        api_name="speech_input_process"
    )

    # 2. When the submit button is clicked (primarily for text input, but acts as a general submit)
    submit_btn.click(
        fn=speech_to_speech_chat,
        inputs=[gr.State(None), text_input, chatbot], # Pass None for audio_file, text_message, and chat_history
        # Updated outputs to reset both audio_input and text_input
        outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input],
        api_name="text_input_process"
    )

    # 3. When the user presses Enter in the text input box
    text_input.submit(
        fn=speech_to_speech_chat,
        inputs=[gr.State(None), text_input, chatbot], # Pass None for audio_file, text_message, and chat_history
        # Updated outputs to reset both audio_input and text_input
        outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input],
        api_name="text_input_submit"
    )

    # Reset button functionality: Clears all displayed outputs and global chat history
    reset_btn.click(
        fn=lambda: ([], "", None, gr.update(value=None), ""), # Clear chatbot, text, audio, and reset inputs
        inputs=[],
        outputs=[chatbot, latest_response_text, audio_output, audio_input, text_input],
        queue=False
    ).success(
        fn=lambda: llm_chat_history.clear(), # Clear the actual LLM history list
        inputs=[],
        outputs=[]
    )


# Launch the Gradio app
if __name__ == "__main__":
    # When deploying to Hugging Face Spaces, `share=True` is not needed and can be removed.
    # Spaces automatically makes your app public.
    demo.launch() # Removed share=True for Hugging Face Spaces deployment