import gradio as gr import whisper from gtts import gTTS from io import BytesIO from pydub import AudioSegment from groq import Groq import os import asyncio # Set your Groq API key groq_api_key = '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' # Initialize Groq client client = Groq(api_key=groq_api_key) # Load Whisper model model = whisper.load_model("base") # or "small", "medium", "large" async def transcribe_audio_async(audio_file): try: # Ensure that audio_file is a BytesIO object if isinstance(audio_file, BytesIO): audio_file.seek(0) # Reset file pointer to the beginning # Save to a temporary file temp_file_path = "temp.wav" with open(temp_file_path, "wb") as temp_file: temp_file.write(audio_file.read()) # Transcribe using Whisper result = model.transcribe(temp_file_path) os.remove(temp_file_path) text = result["text"] return text else: raise ValueError("The provided audio file is not in the expected format.") except Exception as e: return f"Error in transcribing audio: {str(e)}" def generate_response(text): try: if not text: raise ValueError("No text provided for response generation.") # Use Groq API to get response from LLaMA 8b model chat_completion = client.chat.completions.create( messages=[{"role": "user", "content": text}], model="llama3-8b-8192" ) return chat_completion.choices[0].message.content except Exception as e: return f"Error in generating response: {str(e)}" def text_to_speech(text): try: if not text: raise ValueError("No text provided for text-to-speech.") # Convert text to speech using GTTS tts = gTTS(text, lang='en') audio_file = BytesIO() tts.write_to_fp(audio_file) audio_file.seek(0) # Convert to wav for Gradio compatibility audio_segment = AudioSegment.from_mp3(audio_file) wav_file = BytesIO() audio_segment.export(wav_file, format="wav") wav_file.seek(0) return wav_file.read() except Exception as e: return f"Error in converting text to speech: {str(e)}" async def chatbot(audio): try: if audio is None: return "No audio file provided.", None # Handle Gradio audio input if isinstance(audio, str): # If the audio is a file path, convert it to BytesIO with open(audio, "rb") as file: audio = BytesIO(file.read()) text = await transcribe_audio_async(audio) if "Error" in text: return text, None response_text = generate_response(text) if "Error" in response_text: return response_text, None audio_response = text_to_speech(response_text) if isinstance(audio_response, bytes): return (response_text, audio_response) else: return response_text, None except Exception as e: return f"Error in chatbot processing: {str(e)}", None # Define Gradio interface iface = gr.Interface( fn=lambda audio: asyncio.run(chatbot(audio)), inputs=gr.Audio(type="filepath"), # Allow file upload outputs=[gr.Textbox(), gr.Audio(type="filepath")] ) # Launch the Gradio app iface.launch()