Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| import whisper | |
| from gtts import gTTS | |
| import gradio as gr | |
| from groq import Groq | |
| import numpy as np | |
| import io | |
| # Load the Whisper model | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = whisper.load_model("base", device=device) | |
| GROQ_API_KEY ="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" | |
| Client = Groq(api_key=GROQ_API_KEY) | |
| # Set your Groq API key (replace with your actual key or set it in the environment) | |
| os.environ["GROQ_API_KEY"] = "your_groq_api_key_here" | |
| client = Groq(api_key=os.environ.get("GROQ_API_KEY")) | |
| # Function to transcribe audio using Whisper | |
| def transcribe(audio_data): | |
| try: | |
| # Convert numpy array (audio) to bytes and save it as a temporary file | |
| audio_path = "temp_audio.wav" | |
| with open(audio_path, "wb") as f: | |
| f.write(audio_data) | |
| # Transcribe the saved audio file | |
| result = model.transcribe(audio_path) | |
| os.remove(audio_path) # Clean up the temporary file | |
| return result["text"] | |
| except Exception as e: | |
| return f"Error during transcription: {e}" | |
| # Function to get response from Groq's LLM | |
| def get_llm_response(text): | |
| try: | |
| chat_completion = client.chat.completions.create( | |
| messages=[{"role": "user", "content": text}], | |
| model="llama-3.3-70b-versatile", | |
| ) | |
| return chat_completion.choices[0].message.content | |
| except Exception as e: | |
| return f"Error during LLM response generation: {e}" | |
| # Function to convert text to speech | |
| def text_to_speech(text): | |
| try: | |
| tts = gTTS(text, lang="en") | |
| audio_path = "response.mp3" | |
| tts.save(audio_path) | |
| return audio_path | |
| except Exception as e: | |
| return f"Error during text-to-speech conversion: {e}" | |
| # Combined function for processing audio input and generating audio output | |
| def process_audio(audio_data): | |
| transcription = transcribe(audio_data) | |
| if "Error" in transcription: | |
| return transcription, None, None | |
| llm_response = get_llm_response(transcription) | |
| if "Error" in llm_response: | |
| return transcription, llm_response, None | |
| audio_response = text_to_speech(llm_response) | |
| if "Error" in audio_response: | |
| return transcription, llm_response, audio_response | |
| return transcription, llm_response, audio_response | |
| # Build the Gradio interface | |
| with gr.Blocks() as app: | |
| gr.Markdown("## Real-Time Voice-to-Voice Chatbot") | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio(type="numpy", label="Speak", interactive=True) | |
| with gr.Column(): | |
| transcription_output = gr.Textbox(label="Transcription (Text)", lines=2) | |
| response_output = gr.Textbox(label="Response (LLM Text)", lines=2) | |
| audio_output = gr.Audio(label="Response (Audio)") | |
| submit_button = gr.Button("Submit") | |
| # Connect the input and output components | |
| submit_button.click( | |
| process_audio, | |
| inputs=[audio_input], | |
| outputs=[transcription_output, response_output, audio_output], | |
| ) | |
| # Launch the app | |
| app.launch() | |