import gradio as gr import speech_recognition as sr import requests from TTS.api import TTS import os import numpy as np import soundfile as sf # Initialize TTS model tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False) # Groq API key GROQ_API_KEY = os.getenv("GROQ_API_KEY") def voice_chat(audio): if audio is None: return "No audio input detected.", None audio_array, sr_rate = audio sf.write("temp.wav", audio_array, sr_rate) # SpeechRecognition setup recognizer = sr.Recognizer() with sr.AudioFile("temp.wav") as source: audio_data = recognizer.record(source) try: text = recognizer.recognize_google(audio_data) except sr.UnknownValueError: return "Could not understand audio.", None except sr.RequestError as e: return f"Speech Recognition error: {e}", None # Call Groq LLM response = requests.post( "https://api.groq.com/openai/v1/chat/completions", headers={ "Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json" }, json={ "model": "llama3-8b-8192", "messages": [{"role": "user", "content": text}] } ) llm_text = response.json()["choices"][0]["message"]["content"] # Generate TTS audio file tts.tts_to_file(text=llm_text, file_path="response.wav") return llm_text, "response.wav" demo = gr.Interface( fn=voice_chat, inputs=gr.Audio(sources=["microphone", "upload"], type="numpy", label="🎤 Speak or upload"), outputs=[gr.Textbox(label="Groq Response"), gr.Audio(label="AI Voice")], title="📚 Speech-to-Text-to-Speech with Groq LLM and TTS" ) if __name__ == "__main__": demo.launch()