import gradio as gr import torch import soundfile as sf from transformers import pipeline from groq import Groq from TTS.api import TTS import os # ---------------------------- # Load models # ---------------------------- # Whisper (Speech → Text) stt = pipeline( "automatic-speech-recognition", model="openai/whisper-small" ) # Groq Client client = Groq(api_key=os.environ["GROQ_API_KEY"]) # Text → Speech tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False) # ---------------------------- # Core function # ---------------------------- def voice_to_voice(audio): # Speech → Text text = stt(audio)["text"] # LLM Response completion = client.chat.completions.create( model="llama3-8b-8192", messages=[{"role": "user", "content": text}] ) reply = completion.choices[0].message.content # Text → Speech output_path = "response.wav" tts.tts_to_file(text=reply, file_path=output_path) return reply, output_path # ---------------------------- # UI # ---------------------------- ui = gr.Interface( fn=voice_to_voice, inputs=gr.Audio(type="filepath", label="🎤 Speak"), outputs=[ gr.Textbox(label="🧠 AI Response"), gr.Audio(label="🔊 Voice Reply") ], title="Voice to Voice AI (Groq + Hugging Face)", description="Speak → AI thinks → AI speaks back" ) ui.launch()