import gradio as gr import tempfile import torch # ========================= # ASR (Faster Whisper - CPU) # ========================= from faster_whisper import WhisperModel asr_model = WhisperModel( "base", device="cpu", compute_type="int8" ) def transcribe(audio): if audio is None: return "" segments, _ = asr_model.transcribe(audio) text = " ".join([seg.text for seg in segments]) return text.strip() # ========================= # LLM (Qwen 0.5B - CPU) # ========================= from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, dtype=torch.float32, low_cpu_mem_usage=True ) def generate_response(text): if not text: return "Say something..." prompt = f"User: {text}\nAssistant:" inputs = tokenizer(prompt, return_tensors="pt") with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=120, do_sample=True, temperature=0.7, top_p=0.9 ) response = tokenizer.decode(output[0], skip_special_tokens=True) return response.split("Assistant:")[-1].strip() # ========================= # TTS (CPU - pyttsx3) # ========================= import pyttsx3 engine = pyttsx3.init() engine.setProperty("rate", 170) def text_to_speech(text): with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: path = f.name engine.save_to_file(text, path) engine.runAndWait() return path # ========================= # FULL PIPELINE # ========================= def full_pipeline(audio): text = transcribe(audio) response = generate_response(text) audio_out = text_to_speech(response) return text, response, audio_out # ========================= # GRADIO UI # ========================= with gr.Blocks() as demo: gr.Markdown("# 🎙️ CPU Voice Agent (ASR + LLM + TTS)") audio_input = gr.Audio( sources=["microphone"], type="filepath", label="Speak" ) btn = gr.Button("Run") text_out = gr.Textbox(label="Transcription") response_out = gr.Textbox(label="LLM Response") audio_out = gr.Audio(label="Response Audio") btn.click( fn=full_pipeline, inputs=audio_input, outputs=[text_out, response_out, audio_out] ) demo.launch()