parthmax's picture
Update app.py
94a75ec verified
import gradio as gr
import tempfile
import torch
# =========================
# ASR (Faster Whisper - CPU)
# =========================
from faster_whisper import WhisperModel
asr_model = WhisperModel(
"base",
device="cpu",
compute_type="int8"
)
def transcribe(audio):
if audio is None:
return ""
segments, _ = asr_model.transcribe(audio)
text = " ".join([seg.text for seg in segments])
return text.strip()
# =========================
# LLM (Qwen 0.5B - CPU)
# =========================
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
dtype=torch.float32,
low_cpu_mem_usage=True
)
def generate_response(text):
if not text:
return "Say something..."
prompt = f"User: {text}\nAssistant:"
inputs = tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
output = model.generate(
**inputs,
max_new_tokens=120,
do_sample=True,
temperature=0.7,
top_p=0.9
)
response = tokenizer.decode(output[0], skip_special_tokens=True)
return response.split("Assistant:")[-1].strip()
# =========================
# TTS (CPU - pyttsx3)
# =========================
import pyttsx3
engine = pyttsx3.init()
engine.setProperty("rate", 170)
def text_to_speech(text):
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
path = f.name
engine.save_to_file(text, path)
engine.runAndWait()
return path
# =========================
# FULL PIPELINE
# =========================
def full_pipeline(audio):
text = transcribe(audio)
response = generate_response(text)
audio_out = text_to_speech(response)
return text, response, audio_out
# =========================
# GRADIO UI
# =========================
with gr.Blocks() as demo:
gr.Markdown("# 🎙️ CPU Voice Agent (ASR + LLM + TTS)")
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Speak"
)
btn = gr.Button("Run")
text_out = gr.Textbox(label="Transcription")
response_out = gr.Textbox(label="LLM Response")
audio_out = gr.Audio(label="Response Audio")
btn.click(
fn=full_pipeline,
inputs=audio_input,
outputs=[text_out, response_out, audio_out]
)
demo.launch()