File size: 4,546 Bytes
588b72b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import gradio as gr
import torch
import numpy as np
import scipy.io.wavfile as wav
from transformers import pipeline, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import warnings

warnings.filterwarnings("ignore")

# --- 1. THE BOT CLASS (Logic) ---
class ResumeVoiceBot:
    def __init__(self):
        print("⚙️ Loading Models... (This runs only once)")
        self.device = "cpu"

        # Ears (Whisper)
        self.stt_pipe = pipeline(
            "automatic-speech-recognition",
            model="openai/whisper-tiny.en",
            device=self.device
        )

        # Brain (SmolLM2)
        self.llm_pipe = pipeline(
            "text-generation",
            model="HuggingFaceTB/SmolLM2-360M-Instruct",
            device=self.device,
            torch_dtype=torch.float32
        )

        # Mouth (SpeechT5)
        self.tts_processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
        self.tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(self.device)
        self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(self.device)
        self.speaker_embeddings = torch.tensor(load_dataset("regisss/cmu-arctic-xvectors", split="validation")[7306]["xvector"]).unsqueeze(0).to(self.device)
        
        print("✅ Models Loaded!")

    def process_conversation(self, audio_path):
        """

        1. Takes audio file path from UI

        2. Transcribes (STT)

        3. Generates Reply (LLM)

        4. Synthesizes Speech (TTS)

        """
        if audio_path is None:
            return "Please record something!", None

        # --- A. STT (Transcribe) ---
        try:
            text = self.stt_pipe(audio_path)["text"].strip()
        except Exception as e:
            return f"Error reading audio: {e}", None

        # --- BUG FIX: Hallucination Filter ---
        # If Whisper hears silence, it often outputs these phrases. We block them.
        hallucinations = ["end of the video", "thanks for watching", "subscribe", "subtitles"]
        if not text or len(text) < 2 or any(h in text.lower() for h in hallucinations):
            return "(Silence or Background Noise detected - Try Speaking Louder)", None

        print(f"User said: {text}")

        # --- B. LLM (Think) ---
        messages = [{"role": "user", "content": text}]
        prompt = self.llm_pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        
        response = self.llm_pipe(
            prompt, 
            max_new_tokens=50, 
            do_sample=True, 
            temperature=0.6
        )[0]['generated_text']
        
        bot_reply = response.split("assistant\n")[-1].strip()
        print(f"Bot reply: {bot_reply}")

        # --- C. TTS (Speak) ---
        inputs = self.tts_processor(text=bot_reply, return_tensors="pt").to(self.device)
        with torch.no_grad():
            speech = self.tts_model.generate_speech(
                inputs["input_ids"], 
                self.speaker_embeddings, 
                vocoder=self.vocoder
            )
        
        # Save audio to a temporary file for the UI to play
        output_path = "response.wav"
        wav.write(output_path, rate=16000, data=speech.cpu().numpy())
        
        return f"👤 You: {text}\n🤖 Bot: {bot_reply}", output_path

# --- 2. INITIALIZE BOT ---
bot = ResumeVoiceBot()

# --- 3. THE UI (Gradio) ---
with gr.Blocks(title="AI Voice Assistant", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🤖 Edge AI Voice Assistant")
    gr.Markdown("Runs 100% locally on CPU using Whisper, SmolLM2, and SpeechT5.")
    
    with gr.Row():
        with gr.Column():
            # Input: Microphone
            audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Your Voice")
            submit_btn = gr.Button("Talk to Bot", variant="primary")
        
        with gr.Column():
            # Output: Text Log + Audio Response
            chat_log = gr.Textbox(label="Conversation Log")
            audio_output = gr.Audio(label="Bot Response", type="filepath", autoplay=True)

    # Link the button to the function
    submit_btn.click(
        fn=bot.process_conversation,
        inputs=audio_input,
        outputs=[chat_log, audio_output]
    )

# Launch the Web App
if __name__ == "__main__":
    demo.launch(share=True)