File size: 5,135 Bytes
588b72b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import torch
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wav
from transformers import pipeline, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import warnings
import sys

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

class CPUBot:
    def __init__(self):
        print("⚙️  Initializing CPU-Optimized Bot...")
        
        # 1. Force CPU Device
        self.device = "cpu"
        
        # 2. Initialize STT (Ears) - Whisper Tiny
        # "tiny" is the fastest model, perfect for CPU
        print(" Loading Ears (Whisper)...")
        self.stt_pipe = pipeline(
            "automatic-speech-recognition",
            model="openai/whisper-tiny.en",
            device=self.device
        )

        # 3. Initialize LLM (Brain) - SmolLM2-360M
        # We use the 360M version instead of 1.7B so it runs fast on CPU
        print(" Loading Brain (SmolLM2-360M)...")
        self.llm_pipe = pipeline(
            "text-generation",
            model="HuggingFaceTB/SmolLM2-360M-Instruct",
            device=self.device,
            torch_dtype=torch.float32 # CPU works best with float32
        )

        # 4. Initialize TTS (Mouth) - SpeechT5
        print(" Loading Mouth (SpeechT5)...")
        self.tts_processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
        self.tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(self.device)
        self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(self.device)
        
        # Load a default speaker embedding (voice)
        # Note: This might download a small dataset on first run
        # Use this updated parquet version that works with new libraries
        embeddings_dataset = load_dataset("regisss/cmu-arctic-xvectors", split="validation")
        self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(self.device)
        
        print("\n Bot is ready! Press Ctrl+C to stop.")

    def record_audio(self, duration=5, samplerate=16000):
        """Records audio from the microphone."""
        print("\n🎤 Listening... (Speak now)")
        recording = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype='float32')
        sd.wait()  # Wait until recording is finished
        return recording.squeeze()

    def speak(self, text):
        """Converts text to speech and plays it."""
        if not text: return
        print(f"🤖 Speaking: {text}")
        
        inputs = self.tts_processor(text=text, return_tensors="pt").to(self.device)
        
        # Generate audio
        with torch.no_grad():
            speech = self.tts_model.generate_speech(
                inputs["input_ids"], 
                self.speaker_embeddings, 
                vocoder=self.vocoder
            )

        # Play audio
        sd.play(speech.cpu().numpy(), samplerate=16000)
        sd.wait()

    def run(self):
        """Main Loop: Listen -> Think -> Speak"""
        print("------------------------------------------------")
        print("  Starting Conversation Loop")
        print("  (Adjust 'duration' in code if 4s is too short)")
        print("------------------------------------------------")
        
        while True:
            try:
                # 1. Listen
                audio_data = self.record_audio(duration=4) # Record for 4 seconds
                
                # 2. Transcribe (STT)
                try:
                    result = self.stt_pipe(audio_data)["text"]
                except Exception:
                    continue # Skip if audio was empty/error

                if len(result.strip()) == 0:
                    print("... (Silence detected)")
                    continue
                
                print(f"👤 You said: {result}")

                # 3. Think (LLM)
                # Chat template for SmolLM
                messages = [{"role": "user", "content": result}]
                prompt = self.llm_pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
                
                # Generate response (Limited to 40 tokens for speed)
                response = self.llm_pipe(
                    prompt, 
                    max_new_tokens=40, 
                    do_sample=True, 
                    temperature=0.6,
                    top_k=50
                )[0]['generated_text']
                
                # Extract just the assistant's part
                bot_reply = response.split("assistant\n")[-1].strip()

                # 4. Speak (TTS)
                self.speak(bot_reply)

            except KeyboardInterrupt:
                print("\n👋 Exiting...")
                break
            except Exception as e:
                # Print error but keep running
                print(f"⚠️ Error: {e}")

if __name__ == "__main__":
    bot = CPUBot()
    bot.run()