Mohit0708 commited on
Commit
588b72b
·
verified ·
1 Parent(s): 78e95a8

Upload 5 files

Browse files
Files changed (5) hide show
  1. Dockerfile +19 -0
  2. app.py +121 -0
  3. bot.py +132 -0
  4. packages.txt +1 -0
  5. requirements.txt +8 -0
Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies for audio processing
6
+ RUN apt-get update && apt-get install -y ffmpeg libportaudio2 && rm -rf /var/lib/apt/lists/*
7
+
8
+ # Install Python dependencies
9
+ COPY requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ # Copy application code
13
+ COPY . .
14
+
15
+ # Expose Gradio port
16
+ EXPOSE 7860
17
+
18
+ # Run the application
19
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ import scipy.io.wavfile as wav
5
+ from transformers import pipeline, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
6
+ from datasets import load_dataset
7
+ import warnings
8
+
9
+ warnings.filterwarnings("ignore")
10
+
11
+ # --- 1. THE BOT CLASS (Logic) ---
12
+ class ResumeVoiceBot:
13
+ def __init__(self):
14
+ print("⚙️ Loading Models... (This runs only once)")
15
+ self.device = "cpu"
16
+
17
+ # Ears (Whisper)
18
+ self.stt_pipe = pipeline(
19
+ "automatic-speech-recognition",
20
+ model="openai/whisper-tiny.en",
21
+ device=self.device
22
+ )
23
+
24
+ # Brain (SmolLM2)
25
+ self.llm_pipe = pipeline(
26
+ "text-generation",
27
+ model="HuggingFaceTB/SmolLM2-360M-Instruct",
28
+ device=self.device,
29
+ torch_dtype=torch.float32
30
+ )
31
+
32
+ # Mouth (SpeechT5)
33
+ self.tts_processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
34
+ self.tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(self.device)
35
+ self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(self.device)
36
+ self.speaker_embeddings = torch.tensor(load_dataset("regisss/cmu-arctic-xvectors", split="validation")[7306]["xvector"]).unsqueeze(0).to(self.device)
37
+
38
+ print("✅ Models Loaded!")
39
+
40
+ def process_conversation(self, audio_path):
41
+ """
42
+ 1. Takes audio file path from UI
43
+ 2. Transcribes (STT)
44
+ 3. Generates Reply (LLM)
45
+ 4. Synthesizes Speech (TTS)
46
+ """
47
+ if audio_path is None:
48
+ return "Please record something!", None
49
+
50
+ # --- A. STT (Transcribe) ---
51
+ try:
52
+ text = self.stt_pipe(audio_path)["text"].strip()
53
+ except Exception as e:
54
+ return f"Error reading audio: {e}", None
55
+
56
+ # --- BUG FIX: Hallucination Filter ---
57
+ # If Whisper hears silence, it often outputs these phrases. We block them.
58
+ hallucinations = ["end of the video", "thanks for watching", "subscribe", "subtitles"]
59
+ if not text or len(text) < 2 or any(h in text.lower() for h in hallucinations):
60
+ return "(Silence or Background Noise detected - Try Speaking Louder)", None
61
+
62
+ print(f"User said: {text}")
63
+
64
+ # --- B. LLM (Think) ---
65
+ messages = [{"role": "user", "content": text}]
66
+ prompt = self.llm_pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
67
+
68
+ response = self.llm_pipe(
69
+ prompt,
70
+ max_new_tokens=50,
71
+ do_sample=True,
72
+ temperature=0.6
73
+ )[0]['generated_text']
74
+
75
+ bot_reply = response.split("assistant\n")[-1].strip()
76
+ print(f"Bot reply: {bot_reply}")
77
+
78
+ # --- C. TTS (Speak) ---
79
+ inputs = self.tts_processor(text=bot_reply, return_tensors="pt").to(self.device)
80
+ with torch.no_grad():
81
+ speech = self.tts_model.generate_speech(
82
+ inputs["input_ids"],
83
+ self.speaker_embeddings,
84
+ vocoder=self.vocoder
85
+ )
86
+
87
+ # Save audio to a temporary file for the UI to play
88
+ output_path = "response.wav"
89
+ wav.write(output_path, rate=16000, data=speech.cpu().numpy())
90
+
91
+ return f"👤 You: {text}\n🤖 Bot: {bot_reply}", output_path
92
+
93
+ # --- 2. INITIALIZE BOT ---
94
+ bot = ResumeVoiceBot()
95
+
96
+ # --- 3. THE UI (Gradio) ---
97
+ with gr.Blocks(title="AI Voice Assistant", theme=gr.themes.Soft()) as demo:
98
+ gr.Markdown("# 🤖 Edge AI Voice Assistant")
99
+ gr.Markdown("Runs 100% locally on CPU using Whisper, SmolLM2, and SpeechT5.")
100
+
101
+ with gr.Row():
102
+ with gr.Column():
103
+ # Input: Microphone
104
+ audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Your Voice")
105
+ submit_btn = gr.Button("Talk to Bot", variant="primary")
106
+
107
+ with gr.Column():
108
+ # Output: Text Log + Audio Response
109
+ chat_log = gr.Textbox(label="Conversation Log")
110
+ audio_output = gr.Audio(label="Bot Response", type="filepath", autoplay=True)
111
+
112
+ # Link the button to the function
113
+ submit_btn.click(
114
+ fn=bot.process_conversation,
115
+ inputs=audio_input,
116
+ outputs=[chat_log, audio_output]
117
+ )
118
+
119
+ # Launch the Web App
120
+ if __name__ == "__main__":
121
+ demo.launch(share=True)
bot.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import sounddevice as sd
3
+ import numpy as np
4
+ import scipy.io.wavfile as wav
5
+ from transformers import pipeline, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
6
+ from datasets import load_dataset
7
+ import warnings
8
+ import sys
9
+
10
+ # Suppress warnings for cleaner output
11
+ warnings.filterwarnings("ignore")
12
+
13
+ class CPUBot:
14
+ def __init__(self):
15
+ print("⚙️ Initializing CPU-Optimized Bot...")
16
+
17
+ # 1. Force CPU Device
18
+ self.device = "cpu"
19
+
20
+ # 2. Initialize STT (Ears) - Whisper Tiny
21
+ # "tiny" is the fastest model, perfect for CPU
22
+ print(" Loading Ears (Whisper)...")
23
+ self.stt_pipe = pipeline(
24
+ "automatic-speech-recognition",
25
+ model="openai/whisper-tiny.en",
26
+ device=self.device
27
+ )
28
+
29
+ # 3. Initialize LLM (Brain) - SmolLM2-360M
30
+ # We use the 360M version instead of 1.7B so it runs fast on CPU
31
+ print(" Loading Brain (SmolLM2-360M)...")
32
+ self.llm_pipe = pipeline(
33
+ "text-generation",
34
+ model="HuggingFaceTB/SmolLM2-360M-Instruct",
35
+ device=self.device,
36
+ torch_dtype=torch.float32 # CPU works best with float32
37
+ )
38
+
39
+ # 4. Initialize TTS (Mouth) - SpeechT5
40
+ print(" Loading Mouth (SpeechT5)...")
41
+ self.tts_processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
42
+ self.tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(self.device)
43
+ self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(self.device)
44
+
45
+ # Load a default speaker embedding (voice)
46
+ # Note: This might download a small dataset on first run
47
+ # Use this updated parquet version that works with new libraries
48
+ embeddings_dataset = load_dataset("regisss/cmu-arctic-xvectors", split="validation")
49
+ self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(self.device)
50
+
51
+ print("\n Bot is ready! Press Ctrl+C to stop.")
52
+
53
+ def record_audio(self, duration=5, samplerate=16000):
54
+ """Records audio from the microphone."""
55
+ print("\n🎤 Listening... (Speak now)")
56
+ recording = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype='float32')
57
+ sd.wait() # Wait until recording is finished
58
+ return recording.squeeze()
59
+
60
+ def speak(self, text):
61
+ """Converts text to speech and plays it."""
62
+ if not text: return
63
+ print(f"🤖 Speaking: {text}")
64
+
65
+ inputs = self.tts_processor(text=text, return_tensors="pt").to(self.device)
66
+
67
+ # Generate audio
68
+ with torch.no_grad():
69
+ speech = self.tts_model.generate_speech(
70
+ inputs["input_ids"],
71
+ self.speaker_embeddings,
72
+ vocoder=self.vocoder
73
+ )
74
+
75
+ # Play audio
76
+ sd.play(speech.cpu().numpy(), samplerate=16000)
77
+ sd.wait()
78
+
79
+ def run(self):
80
+ """Main Loop: Listen -> Think -> Speak"""
81
+ print("------------------------------------------------")
82
+ print(" Starting Conversation Loop")
83
+ print(" (Adjust 'duration' in code if 4s is too short)")
84
+ print("------------------------------------------------")
85
+
86
+ while True:
87
+ try:
88
+ # 1. Listen
89
+ audio_data = self.record_audio(duration=4) # Record for 4 seconds
90
+
91
+ # 2. Transcribe (STT)
92
+ try:
93
+ result = self.stt_pipe(audio_data)["text"]
94
+ except Exception:
95
+ continue # Skip if audio was empty/error
96
+
97
+ if len(result.strip()) == 0:
98
+ print("... (Silence detected)")
99
+ continue
100
+
101
+ print(f"👤 You said: {result}")
102
+
103
+ # 3. Think (LLM)
104
+ # Chat template for SmolLM
105
+ messages = [{"role": "user", "content": result}]
106
+ prompt = self.llm_pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
107
+
108
+ # Generate response (Limited to 40 tokens for speed)
109
+ response = self.llm_pipe(
110
+ prompt,
111
+ max_new_tokens=40,
112
+ do_sample=True,
113
+ temperature=0.6,
114
+ top_k=50
115
+ )[0]['generated_text']
116
+
117
+ # Extract just the assistant's part
118
+ bot_reply = response.split("assistant\n")[-1].strip()
119
+
120
+ # 4. Speak (TTS)
121
+ self.speak(bot_reply)
122
+
123
+ except KeyboardInterrupt:
124
+ print("\n👋 Exiting...")
125
+ break
126
+ except Exception as e:
127
+ # Print error but keep running
128
+ print(f"⚠️ Error: {e}")
129
+
130
+ if __name__ == "__main__":
131
+ bot = CPUBot()
132
+ bot.run()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ gradio
4
+ soundfile
5
+ scipy
6
+ datasets
7
+ sentencepiece
8
+ accelerate