Spaces:

Mohit0708
/

Edge-Voice-Assistant

Sleeping

App Files Files Community

Mohit0708 commited on 18 days ago

Commit

588b72b

verified ·

1 Parent(s): 78e95a8

Upload 5 files

Browse files

Files changed (5) hide show

Dockerfile +19 -0
app.py +121 -0
bot.py +132 -0
packages.txt +1 -0
requirements.txt +8 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,19 @@

+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies for audio processing
+RUN apt-get update && apt-get install -y ffmpeg libportaudio2 && rm -rf /var/lib/apt/lists/*
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Expose Gradio port
+EXPOSE 7860
+# Run the application
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import gradio as gr
+import torch
+import numpy as np
+import scipy.io.wavfile as wav
+from transformers import pipeline, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset
+import warnings
+warnings.filterwarnings("ignore")
+# --- 1. THE BOT CLASS (Logic) ---
+class ResumeVoiceBot:
+    def __init__(self):
+        print("⚙️ Loading Models... (This runs only once)")
+        self.device = "cpu"
+        # Ears (Whisper)
+        self.stt_pipe = pipeline(
+            "automatic-speech-recognition",
+            model="openai/whisper-tiny.en",
+            device=self.device
+        )
+        # Brain (SmolLM2)
+        self.llm_pipe = pipeline(
+            "text-generation",
+            model="HuggingFaceTB/SmolLM2-360M-Instruct",
+            device=self.device,
+            torch_dtype=torch.float32
+        )
+        # Mouth (SpeechT5)
+        self.tts_processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
+        self.tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(self.device)
+        self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(self.device)
+        self.speaker_embeddings = torch.tensor(load_dataset("regisss/cmu-arctic-xvectors", split="validation")[7306]["xvector"]).unsqueeze(0).to(self.device)
+        print("✅ Models Loaded!")
+    def process_conversation(self, audio_path):
+        """
+        1. Takes audio file path from UI
+        2. Transcribes (STT)
+        3. Generates Reply (LLM)
+        4. Synthesizes Speech (TTS)
+        """
+        if audio_path is None:
+            return "Please record something!", None
+        # --- A. STT (Transcribe) ---
+        try:
+            text = self.stt_pipe(audio_path)["text"].strip()
+        except Exception as e:
+            return f"Error reading audio: {e}", None
+        # --- BUG FIX: Hallucination Filter ---
+        # If Whisper hears silence, it often outputs these phrases. We block them.
+        hallucinations = ["end of the video", "thanks for watching", "subscribe", "subtitles"]
+        if not text or len(text) < 2 or any(h in text.lower() for h in hallucinations):
+            return "(Silence or Background Noise detected - Try Speaking Louder)", None
+        print(f"User said: {text}")
+        # --- B. LLM (Think) ---
+        messages = [{"role": "user", "content": text}]
+        prompt = self.llm_pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        response = self.llm_pipe(
+            prompt,
+            max_new_tokens=50,
+            do_sample=True,
+            temperature=0.6
+        )[0]['generated_text']
+        bot_reply = response.split("assistant\n")[-1].strip()
+        print(f"Bot reply: {bot_reply}")
+        # --- C. TTS (Speak) ---
+        inputs = self.tts_processor(text=bot_reply, return_tensors="pt").to(self.device)
+        with torch.no_grad():
+            speech = self.tts_model.generate_speech(
+                inputs["input_ids"],
+                self.speaker_embeddings,
+                vocoder=self.vocoder
+            )
+        # Save audio to a temporary file for the UI to play
+        output_path = "response.wav"
+        wav.write(output_path, rate=16000, data=speech.cpu().numpy())
+        return f"👤 You: {text}\n🤖 Bot: {bot_reply}", output_path
+# --- 2. INITIALIZE BOT ---
+bot = ResumeVoiceBot()
+# --- 3. THE UI (Gradio) ---
+with gr.Blocks(title="AI Voice Assistant", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🤖 Edge AI Voice Assistant")
+    gr.Markdown("Runs 100% locally on CPU using Whisper, SmolLM2, and SpeechT5.")
+    with gr.Row():
+        with gr.Column():
+            # Input: Microphone
+            audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Your Voice")
+            submit_btn = gr.Button("Talk to Bot", variant="primary")
+        with gr.Column():
+            # Output: Text Log + Audio Response
+            chat_log = gr.Textbox(label="Conversation Log")
+            audio_output = gr.Audio(label="Bot Response", type="filepath", autoplay=True)
+    # Link the button to the function
+    submit_btn.click(
+        fn=bot.process_conversation,
+        inputs=audio_input,
+        outputs=[chat_log, audio_output]
+    )
+# Launch the Web App
+if __name__ == "__main__":
+    demo.launch(share=True)

bot.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import torch
+import sounddevice as sd
+import numpy as np
+import scipy.io.wavfile as wav
+from transformers import pipeline, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset
+import warnings
+import sys
+# Suppress warnings for cleaner output
+warnings.filterwarnings("ignore")
+class CPUBot:
+    def __init__(self):
+        print("⚙️  Initializing CPU-Optimized Bot...")
+        # 1. Force CPU Device
+        self.device = "cpu"
+        # 2. Initialize STT (Ears) - Whisper Tiny
+        # "tiny" is the fastest model, perfect for CPU
+        print(" Loading Ears (Whisper)...")
+        self.stt_pipe = pipeline(
+            "automatic-speech-recognition",
+            model="openai/whisper-tiny.en",
+            device=self.device
+        )
+        # 3. Initialize LLM (Brain) - SmolLM2-360M
+        # We use the 360M version instead of 1.7B so it runs fast on CPU
+        print(" Loading Brain (SmolLM2-360M)...")
+        self.llm_pipe = pipeline(
+            "text-generation",
+            model="HuggingFaceTB/SmolLM2-360M-Instruct",
+            device=self.device,
+            torch_dtype=torch.float32 # CPU works best with float32
+        )
+        # 4. Initialize TTS (Mouth) - SpeechT5
+        print(" Loading Mouth (SpeechT5)...")
+        self.tts_processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
+        self.tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(self.device)
+        self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(self.device)
+        # Load a default speaker embedding (voice)
+        # Note: This might download a small dataset on first run
+        # Use this updated parquet version that works with new libraries
+        embeddings_dataset = load_dataset("regisss/cmu-arctic-xvectors", split="validation")
+        self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(self.device)
+        print("\n Bot is ready! Press Ctrl+C to stop.")
+    def record_audio(self, duration=5, samplerate=16000):
+        """Records audio from the microphone."""
+        print("\n🎤 Listening... (Speak now)")
+        recording = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype='float32')
+        sd.wait()  # Wait until recording is finished
+        return recording.squeeze()
+    def speak(self, text):
+        """Converts text to speech and plays it."""
+        if not text: return
+        print(f"🤖 Speaking: {text}")
+        inputs = self.tts_processor(text=text, return_tensors="pt").to(self.device)
+        # Generate audio
+        with torch.no_grad():
+            speech = self.tts_model.generate_speech(
+                inputs["input_ids"],
+                self.speaker_embeddings,
+                vocoder=self.vocoder
+            )
+        # Play audio
+        sd.play(speech.cpu().numpy(), samplerate=16000)
+        sd.wait()
+    def run(self):
+        """Main Loop: Listen -> Think -> Speak"""
+        print("------------------------------------------------")
+        print("  Starting Conversation Loop")
+        print("  (Adjust 'duration' in code if 4s is too short)")
+        print("------------------------------------------------")
+        while True:
+            try:
+                # 1. Listen
+                audio_data = self.record_audio(duration=4) # Record for 4 seconds
+                # 2. Transcribe (STT)
+                try:
+                    result = self.stt_pipe(audio_data)["text"]
+                except Exception:
+                    continue # Skip if audio was empty/error
+                if len(result.strip()) == 0:
+                    print("... (Silence detected)")
+                    continue
+                print(f"👤 You said: {result}")
+                # 3. Think (LLM)
+                # Chat template for SmolLM
+                messages = [{"role": "user", "content": result}]
+                prompt = self.llm_pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+                # Generate response (Limited to 40 tokens for speed)
+                response = self.llm_pipe(
+                    prompt,
+                    max_new_tokens=40,
+                    do_sample=True,
+                    temperature=0.6,
+                    top_k=50
+                )[0]['generated_text']
+                # Extract just the assistant's part
+                bot_reply = response.split("assistant\n")[-1].strip()
+                # 4. Speak (TTS)
+                self.speak(bot_reply)
+            except KeyboardInterrupt:
+                print("\n👋 Exiting...")
+                break
+            except Exception as e:
+                # Print error but keep running
+                print(f"⚠️ Error: {e}")
+if __name__ == "__main__":
+    bot = CPUBot()
+    bot.run()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch
+transformers
+gradio
+soundfile
+scipy
+datasets
+sentencepiece
+accelerate