Spaces:

pgits
/

docker-speech2text

Sleeping

App Files Files Community

petergits commited on Jun 12, 2025

Commit

c08e928

1 Parent(s): 56ce875

2nd checking push speech2text

Browse files

Files changed (9) hide show

Dockerfile +10 -1
create_test_audio.py +149 -0
generate_test_audio.py +136 -0
mcp_speech_client.py +125 -0
mcp_speech_service.py +408 -0
requirements.txt +8 -3
startup_script.py +212 -0
testClient.py +141 -0
whisper_http_wrapper.py +304 -0

Dockerfile CHANGED Viewed

@@ -10,8 +10,17 @@ ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
 COPY --chown=user ./requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 COPY --chown=user . /app
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 WORKDIR /app
 COPY --chown=user ./requirements.txt requirements.txt
+COPY create_test_audio.py .
+COPY generate_test_audio.py .
+COPY mcp_speech_client.py .
+COPY mcp_speech_service.py .
+COPY requirements.txt .
+COPY startup_script.py .
+COPY testClient.py .
+COPY whisper_http_wrapper.py .
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 COPY --chown=user . /app
+CMD ["python", "startup_script.py"]
+#CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

create_test_audio.py ADDED Viewed

	@@ -0,0 +1,149 @@

+#!/usr/bin/env python3
+"""
+Create a test audio file for testing the MPC Speech service
+"""
+import numpy as np
+import soundfile as sf
+from pathlib import Path
+def create_test_audio(filename="test_audio.wav", duration=3.0, sample_rate=16000):
+    """
+    Create a simple test audio file with a sine wave tone
+    Args:
+        filename: Output filename
+        duration: Duration in seconds
+        sample_rate: Sample rate in Hz
+    """
+    print(f"🎵 Creating test audio file: {filename}")
+    # Generate time array
+    t = np.linspace(0, duration, int(sample_rate * duration), False)
+    # Create a simple tone (440 Hz - A note) with some variation
+    frequency1 = 440  # A note
+    frequency2 = 523  # C note
+    # Create a chord-like sound
+    audio = (
+        0.3 * np.sin(2 * np.pi * frequency1 * t) +  # A note
+        0.2 * np.sin(2 * np.pi * frequency2 * t) +  # C note
+        0.1 * np.sin(2 * np.pi * 660 * t)          # E note
+    )
+    # Add some envelope to make it sound more natural
+    envelope = np.exp(-t * 0.5)  # Exponential decay
+    audio = audio * envelope
+    # Normalize to prevent clipping
+    audio = audio / np.max(np.abs(audio)) * 0.8
+    # Save as WAV file
+    sf.write(filename, audio, sample_rate)
+    file_size = Path(filename).stat().st_size
+    print(f"✅ Created {filename}")
+    print(f"   Duration: {duration}s")
+    print(f"   Sample rate: {sample_rate} Hz")
+    print(f"   File size: {file_size:,} bytes")
+    return filename
+def create_silent_audio(filename="silent_audio.wav", duration=2.0, sample_rate=16000):
+    """Create a silent audio file for testing"""
+    print(f"🔇 Creating silent audio file: {filename}")
+    # Create silent audio (zeros)
+    audio = np.zeros(int(sample_rate * duration))
+    # Save as WAV file
+    sf.write(filename, audio, sample_rate)
+    file_size = Path(filename).stat().st_size
+    print(f"✅ Created {filename}")
+    print(f"   Duration: {duration}s (silent)")
+    print(f"   Sample rate: {sample_rate} Hz")
+    print(f"   File size: {file_size:,} bytes")
+    return filename
+def create_speech_like_audio(filename="speech_test.wav", duration=5.0, sample_rate=16000):
+    """Create a more speech-like test audio with varying frequencies"""
+    print(f"🎤 Creating speech-like audio file: {filename}")
+    # Generate time array
+    t = np.linspace(0, duration, int(sample_rate * duration), False)
+    # Create speech-like formants (simplified)
+    # Human speech typically has formants around 500Hz, 1500Hz, 2500Hz
+    formant1 = 500
+    formant2 = 1500
+    formant3 = 2500
+    # Create a more complex waveform
+    audio = (
+        0.4 * np.sin(2 * np.pi * formant1 * t) * (1 + 0.3 * np.sin(2 * np.pi * 3 * t)) +
+        0.3 * np.sin(2 * np.pi * formant2 * t) * (1 + 0.2 * np.sin(2 * np.pi * 5 * t)) +
+        0.2 * np.sin(2 * np.pi * formant3 * t) * (1 + 0.1 * np.sin(2 * np.pi * 7 * t))
+    )
+    # Add some noise to make it more realistic
+    noise = 0.05 * np.random.normal(0, 1, len(audio))
+    audio = audio + noise
+    # Create segments (like words)
+    segment_duration = 0.8
+    pause_duration = 0.2
+    segment_samples = int(segment_duration * sample_rate)
+    pause_samples = int(pause_duration * sample_rate)
+    # Apply segmentation
+    for i in range(0, len(audio), segment_samples + pause_samples):
+        # Keep segment
+        segment_end = min(i + segment_samples, len(audio))
+        # Add pause
+        pause_start = segment_end
+        pause_end = min(pause_start + pause_samples, len(audio))
+        if pause_end > pause_start:
+            audio[pause_start:pause_end] *= 0.1  # Reduce volume for pause
+    # Normalize
+    audio = audio / np.max(np.abs(audio)) * 0.7
+    # Save as WAV file
+    sf.write(filename, audio, sample_rate)
+    file_size = Path(filename).stat().st_size
+    print(f"✅ Created {filename}")
+    print(f"   Duration: {duration}s")
+    print(f"   Sample rate: {sample_rate} Hz")
+    print(f"   File size: {file_size:,} bytes")
+    print(f"   Note: This is synthetic audio for testing")
+    return filename
+def main():
+    """Create test audio files"""
+    print("🎧 Creating test audio files for MPC Speech service")
+    print("=" * 50)
+    try:
+        # Create different types of test audio
+        create_test_audio("audio.wav", duration=3.0)
+        create_silent_audio("silent_test.wav", duration=2.0)
+        create_speech_like_audio("speech_test.wav", duration=4.0)
+        print(f"\n✅ All test audio files created successfully!")
+        print(f"📁 Files created in current directory:")
+        print(f"   • audio.wav - Simple tone (for basic testing)")
+        print(f"   • silent_test.wav - Silent audio (edge case testing)")
+        print(f"   • speech_test.wav - Speech-like audio (more realistic)")
+        print(f"\n🧪 You can now test with: python testClient.py")
+    except Exception as e:
+        print(f"❌ Error creating test audio files: {e}")
+        print(f"💡 Make sure you have soundfile installed: pip install soundfile")
+if __name__ == "__main__":
+    main()

generate_test_audio.py ADDED Viewed

	@@ -0,0 +1,136 @@

+#!/usr/bin/env python3
+"""
+Generate test audio files for testing the MPC Speech service
+"""
+import numpy as np
+import soundfile as sf
+from scipy import signal
+import math
+def generate_sine_wave(frequency=440, duration=3.0, sample_rate=16000, amplitude=0.3):
+    """Generate a sine wave audio signal"""
+    t = np.linspace(0, duration, int(sample_rate * duration), False)
+    wave = amplitude * np.sin(2 * np.pi * frequency * t)
+    return wave.astype(np.float32)
+def generate_chirp(duration=3.0, sample_rate=16000, f0=200, f1=2000, amplitude=0.3):
+    """Generate a frequency sweep (chirp) signal"""
+    t = np.linspace(0, duration, int(sample_rate * duration), False)
+    wave = amplitude * signal.chirp(t, f0, duration, f1)
+    return wave.astype(np.float32)
+def generate_white_noise(duration=3.0, sample_rate=16000, amplitude=0.1):
+    """Generate white noise"""
+    samples = int(sample_rate * duration)
+    wave = amplitude * np.random.normal(0, 1, samples)
+    return wave.astype(np.float32)
+def generate_speech_like_signal(duration=5.0, sample_rate=16000):
+    """Generate a speech-like signal with multiple frequency components"""
+    t = np.linspace(0, duration, int(sample_rate * duration), False)
+    # Fundamental frequency (varies like speech)
+    f0 = 120 + 30 * np.sin(2 * np.pi * 0.5 * t)  # Varying pitch
+    # Multiple harmonics
+    signal_wave = np.zeros_like(t)
+    for harmonic in range(1, 6):
+        amplitude = 0.3 / harmonic  # Decreasing amplitude for higher harmonics
+        signal_wave += amplitude * np.sin(2 * np.pi * harmonic * f0 * t)
+    # Add some formant-like filtering
+    # Simple bandpass filtering to simulate formants
+    b, a = signal.butter(4, [300, 3000], btype='band', fs=sample_rate)
+    signal_wave = signal.filtfilt(b, a, signal_wave)
+    # Add envelope to make it more speech-like
+    envelope = np.exp(-0.5 * t) * (1 + 0.5 * np.sin(2 * np.pi * 2 * t))
+    signal_wave *= envelope
+    return signal_wave.astype(np.float32)
+def create_test_audio_files():
+    """Create various test audio files"""
+    sample_rate = 16000  # Common sample rate for speech recognition
+    # 1. Simple sine wave (440 Hz - A note)
+    sine_wave = generate_sine_wave(440, 3.0, sample_rate)
+    sf.write('test_sine_440hz.wav', sine_wave, sample_rate)
+    print("Created: test_sine_440hz.wav (3 seconds, 440 Hz sine wave)")
+    # 2. Frequency sweep
+    chirp = generate_chirp(3.0, sample_rate)
+    sf.write('test_chirp.wav', chirp, sample_rate)
+    print("Created: test_chirp.wav (3 seconds, frequency sweep 200-2000 Hz)")
+    # 3. White noise (for testing noise handling)
+    noise = generate_white_noise(2.0, sample_rate)
+    sf.write('test_noise.wav', noise, sample_rate)
+    print("Created: test_noise.wav (2 seconds, white noise)")
+    # 4. Speech-like signal
+    speech_like = generate_speech_like_signal(5.0, sample_rate)
+    sf.write('test_speech_like.wav', speech_like, sample_rate)
+    print("Created: test_speech_like.wav (5 seconds, speech-like signal)")
+    # 5. Mixed signal (sine + noise)
+    mixed_duration = 4.0
+    sine_component = generate_sine_wave(300, mixed_duration, sample_rate, 0.4)
+    noise_component = generate_white_noise(mixed_duration, sample_rate, 0.1)
+    mixed_signal = sine_component + noise_component
+    sf.write('test_mixed.wav', mixed_signal, sample_rate)
+    print("Created: test_mixed.wav (4 seconds, 300 Hz sine + noise)")
+    # 6. Multi-tone signal
+    multi_tone = (
+        0.3 * generate_sine_wave(261.63, 3.0, sample_rate, 1.0) +  # C4
+        0.2 * generate_sine_wave(329.63, 3.0, sample_rate, 1.0) +  # E4
+        0.2 * generate_sine_wave(392.00, 3.0, sample_rate, 1.0)    # G4
+    ) / 3
+    sf.write('test_chord.wav', multi_tone, sample_rate)
+    print("Created: test_chord.wav (3 seconds, C major chord)")
+def create_simple_test_audio():
+    """Create a simple test audio file named 'audio.wav' for the examples"""
+    sample_rate = 16000
+    duration = 3.0
+    # Create a simple melody-like signal
+    frequencies = [261.63, 293.66, 329.63, 349.23, 392.00]  # C, D, E, F, G
+    note_duration = duration / len(frequencies)
+    full_signal = np.array([])
+    for freq in frequencies:
+        note = generate_sine_wave(freq, note_duration, sample_rate, 0.3)
+        full_signal = np.concatenate([full_signal, note])
+    sf.write('audio.wav', full_signal, sample_rate)
+    print("Created: audio.wav (3 seconds, simple melody for testing)")
+if __name__ == '__main__':
+    print("Generating test audio files...")
+    print(f"Sample rate: 16000 Hz (common for speech recognition)")
+    print("-" * 50)
+    # Install required package if not present
+    try:
+        from scipy import signal
+    except ImportError:
+        print("Installing scipy for signal processing...")
+        import subprocess
+        subprocess.check_call(['pip', 'install', 'scipy'])
+        from scipy import signal
+    # Create the simple test file first
+    create_simple_test_audio()
+    print("-" * 50)
+    # Create various test files
+    create_test_audio_files()
+    print("-" * 50)
+    print("All test audio files created successfully!")
+    print("\nYou can now use these files to test your MPC Speech service:")
+    print("- audio.wav (for basic examples)")
+    print("- test_*.wav files (for various test scenarios)")

mcp_speech_client.py ADDED Viewed

	@@ -0,0 +1,125 @@

+#!/usr/bin/env python3
+"""
+Client library for MCP Speech-to-Text service
+"""
+import asyncio
+import aiohttp
+import json
+import base64
+import logging
+from pathlib import Path
+from typing import Dict, Any, Optional
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class MCPSpeechClient:
+    """Client for MCP Speech-to-Text service"""
+    def __init__(self, service_url: str = "http://localhost:8081"):
+        self.service_url = service_url.rstrip('/')
+        self.token = None
+        self.encryption_key = None
+        self.client_session = None
+    async def __aenter__(self):
+        self.client_session = aiohttp.ClientSession()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if self.client_session:
+            await self.client_session.close()
+    async def authenticate(self, client_id: str = "test_client") -> bool:
+        """Authenticate with the MCP service"""
+        try:
+            if not self.client_session:
+                self.client_session = aiohttp.ClientSession()
+            async with self.client_session.post(
+                f"{self.service_url}/auth",
+                json={"client_id": client_id}
+            ) as response:
+                if response.status == 200:
+                    data = await response.json()
+                    self.token = data.get("token")
+                    self.encryption_key = data.get("encryption_key")
+                    logger.info("Authentication successful")
+                    return True
+                else:
+                    error = await response.text()
+                    logger.error(f"Authentication failed: {error}")
+                    return False
+        except Exception as e:
+            logger.error(f"Authentication error: {e}")
+            return False
+    async def transcribe_file(self, audio_file_path: str, session_id: Optional[str] = None) -> Dict[str, Any]:
+        """Transcribe an audio file"""
+        if not self.token:
+            if not await self.authenticate():
+                return {"status": "error", "error": "Authentication failed"}
+        try:
+            if not Path(audio_file_path).exists():
+                raise FileNotFoundError(f"Audio file not found: {audio_file_path}")
+            headers = {
+                "Authorization": f"Bearer {self.token}"
+            }
+            if session_id:
+                headers["X-Session-ID"] = session_id
+            with open(audio_file_path, 'rb') as f:
+                form_data = aiohttp.FormData()
+                form_data.add_field('audio', f, filename=Path(audio_file_path).name)
+                async with self.client_session.post(
+                    f"{self.service_url}/transcribe",
+                    data=form_data,
+                    headers=headers
+                ) as response:
+                    if response.status == 200:
+                        return await response.json()
+                    else:
+                        error_text = await response.text()
+                        return {
+                            "status": "error",
+                            "error": f"HTTP {response.status}: {error_text}"
+                        }
+        except Exception as e:
+            logger.error(f"Transcription error: {e}")
+            return {"status": "error", "error": str(e)}
+    async def get_status(self) -> Dict[str, Any]:
+        """Get service status"""
+        try:
+            if not self.client_session:
+                self.client_session = aiohttp.ClientSession()
+            async with self.client_session.get(f"{self.service_url}/status") as response:
+                if response.status == 200:
+                    return await response.json()
+                else:
+                    return {
+                        "status": "error",
+                        "error": f"HTTP {response.status}: {await response.text()}"
+                    }
+        except Exception as e:
+            logger.error(f"Status check error: {e}")
+            return {"status": "error", "error": str(e)}
+# Convenience functions for backwards compatibility
+async def check_service_status(service_url: str = "http://localhost:8081") -> Dict[str, Any]:
+    """Check if the MCP service is running"""
+    async with MCPSpeechClient(service_url) as client:
+        return await client.get_status()
+async def transcribe_audio_file(audio_file_path: str, service_url: str = "http://localhost:8081") -> Dict[str, Any]:
+    """Transcribe an audio file using the MCP service"""
+    async with MCPSpeechClient(service_url) as client:
+        result = await client.transcribe_file(audio_file_path, session_id="single_request")
+        return result

mcp_speech_service.py ADDED Viewed

	@@ -0,0 +1,408 @@

+#!/usr/bin/env python3
+"""
+MCP Speech-to-Text Service
+Exposes realtime-whisper-macbook service through a secure MCP interface
+"""
+import asyncio
+import json
+import logging
+import os
+import subprocess
+import tempfile
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+import hashlib
+import hmac
+import base64
+import aiohttp
+from aiohttp import web, WSMsgType
+import aiofiles
+import numpy as np
+import soundfile as sf
+from cryptography.fernet import Fernet
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class MCPSpeechService:
+    """MCP Service for Speech-to-Text processing"""
+    def __init__(self, whisper_service_url: str = "http://localhost:8000",
+                 encryption_key: Optional[bytes] = None):
+        self.whisper_service_url = whisper_service_url
+        self.encryption_key = encryption_key or Fernet.generate_key()
+        self.cipher = Fernet(self.encryption_key)
+        self.active_sessions: Dict[str, Dict] = {}
+        self.auth_tokens: Dict[str, Dict] = {}
+    def generate_session_id(self) -> str:
+        """Generate a unique session ID"""
+        return hashlib.sha256(f"{time.time()}{os.urandom(16)}".encode()).hexdigest()[:16]
+    def generate_auth_token(self, client_id: str) -> str:
+        """Generate JWT-like auth token for client"""
+        payload = {
+            "client_id": client_id,
+            "issued_at": int(time.time()),
+            "expires_at": int(time.time()) + 3600  # 1 hour expiry
+        }
+        token_data = json.dumps(payload).encode()
+        signature = hmac.new(self.encryption_key, token_data, hashlib.sha256).hexdigest()
+        token = base64.b64encode(token_data).decode() + "." + signature
+        self.auth_tokens[token] = payload
+        return token
+    def verify_auth_token(self, token: str) -> Optional[Dict]:
+        """Verify and decode auth token"""
+        try:
+            if "." not in token:
+                return None
+            token_b64, signature = token.rsplit(".", 1)
+            token_data = base64.b64decode(token_b64.encode())
+            # Verify signature
+            expected_sig = hmac.new(self.encryption_key, token_data, hashlib.sha256).hexdigest()
+            if not hmac.compare_digest(signature, expected_sig):
+                return None
+            payload = json.loads(token_data.decode())
+            # Check expiry
+            if payload.get("expires_at", 0) < int(time.time()):
+                return None
+            return payload
+        except Exception as e:
+            logger.error(f"Token verification failed: {e}")
+            return None
+    async def encrypt_data(self, data: bytes) -> bytes:
+        """Encrypt sensitive data"""
+        return self.cipher.encrypt(data)
+    async def decrypt_data(self, encrypted_data: bytes) -> bytes:
+        """Decrypt sensitive data"""
+        return self.cipher.decrypt(encrypted_data)
+    async def process_audio_chunk(self, audio_data: bytes, session_id: str,
+                                format: str = "wav") -> Dict[str, Any]:
+        """Process audio chunk through whisper service"""
+        try:
+            # Create temporary file for audio data
+            with tempfile.NamedTemporaryFile(suffix=f".{format}", delete=False) as temp_file:
+                temp_file.write(audio_data)
+                temp_file_path = temp_file.name
+            try:
+                # Call the whisper service
+                async with aiohttp.ClientSession() as session:
+                    with open(temp_file_path, 'rb') as f:
+                        form_data = aiohttp.FormData()
+                        form_data.add_field('audio', f, filename=f'audio.{format}')
+                        async with session.post(
+                            f"{self.whisper_service_url}/transcribe",
+                            data=form_data
+                        ) as response:
+                            if response.status == 200:
+                                result = await response.json()
+                                return {
+                                    "session_id": session_id,
+                                    "timestamp": datetime.utcnow().isoformat(),
+                                    "transcription": result,
+                                    "status": "success"
+                                }
+                            else:
+                                error_text = await response.text()
+                                logger.error(f"Whisper service error: {error_text}")
+                                return {
+                                    "session_id": session_id,
+                                    "timestamp": datetime.utcnow().isoformat(),
+                                    "error": f"Service error: {response.status}",
+                                    "status": "error"
+                                }
+            finally:
+                # Clean up temporary file
+                os.unlink(temp_file_path)
+        except Exception as e:
+            logger.error(f"Audio processing error: {e}")
+            return {
+                "session_id": session_id,
+                "timestamp": datetime.utcnow().isoformat(),
+                "error": str(e),
+                "status": "error"
+            }
+    async def handle_auth(self, request):
+        """Handle authentication endpoint"""
+        try:
+            data = await request.json()
+            client_id = data.get('client_id')
+            if not client_id:
+                return web.Response(
+                    text=json.dumps({"error": "client_id required"}),
+                    status=400,
+                    content_type='application/json'
+                )
+            token = self.generate_auth_token(client_id)
+            return web.Response(
+                text=json.dumps({
+                    "token": token,
+                    "encryption_key": base64.b64encode(self.encryption_key).decode(),
+                    "expires_in": 3600
+                }),
+                content_type='application/json'
+            )
+        except Exception as e:
+            logger.error(f"Auth error: {e}")
+            return web.Response(
+                text=json.dumps({"error": "Authentication failed"}),
+                status=500,
+                content_type='application/json'
+            )
+    async def handle_transcribe(self, request):
+        """Handle transcription endpoint"""
+        try:
+            # Verify authentication
+            auth_header = request.headers.get('Authorization', '')
+            if not auth_header.startswith('Bearer '):
+                return web.Response(
+                    text=json.dumps({"error": "Missing or invalid authorization"}),
+                    status=401,
+                    content_type='application/json'
+                )
+            token = auth_header[7:]  # Remove 'Bearer ' prefix
+            auth_payload = self.verify_auth_token(token)
+            if not auth_payload:
+                return web.Response(
+                    text=json.dumps({"error": "Invalid or expired token"}),
+                    status=401,
+                    content_type='application/json'
+                )
+            # Get session ID
+            session_id = request.headers.get('X-Session-ID')
+            if not session_id:
+                session_id = self.generate_session_id()
+            # Handle multipart form data (audio file)
+            reader = await request.multipart()
+            audio_data = None
+            async for part in reader:
+                if part.name == 'audio':
+                    audio_data = await part.read()
+                    break
+            if not audio_data:
+                return web.Response(
+                    text=json.dumps({"error": "No audio data provided"}),
+                    status=400,
+                    content_type='application/json'
+                )
+            # Process audio
+            result = await self.process_audio_chunk(audio_data, session_id)
+            # Encrypt sensitive data if requested
+            if request.headers.get('X-Encrypt-Response') == 'true':
+                encrypted_result = await self.encrypt_data(json.dumps(result).encode())
+                return web.Response(
+                    body=encrypted_result,
+                    content_type='application/octet-stream',
+                    headers={'X-Encrypted': 'true'}
+                )
+            return web.Response(
+                text=json.dumps(result),
+                content_type='application/json'
+            )
+        except Exception as e:
+            logger.error(f"Transcription error: {e}")
+            return web.Response(
+                text=json.dumps({"error": "Transcription failed"}),
+                status=500,
+                content_type='application/json'
+            )
+    async def handle_websocket(self, request):
+        """Handle WebSocket connections for real-time transcription"""
+        ws = web.WebSocketResponse()
+        await ws.prepare(request)
+        session_id = None
+        auth_payload = None
+        try:
+            async for msg in ws:
+                if msg.type == WSMsgType.TEXT:
+                    try:
+                        data = json.loads(msg.data)
+                        if data.get('type') == 'auth':
+                            # Handle authentication
+                            token = data.get('token')
+                            auth_payload = self.verify_auth_token(token)
+                            if auth_payload:
+                                session_id = self.generate_session_id()
+                                self.active_sessions[session_id] = {
+                                    'client_id': auth_payload['client_id'],
+                                    'connected_at': time.time(),
+                                    'ws': ws
+                                }
+                                await ws.send_text(json.dumps({
+                                    'type': 'auth_success',
+                                    'session_id': session_id
+                                }))
+                            else:
+                                await ws.send_text(json.dumps({
+                                    'type': 'auth_error',
+                                    'message': 'Invalid token'
+                                }))
+                        elif data.get('type') == 'audio_chunk':
+                            # Handle audio chunk
+                            if not auth_payload or not session_id:
+                                await ws.send_text(json.dumps({
+                                    'type': 'error',
+                                    'message': 'Not authenticated'
+                                }))
+                                continue
+                            # Decode base64 audio data
+                            audio_b64 = data.get('audio_data', '')
+                            audio_data = base64.b64decode(audio_b64)
+                            # Process audio
+                            result = await self.process_audio_chunk(audio_data, session_id)
+                            await ws.send_text(json.dumps({
+                                'type': 'transcription_result',
+                                'data': result
+                            }))
+                    except json.JSONDecodeError:
+                        await ws.send_text(json.dumps({
+                            'type': 'error',
+                            'message': 'Invalid JSON'
+                        }))
+                elif msg.type == WSMsgType.BINARY:
+                    # Handle binary audio data
+                    if not auth_payload or not session_id:
+                        await ws.send_text(json.dumps({
+                            'type': 'error',
+                            'message': 'Not authenticated'
+                        }))
+                        continue
+                    # Process binary audio data
+                    result = await self.process_audio_chunk(msg.data, session_id)
+                    await ws.send_text(json.dumps({
+                        'type': 'transcription_result',
+                        'data': result
+                    }))
+                elif msg.type == WSMsgType.ERROR:
+                    logger.error(f'WebSocket error: {ws.exception()}')
+                    break
+        except Exception as e:
+            logger.error(f"WebSocket error: {e}")
+        finally:
+            # Clean up session
+            if session_id and session_id in self.active_sessions:
+                del self.active_sessions[session_id]
+        return ws
+    async def handle_status(self, request):
+        """Handle status endpoint"""
+        return web.Response(
+            text=json.dumps({
+                "service": "MCP Speech-to-Text",
+                "status": "running",
+                "active_sessions": len(self.active_sessions),
+                "whisper_service": self.whisper_service_url,
+                "timestamp": datetime.utcnow().isoformat()
+            }),
+            content_type='application/json'
+        )
+    def create_app(self) -> web.Application:
+        """Create the web application"""
+        app = web.Application()
+        # Add routes
+        app.router.add_post('/auth', self.handle_auth)
+        app.router.add_post('/transcribe', self.handle_transcribe)
+        app.router.add_get('/ws', self.handle_websocket)
+        app.router.add_get('/status', self.handle_status)
+        # Fixed CORS middleware
+        @web.middleware
+        async def cors_middleware(request, handler):
+            if request.method == 'OPTIONS':
+                # Handle preflight requests
+                response = web.Response()
+            else:
+                response = await handler(request)
+            response.headers['Access-Control-Allow-Origin'] = '*'
+            response.headers['Access-Control-Allow-Methods'] = 'GET, POST, OPTIONS'
+            response.headers['Access-Control-Allow-Headers'] = 'Content-Type, Authorization, X-Session-ID, X-Encrypt-Response'
+            return response
+        app.middlewares.append(cors_middleware)
+        return app
+async def main():
+    """Main function to run the MCP service"""
+    # Configuration
+    host = os.getenv('MCP_HOST', '0.0.0.0')
+    port = int(os.getenv('MCP_PORT', '8081'))
+    whisper_url = os.getenv('WHISPER_SERVICE_URL', 'http://localhost:8000')
+    # Create service
+    service = MCPSpeechService(whisper_service_url=whisper_url)
+    app = service.create_app()
+    logger.info(f"Starting MCP Speech-to-Text service on {host}:{port}")
+    logger.info(f"Whisper service URL: {whisper_url}")
+    logger.info(f"Encryption key: {base64.b64encode(service.encryption_key).decode()}")
+    # Run the service
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, host, port)
+    await site.start()
+    logger.info("MCP Service is running...")
+    try:
+        # Keep the service running
+        while True:
+            await asyncio.sleep(1)
+    except KeyboardInterrupt:
+        logger.info("Shutting down MCP service...")
+    finally:
+        await runner.cleanup()
+if __name__ == '__main__':
+    asyncio.run(main())

requirements.txt CHANGED Viewed

@@ -1,3 +1,8 @@
-fastapi
-uvicorn[standard]

+aiohttp==3.8.5
+aiofiles==23.2.0
+numpy==1.24.3
+soundfile==0.12.1
+cryptography==41.0.3
+openai-whisper==20231117
+torch>=1.13.0
+torchaudio>=0.13.0

startup_script.py ADDED Viewed

	@@ -0,0 +1,212 @@

+#!/usr/bin/env python3
+"""
+Startup script to run both Whisper HTTP service and MCP service
+"""
+import asyncio
+import subprocess
+import sys
+import time
+import signal
+import os
+from pathlib import Path
+class ServiceManager:
+    def __init__(self):
+        self.whisper_process = None
+        self.mcp_process = None
+        self.running = False
+    async def start_whisper_service(self):
+        """Start the Whisper HTTP service"""
+        print("🔄 Starting Whisper HTTP service...")
+        # Check if whisper_http_wrapper.py exists
+        if not Path("whisper_http_wrapper.py").exists():
+            print("❌ whisper_http_wrapper.py not found in current directory")
+            return False
+        try:
+            self.whisper_process = subprocess.Popen([
+                sys.executable, "whisper_http_wrapper.py"
+            ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+            # Wait a bit for the service to start
+            await asyncio.sleep(3)
+            if self.whisper_process.poll() is None:
+                print("✅ Whisper HTTP service started successfully")
+                return True
+            else:
+                stdout, stderr = self.whisper_process.communicate()
+                print(f"❌ Whisper HTTP service failed to start:")
+                print(f"STDOUT: {stdout}")
+                print(f"STDERR: {stderr}")
+                return False
+        except Exception as e:
+            print(f"❌ Error starting Whisper HTTP service: {e}")
+            return False
+    async def start_mcp_service(self):
+        """Start the MCP service"""
+        print("🔄 Starting MCP Speech service...")
+        # Check if mcp_speech_service.py exists
+        if not Path("mcp_speech_service.py").exists():
+            print("❌ mcp_speech_service.py not found in current directory")
+            return False
+        try:
+            self.mcp_process = subprocess.Popen([
+                sys.executable, "mcp_speech_service.py"
+            ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+            # Wait a bit for the service to start
+            await asyncio.sleep(2)
+            if self.mcp_process.poll() is None:
+                print("✅ MCP Speech service started successfully")
+                return True
+            else:
+                stdout, stderr = self.mc_process.communicate()
+                print(f"❌ MCP Speech service failed to start:")
+                print(f"STDOUT: {stdout}")
+                print(f"STDERR: {stderr}")
+                return False
+        except Exception as e:
+            print(f"❌ Error starting MCP Speech service: {e}")
+            return False
+    async def check_services(self):
+        """Check if services are still running"""
+        import aiohttp
+        # Check Whisper service
+        whisper_ok = False
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.get("http://localhost:8000/health", timeout=5) as response:
+                    if response.status == 200:
+                        whisper_ok = True
+        except:
+            pass
+        # Check MCP service
+        mcp_ok = False
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.get("http://localhost:8081/status", timeout=5) as response:
+                    if response.status == 200:
+                        mcp_ok = True
+        except:
+            pass
+        return whisper_ok, mcp_ok
+    def stop_services(self):
+        """Stop all services"""
+        print("\n🛑 Stopping services...")
+        if self.mcp_process and self.mcp_process.poll() is None:
+            print("  Stopping MCP service...")
+            self.mcp_process.terminate()
+            try:
+                self.mcp_process.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                self.mcp_process.kill()
+        if self.whisper_process and self.whisper_process.poll() is None:
+            print("  Stopping Whisper service...")
+            self.whisper_process.terminate()
+            try:
+                self.whisper_process.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                self.whisper_process.kill()
+        print("✅ Services stopped")
+    def signal_handler(self, signum, frame):
+        """Handle Ctrl+C"""
+        print(f"\n📡 Received signal {signum}")
+        self.running = False
+        self.stop_services()
+        sys.exit(0)
+    async def run(self):
+        """Run both services"""
+        # Set up signal handlers
+        signal.signal(signal.SIGINT, self.signal_handler)
+        signal.signal(signal.SIGTERM, self.signal_handler)
+        print("🚀 Starting MCP Speech-to-Text Services")
+        print("=" * 50)
+        # Start Whisper service first
+        if not await self.start_whisper_service():
+            print("❌ Failed to start Whisper service. Exiting.")
+            return
+        # Start MCP service
+        if not await self.start_mcp_service():
+            print("❌ Failed to start MCP service. Stopping Whisper service.")
+            self.stop_services()
+            return
+        print("\n✅ Both services are running!")
+        print("📋 Service URLs:")
+        print("   • Whisper HTTP: http://localhost:8000")
+        print("   • MCP Service:  http://localhost:8081")
+        print("\n🧪 You can now run: python testClient.py")
+        print("⏹️  Press Ctrl+C to stop all services")
+        self.running = True
+        # Monitor services
+        try:
+            while self.running:
+                await asyncio.sleep(10)  # Check every 10 seconds
+                whisper_ok, mcp_ok = await self.check_services()
+                if not whisper_ok:
+                    print("⚠️  Whisper service appears to be down")
+                if not mcp_ok:
+                    print("⚠️  MCP service appears to be down")
+                if not whisper_ok or not mcp_ok:
+                    print("🔄 Attempting to restart services...")
+                    self.stop_services()
+                    await asyncio.sleep(2)
+                    if not whisper_ok:
+                        await self.start_whisper_service()
+                    if not mcp_ok:
+                        await self.start_mcp_service()
+        except KeyboardInterrupt:
+            pass
+        finally:
+            self.stop_services()
+async def main():
+    """Main function"""
+    if len(sys.argv) > 1 and sys.argv[1] == "--help":
+        print("MCP Speech-to-Text Service Manager")
+        print("Usage: python startup.py")
+        print("\nThis script will:")
+        print("1. Start the Whisper HTTP service on port 8000")
+        print("2. Start the MCP Speech service on port 8081")
+        print("3. Monitor both services and restart if needed")
+        print("4. Stop all services when you press Ctrl+C")
+        return
+    manager = ServiceManager()
+    await manager.run()
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        print("\n👋 Goodbye!")

testClient.py ADDED Viewed

	@@ -0,0 +1,141 @@

+#!/usr/bin/env python3
+"""
+Test client for MPC Speech-to-Text service
+"""
+import asyncio
+import sys
+from pathlib import Path
+from mcp_speech_client import transcribe_audio_file, check_service_status
+async def main():
+    """Main test function"""
+    print("🧪 Testing MPC Speech-to-Text Client")
+    print("=" * 50)
+    # First, check if the services are running
+    print("1️⃣ Checking service status...")
+    status = await check_service_status()
+    print(f"   Service status: {status.get('status', 'unknown')}")
+    if status.get("status") == "error":
+        print("\n⚠️  Service appears to be down!")
+        print("💡 To start the services, run:")
+        print("   python startup.py")
+        print("\n   Or start them manually:")
+        print("   1. Start Whisper HTTP service: python whisper_http_wrapper.py")
+        print("   2. Start MPC service: python mpc_speech_service.py")
+        return
+    print("✅ Services are running!")
+    print(f"   Active sessions: {status.get('active_sessions', 0)}")
+    print("\n2️⃣ Testing audio file transcription...")
+    # Look for available audio files
+    test_files = [
+        "audio.wav",
+        "speech_test.wav",
+        "test_audio.wav",
+        "silent_test.wav"
+    ]
+    audio_file = None
+    for file in test_files:
+        if Path(file).exists():
+            audio_file = file
+            break
+    if not audio_file:
+        print("❌ No test audio files found!")
+        print("💡 Create test audio files by running:")
+        print("   python create_test_audio.py")
+        print("\n   Or provide your own audio file and update the test_files list.")
+        return
+    print(f"🎵 Using audio file: {audio_file}")
+    try:
+        # Test transcription
+        print("   Sending transcription request...")
+        result = await transcribe_audio_file(audio_file)
+        print(f"\n📋 Transcription result:")
+        print(f"   Status: {result.get('status')}")
+        print(f"   Session ID: {result.get('session_id')}")
+        if result.get("status") == "success":
+            transcription = result.get("transcription", {})
+            if isinstance(transcription, dict):
+                text = transcription.get("result", {}).get("text", "No text found")
+                processing_time = transcription.get("result", {}).get("processing_time", 0)
+                model_info = transcription.get("result", {}).get("model_info", {})
+                print(f"✅ Transcription successful!")
+                print(f"   Text: '{text}'")
+                print(f"   Processing time: {processing_time:.2f}s")
+                print(f"   Model: {model_info.get('model', 'unknown')}")
+                print(f"   Device: {model_info.get('device', 'unknown')}")
+                # If it's synthetic audio, explain the result
+                if "test" in audio_file.lower() or "speech" in audio_file.lower():
+                    print(f"\n💡 Note: This was synthetic test audio, so the transcription")
+                    print(f"   result may not be meaningful. Try with real speech audio")
+                    print(f"   for better results.")
+            else:
+                print(f"   Raw transcription: {transcription}")
+        else:
+            error_msg = result.get('error', 'Unknown error')
+            print(f"❌ Transcription failed: {error_msg}")
+            # Provide specific help for common errors
+            if "Cannot connect" in error_msg and "8000" in error_msg:
+                print("\n💡 The Whisper HTTP service (port 8000) is not running.")
+                print("   Start it with: python whisper_http_wrapper.py")
+            elif "Authentication" in error_msg:
+                print("\n💡 Authentication issue with the MPC service.")
+                print("   Check if the MPC service is running properly.")
+    except FileNotFoundError:
+        print(f"❌ Audio file '{audio_file}' not found.")
+    except Exception as e:
+        print(f"❌ Unexpected error during transcription: {e}")
+    print(f"\n🏁 Test completed!")
+def check_dependencies():
+    """Check if required dependencies are available"""
+    missing_deps = []
+    try:
+        import aiohttp
+    except ImportError:
+        missing_deps.append("aiohttp")
+    try:
+        import soundfile
+    except ImportError:
+        missing_deps.append("soundfile")
+    if missing_deps:
+        print("❌ Missing dependencies:")
+        for dep in missing_deps:
+            print(f"   • {dep}")
+        print(f"\n💡 Install them with: pip install {' '.join(missing_deps)}")
+        return False
+    return True
+if __name__ == "__main__":
+    if not check_dependencies():
+        sys.exit(1)
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        print("\n👋 Test interrupted by user")
+    except Exception as e:
+        print(f"\n❌ Test failed with error: {e}")
+        sys.exit(1)

whisper_http_wrapper.py ADDED Viewed

	@@ -0,0 +1,304 @@

+#!/usr/bin/env python3
+"""
+HTTP API Wrapper for realtime-whisper-macbook
+This adds HTTP endpoints to the existing whisper functionality
+"""
+import asyncio
+import json
+import logging
+import os
+import tempfile
+import threading
+import time
+from pathlib import Path
+from typing import Dict, Any, Optional
+import numpy as np
+import soundfile as sf
+import torch
+import whisper
+from aiohttp import web
+import aiofiles
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class WhisperHTTPService:
+    """HTTP wrapper for Whisper transcription service"""
+    def __init__(self, model_name: str = "base", device: str = "auto"):
+        """
+        Initialize the Whisper HTTP service
+        Args:
+            model_name: Whisper model to use (tiny, base, small, medium, large)
+            device: Device to run on (cpu, cuda, mps, auto)
+        """
+        self.model_name = model_name
+        # Auto-detect device if not specified
+        if device == "auto":
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            #elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+            #    self.device = "mps"  # Apple Silicon
+            else:
+                self.device = "cpu"
+        else:
+            self.device = device
+        logger.info(f"Using device: {self.device}")
+        # Load Whisper model
+        logger.info(f"Loading Whisper model: {model_name}")
+        self.model = whisper.load_model(model_name, device=self.device)
+        logger.info("Whisper model loaded successfully")
+        # Statistics
+        self.stats = {
+            "requests_processed": 0,
+            "total_audio_duration": 0.0,
+            "average_processing_time": 0.0,
+            "start_time": time.time()
+        }
+    def transcribe_audio_file(self, audio_file_path: str, **kwargs) -> Dict[str, Any]:
+        """
+        Transcribe audio file using Whisper
+        Args:
+            audio_file_path: Path to audio file
+            **kwargs: Additional Whisper parameters
+        Returns:
+            Transcription result dictionary
+        """
+        try:
+            start_time = time.time()
+            # Default Whisper options
+            options = {
+                "language": kwargs.get("language"),  # None for auto-detection
+                "task": kwargs.get("task", "transcribe"),  # transcribe or translate
+                "temperature": kwargs.get("temperature", 0.0),
+                "best_of": kwargs.get("best_of", 5),
+                "beam_size": kwargs.get("beam_size", 5),
+                "patience": kwargs.get("patience", 1.0),
+                "length_penalty": kwargs.get("length_penalty", 1.0),
+                "suppress_tokens": kwargs.get("suppress_tokens", "-1"),
+                "initial_prompt": kwargs.get("initial_prompt"),
+                "condition_on_previous_text": kwargs.get("condition_on_previous_text", True),
+                "fp16": kwargs.get("fp16", True if self.device == "cuda" else False),
+                "compression_ratio_threshold": kwargs.get("compression_ratio_threshold", 2.4),
+                "logprob_threshold": kwargs.get("logprob_threshold", -1.0),
+                "no_speech_threshold": kwargs.get("no_speech_threshold", 0.6),
+            }
+            # Remove None values
+            options = {k: v for k, v in options.items() if v is not None}
+            # Transcribe
+            result = self.model.transcribe(audio_file_path, **options)
+            processing_time = time.time() - start_time
+            # Update statistics
+            self.stats["requests_processed"] += 1
+            if "segments" in result:
+                audio_duration = max([seg["end"] for seg in result["segments"]], default=0)
+                self.stats["total_audio_duration"] += audio_duration
+            # Calculate average processing time
+            total_requests = self.stats["requests_processed"]
+            self.stats["average_processing_time"] = (
+                (self.stats["average_processing_time"] * (total_requests - 1) + processing_time) / total_requests
+            )
+            # Add metadata
+            result["processing_time"] = processing_time
+            result["model"] = self.model_name
+            result["device"] = self.device
+            logger.info(f"Transcribed audio in {processing_time:.2f}s: '{result['text'][:100]}...'")
+            return {
+                "success": True,
+                "result": result,
+                "processing_time": processing_time,
+                "model_info": {
+                    "model": self.model_name,
+                    "device": self.device
+                }
+            }
+        except Exception as e:
+            logger.error(f"Transcription error: {e}")
+            return {
+                "success": False,
+                "error": str(e),
+                "model_info": {
+                    "model": self.model_name,
+                    "device": self.device
+                }
+            }
+    async def handle_transcribe(self, request):
+        """Handle transcription HTTP requests"""
+        try:
+            # Handle multipart form data
+            reader = await request.multipart()
+            audio_data = None
+            options = {}
+            async for part in reader:
+                if part.name == 'audio':
+                    audio_data = await part.read()
+                elif part.name == 'options':
+                    options_text = await part.text()
+                    try:
+                        options = json.loads(options_text)
+                    except json.JSONDecodeError:
+                        pass
+                elif part.name in ['language', 'task', 'temperature', 'beam_size']:
+                    # Handle individual parameters
+                    options[part.name] = await part.text()
+            if not audio_data:
+                return web.Response(
+                    text=json.dumps({"error": "No audio data provided"}),
+                    status=400,
+                    content_type='application/json'
+                )
+            # Save audio data to temporary file
+            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
+                temp_file.write(audio_data)
+                temp_file_path = temp_file.name
+            try:
+                # Convert options to appropriate types
+                if 'temperature' in options:
+                    options['temperature'] = float(options['temperature'])
+                if 'beam_size' in options:
+                    options['beam_size'] = int(options['beam_size'])
+                # Transcribe
+                result = self.transcribe_audio_file(temp_file_path, **options)
+                return web.Response(
+                    text=json.dumps(result),
+                    content_type='application/json'
+                )
+            finally:
+                # Clean up temporary file
+                try:
+                    os.unlink(temp_file_path)
+                except:
+                    pass
+        except Exception as e:
+            logger.error(f"Request handling error: {e}")
+            return web.Response(
+                text=json.dumps({"error": f"Request processing failed: {str(e)}"}),
+                status=500,
+                content_type='application/json'
+            )
+    async def handle_health(self, request):
+        """Health check endpoint"""
+        uptime = time.time() - self.stats["start_time"]
+        health_info = {
+            "status": "healthy",
+            "model": self.model_name,
+            "device": self.device,
+            "uptime_seconds": uptime,
+            "statistics": self.stats.copy()
+        }
+        return web.Response(
+            text=json.dumps(health_info),
+            content_type='application/json'
+        )
+    async def handle_models(self, request):
+        """List available models"""
+        available_models = ["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"]
+        return web.Response(
+            text=json.dumps({
+                "available_models": available_models,
+                "current_model": self.model_name,
+                "device": self.device
+            }),
+            content_type='application/json'
+        )
+    def create_app(self) -> web.Application:
+        """Create the web application"""
+        app = web.Application(client_max_size=50*1024*1024)  # 50MB max file size
+        # Add routes
+        app.router.add_post('/transcribe', self.handle_transcribe)
+        app.router.add_get('/health', self.handle_health)
+        app.router.add_get('/models', self.handle_models)
+        # Add CORS middleware
+        async def cors_middleware(request, handler):
+            if request.method == 'OPTIONS':
+                # Handle preflight requests
+                response = web.Response()
+            else:
+                response = await handler(request)
+            response.headers['Access-Control-Allow-Origin'] = '*'
+            response.headers['Access-Control-Allow-Methods'] = 'GET, POST, OPTIONS'
+            response.headers['Access-Control-Allow-Headers'] = 'Content-Type, Authorization'
+            return response
+        app.middlewares.append(cors_middleware)
+        return app
+async def main():
+    """Main function to run the Whisper HTTP service"""
+    # Configuration from environment variables
+    host = os.getenv('WHISPER_HOST', '127.0.0.1')
+    port = int(os.getenv('WHISPER_PORT', '8000'))
+    model_name = os.getenv('WHISPER_MODEL', 'base')
+    device = os.getenv('WHISPER_DEVICE', 'auto')
+    # Create service
+    logger.info("Initializing Whisper HTTP service...")
+    service = WhisperHTTPService(model_name=model_name, device=device)
+    app = service.create_app()
+    logger.info(f"Starting Whisper HTTP service on {host}:{port}")
+    logger.info(f"Model: {model_name}, Device: {device}")
+    # Run the service
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, host, port)
+    await site.start()
+    logger.info("Whisper HTTP service is running!")
+    logger.info(f"Endpoints available:")
+    logger.info(f"  POST http://{host}:{port}/transcribe - Transcribe audio")
+    logger.info(f"  GET  http://{host}:{port}/health - Health check")
+    logger.info(f"  GET  http://{host}:{port}/models - List models")
+    try:
+        # Keep the service running
+        while True:
+            await asyncio.sleep(1)
+    except KeyboardInterrupt:
+        logger.info("Shutting down Whisper HTTP service...")
+    finally:
+        await runner.cleanup()
+if __name__ == '__main__':
+    asyncio.run(main())