petergits commited on
Commit
c08e928
·
1 Parent(s): 56ce875

2nd checking push speech2text

Browse files
Dockerfile CHANGED
@@ -10,8 +10,17 @@ ENV PATH="/home/user/.local/bin:$PATH"
10
  WORKDIR /app
11
 
12
  COPY --chown=user ./requirements.txt requirements.txt
 
 
 
 
 
 
 
 
13
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
 
15
  COPY --chown=user . /app
16
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
17
 
 
10
  WORKDIR /app
11
 
12
  COPY --chown=user ./requirements.txt requirements.txt
13
+ COPY create_test_audio.py .
14
+ COPY generate_test_audio.py .
15
+ COPY mcp_speech_client.py .
16
+ COPY mcp_speech_service.py .
17
+ COPY requirements.txt .
18
+ COPY startup_script.py .
19
+ COPY testClient.py .
20
+ COPY whisper_http_wrapper.py .
21
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
22
 
23
  COPY --chown=user . /app
24
+ CMD ["python", "startup_script.py"]
25
+ #CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
26
 
create_test_audio.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Create a test audio file for testing the MPC Speech service
4
+ """
5
+
6
+ import numpy as np
7
+ import soundfile as sf
8
+ from pathlib import Path
9
+
10
+ def create_test_audio(filename="test_audio.wav", duration=3.0, sample_rate=16000):
11
+ """
12
+ Create a simple test audio file with a sine wave tone
13
+
14
+ Args:
15
+ filename: Output filename
16
+ duration: Duration in seconds
17
+ sample_rate: Sample rate in Hz
18
+ """
19
+ print(f"🎵 Creating test audio file: {filename}")
20
+
21
+ # Generate time array
22
+ t = np.linspace(0, duration, int(sample_rate * duration), False)
23
+
24
+ # Create a simple tone (440 Hz - A note) with some variation
25
+ frequency1 = 440 # A note
26
+ frequency2 = 523 # C note
27
+
28
+ # Create a chord-like sound
29
+ audio = (
30
+ 0.3 * np.sin(2 * np.pi * frequency1 * t) + # A note
31
+ 0.2 * np.sin(2 * np.pi * frequency2 * t) + # C note
32
+ 0.1 * np.sin(2 * np.pi * 660 * t) # E note
33
+ )
34
+
35
+ # Add some envelope to make it sound more natural
36
+ envelope = np.exp(-t * 0.5) # Exponential decay
37
+ audio = audio * envelope
38
+
39
+ # Normalize to prevent clipping
40
+ audio = audio / np.max(np.abs(audio)) * 0.8
41
+
42
+ # Save as WAV file
43
+ sf.write(filename, audio, sample_rate)
44
+
45
+ file_size = Path(filename).stat().st_size
46
+ print(f"✅ Created {filename}")
47
+ print(f" Duration: {duration}s")
48
+ print(f" Sample rate: {sample_rate} Hz")
49
+ print(f" File size: {file_size:,} bytes")
50
+
51
+ return filename
52
+
53
+ def create_silent_audio(filename="silent_audio.wav", duration=2.0, sample_rate=16000):
54
+ """Create a silent audio file for testing"""
55
+ print(f"🔇 Creating silent audio file: {filename}")
56
+
57
+ # Create silent audio (zeros)
58
+ audio = np.zeros(int(sample_rate * duration))
59
+
60
+ # Save as WAV file
61
+ sf.write(filename, audio, sample_rate)
62
+
63
+ file_size = Path(filename).stat().st_size
64
+ print(f"✅ Created {filename}")
65
+ print(f" Duration: {duration}s (silent)")
66
+ print(f" Sample rate: {sample_rate} Hz")
67
+ print(f" File size: {file_size:,} bytes")
68
+
69
+ return filename
70
+
71
+ def create_speech_like_audio(filename="speech_test.wav", duration=5.0, sample_rate=16000):
72
+ """Create a more speech-like test audio with varying frequencies"""
73
+ print(f"🎤 Creating speech-like audio file: {filename}")
74
+
75
+ # Generate time array
76
+ t = np.linspace(0, duration, int(sample_rate * duration), False)
77
+
78
+ # Create speech-like formants (simplified)
79
+ # Human speech typically has formants around 500Hz, 1500Hz, 2500Hz
80
+ formant1 = 500
81
+ formant2 = 1500
82
+ formant3 = 2500
83
+
84
+ # Create a more complex waveform
85
+ audio = (
86
+ 0.4 * np.sin(2 * np.pi * formant1 * t) * (1 + 0.3 * np.sin(2 * np.pi * 3 * t)) +
87
+ 0.3 * np.sin(2 * np.pi * formant2 * t) * (1 + 0.2 * np.sin(2 * np.pi * 5 * t)) +
88
+ 0.2 * np.sin(2 * np.pi * formant3 * t) * (1 + 0.1 * np.sin(2 * np.pi * 7 * t))
89
+ )
90
+
91
+ # Add some noise to make it more realistic
92
+ noise = 0.05 * np.random.normal(0, 1, len(audio))
93
+ audio = audio + noise
94
+
95
+ # Create segments (like words)
96
+ segment_duration = 0.8
97
+ pause_duration = 0.2
98
+ segment_samples = int(segment_duration * sample_rate)
99
+ pause_samples = int(pause_duration * sample_rate)
100
+
101
+ # Apply segmentation
102
+ for i in range(0, len(audio), segment_samples + pause_samples):
103
+ # Keep segment
104
+ segment_end = min(i + segment_samples, len(audio))
105
+ # Add pause
106
+ pause_start = segment_end
107
+ pause_end = min(pause_start + pause_samples, len(audio))
108
+ if pause_end > pause_start:
109
+ audio[pause_start:pause_end] *= 0.1 # Reduce volume for pause
110
+
111
+ # Normalize
112
+ audio = audio / np.max(np.abs(audio)) * 0.7
113
+
114
+ # Save as WAV file
115
+ sf.write(filename, audio, sample_rate)
116
+
117
+ file_size = Path(filename).stat().st_size
118
+ print(f"✅ Created {filename}")
119
+ print(f" Duration: {duration}s")
120
+ print(f" Sample rate: {sample_rate} Hz")
121
+ print(f" File size: {file_size:,} bytes")
122
+ print(f" Note: This is synthetic audio for testing")
123
+
124
+ return filename
125
+
126
+ def main():
127
+ """Create test audio files"""
128
+ print("🎧 Creating test audio files for MPC Speech service")
129
+ print("=" * 50)
130
+
131
+ try:
132
+ # Create different types of test audio
133
+ create_test_audio("audio.wav", duration=3.0)
134
+ create_silent_audio("silent_test.wav", duration=2.0)
135
+ create_speech_like_audio("speech_test.wav", duration=4.0)
136
+
137
+ print(f"\n✅ All test audio files created successfully!")
138
+ print(f"📁 Files created in current directory:")
139
+ print(f" • audio.wav - Simple tone (for basic testing)")
140
+ print(f" • silent_test.wav - Silent audio (edge case testing)")
141
+ print(f" • speech_test.wav - Speech-like audio (more realistic)")
142
+ print(f"\n🧪 You can now test with: python testClient.py")
143
+
144
+ except Exception as e:
145
+ print(f"❌ Error creating test audio files: {e}")
146
+ print(f"💡 Make sure you have soundfile installed: pip install soundfile")
147
+
148
+ if __name__ == "__main__":
149
+ main()
generate_test_audio.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Generate test audio files for testing the MPC Speech service
4
+ """
5
+
6
+ import numpy as np
7
+ import soundfile as sf
8
+ from scipy import signal
9
+ import math
10
+
11
+ def generate_sine_wave(frequency=440, duration=3.0, sample_rate=16000, amplitude=0.3):
12
+ """Generate a sine wave audio signal"""
13
+ t = np.linspace(0, duration, int(sample_rate * duration), False)
14
+ wave = amplitude * np.sin(2 * np.pi * frequency * t)
15
+ return wave.astype(np.float32)
16
+
17
+ def generate_chirp(duration=3.0, sample_rate=16000, f0=200, f1=2000, amplitude=0.3):
18
+ """Generate a frequency sweep (chirp) signal"""
19
+ t = np.linspace(0, duration, int(sample_rate * duration), False)
20
+ wave = amplitude * signal.chirp(t, f0, duration, f1)
21
+ return wave.astype(np.float32)
22
+
23
+ def generate_white_noise(duration=3.0, sample_rate=16000, amplitude=0.1):
24
+ """Generate white noise"""
25
+ samples = int(sample_rate * duration)
26
+ wave = amplitude * np.random.normal(0, 1, samples)
27
+ return wave.astype(np.float32)
28
+
29
+ def generate_speech_like_signal(duration=5.0, sample_rate=16000):
30
+ """Generate a speech-like signal with multiple frequency components"""
31
+ t = np.linspace(0, duration, int(sample_rate * duration), False)
32
+
33
+ # Fundamental frequency (varies like speech)
34
+ f0 = 120 + 30 * np.sin(2 * np.pi * 0.5 * t) # Varying pitch
35
+
36
+ # Multiple harmonics
37
+ signal_wave = np.zeros_like(t)
38
+ for harmonic in range(1, 6):
39
+ amplitude = 0.3 / harmonic # Decreasing amplitude for higher harmonics
40
+ signal_wave += amplitude * np.sin(2 * np.pi * harmonic * f0 * t)
41
+
42
+ # Add some formant-like filtering
43
+ # Simple bandpass filtering to simulate formants
44
+ b, a = signal.butter(4, [300, 3000], btype='band', fs=sample_rate)
45
+ signal_wave = signal.filtfilt(b, a, signal_wave)
46
+
47
+ # Add envelope to make it more speech-like
48
+ envelope = np.exp(-0.5 * t) * (1 + 0.5 * np.sin(2 * np.pi * 2 * t))
49
+ signal_wave *= envelope
50
+
51
+ return signal_wave.astype(np.float32)
52
+
53
+ def create_test_audio_files():
54
+ """Create various test audio files"""
55
+ sample_rate = 16000 # Common sample rate for speech recognition
56
+
57
+ # 1. Simple sine wave (440 Hz - A note)
58
+ sine_wave = generate_sine_wave(440, 3.0, sample_rate)
59
+ sf.write('test_sine_440hz.wav', sine_wave, sample_rate)
60
+ print("Created: test_sine_440hz.wav (3 seconds, 440 Hz sine wave)")
61
+
62
+ # 2. Frequency sweep
63
+ chirp = generate_chirp(3.0, sample_rate)
64
+ sf.write('test_chirp.wav', chirp, sample_rate)
65
+ print("Created: test_chirp.wav (3 seconds, frequency sweep 200-2000 Hz)")
66
+
67
+ # 3. White noise (for testing noise handling)
68
+ noise = generate_white_noise(2.0, sample_rate)
69
+ sf.write('test_noise.wav', noise, sample_rate)
70
+ print("Created: test_noise.wav (2 seconds, white noise)")
71
+
72
+ # 4. Speech-like signal
73
+ speech_like = generate_speech_like_signal(5.0, sample_rate)
74
+ sf.write('test_speech_like.wav', speech_like, sample_rate)
75
+ print("Created: test_speech_like.wav (5 seconds, speech-like signal)")
76
+
77
+ # 5. Mixed signal (sine + noise)
78
+ mixed_duration = 4.0
79
+ sine_component = generate_sine_wave(300, mixed_duration, sample_rate, 0.4)
80
+ noise_component = generate_white_noise(mixed_duration, sample_rate, 0.1)
81
+ mixed_signal = sine_component + noise_component
82
+ sf.write('test_mixed.wav', mixed_signal, sample_rate)
83
+ print("Created: test_mixed.wav (4 seconds, 300 Hz sine + noise)")
84
+
85
+ # 6. Multi-tone signal
86
+ multi_tone = (
87
+ 0.3 * generate_sine_wave(261.63, 3.0, sample_rate, 1.0) + # C4
88
+ 0.2 * generate_sine_wave(329.63, 3.0, sample_rate, 1.0) + # E4
89
+ 0.2 * generate_sine_wave(392.00, 3.0, sample_rate, 1.0) # G4
90
+ ) / 3
91
+ sf.write('test_chord.wav', multi_tone, sample_rate)
92
+ print("Created: test_chord.wav (3 seconds, C major chord)")
93
+
94
+ def create_simple_test_audio():
95
+ """Create a simple test audio file named 'audio.wav' for the examples"""
96
+ sample_rate = 16000
97
+ duration = 3.0
98
+
99
+ # Create a simple melody-like signal
100
+ frequencies = [261.63, 293.66, 329.63, 349.23, 392.00] # C, D, E, F, G
101
+ note_duration = duration / len(frequencies)
102
+
103
+ full_signal = np.array([])
104
+
105
+ for freq in frequencies:
106
+ note = generate_sine_wave(freq, note_duration, sample_rate, 0.3)
107
+ full_signal = np.concatenate([full_signal, note])
108
+
109
+ sf.write('audio.wav', full_signal, sample_rate)
110
+ print("Created: audio.wav (3 seconds, simple melody for testing)")
111
+
112
+ if __name__ == '__main__':
113
+ print("Generating test audio files...")
114
+ print(f"Sample rate: 16000 Hz (common for speech recognition)")
115
+ print("-" * 50)
116
+
117
+ # Install required package if not present
118
+ try:
119
+ from scipy import signal
120
+ except ImportError:
121
+ print("Installing scipy for signal processing...")
122
+ import subprocess
123
+ subprocess.check_call(['pip', 'install', 'scipy'])
124
+ from scipy import signal
125
+
126
+ # Create the simple test file first
127
+ create_simple_test_audio()
128
+ print("-" * 50)
129
+
130
+ # Create various test files
131
+ create_test_audio_files()
132
+ print("-" * 50)
133
+ print("All test audio files created successfully!")
134
+ print("\nYou can now use these files to test your MPC Speech service:")
135
+ print("- audio.wav (for basic examples)")
136
+ print("- test_*.wav files (for various test scenarios)")
mcp_speech_client.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Client library for MCP Speech-to-Text service
4
+ """
5
+
6
+ import asyncio
7
+ import aiohttp
8
+ import json
9
+ import base64
10
+ import logging
11
+ from pathlib import Path
12
+ from typing import Dict, Any, Optional
13
+
14
+ # Configure logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class MCPSpeechClient:
19
+ """Client for MCP Speech-to-Text service"""
20
+
21
+ def __init__(self, service_url: str = "http://localhost:8081"):
22
+ self.service_url = service_url.rstrip('/')
23
+ self.token = None
24
+ self.encryption_key = None
25
+ self.client_session = None
26
+
27
+ async def __aenter__(self):
28
+ self.client_session = aiohttp.ClientSession()
29
+ return self
30
+
31
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
32
+ if self.client_session:
33
+ await self.client_session.close()
34
+
35
+ async def authenticate(self, client_id: str = "test_client") -> bool:
36
+ """Authenticate with the MCP service"""
37
+ try:
38
+ if not self.client_session:
39
+ self.client_session = aiohttp.ClientSession()
40
+
41
+ async with self.client_session.post(
42
+ f"{self.service_url}/auth",
43
+ json={"client_id": client_id}
44
+ ) as response:
45
+ if response.status == 200:
46
+ data = await response.json()
47
+ self.token = data.get("token")
48
+ self.encryption_key = data.get("encryption_key")
49
+ logger.info("Authentication successful")
50
+ return True
51
+ else:
52
+ error = await response.text()
53
+ logger.error(f"Authentication failed: {error}")
54
+ return False
55
+ except Exception as e:
56
+ logger.error(f"Authentication error: {e}")
57
+ return False
58
+
59
+ async def transcribe_file(self, audio_file_path: str, session_id: Optional[str] = None) -> Dict[str, Any]:
60
+ """Transcribe an audio file"""
61
+ if not self.token:
62
+ if not await self.authenticate():
63
+ return {"status": "error", "error": "Authentication failed"}
64
+
65
+ try:
66
+ if not Path(audio_file_path).exists():
67
+ raise FileNotFoundError(f"Audio file not found: {audio_file_path}")
68
+
69
+ headers = {
70
+ "Authorization": f"Bearer {self.token}"
71
+ }
72
+
73
+ if session_id:
74
+ headers["X-Session-ID"] = session_id
75
+
76
+ with open(audio_file_path, 'rb') as f:
77
+ form_data = aiohttp.FormData()
78
+ form_data.add_field('audio', f, filename=Path(audio_file_path).name)
79
+
80
+ async with self.client_session.post(
81
+ f"{self.service_url}/transcribe",
82
+ data=form_data,
83
+ headers=headers
84
+ ) as response:
85
+ if response.status == 200:
86
+ return await response.json()
87
+ else:
88
+ error_text = await response.text()
89
+ return {
90
+ "status": "error",
91
+ "error": f"HTTP {response.status}: {error_text}"
92
+ }
93
+ except Exception as e:
94
+ logger.error(f"Transcription error: {e}")
95
+ return {"status": "error", "error": str(e)}
96
+
97
+ async def get_status(self) -> Dict[str, Any]:
98
+ """Get service status"""
99
+ try:
100
+ if not self.client_session:
101
+ self.client_session = aiohttp.ClientSession()
102
+
103
+ async with self.client_session.get(f"{self.service_url}/status") as response:
104
+ if response.status == 200:
105
+ return await response.json()
106
+ else:
107
+ return {
108
+ "status": "error",
109
+ "error": f"HTTP {response.status}: {await response.text()}"
110
+ }
111
+ except Exception as e:
112
+ logger.error(f"Status check error: {e}")
113
+ return {"status": "error", "error": str(e)}
114
+
115
+ # Convenience functions for backwards compatibility
116
+ async def check_service_status(service_url: str = "http://localhost:8081") -> Dict[str, Any]:
117
+ """Check if the MCP service is running"""
118
+ async with MCPSpeechClient(service_url) as client:
119
+ return await client.get_status()
120
+
121
+ async def transcribe_audio_file(audio_file_path: str, service_url: str = "http://localhost:8081") -> Dict[str, Any]:
122
+ """Transcribe an audio file using the MCP service"""
123
+ async with MCPSpeechClient(service_url) as client:
124
+ result = await client.transcribe_file(audio_file_path, session_id="single_request")
125
+ return result
mcp_speech_service.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ MCP Speech-to-Text Service
4
+ Exposes realtime-whisper-macbook service through a secure MCP interface
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ import logging
10
+ import os
11
+ import subprocess
12
+ import tempfile
13
+ import time
14
+ from datetime import datetime
15
+ from pathlib import Path
16
+ from typing import Dict, List, Optional, Any
17
+ import hashlib
18
+ import hmac
19
+ import base64
20
+
21
+ import aiohttp
22
+ from aiohttp import web, WSMsgType
23
+ import aiofiles
24
+ import numpy as np
25
+ import soundfile as sf
26
+ from cryptography.fernet import Fernet
27
+
28
+ # Configure logging
29
+ logging.basicConfig(level=logging.INFO)
30
+ logger = logging.getLogger(__name__)
31
+
32
+ class MCPSpeechService:
33
+ """MCP Service for Speech-to-Text processing"""
34
+
35
+ def __init__(self, whisper_service_url: str = "http://localhost:8000",
36
+ encryption_key: Optional[bytes] = None):
37
+ self.whisper_service_url = whisper_service_url
38
+ self.encryption_key = encryption_key or Fernet.generate_key()
39
+ self.cipher = Fernet(self.encryption_key)
40
+ self.active_sessions: Dict[str, Dict] = {}
41
+ self.auth_tokens: Dict[str, Dict] = {}
42
+
43
+ def generate_session_id(self) -> str:
44
+ """Generate a unique session ID"""
45
+ return hashlib.sha256(f"{time.time()}{os.urandom(16)}".encode()).hexdigest()[:16]
46
+
47
+ def generate_auth_token(self, client_id: str) -> str:
48
+ """Generate JWT-like auth token for client"""
49
+ payload = {
50
+ "client_id": client_id,
51
+ "issued_at": int(time.time()),
52
+ "expires_at": int(time.time()) + 3600 # 1 hour expiry
53
+ }
54
+ token_data = json.dumps(payload).encode()
55
+ signature = hmac.new(self.encryption_key, token_data, hashlib.sha256).hexdigest()
56
+ token = base64.b64encode(token_data).decode() + "." + signature
57
+ self.auth_tokens[token] = payload
58
+ return token
59
+
60
+ def verify_auth_token(self, token: str) -> Optional[Dict]:
61
+ """Verify and decode auth token"""
62
+ try:
63
+ if "." not in token:
64
+ return None
65
+
66
+ token_b64, signature = token.rsplit(".", 1)
67
+ token_data = base64.b64decode(token_b64.encode())
68
+
69
+ # Verify signature
70
+ expected_sig = hmac.new(self.encryption_key, token_data, hashlib.sha256).hexdigest()
71
+ if not hmac.compare_digest(signature, expected_sig):
72
+ return None
73
+
74
+ payload = json.loads(token_data.decode())
75
+
76
+ # Check expiry
77
+ if payload.get("expires_at", 0) < int(time.time()):
78
+ return None
79
+
80
+ return payload
81
+ except Exception as e:
82
+ logger.error(f"Token verification failed: {e}")
83
+ return None
84
+
85
+ async def encrypt_data(self, data: bytes) -> bytes:
86
+ """Encrypt sensitive data"""
87
+ return self.cipher.encrypt(data)
88
+
89
+ async def decrypt_data(self, encrypted_data: bytes) -> bytes:
90
+ """Decrypt sensitive data"""
91
+ return self.cipher.decrypt(encrypted_data)
92
+
93
+ async def process_audio_chunk(self, audio_data: bytes, session_id: str,
94
+ format: str = "wav") -> Dict[str, Any]:
95
+ """Process audio chunk through whisper service"""
96
+ try:
97
+ # Create temporary file for audio data
98
+ with tempfile.NamedTemporaryFile(suffix=f".{format}", delete=False) as temp_file:
99
+ temp_file.write(audio_data)
100
+ temp_file_path = temp_file.name
101
+
102
+ try:
103
+ # Call the whisper service
104
+ async with aiohttp.ClientSession() as session:
105
+ with open(temp_file_path, 'rb') as f:
106
+ form_data = aiohttp.FormData()
107
+ form_data.add_field('audio', f, filename=f'audio.{format}')
108
+
109
+ async with session.post(
110
+ f"{self.whisper_service_url}/transcribe",
111
+ data=form_data
112
+ ) as response:
113
+ if response.status == 200:
114
+ result = await response.json()
115
+ return {
116
+ "session_id": session_id,
117
+ "timestamp": datetime.utcnow().isoformat(),
118
+ "transcription": result,
119
+ "status": "success"
120
+ }
121
+ else:
122
+ error_text = await response.text()
123
+ logger.error(f"Whisper service error: {error_text}")
124
+ return {
125
+ "session_id": session_id,
126
+ "timestamp": datetime.utcnow().isoformat(),
127
+ "error": f"Service error: {response.status}",
128
+ "status": "error"
129
+ }
130
+ finally:
131
+ # Clean up temporary file
132
+ os.unlink(temp_file_path)
133
+
134
+ except Exception as e:
135
+ logger.error(f"Audio processing error: {e}")
136
+ return {
137
+ "session_id": session_id,
138
+ "timestamp": datetime.utcnow().isoformat(),
139
+ "error": str(e),
140
+ "status": "error"
141
+ }
142
+
143
+ async def handle_auth(self, request):
144
+ """Handle authentication endpoint"""
145
+ try:
146
+ data = await request.json()
147
+ client_id = data.get('client_id')
148
+
149
+ if not client_id:
150
+ return web.Response(
151
+ text=json.dumps({"error": "client_id required"}),
152
+ status=400,
153
+ content_type='application/json'
154
+ )
155
+
156
+ token = self.generate_auth_token(client_id)
157
+
158
+ return web.Response(
159
+ text=json.dumps({
160
+ "token": token,
161
+ "encryption_key": base64.b64encode(self.encryption_key).decode(),
162
+ "expires_in": 3600
163
+ }),
164
+ content_type='application/json'
165
+ )
166
+
167
+ except Exception as e:
168
+ logger.error(f"Auth error: {e}")
169
+ return web.Response(
170
+ text=json.dumps({"error": "Authentication failed"}),
171
+ status=500,
172
+ content_type='application/json'
173
+ )
174
+
175
+ async def handle_transcribe(self, request):
176
+ """Handle transcription endpoint"""
177
+ try:
178
+ # Verify authentication
179
+ auth_header = request.headers.get('Authorization', '')
180
+ if not auth_header.startswith('Bearer '):
181
+ return web.Response(
182
+ text=json.dumps({"error": "Missing or invalid authorization"}),
183
+ status=401,
184
+ content_type='application/json'
185
+ )
186
+
187
+ token = auth_header[7:] # Remove 'Bearer ' prefix
188
+ auth_payload = self.verify_auth_token(token)
189
+ if not auth_payload:
190
+ return web.Response(
191
+ text=json.dumps({"error": "Invalid or expired token"}),
192
+ status=401,
193
+ content_type='application/json'
194
+ )
195
+
196
+ # Get session ID
197
+ session_id = request.headers.get('X-Session-ID')
198
+ if not session_id:
199
+ session_id = self.generate_session_id()
200
+
201
+ # Handle multipart form data (audio file)
202
+ reader = await request.multipart()
203
+ audio_data = None
204
+
205
+ async for part in reader:
206
+ if part.name == 'audio':
207
+ audio_data = await part.read()
208
+ break
209
+
210
+ if not audio_data:
211
+ return web.Response(
212
+ text=json.dumps({"error": "No audio data provided"}),
213
+ status=400,
214
+ content_type='application/json'
215
+ )
216
+
217
+ # Process audio
218
+ result = await self.process_audio_chunk(audio_data, session_id)
219
+
220
+ # Encrypt sensitive data if requested
221
+ if request.headers.get('X-Encrypt-Response') == 'true':
222
+ encrypted_result = await self.encrypt_data(json.dumps(result).encode())
223
+ return web.Response(
224
+ body=encrypted_result,
225
+ content_type='application/octet-stream',
226
+ headers={'X-Encrypted': 'true'}
227
+ )
228
+
229
+ return web.Response(
230
+ text=json.dumps(result),
231
+ content_type='application/json'
232
+ )
233
+
234
+ except Exception as e:
235
+ logger.error(f"Transcription error: {e}")
236
+ return web.Response(
237
+ text=json.dumps({"error": "Transcription failed"}),
238
+ status=500,
239
+ content_type='application/json'
240
+ )
241
+
242
+ async def handle_websocket(self, request):
243
+ """Handle WebSocket connections for real-time transcription"""
244
+ ws = web.WebSocketResponse()
245
+ await ws.prepare(request)
246
+
247
+ session_id = None
248
+ auth_payload = None
249
+
250
+ try:
251
+ async for msg in ws:
252
+ if msg.type == WSMsgType.TEXT:
253
+ try:
254
+ data = json.loads(msg.data)
255
+
256
+ if data.get('type') == 'auth':
257
+ # Handle authentication
258
+ token = data.get('token')
259
+ auth_payload = self.verify_auth_token(token)
260
+ if auth_payload:
261
+ session_id = self.generate_session_id()
262
+ self.active_sessions[session_id] = {
263
+ 'client_id': auth_payload['client_id'],
264
+ 'connected_at': time.time(),
265
+ 'ws': ws
266
+ }
267
+ await ws.send_text(json.dumps({
268
+ 'type': 'auth_success',
269
+ 'session_id': session_id
270
+ }))
271
+ else:
272
+ await ws.send_text(json.dumps({
273
+ 'type': 'auth_error',
274
+ 'message': 'Invalid token'
275
+ }))
276
+
277
+ elif data.get('type') == 'audio_chunk':
278
+ # Handle audio chunk
279
+ if not auth_payload or not session_id:
280
+ await ws.send_text(json.dumps({
281
+ 'type': 'error',
282
+ 'message': 'Not authenticated'
283
+ }))
284
+ continue
285
+
286
+ # Decode base64 audio data
287
+ audio_b64 = data.get('audio_data', '')
288
+ audio_data = base64.b64decode(audio_b64)
289
+
290
+ # Process audio
291
+ result = await self.process_audio_chunk(audio_data, session_id)
292
+
293
+ await ws.send_text(json.dumps({
294
+ 'type': 'transcription_result',
295
+ 'data': result
296
+ }))
297
+
298
+ except json.JSONDecodeError:
299
+ await ws.send_text(json.dumps({
300
+ 'type': 'error',
301
+ 'message': 'Invalid JSON'
302
+ }))
303
+
304
+ elif msg.type == WSMsgType.BINARY:
305
+ # Handle binary audio data
306
+ if not auth_payload or not session_id:
307
+ await ws.send_text(json.dumps({
308
+ 'type': 'error',
309
+ 'message': 'Not authenticated'
310
+ }))
311
+ continue
312
+
313
+ # Process binary audio data
314
+ result = await self.process_audio_chunk(msg.data, session_id)
315
+
316
+ await ws.send_text(json.dumps({
317
+ 'type': 'transcription_result',
318
+ 'data': result
319
+ }))
320
+
321
+ elif msg.type == WSMsgType.ERROR:
322
+ logger.error(f'WebSocket error: {ws.exception()}')
323
+ break
324
+
325
+ except Exception as e:
326
+ logger.error(f"WebSocket error: {e}")
327
+ finally:
328
+ # Clean up session
329
+ if session_id and session_id in self.active_sessions:
330
+ del self.active_sessions[session_id]
331
+
332
+ return ws
333
+
334
+ async def handle_status(self, request):
335
+ """Handle status endpoint"""
336
+ return web.Response(
337
+ text=json.dumps({
338
+ "service": "MCP Speech-to-Text",
339
+ "status": "running",
340
+ "active_sessions": len(self.active_sessions),
341
+ "whisper_service": self.whisper_service_url,
342
+ "timestamp": datetime.utcnow().isoformat()
343
+ }),
344
+ content_type='application/json'
345
+ )
346
+
347
+ def create_app(self) -> web.Application:
348
+ """Create the web application"""
349
+ app = web.Application()
350
+
351
+ # Add routes
352
+ app.router.add_post('/auth', self.handle_auth)
353
+ app.router.add_post('/transcribe', self.handle_transcribe)
354
+ app.router.add_get('/ws', self.handle_websocket)
355
+ app.router.add_get('/status', self.handle_status)
356
+
357
+ # Fixed CORS middleware
358
+ @web.middleware
359
+ async def cors_middleware(request, handler):
360
+ if request.method == 'OPTIONS':
361
+ # Handle preflight requests
362
+ response = web.Response()
363
+ else:
364
+ response = await handler(request)
365
+
366
+ response.headers['Access-Control-Allow-Origin'] = '*'
367
+ response.headers['Access-Control-Allow-Methods'] = 'GET, POST, OPTIONS'
368
+ response.headers['Access-Control-Allow-Headers'] = 'Content-Type, Authorization, X-Session-ID, X-Encrypt-Response'
369
+ return response
370
+
371
+ app.middlewares.append(cors_middleware)
372
+
373
+ return app
374
+
375
+ async def main():
376
+ """Main function to run the MCP service"""
377
+ # Configuration
378
+ host = os.getenv('MCP_HOST', '0.0.0.0')
379
+ port = int(os.getenv('MCP_PORT', '8081'))
380
+ whisper_url = os.getenv('WHISPER_SERVICE_URL', 'http://localhost:8000')
381
+
382
+ # Create service
383
+ service = MCPSpeechService(whisper_service_url=whisper_url)
384
+ app = service.create_app()
385
+
386
+ logger.info(f"Starting MCP Speech-to-Text service on {host}:{port}")
387
+ logger.info(f"Whisper service URL: {whisper_url}")
388
+ logger.info(f"Encryption key: {base64.b64encode(service.encryption_key).decode()}")
389
+
390
+ # Run the service
391
+ runner = web.AppRunner(app)
392
+ await runner.setup()
393
+ site = web.TCPSite(runner, host, port)
394
+ await site.start()
395
+
396
+ logger.info("MCP Service is running...")
397
+
398
+ try:
399
+ # Keep the service running
400
+ while True:
401
+ await asyncio.sleep(1)
402
+ except KeyboardInterrupt:
403
+ logger.info("Shutting down MCP service...")
404
+ finally:
405
+ await runner.cleanup()
406
+
407
+ if __name__ == '__main__':
408
+ asyncio.run(main())
requirements.txt CHANGED
@@ -1,3 +1,8 @@
1
- fastapi
2
- uvicorn[standard]
3
-
 
 
 
 
 
 
1
+ aiohttp==3.8.5
2
+ aiofiles==23.2.0
3
+ numpy==1.24.3
4
+ soundfile==0.12.1
5
+ cryptography==41.0.3
6
+ openai-whisper==20231117
7
+ torch>=1.13.0
8
+ torchaudio>=0.13.0
startup_script.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Startup script to run both Whisper HTTP service and MCP service
4
+ """
5
+
6
+ import asyncio
7
+ import subprocess
8
+ import sys
9
+ import time
10
+ import signal
11
+ import os
12
+ from pathlib import Path
13
+
14
+ class ServiceManager:
15
+ def __init__(self):
16
+ self.whisper_process = None
17
+ self.mcp_process = None
18
+ self.running = False
19
+
20
+ async def start_whisper_service(self):
21
+ """Start the Whisper HTTP service"""
22
+ print("🔄 Starting Whisper HTTP service...")
23
+
24
+ # Check if whisper_http_wrapper.py exists
25
+ if not Path("whisper_http_wrapper.py").exists():
26
+ print("❌ whisper_http_wrapper.py not found in current directory")
27
+ return False
28
+
29
+ try:
30
+ self.whisper_process = subprocess.Popen([
31
+ sys.executable, "whisper_http_wrapper.py"
32
+ ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
33
+
34
+ # Wait a bit for the service to start
35
+ await asyncio.sleep(3)
36
+
37
+ if self.whisper_process.poll() is None:
38
+ print("✅ Whisper HTTP service started successfully")
39
+ return True
40
+ else:
41
+ stdout, stderr = self.whisper_process.communicate()
42
+ print(f"❌ Whisper HTTP service failed to start:")
43
+ print(f"STDOUT: {stdout}")
44
+ print(f"STDERR: {stderr}")
45
+ return False
46
+
47
+ except Exception as e:
48
+ print(f"❌ Error starting Whisper HTTP service: {e}")
49
+ return False
50
+
51
+ async def start_mcp_service(self):
52
+ """Start the MCP service"""
53
+ print("🔄 Starting MCP Speech service...")
54
+
55
+ # Check if mcp_speech_service.py exists
56
+ if not Path("mcp_speech_service.py").exists():
57
+ print("❌ mcp_speech_service.py not found in current directory")
58
+ return False
59
+
60
+ try:
61
+ self.mcp_process = subprocess.Popen([
62
+ sys.executable, "mcp_speech_service.py"
63
+ ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
64
+
65
+ # Wait a bit for the service to start
66
+ await asyncio.sleep(2)
67
+
68
+ if self.mcp_process.poll() is None:
69
+ print("✅ MCP Speech service started successfully")
70
+ return True
71
+ else:
72
+ stdout, stderr = self.mc_process.communicate()
73
+ print(f"❌ MCP Speech service failed to start:")
74
+ print(f"STDOUT: {stdout}")
75
+ print(f"STDERR: {stderr}")
76
+ return False
77
+
78
+ except Exception as e:
79
+ print(f"❌ Error starting MCP Speech service: {e}")
80
+ return False
81
+
82
+ async def check_services(self):
83
+ """Check if services are still running"""
84
+ import aiohttp
85
+
86
+ # Check Whisper service
87
+ whisper_ok = False
88
+ try:
89
+ async with aiohttp.ClientSession() as session:
90
+ async with session.get("http://localhost:8000/health", timeout=5) as response:
91
+ if response.status == 200:
92
+ whisper_ok = True
93
+ except:
94
+ pass
95
+
96
+ # Check MCP service
97
+ mcp_ok = False
98
+ try:
99
+ async with aiohttp.ClientSession() as session:
100
+ async with session.get("http://localhost:8081/status", timeout=5) as response:
101
+ if response.status == 200:
102
+ mcp_ok = True
103
+ except:
104
+ pass
105
+
106
+ return whisper_ok, mcp_ok
107
+
108
+ def stop_services(self):
109
+ """Stop all services"""
110
+ print("\n🛑 Stopping services...")
111
+
112
+ if self.mcp_process and self.mcp_process.poll() is None:
113
+ print(" Stopping MCP service...")
114
+ self.mcp_process.terminate()
115
+ try:
116
+ self.mcp_process.wait(timeout=5)
117
+ except subprocess.TimeoutExpired:
118
+ self.mcp_process.kill()
119
+
120
+ if self.whisper_process and self.whisper_process.poll() is None:
121
+ print(" Stopping Whisper service...")
122
+ self.whisper_process.terminate()
123
+ try:
124
+ self.whisper_process.wait(timeout=5)
125
+ except subprocess.TimeoutExpired:
126
+ self.whisper_process.kill()
127
+
128
+ print("✅ Services stopped")
129
+
130
+ def signal_handler(self, signum, frame):
131
+ """Handle Ctrl+C"""
132
+ print(f"\n📡 Received signal {signum}")
133
+ self.running = False
134
+ self.stop_services()
135
+ sys.exit(0)
136
+
137
+ async def run(self):
138
+ """Run both services"""
139
+ # Set up signal handlers
140
+ signal.signal(signal.SIGINT, self.signal_handler)
141
+ signal.signal(signal.SIGTERM, self.signal_handler)
142
+
143
+ print("🚀 Starting MCP Speech-to-Text Services")
144
+ print("=" * 50)
145
+
146
+ # Start Whisper service first
147
+ if not await self.start_whisper_service():
148
+ print("❌ Failed to start Whisper service. Exiting.")
149
+ return
150
+
151
+ # Start MCP service
152
+ if not await self.start_mcp_service():
153
+ print("❌ Failed to start MCP service. Stopping Whisper service.")
154
+ self.stop_services()
155
+ return
156
+
157
+ print("\n✅ Both services are running!")
158
+ print("📋 Service URLs:")
159
+ print(" • Whisper HTTP: http://localhost:8000")
160
+ print(" • MCP Service: http://localhost:8081")
161
+ print("\n🧪 You can now run: python testClient.py")
162
+ print("⏹️ Press Ctrl+C to stop all services")
163
+
164
+ self.running = True
165
+
166
+ # Monitor services
167
+ try:
168
+ while self.running:
169
+ await asyncio.sleep(10) # Check every 10 seconds
170
+
171
+ whisper_ok, mcp_ok = await self.check_services()
172
+
173
+ if not whisper_ok:
174
+ print("⚠️ Whisper service appears to be down")
175
+ if not mcp_ok:
176
+ print("⚠️ MCP service appears to be down")
177
+
178
+ if not whisper_ok or not mcp_ok:
179
+ print("🔄 Attempting to restart services...")
180
+ self.stop_services()
181
+ await asyncio.sleep(2)
182
+
183
+ if not whisper_ok:
184
+ await self.start_whisper_service()
185
+ if not mcp_ok:
186
+ await self.start_mcp_service()
187
+
188
+ except KeyboardInterrupt:
189
+ pass
190
+ finally:
191
+ self.stop_services()
192
+
193
+ async def main():
194
+ """Main function"""
195
+ if len(sys.argv) > 1 and sys.argv[1] == "--help":
196
+ print("MCP Speech-to-Text Service Manager")
197
+ print("Usage: python startup.py")
198
+ print("\nThis script will:")
199
+ print("1. Start the Whisper HTTP service on port 8000")
200
+ print("2. Start the MCP Speech service on port 8081")
201
+ print("3. Monitor both services and restart if needed")
202
+ print("4. Stop all services when you press Ctrl+C")
203
+ return
204
+
205
+ manager = ServiceManager()
206
+ await manager.run()
207
+
208
+ if __name__ == "__main__":
209
+ try:
210
+ asyncio.run(main())
211
+ except KeyboardInterrupt:
212
+ print("\n👋 Goodbye!")
testClient.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test client for MPC Speech-to-Text service
4
+ """
5
+
6
+ import asyncio
7
+ import sys
8
+ from pathlib import Path
9
+ from mcp_speech_client import transcribe_audio_file, check_service_status
10
+
11
+ async def main():
12
+ """Main test function"""
13
+ print("🧪 Testing MPC Speech-to-Text Client")
14
+ print("=" * 50)
15
+
16
+ # First, check if the services are running
17
+ print("1️⃣ Checking service status...")
18
+ status = await check_service_status()
19
+ print(f" Service status: {status.get('status', 'unknown')}")
20
+
21
+ if status.get("status") == "error":
22
+ print("\n⚠️ Service appears to be down!")
23
+ print("💡 To start the services, run:")
24
+ print(" python startup.py")
25
+ print("\n Or start them manually:")
26
+ print(" 1. Start Whisper HTTP service: python whisper_http_wrapper.py")
27
+ print(" 2. Start MPC service: python mpc_speech_service.py")
28
+ return
29
+
30
+ print("✅ Services are running!")
31
+ print(f" Active sessions: {status.get('active_sessions', 0)}")
32
+
33
+ print("\n2️⃣ Testing audio file transcription...")
34
+
35
+ # Look for available audio files
36
+ test_files = [
37
+ "audio.wav",
38
+ "speech_test.wav",
39
+ "test_audio.wav",
40
+ "silent_test.wav"
41
+ ]
42
+
43
+ audio_file = None
44
+ for file in test_files:
45
+ if Path(file).exists():
46
+ audio_file = file
47
+ break
48
+
49
+ if not audio_file:
50
+ print("❌ No test audio files found!")
51
+ print("💡 Create test audio files by running:")
52
+ print(" python create_test_audio.py")
53
+ print("\n Or provide your own audio file and update the test_files list.")
54
+ return
55
+
56
+ print(f"🎵 Using audio file: {audio_file}")
57
+
58
+ try:
59
+ # Test transcription
60
+ print(" Sending transcription request...")
61
+ result = await transcribe_audio_file(audio_file)
62
+
63
+ print(f"\n📋 Transcription result:")
64
+ print(f" Status: {result.get('status')}")
65
+ print(f" Session ID: {result.get('session_id')}")
66
+
67
+ if result.get("status") == "success":
68
+ transcription = result.get("transcription", {})
69
+
70
+ if isinstance(transcription, dict):
71
+ text = transcription.get("result", {}).get("text", "No text found")
72
+ processing_time = transcription.get("result", {}).get("processing_time", 0)
73
+ model_info = transcription.get("result", {}).get("model_info", {})
74
+
75
+ print(f"✅ Transcription successful!")
76
+ print(f" Text: '{text}'")
77
+ print(f" Processing time: {processing_time:.2f}s")
78
+ print(f" Model: {model_info.get('model', 'unknown')}")
79
+ print(f" Device: {model_info.get('device', 'unknown')}")
80
+
81
+ # If it's synthetic audio, explain the result
82
+ if "test" in audio_file.lower() or "speech" in audio_file.lower():
83
+ print(f"\n💡 Note: This was synthetic test audio, so the transcription")
84
+ print(f" result may not be meaningful. Try with real speech audio")
85
+ print(f" for better results.")
86
+ else:
87
+ print(f" Raw transcription: {transcription}")
88
+
89
+ else:
90
+ error_msg = result.get('error', 'Unknown error')
91
+ print(f"❌ Transcription failed: {error_msg}")
92
+
93
+ # Provide specific help for common errors
94
+ if "Cannot connect" in error_msg and "8000" in error_msg:
95
+ print("\n💡 The Whisper HTTP service (port 8000) is not running.")
96
+ print(" Start it with: python whisper_http_wrapper.py")
97
+ elif "Authentication" in error_msg:
98
+ print("\n💡 Authentication issue with the MPC service.")
99
+ print(" Check if the MPC service is running properly.")
100
+
101
+ except FileNotFoundError:
102
+ print(f"❌ Audio file '{audio_file}' not found.")
103
+ except Exception as e:
104
+ print(f"❌ Unexpected error during transcription: {e}")
105
+
106
+ print(f"\n🏁 Test completed!")
107
+
108
+ def check_dependencies():
109
+ """Check if required dependencies are available"""
110
+ missing_deps = []
111
+
112
+ try:
113
+ import aiohttp
114
+ except ImportError:
115
+ missing_deps.append("aiohttp")
116
+
117
+ try:
118
+ import soundfile
119
+ except ImportError:
120
+ missing_deps.append("soundfile")
121
+
122
+ if missing_deps:
123
+ print("❌ Missing dependencies:")
124
+ for dep in missing_deps:
125
+ print(f" • {dep}")
126
+ print(f"\n💡 Install them with: pip install {' '.join(missing_deps)}")
127
+ return False
128
+
129
+ return True
130
+
131
+ if __name__ == "__main__":
132
+ if not check_dependencies():
133
+ sys.exit(1)
134
+
135
+ try:
136
+ asyncio.run(main())
137
+ except KeyboardInterrupt:
138
+ print("\n👋 Test interrupted by user")
139
+ except Exception as e:
140
+ print(f"\n❌ Test failed with error: {e}")
141
+ sys.exit(1)
whisper_http_wrapper.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ HTTP API Wrapper for realtime-whisper-macbook
4
+ This adds HTTP endpoints to the existing whisper functionality
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ import logging
10
+ import os
11
+ import tempfile
12
+ import threading
13
+ import time
14
+ from pathlib import Path
15
+ from typing import Dict, Any, Optional
16
+
17
+ import numpy as np
18
+ import soundfile as sf
19
+ import torch
20
+ import whisper
21
+ from aiohttp import web
22
+ import aiofiles
23
+
24
+ # Configure logging
25
+ logging.basicConfig(level=logging.INFO)
26
+ logger = logging.getLogger(__name__)
27
+
28
+ class WhisperHTTPService:
29
+ """HTTP wrapper for Whisper transcription service"""
30
+
31
+ def __init__(self, model_name: str = "base", device: str = "auto"):
32
+ """
33
+ Initialize the Whisper HTTP service
34
+
35
+ Args:
36
+ model_name: Whisper model to use (tiny, base, small, medium, large)
37
+ device: Device to run on (cpu, cuda, mps, auto)
38
+ """
39
+ self.model_name = model_name
40
+
41
+ # Auto-detect device if not specified
42
+ if device == "auto":
43
+ if torch.cuda.is_available():
44
+ self.device = "cuda"
45
+ #elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
46
+ # self.device = "mps" # Apple Silicon
47
+ else:
48
+ self.device = "cpu"
49
+ else:
50
+ self.device = device
51
+
52
+ logger.info(f"Using device: {self.device}")
53
+
54
+ # Load Whisper model
55
+ logger.info(f"Loading Whisper model: {model_name}")
56
+ self.model = whisper.load_model(model_name, device=self.device)
57
+ logger.info("Whisper model loaded successfully")
58
+
59
+ # Statistics
60
+ self.stats = {
61
+ "requests_processed": 0,
62
+ "total_audio_duration": 0.0,
63
+ "average_processing_time": 0.0,
64
+ "start_time": time.time()
65
+ }
66
+
67
+ def transcribe_audio_file(self, audio_file_path: str, **kwargs) -> Dict[str, Any]:
68
+ """
69
+ Transcribe audio file using Whisper
70
+
71
+ Args:
72
+ audio_file_path: Path to audio file
73
+ **kwargs: Additional Whisper parameters
74
+
75
+ Returns:
76
+ Transcription result dictionary
77
+ """
78
+ try:
79
+ start_time = time.time()
80
+
81
+ # Default Whisper options
82
+ options = {
83
+ "language": kwargs.get("language"), # None for auto-detection
84
+ "task": kwargs.get("task", "transcribe"), # transcribe or translate
85
+ "temperature": kwargs.get("temperature", 0.0),
86
+ "best_of": kwargs.get("best_of", 5),
87
+ "beam_size": kwargs.get("beam_size", 5),
88
+ "patience": kwargs.get("patience", 1.0),
89
+ "length_penalty": kwargs.get("length_penalty", 1.0),
90
+ "suppress_tokens": kwargs.get("suppress_tokens", "-1"),
91
+ "initial_prompt": kwargs.get("initial_prompt"),
92
+ "condition_on_previous_text": kwargs.get("condition_on_previous_text", True),
93
+ "fp16": kwargs.get("fp16", True if self.device == "cuda" else False),
94
+ "compression_ratio_threshold": kwargs.get("compression_ratio_threshold", 2.4),
95
+ "logprob_threshold": kwargs.get("logprob_threshold", -1.0),
96
+ "no_speech_threshold": kwargs.get("no_speech_threshold", 0.6),
97
+ }
98
+
99
+ # Remove None values
100
+ options = {k: v for k, v in options.items() if v is not None}
101
+
102
+ # Transcribe
103
+ result = self.model.transcribe(audio_file_path, **options)
104
+
105
+ processing_time = time.time() - start_time
106
+
107
+ # Update statistics
108
+ self.stats["requests_processed"] += 1
109
+ if "segments" in result:
110
+ audio_duration = max([seg["end"] for seg in result["segments"]], default=0)
111
+ self.stats["total_audio_duration"] += audio_duration
112
+
113
+ # Calculate average processing time
114
+ total_requests = self.stats["requests_processed"]
115
+ self.stats["average_processing_time"] = (
116
+ (self.stats["average_processing_time"] * (total_requests - 1) + processing_time) / total_requests
117
+ )
118
+
119
+ # Add metadata
120
+ result["processing_time"] = processing_time
121
+ result["model"] = self.model_name
122
+ result["device"] = self.device
123
+
124
+ logger.info(f"Transcribed audio in {processing_time:.2f}s: '{result['text'][:100]}...'")
125
+
126
+ return {
127
+ "success": True,
128
+ "result": result,
129
+ "processing_time": processing_time,
130
+ "model_info": {
131
+ "model": self.model_name,
132
+ "device": self.device
133
+ }
134
+ }
135
+
136
+ except Exception as e:
137
+ logger.error(f"Transcription error: {e}")
138
+ return {
139
+ "success": False,
140
+ "error": str(e),
141
+ "model_info": {
142
+ "model": self.model_name,
143
+ "device": self.device
144
+ }
145
+ }
146
+
147
+ async def handle_transcribe(self, request):
148
+ """Handle transcription HTTP requests"""
149
+ try:
150
+ # Handle multipart form data
151
+ reader = await request.multipart()
152
+ audio_data = None
153
+ options = {}
154
+
155
+ async for part in reader:
156
+ if part.name == 'audio':
157
+ audio_data = await part.read()
158
+ elif part.name == 'options':
159
+ options_text = await part.text()
160
+ try:
161
+ options = json.loads(options_text)
162
+ except json.JSONDecodeError:
163
+ pass
164
+ elif part.name in ['language', 'task', 'temperature', 'beam_size']:
165
+ # Handle individual parameters
166
+ options[part.name] = await part.text()
167
+
168
+ if not audio_data:
169
+ return web.Response(
170
+ text=json.dumps({"error": "No audio data provided"}),
171
+ status=400,
172
+ content_type='application/json'
173
+ )
174
+
175
+ # Save audio data to temporary file
176
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
177
+ temp_file.write(audio_data)
178
+ temp_file_path = temp_file.name
179
+
180
+ try:
181
+ # Convert options to appropriate types
182
+ if 'temperature' in options:
183
+ options['temperature'] = float(options['temperature'])
184
+ if 'beam_size' in options:
185
+ options['beam_size'] = int(options['beam_size'])
186
+
187
+ # Transcribe
188
+ result = self.transcribe_audio_file(temp_file_path, **options)
189
+
190
+ return web.Response(
191
+ text=json.dumps(result),
192
+ content_type='application/json'
193
+ )
194
+
195
+ finally:
196
+ # Clean up temporary file
197
+ try:
198
+ os.unlink(temp_file_path)
199
+ except:
200
+ pass
201
+
202
+ except Exception as e:
203
+ logger.error(f"Request handling error: {e}")
204
+ return web.Response(
205
+ text=json.dumps({"error": f"Request processing failed: {str(e)}"}),
206
+ status=500,
207
+ content_type='application/json'
208
+ )
209
+
210
+ async def handle_health(self, request):
211
+ """Health check endpoint"""
212
+ uptime = time.time() - self.stats["start_time"]
213
+
214
+ health_info = {
215
+ "status": "healthy",
216
+ "model": self.model_name,
217
+ "device": self.device,
218
+ "uptime_seconds": uptime,
219
+ "statistics": self.stats.copy()
220
+ }
221
+
222
+ return web.Response(
223
+ text=json.dumps(health_info),
224
+ content_type='application/json'
225
+ )
226
+
227
+ async def handle_models(self, request):
228
+ """List available models"""
229
+ available_models = ["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"]
230
+
231
+ return web.Response(
232
+ text=json.dumps({
233
+ "available_models": available_models,
234
+ "current_model": self.model_name,
235
+ "device": self.device
236
+ }),
237
+ content_type='application/json'
238
+ )
239
+
240
+ def create_app(self) -> web.Application:
241
+ """Create the web application"""
242
+ app = web.Application(client_max_size=50*1024*1024) # 50MB max file size
243
+
244
+ # Add routes
245
+ app.router.add_post('/transcribe', self.handle_transcribe)
246
+ app.router.add_get('/health', self.handle_health)
247
+ app.router.add_get('/models', self.handle_models)
248
+
249
+ # Add CORS middleware
250
+ async def cors_middleware(request, handler):
251
+ if request.method == 'OPTIONS':
252
+ # Handle preflight requests
253
+ response = web.Response()
254
+ else:
255
+ response = await handler(request)
256
+
257
+ response.headers['Access-Control-Allow-Origin'] = '*'
258
+ response.headers['Access-Control-Allow-Methods'] = 'GET, POST, OPTIONS'
259
+ response.headers['Access-Control-Allow-Headers'] = 'Content-Type, Authorization'
260
+ return response
261
+
262
+ app.middlewares.append(cors_middleware)
263
+
264
+ return app
265
+
266
+ async def main():
267
+ """Main function to run the Whisper HTTP service"""
268
+ # Configuration from environment variables
269
+ host = os.getenv('WHISPER_HOST', '127.0.0.1')
270
+ port = int(os.getenv('WHISPER_PORT', '8000'))
271
+ model_name = os.getenv('WHISPER_MODEL', 'base')
272
+ device = os.getenv('WHISPER_DEVICE', 'auto')
273
+
274
+ # Create service
275
+ logger.info("Initializing Whisper HTTP service...")
276
+ service = WhisperHTTPService(model_name=model_name, device=device)
277
+ app = service.create_app()
278
+
279
+ logger.info(f"Starting Whisper HTTP service on {host}:{port}")
280
+ logger.info(f"Model: {model_name}, Device: {device}")
281
+
282
+ # Run the service
283
+ runner = web.AppRunner(app)
284
+ await runner.setup()
285
+ site = web.TCPSite(runner, host, port)
286
+ await site.start()
287
+
288
+ logger.info("Whisper HTTP service is running!")
289
+ logger.info(f"Endpoints available:")
290
+ logger.info(f" POST http://{host}:{port}/transcribe - Transcribe audio")
291
+ logger.info(f" GET http://{host}:{port}/health - Health check")
292
+ logger.info(f" GET http://{host}:{port}/models - List models")
293
+
294
+ try:
295
+ # Keep the service running
296
+ while True:
297
+ await asyncio.sleep(1)
298
+ except KeyboardInterrupt:
299
+ logger.info("Shutting down Whisper HTTP service...")
300
+ finally:
301
+ await runner.cleanup()
302
+
303
+ if __name__ == '__main__':
304
+ asyncio.run(main())