Mike W commited on
Commit
9cbfee0
·
1 Parent(s): f46c356

Fix: Initial runtime errors with integration

Browse files
Files changed (4) hide show
  1. index.html +25 -22
  2. requirements.txt +0 -1
  3. server.py +1 -1
  4. working.py +0 -334
index.html CHANGED
@@ -18,11 +18,11 @@
18
  <button id="stopButton" disabled>Stop Translation</button>
19
  </div>
20
  <p id="status">Status: Not connected</p>
21
-
22
  <script>
23
  const startButton = document.getElementById('startButton');
24
  const stopButton = document.getElementById('stopButton');
25
- const statusDiv = document.getElementById('status');
26
  let socket;
27
  let mediaRecorder;
28
  let audioContext;
@@ -31,10 +31,12 @@
31
 
32
  const connectWebSocket = () => {
33
  const proto = window.location.protocol === "https:" ? "wss:" : "ws:";
34
- socket = new WebSocket(`${proto}//${window.location.host}/ws`);
 
 
35
 
36
  socket.onopen = () => {
37
- statusDiv.textContent = 'Status: Connected. Press Start.';
38
  startButton.disabled = false;
39
  };
40
 
@@ -43,7 +45,9 @@
43
  const reader = new FileReader();
44
  reader.onload = function() {
45
  const arrayBuffer = this.result;
46
- audioContext.decodeAudioData(arrayBuffer, (buffer) => {
 
 
47
  audioQueue.push(buffer);
48
  if (!isPlaying) {
49
  playNextInQueue();
@@ -51,13 +55,18 @@
51
  });
52
  };
53
  reader.readAsArrayBuffer(event.data);
 
 
 
 
 
54
  }
55
  };
56
 
57
  socket.onclose = () => {
58
  statusDiv.textContent = 'Status: Disconnected';
59
- startButton.disabled = true;
60
- stopButton.disabled = true;
61
  };
62
 
63
  socket.onerror = (error) => {
@@ -81,15 +90,11 @@
81
  }
82
  };
83
 
84
-
85
  startButton.onclick = async () => {
86
- if (!socket || socket.readyState !== WebSocket.OPEN) {
87
- connectWebSocket();
88
- }
89
-
90
- audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 });
91
-
92
- if (audioContext.state === 'suspended') {
93
  await audioContext.resume();
94
  }
95
 
@@ -117,19 +122,17 @@
117
  if (mediaRecorder) {
118
  mediaRecorder.stop();
119
  }
120
- if (socket && socket.readyState === WebSocket.OPEN) {
121
- socket.send(JSON.stringify({type: "stop"}));
122
- socket.close();
123
- }
124
  startButton.disabled = false;
125
  stopButton.disabled = true;
126
- statusDiv.textContent = 'Status: Stopped. Re-connect to start again.';
127
  };
128
 
129
  window.onload = () => {
130
- startButton.disabled = false;
131
  stopButton.disabled = true;
132
- statusDiv.textContent = 'Status: Ready to connect.';
133
  };
134
 
135
  </script>
 
18
  <button id="stopButton" disabled>Stop Translation</button>
19
  </div>
20
  <p id="status">Status: Not connected</p>
21
+ <div id="log"></div>
22
  <script>
23
  const startButton = document.getElementById('startButton');
24
  const stopButton = document.getElementById('stopButton');
25
+ const statusDiv = document.getElementById('status'); // Corrected from status to statusDiv
26
  let socket;
27
  let mediaRecorder;
28
  let audioContext;
 
31
 
32
  const connectWebSocket = () => {
33
  const proto = window.location.protocol === "https:" ? "wss:" : "ws:";
34
+ const wsUri = `${proto}//${window.location.host}/ws`;
35
+ statusDiv.textContent = `Status: Connecting to ${wsUri}...`;
36
+ socket = new WebSocket(wsUri);
37
 
38
  socket.onopen = () => {
39
+ statusDiv.textContent = 'Status: Connected. Ready to start.';
40
  startButton.disabled = false;
41
  };
42
 
 
45
  const reader = new FileReader();
46
  reader.onload = function() {
47
  const arrayBuffer = this.result;
48
+ // Ensure audioContext is initialized before decoding
49
+ if (audioContext) {
50
+ audioContext.decodeAudioData(arrayBuffer, (buffer) => {
51
  audioQueue.push(buffer);
52
  if (!isPlaying) {
53
  playNextInQueue();
 
55
  });
56
  };
57
  reader.readAsArrayBuffer(event.data);
58
+ } else {
59
+ // Handle text messages from server (e.g., for logging)
60
+ const logElement = document.createElement('p');
61
+ logElement.textContent = event.data;
62
+ document.getElementById('log').prepend(logElement);
63
  }
64
  };
65
 
66
  socket.onclose = () => {
67
  statusDiv.textContent = 'Status: Disconnected';
68
+ startButton.disabled = false; // Allow user to try starting again
69
+ stopButton.disabled = true;
70
  };
71
 
72
  socket.onerror = (error) => {
 
90
  }
91
  };
92
 
 
93
  startButton.onclick = async () => {
94
+ // AudioContext must be resumed by a user gesture.
95
+ if (!audioContext) {
96
+ audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 });
97
+ } else if (audioContext.state === 'suspended') {
 
 
 
98
  await audioContext.resume();
99
  }
100
 
 
122
  if (mediaRecorder) {
123
  mediaRecorder.stop();
124
  }
125
+ // Don't close the socket, just stop sending data.
126
+ // The user might want to start and stop multiple times in one session.
 
 
127
  startButton.disabled = false;
128
  stopButton.disabled = true;
129
+ statusDiv.textContent = 'Status: Stopped. Press Start to translate again.';
130
  };
131
 
132
  window.onload = () => {
133
+ startButton.disabled = true;
134
  stopButton.disabled = true;
135
+ connectWebSocket(); // Connect automatically on page load
136
  };
137
 
138
  </script>
requirements.txt CHANGED
@@ -4,5 +4,4 @@ websockets
4
  python-dotenv
5
  fastapi
6
  uvicorn
7
- python-multipart
8
  ffmpeg-python
 
4
  python-dotenv
5
  fastapi
6
  uvicorn
 
7
  ffmpeg-python
server.py CHANGED
@@ -61,7 +61,7 @@ async def handle_audio_input(websocket: WebSocket, input_queue: asyncio.Queue):
61
  print("Audio input handler stopped.")
62
 
63
 
64
- @app.websocket("/ws")
65
  async def websocket_endpoint(websocket: WebSocket):
66
  await websocket.accept()
67
  print("WebSocket connection accepted.")
 
61
  print("Audio input handler stopped.")
62
 
63
 
64
+ @app.websocket("/ws") # This was correct, the error was in the old HTML. No change needed here, but confirming it's /ws.
65
  async def websocket_endpoint(websocket: WebSocket):
66
  await websocket.accept()
67
  print("WebSocket connection accepted.")
working.py DELETED
@@ -1,334 +0,0 @@
1
- """
2
- Real-Time French/English Voice Translator - FIXED VERSION v4.2
3
- Improvements:
4
- - Removed noisy [audio_gen]/[tts] prints
5
- - Added TTS pre-buffer to eliminate start bursts
6
- - Added silence-based auto-finalization when no STT final detected
7
- - Switched to "latest_long" model for better segmentation
8
- - Added echo suppression (skip self-spoken TTS text)
9
- """
10
-
11
- import asyncio
12
- import json
13
- import queue
14
- import threading
15
- import time
16
- from typing import Optional, Dict, List
17
- import pyaudio
18
- import websockets
19
- from google.cloud import speech
20
- import deepl
21
- import os
22
- from dotenv import load_dotenv
23
- import base64
24
-
25
-
26
- class VoiceTranslator:
27
- def __init__(self, deepl_api_key: str, elevenlabs_api_key: str, elevenlabs_voice_id: str):
28
- self.stt_client = speech.SpeechClient()
29
- self.deepl_client = deepl.Translator(deepl_api_key)
30
- self.elevenlabs_api_key = elevenlabs_api_key
31
- self.voice_id = elevenlabs_voice_id
32
-
33
- self.audio_rate = 16000
34
- self.audio_chunk = 1024
35
-
36
- self.audio_queue_en = queue.Queue()
37
- self.audio_queue_fr = queue.Queue()
38
- self.result_queue = queue.Queue()
39
- self.is_recording = False
40
- self.processing_lock = threading.Lock()
41
-
42
- self.last_processed_transcript = ""
43
- self.last_tts_text = ""
44
-
45
- self.pyaudio_instance = pyaudio.PyAudio()
46
- self.audio_stream = None
47
-
48
- # ---------- AUDIO CAPTURE ----------
49
-
50
- def _audio_generator(self, audio_queue: queue.Queue):
51
- while self.is_recording:
52
- try:
53
- chunk = audio_queue.get(timeout=0.2)
54
- if chunk:
55
- yield chunk
56
- except queue.Empty:
57
- continue
58
-
59
- def _record_audio(self):
60
- try:
61
- stream = self.pyaudio_instance.open(
62
- format=pyaudio.paInt16,
63
- channels=1,
64
- rate=self.audio_rate,
65
- input=True,
66
- frames_per_buffer=self.audio_chunk,
67
- )
68
- print("🎤 Recording started...")
69
- while self.is_recording:
70
- try:
71
- data = stream.read(self.audio_chunk, exception_on_overflow=False)
72
- if not data:
73
- continue
74
- self.audio_queue_en.put(data)
75
- self.audio_queue_fr.put(data)
76
- except Exception as e:
77
- print(f"[recorder] error: {e}")
78
- break
79
- stream.stop_stream()
80
- stream.close()
81
- print("🎤 Recording stopped.")
82
- except Exception as e:
83
- print(f"[recorder] fatal: {e}")
84
-
85
- # ---------- TEXT TO SPEECH ----------
86
-
87
- async def _stream_tts(self, text: str):
88
- """Stream TTS with small pre-buffer to smooth playback."""
89
- uri = (
90
- f"wss://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}"
91
- f"/stream-input?model_id=eleven_flash_v2_5&output_format=pcm_16000"
92
- )
93
-
94
- try:
95
- async with websockets.connect(uri) as websocket:
96
- await websocket.send(json.dumps({
97
- "text": " ",
98
- "voice_settings": {"stability": 0.5, "similarity_boost": 0.8},
99
- "xi_api_key": self.elevenlabs_api_key,
100
- }))
101
- await websocket.send(json.dumps({"text": text, "try_trigger_generation": True}))
102
- await websocket.send(json.dumps({"text": ""}))
103
-
104
- if self.audio_stream is None:
105
- self.audio_stream = self.pyaudio_instance.open(
106
- format=pyaudio.paInt16,
107
- channels=1,
108
- rate=16000,
109
- output=True,
110
- frames_per_buffer=1024,
111
- )
112
-
113
- prebuffer = bytearray()
114
- playback_started = False
115
- last_chunk_time = time.time()
116
-
117
- async for message in websocket:
118
- if isinstance(message, bytes):
119
- prebuffer.extend(message)
120
- # Start playback after ~0.5 s of audio buffered
121
- if not playback_started and len(prebuffer) >= 16000:
122
- self.audio_stream.write(bytes(prebuffer))
123
- prebuffer.clear()
124
- playback_started = True
125
- elif playback_started:
126
- self.audio_stream.write(message)
127
- last_chunk_time = time.time()
128
- continue
129
-
130
- try:
131
- data = json.loads(message)
132
- except Exception:
133
- continue
134
-
135
- if data.get("audio"):
136
- audio_bytes = base64.b64decode(data["audio"])
137
- prebuffer.extend(audio_bytes)
138
- if not playback_started and len(prebuffer) >= 16000:
139
- self.audio_stream.write(bytes(prebuffer))
140
- prebuffer.clear()
141
- playback_started = True
142
- elif playback_started:
143
- self.audio_stream.write(audio_bytes)
144
- last_chunk_time = time.time()
145
- elif data.get("isFinal"):
146
- break
147
- elif data.get("error"):
148
- print("TTS error:", data["error"])
149
- break
150
-
151
- if prebuffer:
152
- self.audio_stream.write(bytes(prebuffer))
153
- except Exception as e:
154
- print(f"[tts] error: {e}")
155
-
156
- # ---------- TRANSLATION ----------
157
-
158
- async def _process_result(self, transcript: str, confidence: Optional[float], language: str):
159
- lang_flag = "🇫🇷" if language == "fr-FR" else "🇬🇧"
160
- conf_display = f"{confidence:.2f}" if confidence is not None else "n/a"
161
- print(f"{lang_flag} Heard ({language}, conf {conf_display}): {transcript}")
162
-
163
- # Simple echo suppression
164
- if transcript.strip().lower() == self.last_tts_text.strip().lower():
165
- return
166
-
167
- try:
168
- if language == "fr-FR":
169
- translated = self.deepl_client.translate_text(transcript, target_lang="EN-US").text
170
- print(f"🌐 FR → EN: {translated}")
171
- else:
172
- translated = self.deepl_client.translate_text(transcript, target_lang="FR").text
173
- print(f"🌐 EN → FR: {translated}")
174
-
175
- self.last_tts_text = translated
176
- print("🔊 Speaking...")
177
- await self._stream_tts(translated)
178
- print("✅ Done\n")
179
-
180
- except Exception as e:
181
- print(f"Translation error: {e}")
182
-
183
- # ---------- STT STREAMING ----------
184
-
185
- def _run_stt_stream(self, language: str, audio_queue: queue.Queue):
186
- print(f"[stt] Thread start for {language}")
187
-
188
- config = speech.RecognitionConfig(
189
- encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
190
- sample_rate_hertz=self.audio_rate,
191
- language_code=language,
192
- enable_automatic_punctuation=True,
193
- model="latest_long",
194
- )
195
-
196
- streaming_config = speech.StreamingRecognitionConfig(
197
- config=config, interim_results=True, single_utterance=False
198
- )
199
-
200
- def requests():
201
- for content in self._audio_generator(audio_queue):
202
- yield speech.StreamingRecognizeRequest(audio_content=content)
203
-
204
- try:
205
- responses = self.stt_client.streaming_recognize(streaming_config, requests())
206
-
207
- last_update_time = time.time()
208
- current_text = ""
209
- for response in responses:
210
- if not self.is_recording:
211
- break
212
- if not response.results:
213
- continue
214
-
215
- for result in response.results:
216
- if not result.alternatives:
217
- continue
218
- alt = result.alternatives[0]
219
- transcript = alt.transcript.strip()
220
- conf = getattr(alt, "confidence", None)
221
- current_text = transcript
222
- last_update_time = time.time()
223
-
224
- self.result_queue.put({
225
- "transcript": transcript,
226
- "confidence": conf,
227
- "language": language,
228
- "is_final": bool(result.is_final),
229
- })
230
-
231
- # If we haven’t heard anything new for 1.2 s, flush it as “final”
232
- if time.time() - last_update_time > 1.2 and current_text:
233
- self.result_queue.put({
234
- "transcript": current_text,
235
- "confidence": 0.5,
236
- "language": language,
237
- "is_final": True,
238
- })
239
- current_text = ""
240
-
241
- except Exception as e:
242
- print(f"[stt:{language}] exception: {e}")
243
-
244
- # ---------- RESULT AGGREGATION ----------
245
-
246
- async def _process_results_queue(self):
247
- while self.is_recording:
248
- try:
249
- r = self.result_queue.get(timeout=0.2)
250
- if r["is_final"] and r["transcript"] != self.last_processed_transcript:
251
- with self.processing_lock:
252
- self.last_processed_transcript = r["transcript"]
253
- await self._process_result(
254
- r["transcript"], r.get("confidence"), r["language"]
255
- )
256
- await asyncio.sleep(0.01)
257
- except queue.Empty:
258
- await asyncio.sleep(0.05)
259
- except Exception as e:
260
- print("Queue error:", e)
261
- await asyncio.sleep(0.1)
262
-
263
- # ---------- CONTROL ----------
264
-
265
- async def _run_dual_streams(self):
266
- print("🔄 Dual-stream: English ⇄ French\n")
267
- en_thread = threading.Thread(target=self._run_stt_stream, args=("en-US", self.audio_queue_en), daemon=True)
268
- fr_thread = threading.Thread(target=self._run_stt_stream, args=("fr-FR", self.audio_queue_fr), daemon=True)
269
- en_thread.start()
270
- fr_thread.start()
271
- await self._process_results_queue()
272
-
273
- def start_translation(self):
274
- if self.is_recording:
275
- print("Already recording!")
276
- return
277
- self.is_recording = True
278
- self.last_processed_transcript = ""
279
- while not self.result_queue.empty():
280
- try: self.result_queue.get_nowait()
281
- except: break
282
- threading.Thread(target=self._record_audio, daemon=True).start()
283
- try:
284
- asyncio.run(self._run_dual_streams())
285
- except KeyboardInterrupt:
286
- self.stop_translation()
287
-
288
- def stop_translation(self):
289
- print("\n⏹️ Stopping translation...")
290
- self.is_recording = False
291
- if self.audio_stream:
292
- try:
293
- self.audio_stream.stop_stream()
294
- self.audio_stream.close()
295
- except Exception:
296
- pass
297
- self.audio_stream = None
298
-
299
- def cleanup(self):
300
- self.stop_translation()
301
- try:
302
- self.pyaudio_instance.terminate()
303
- except Exception:
304
- pass
305
-
306
-
307
- # ---------- MAIN ----------
308
-
309
- def main():
310
- load_dotenv()
311
- google_creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
312
- deepl_key = os.getenv("DEEPL_API_KEY")
313
- eleven_key = os.getenv("ELEVENLABS_API_KEY")
314
- voice_id = os.getenv("ELEVENLABS_VOICE_ID")
315
-
316
- if not all([google_creds, deepl_key, eleven_key, voice_id]):
317
- print("Missing API keys or credentials.")
318
- return
319
-
320
- translator = VoiceTranslator(deepl_key, eleven_key, voice_id)
321
- print("Ready! Press ENTER to start, ENTER again to stop, Ctrl+C to quit.\n")
322
-
323
- try:
324
- while True:
325
- input("Press ENTER to start speaking...")
326
- threading.Thread(target=translator.start_translation, daemon=True).start()
327
- input("Press ENTER to stop...\n")
328
- translator.stop_translation()
329
- except KeyboardInterrupt:
330
- translator.cleanup()
331
-
332
-
333
- if __name__ == "__main__":
334
- main()