Sidak Singh commited on
Commit
7b7174c
·
1 Parent(s): 8b3bbb3

transcribing works

Browse files
Files changed (4) hide show
  1. __pycache__/transcriber.cpython-310.pyc +0 -0
  2. app.py +13 -12
  3. nodemon.json +27 -0
  4. transcriber.py +219 -71
__pycache__/transcriber.cpython-310.pyc CHANGED
Binary files a/__pycache__/transcriber.cpython-310.pyc and b/__pycache__/transcriber.cpython-310.pyc differ
 
app.py CHANGED
@@ -13,15 +13,16 @@ def process_mic_audio(audio):
13
  """Process audio from Gradio microphone and update transcription"""
14
  if audio is None:
15
  return gr.update(), gr.update()
16
-
17
  sr, y = audio
18
-
19
  # Add to processor and possibly trigger transcription
20
  buffer_size = processor.add_audio(y, sr)
21
-
22
  # Get current transcription
23
  transcription = processor.get_transcription()
24
-
 
25
  # Return status update and transcription
26
  buffer_seconds = buffer_size / processor.sample_rate
27
  return (
@@ -45,29 +46,29 @@ def force_transcribe():
45
  # Create Gradio interface
46
  with gr.Blocks(title="Live Speech Transcription") as demo:
47
  gr.Markdown("# Live Speech Recognition with Buffer Playback")
48
-
49
  with gr.Row():
50
  audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Microphone Input")
51
-
52
  with gr.Row():
53
  status_output = gr.Textbox(label="Buffer Status", interactive=False)
54
  buffer_audio = gr.Audio(label="Current Buffer (Click to Play)", interactive=False)
55
-
56
  with gr.Row():
57
  clear_btn = gr.Button("Clear Buffer")
58
  play_btn = gr.Button("Get Buffer for Playback")
59
  force_btn = gr.Button("Force Transcribe")
60
-
61
  with gr.Row():
62
  transcription_output = gr.Textbox(label="Live Transcription", lines=5, interactive=False)
63
-
64
  # Connect components
65
  audio_input.stream(
66
- process_mic_audio,
67
- audio_input,
68
  [status_output, transcription_output]
69
  )
70
-
71
  clear_btn.click(clear_audio_buffer, None, [status_output, buffer_audio, transcription_output])
72
  play_btn.click(get_current_buffer, None, buffer_audio)
73
  force_btn.click(force_transcribe, None, transcription_output)
 
13
  """Process audio from Gradio microphone and update transcription"""
14
  if audio is None:
15
  return gr.update(), gr.update()
16
+
17
  sr, y = audio
18
+
19
  # Add to processor and possibly trigger transcription
20
  buffer_size = processor.add_audio(y, sr)
21
+
22
  # Get current transcription
23
  transcription = processor.get_transcription()
24
+ print(transcription)
25
+
26
  # Return status update and transcription
27
  buffer_seconds = buffer_size / processor.sample_rate
28
  return (
 
46
  # Create Gradio interface
47
  with gr.Blocks(title="Live Speech Transcription") as demo:
48
  gr.Markdown("# Live Speech Recognition with Buffer Playback")
49
+
50
  with gr.Row():
51
  audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Microphone Input")
52
+
53
  with gr.Row():
54
  status_output = gr.Textbox(label="Buffer Status", interactive=False)
55
  buffer_audio = gr.Audio(label="Current Buffer (Click to Play)", interactive=False)
56
+
57
  with gr.Row():
58
  clear_btn = gr.Button("Clear Buffer")
59
  play_btn = gr.Button("Get Buffer for Playback")
60
  force_btn = gr.Button("Force Transcribe")
61
+
62
  with gr.Row():
63
  transcription_output = gr.Textbox(label="Live Transcription", lines=5, interactive=False)
64
+
65
  # Connect components
66
  audio_input.stream(
67
+ process_mic_audio,
68
+ audio_input,
69
  [status_output, transcription_output]
70
  )
71
+
72
  clear_btn.click(clear_audio_buffer, None, [status_output, buffer_audio, transcription_output])
73
  play_btn.click(get_current_buffer, None, buffer_audio)
74
  force_btn.click(force_transcribe, None, transcription_output)
nodemon.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "watch": [
3
+ "*.py",
4
+ "**/*.py"
5
+ ],
6
+ "ext": "py",
7
+ "ignore": [
8
+ "__pycache__/",
9
+ "*.pyc",
10
+ ".git/",
11
+ "node_modules/",
12
+ "venv/",
13
+ "env/",
14
+ ".pytest_cache/",
15
+ "*.log"
16
+ ],
17
+ "exec": "python3 transcriber.py",
18
+ "env": {
19
+ "PYTHONPATH": ".",
20
+ "PYTHONUNBUFFERED": "1"
21
+ },
22
+ "delay": 1000,
23
+ "verbose": true,
24
+ "restartable": "rs",
25
+ "colours": true,
26
+ "legacy-watch": false
27
+ }
transcriber.py CHANGED
@@ -11,25 +11,197 @@ class AudioProcessor:
11
  self.processed_length = 0 # Length of audio already processed
12
  self.sample_rate = 16000 # Default sample rate for whisper
13
  self.lock = threading.Lock() # Thread safety for buffer access
14
- self.transcription = [''] # List of transcription segments
15
  self.min_process_length = 1 * self.sample_rate # Process at least 1 second
16
  self.max_buffer_size = 30 * self.sample_rate # Maximum buffer size (30 seconds)
 
17
  self.last_process_time = time.time()
18
  self.process_interval = 1.0 # Process every 1 second
19
  self.is_processing = False # Flag to prevent concurrent processing
20
-
 
 
 
 
21
  # Initialize the whisper model
22
  self.audio_model = WhisperModel(model_size, device=device, compute_type=compute_type)
23
  print(f"Initialized {model_size} model on {device}")
24
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def add_audio(self, audio_data, sr):
26
  """
27
  Add audio to the buffer and process if needed
28
-
29
  Args:
30
  audio_data (numpy.ndarray): Audio data to add
31
  sr (int): Sample rate of the audio data
32
-
33
  Returns:
34
  int: Current buffer size in samples
35
  """
@@ -37,11 +209,11 @@ class AudioProcessor:
37
  # Convert to mono if stereo
38
  if audio_data.ndim > 1:
39
  audio_data = audio_data.mean(axis=1)
40
-
41
- # Keep original format without normalization
42
  audio_data = audio_data.astype(np.float32)
43
-
44
- # Resample properly if needed
45
  if sr != self.sample_rate:
46
  try:
47
  # Use scipy for proper resampling
@@ -49,106 +221,82 @@ class AudioProcessor:
49
  audio_data = signal.resample(audio_data, number_of_samples)
50
  except Exception as e:
51
  print(f"Resampling error: {e}")
52
- # Fallback to simple method if scipy fails
53
  ratio = self.sample_rate / sr
54
  audio_data = np.interp(
55
  np.arange(0, len(audio_data) * ratio, ratio),
56
  np.arange(0, len(audio_data)),
57
  audio_data
58
  )
59
-
60
- # Apply fade-in to prevent clicks at chunk boundaries (5ms fade)
61
  fade_samples = min(int(0.005 * self.sample_rate), len(audio_data))
62
  if fade_samples > 0:
63
  fade_in = np.linspace(0, 1, fade_samples)
64
- audio_data[:fade_samples] = audio_data[:fade_samples] * fade_in
65
-
66
  # Add to buffer
67
  if len(self.audio_buffer) == 0:
68
  self.audio_buffer = audio_data
69
  else:
70
  self.audio_buffer = np.concatenate([self.audio_buffer, audio_data])
71
-
72
- # Trim buffer if it gets too large
73
- if len(self.audio_buffer) > self.max_buffer_size:
74
- excess = len(self.audio_buffer) - self.max_buffer_size
75
- self.audio_buffer = self.audio_buffer[excess:]
76
- # Adjust processed length when trimming
77
- self.processed_length = max(0, self.processed_length - excess)
78
-
79
  # Check if we should process now
80
  should_process = (
81
  len(self.audio_buffer) >= self.min_process_length and
82
  time.time() - self.last_process_time >= self.process_interval and
83
  not self.is_processing
84
  )
85
-
86
  if should_process:
87
  self.last_process_time = time.time()
88
  self.is_processing = True
89
- # Process the buffer in a separate thread to avoid blocking
90
- threading.Thread(target=self._process_audio).start()
91
-
92
  return len(self.audio_buffer)
93
-
94
- def _process_audio(self):
95
- """Process the current audio buffer (should be called in a separate thread)"""
96
- try:
97
- with self.lock:
98
- # Get unprocessed portion of the buffer
99
- if self.processed_length >= len(self.audio_buffer):
100
- self.is_processing = False
101
- return
102
-
103
- # Make a copy of the full buffer for processing
104
- audio = self.audio_buffer.copy()
105
-
106
- # Normalize for transcription
107
- audio_norm = audio.astype(np.float32)
108
- if np.max(np.abs(audio_norm)) > 0:
109
- audio_norm = audio_norm / np.max(np.abs(audio_norm))
110
-
111
- # Transcribe with whisper
112
- segments, info = self.audio_model.transcribe(audio_norm, beam_size=5)
113
- result = list(segments)
114
-
115
- if result:
116
- with self.lock:
117
- # Update the transcription
118
- self.transcription = [seg.text for seg in result]
119
- # Mark the whole buffer as processed
120
- self.processed_length = len(self.audio_buffer)
121
- except Exception as e:
122
- print(f"Transcription error: {e}")
123
- finally:
124
- # Reset processing flag
125
- self.is_processing = False
126
-
127
- def get_transcription(self):
128
- """Get the current transcription text"""
129
- with self.lock:
130
- return " ".join(self.transcription)
131
-
132
  def clear_buffer(self):
133
- """Clear the audio buffer"""
134
  with self.lock:
135
  self.audio_buffer = np.array([])
136
  self.processed_length = 0
137
- self.transcription = ['']
 
 
138
  self.is_processing = False
139
  return "Buffers cleared"
140
-
 
 
 
 
 
141
  def get_playback_audio(self):
142
  """Get properly formatted audio for Gradio playback"""
143
  with self.lock:
144
  if len(self.audio_buffer) == 0:
145
  return None
146
-
147
  # Make a copy and ensure proper format for Gradio
148
  audio = self.audio_buffer.copy()
149
-
150
  # Ensure audio is in the correct range for playback (-1 to 1)
151
  if np.max(np.abs(audio)) > 0:
152
  audio = audio / max(1.0, np.max(np.abs(audio)))
153
-
154
  return (self.sample_rate, audio)
 
 
 
 
 
 
 
 
 
 
 
 
11
  self.processed_length = 0 # Length of audio already processed
12
  self.sample_rate = 16000 # Default sample rate for whisper
13
  self.lock = threading.Lock() # Thread safety for buffer access
 
14
  self.min_process_length = 1 * self.sample_rate # Process at least 1 second
15
  self.max_buffer_size = 30 * self.sample_rate # Maximum buffer size (30 seconds)
16
+ self.overlap_size = 3 * self.sample_rate # Keep 3 seconds of overlap when trimming
17
  self.last_process_time = time.time()
18
  self.process_interval = 1.0 # Process every 1 second
19
  self.is_processing = False # Flag to prevent concurrent processing
20
+
21
+ self.full_transcription = "" # Complete history of transcription
22
+ self.last_segment_text = "" # Last segment that was transcribed
23
+ self.confirmed_transcription = "" # Transcription that won't change (beyond overlap zone)
24
+
25
  # Initialize the whisper model
26
  self.audio_model = WhisperModel(model_size, device=device, compute_type=compute_type)
27
  print(f"Initialized {model_size} model on {device}")
28
+
29
+ def _trim_buffer_intelligently(self):
30
+ """
31
+ Trim the buffer while preserving transcription continuity
32
+ Keep some overlap to maintain context for the next processing
33
+ """
34
+ if len(self.audio_buffer) <= self.max_buffer_size:
35
+ return
36
+
37
+ # Calculate how much to trim (keep overlap_size for context)
38
+ trim_amount = len(self.audio_buffer) - self.max_buffer_size + self.overlap_size
39
+
40
+ # Make sure we don't trim more than we have
41
+ trim_amount = min(trim_amount, len(self.audio_buffer) - self.overlap_size)
42
+
43
+ if trim_amount > 0:
44
+ # Before trimming, finalize the transcription for the part we're removing
45
+ # This ensures we don't lose confirmed text
46
+ if self.processed_length > trim_amount:
47
+ # We're removing audio that was already processed
48
+ # The transcription for this part should be considered final
49
+ pass # The full_transcription already contains this
50
+
51
+ # Trim the buffer
52
+ self.audio_buffer = self.audio_buffer[trim_amount:]
53
+
54
+ # Adjust processed_length to account for trimmed audio
55
+ self.processed_length = max(0, self.processed_length - trim_amount)
56
+
57
+ # Reset last_segment_text since our context has changed
58
+ # This forces the next processing to start fresh with overlap handling
59
+ self.last_segment_text = ""
60
+
61
+ def _process_audio_chunk(self):
62
+ """Process the current audio buffer and return new transcription"""
63
+ try:
64
+ with self.lock:
65
+ # Check if there's enough new content to process
66
+ unprocessed_length = len(self.audio_buffer) - self.processed_length
67
+ if unprocessed_length < self.min_process_length:
68
+ self.is_processing = False
69
+ return None
70
+
71
+ # Determine what portion to process
72
+ # Include some overlap from already processed audio for context
73
+ overlap_samples = min(self.overlap_size, self.processed_length)
74
+ start_pos = max(0, self.processed_length - overlap_samples)
75
+
76
+ # Process from start_pos to end of buffer
77
+ audio_to_process = self.audio_buffer[start_pos:].copy()
78
+ end_pos = len(self.audio_buffer)
79
+
80
+ # Normalize for transcription
81
+ audio_norm = audio_to_process.astype(np.float32)
82
+ if np.max(np.abs(audio_norm)) > 0:
83
+ audio_norm = audio_norm / np.max(np.abs(audio_norm))
84
+
85
+ # Transcribe with faster settings for real-time processing
86
+ segments, info = self.audio_model.transcribe(
87
+ audio_norm,
88
+ beam_size=1,
89
+ word_timestamps=False,
90
+ vad_filter=True,
91
+ vad_parameters=dict(min_silence_duration_ms=500)
92
+ )
93
+
94
+ result = list(segments)
95
+
96
+ if result:
97
+ # Get the new text from all segments
98
+ current_segment_text = " ".join([seg.text.strip() for seg in result if seg.text.strip()])
99
+
100
+ if not current_segment_text:
101
+ self.is_processing = False
102
+ return None
103
+
104
+ # Handle overlap and merge with existing transcription
105
+ new_text = self._merge_transcription_intelligently(current_segment_text)
106
+
107
+ if new_text:
108
+ # Append new text to full transcription
109
+ if self.full_transcription and not self.full_transcription.endswith(' '):
110
+ self.full_transcription += " "
111
+ self.full_transcription += new_text
112
+
113
+ # Update state
114
+ self.last_segment_text = current_segment_text
115
+ self.processed_length = end_pos
116
+
117
+ return self.full_transcription
118
+
119
+ return None
120
+
121
+ except Exception as e:
122
+ print(f"Transcription error: {e}")
123
+ return None
124
+ finally:
125
+ self.is_processing = False
126
+
127
+ def _merge_transcription_intelligently(self, new_segment_text):
128
+ """
129
+ Intelligently merge new transcription with existing text
130
+ Handles overlap detection and prevents duplication
131
+ """
132
+ if not new_segment_text or not new_segment_text.strip():
133
+ return ""
134
+
135
+ # If this is the first transcription or we reset context, use it directly
136
+ if not self.last_segment_text:
137
+ return new_segment_text
138
+
139
+ # Normalize text for comparison
140
+ import re
141
+
142
+ def normalize_for_comparison(text):
143
+ # Convert to lowercase and remove punctuation for comparison
144
+ text = text.lower()
145
+ text = re.sub(r'[^\w\s]', '', text)
146
+ return text.strip()
147
+
148
+ norm_prev = normalize_for_comparison(self.last_segment_text)
149
+ norm_new = normalize_for_comparison(new_segment_text)
150
+
151
+ if not norm_prev or not norm_new:
152
+ return new_segment_text
153
+
154
+ # Split into words for overlap detection
155
+ prev_words = norm_prev.split()
156
+ new_words = norm_new.split()
157
+
158
+ # Find the longest overlap between end of previous and start of new
159
+ max_overlap = min(len(prev_words), len(new_words), 15) # Check up to 15 words
160
+ overlap_found = 0
161
+
162
+ for i in range(max_overlap, 2, -1): # Minimum 3 words to consider overlap
163
+ if prev_words[-i:] == new_words[:i]:
164
+ overlap_found = i
165
+ break
166
+
167
+ # Handle special cases for numbers (counting sequences)
168
+ if overlap_found == 0:
169
+ # Check if we have a counting sequence
170
+ prev_numbers = [int(x) for x in re.findall(r'\b\d+\b', norm_prev)]
171
+ new_numbers = [int(x) for x in re.findall(r'\b\d+\b', norm_new)]
172
+
173
+ if prev_numbers and new_numbers:
174
+ max_prev = max(prev_numbers)
175
+ min_new = min(new_numbers)
176
+
177
+ # If there's a logical continuation, find where it starts
178
+ if min_new <= max_prev + 5: # Allow some gap in counting
179
+ new_text_words = new_segment_text.split()
180
+ for i, word in enumerate(new_text_words):
181
+ if re.search(r'\b\d+\b', word):
182
+ num = int(re.search(r'\d+', word).group())
183
+ if num > max_prev:
184
+ return " ".join(new_text_words[i:])
185
+
186
+ # Apply overlap removal if found
187
+ if overlap_found > 0:
188
+ new_text_words = new_segment_text.split()
189
+ return " ".join(new_text_words[overlap_found:])
190
+ else:
191
+ # Check if new text is completely contained in previous (avoid duplication)
192
+ if norm_new in norm_prev:
193
+ return ""
194
+ # No overlap found, return the full new text
195
+ return new_segment_text
196
+
197
  def add_audio(self, audio_data, sr):
198
  """
199
  Add audio to the buffer and process if needed
200
+
201
  Args:
202
  audio_data (numpy.ndarray): Audio data to add
203
  sr (int): Sample rate of the audio data
204
+
205
  Returns:
206
  int: Current buffer size in samples
207
  """
 
209
  # Convert to mono if stereo
210
  if audio_data.ndim > 1:
211
  audio_data = audio_data.mean(axis=1)
212
+
213
+ # Convert to float32
214
  audio_data = audio_data.astype(np.float32)
215
+
216
+ # Resample if needed
217
  if sr != self.sample_rate:
218
  try:
219
  # Use scipy for proper resampling
 
221
  audio_data = signal.resample(audio_data, number_of_samples)
222
  except Exception as e:
223
  print(f"Resampling error: {e}")
224
+ # Fallback resampling
225
  ratio = self.sample_rate / sr
226
  audio_data = np.interp(
227
  np.arange(0, len(audio_data) * ratio, ratio),
228
  np.arange(0, len(audio_data)),
229
  audio_data
230
  )
231
+
232
+ # Apply fade-in to prevent clicks (5ms fade)
233
  fade_samples = min(int(0.005 * self.sample_rate), len(audio_data))
234
  if fade_samples > 0:
235
  fade_in = np.linspace(0, 1, fade_samples)
236
+ audio_data[:fade_samples] *= fade_in
237
+
238
  # Add to buffer
239
  if len(self.audio_buffer) == 0:
240
  self.audio_buffer = audio_data
241
  else:
242
  self.audio_buffer = np.concatenate([self.audio_buffer, audio_data])
243
+
244
+ # Intelligently trim buffer if it gets too large
245
+ self._trim_buffer_intelligently()
246
+
 
 
 
 
247
  # Check if we should process now
248
  should_process = (
249
  len(self.audio_buffer) >= self.min_process_length and
250
  time.time() - self.last_process_time >= self.process_interval and
251
  not self.is_processing
252
  )
253
+
254
  if should_process:
255
  self.last_process_time = time.time()
256
  self.is_processing = True
257
+ # Process in a separate thread
258
+ threading.Thread(target=self._process_audio_chunk, daemon=True).start()
259
+
260
  return len(self.audio_buffer)
261
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  def clear_buffer(self):
263
+ """Clear the audio buffer and transcription"""
264
  with self.lock:
265
  self.audio_buffer = np.array([])
266
  self.processed_length = 0
267
+ self.full_transcription = ""
268
+ self.last_segment_text = ""
269
+ self.confirmed_transcription = ""
270
  self.is_processing = False
271
  return "Buffers cleared"
272
+
273
+ def get_transcription(self):
274
+ """Get the current transcription text"""
275
+ with self.lock:
276
+ return self.full_transcription
277
+
278
  def get_playback_audio(self):
279
  """Get properly formatted audio for Gradio playback"""
280
  with self.lock:
281
  if len(self.audio_buffer) == 0:
282
  return None
283
+
284
  # Make a copy and ensure proper format for Gradio
285
  audio = self.audio_buffer.copy()
286
+
287
  # Ensure audio is in the correct range for playback (-1 to 1)
288
  if np.max(np.abs(audio)) > 0:
289
  audio = audio / max(1.0, np.max(np.abs(audio)))
290
+
291
  return (self.sample_rate, audio)
292
+
293
+ def get_buffer_info(self):
294
+ """Get information about the current buffer state"""
295
+ with self.lock:
296
+ return {
297
+ "buffer_length_seconds": len(self.audio_buffer) / self.sample_rate,
298
+ "processed_length_seconds": self.processed_length / self.sample_rate,
299
+ "unprocessed_length_seconds": (len(self.audio_buffer) - self.processed_length) / self.sample_rate,
300
+ "is_processing": self.is_processing,
301
+ "transcription_length": len(self.full_transcription)
302
+ }