andrijdavid Qwen-Coder commited on
Commit
5bff499
·
1 Parent(s): ddae84a

Fix transcription errors by improving audio segment handling\n\n- Add checks for empty audio segments to avoid creating invalid files\n- Pad very short audio segments to ensure Whisper compatibility\n- Use explicit WAV format with PCM_16 subtype for better compatibility\n- Add error handling around transcription to gracefully handle segment errors

Browse files
Files changed (1) hide show
  1. app.py +25 -9
app.py CHANGED
@@ -189,19 +189,35 @@ class DiarizationTranscriptionTranslation:
189
  end_sample = int(segment["end"] * orig_sr)
190
  segment_audio = audio[start_sample:end_sample]
191
 
 
 
 
 
 
 
 
 
 
 
192
  # Save the segment as a temporary file for Whisper
193
  temp_file = f"temp_segment_{segment['start']}_{segment['end']}.wav"
194
- sf.write(temp_file, segment_audio, orig_sr)
 
195
 
196
  # Transcribe the segment
197
- transcription_result = self.transcribe_audio(temp_file)
198
- # Handle both possible return formats
199
- if isinstance(transcription_result, dict) and "text" in transcription_result:
200
- transcribed_text = transcription_result["text"]
201
- elif isinstance(transcription_result, str):
202
- transcribed_text = transcription_result
203
- else:
204
- transcribed_text = str(transcription_result)
 
 
 
 
 
205
 
206
  # Translate if necessary
207
  translated_text = self.translate_text(transcribed_text)
 
189
  end_sample = int(segment["end"] * orig_sr)
190
  segment_audio = audio[start_sample:end_sample]
191
 
192
+ # Ensure segment_audio is not empty
193
+ if len(segment_audio) == 0:
194
+ continue # Skip empty segments
195
+
196
+ # Add a small amount of silence if segment is too short for Whisper
197
+ if len(segment_audio) < orig_sr * 0.1: # Less than 0.1 seconds
198
+ min_samples = int(orig_sr * 0.1)
199
+ zeros_to_add = min_samples - len(segment_audio)
200
+ segment_audio = np.pad(segment_audio, (0, zeros_to_add), mode='constant')
201
+
202
  # Save the segment as a temporary file for Whisper
203
  temp_file = f"temp_segment_{segment['start']}_{segment['end']}.wav"
204
+ # Use subtype parameter to ensure proper WAV format
205
+ sf.write(temp_file, segment_audio, orig_sr, format='WAV', subtype='PCM_16')
206
 
207
  # Transcribe the segment
208
+ try:
209
+ transcription_result = self.transcribe_audio(temp_file)
210
+ # Handle both possible return formats
211
+ if isinstance(transcription_result, dict) and "text" in transcription_result:
212
+ transcribed_text = transcription_result["text"]
213
+ elif isinstance(transcription_result, str):
214
+ transcribed_text = transcription_result
215
+ else:
216
+ transcribed_text = str(transcription_result)
217
+ except Exception as e:
218
+ print(f"Error transcribing segment {temp_file}: {str(e)}")
219
+ transcribed_text = f"Transcription error: {str(e)}"
220
+ # Continue with the error message as the transcription
221
 
222
  # Translate if necessary
223
  translated_text = self.translate_text(transcribed_text)