Spaces:

DineshJ96
/

multilanguage-transcript

Sleeping

App Files Files Community

DineshJ96 commited on Jul 27, 2025

Commit

d926715

1 Parent(s): 127cc86

app file updated- process audio

Browse files

Files changed (1) hide show

app.py +22 -6

app.py CHANGED Viewed

@@ -144,9 +144,25 @@ def process_audio_for_web(audio_input):
         # 2. Align
         print("Aligning transcription with audio...")
         transcription_result = whisperx.align(transcription_result["segments"], align_model_local, audio, return_char_alignments=False)
-        transcription_result = whisperx.align(transcription_result["segments"], align_model_local, audio, device, return_char_alignments=False)
-        del align_model_local
         gc.collect()
         if device == "cuda":
             torch.cuda.empty_cache()
@@ -157,7 +173,7 @@ def process_audio_for_web(audio_input):
         final_result = whisperx.assign_word_speakers(diarize_segments, transcription_result)
         speaker_transcripts_raw = {}
-        # Prepare for display in dianzed_transcription_output
         diarized_display_lines = []
         for segment in final_result["segments"]:
@@ -205,8 +221,8 @@ def process_audio_for_web(audio_input):
                         "translated_text": translated_text_output
                     })
                     translated_display_lines.append(f"[{seg['start']:.2f}s - {seg['end']:.2f}s] Original: {original_text}")
-                    translated_display_lines.append(f"                                   Translated: {translated_text_output}")
             translated_output_str = "\n".join(translated_display_lines)
         else:
             translated_output_str = "Translation model not loaded. Skipping translation."
@@ -228,7 +244,7 @@ def process_audio_for_web(audio_input):
                     f.write(f"\n### Speaker {speaker} ###\n")
                     for seg in segments:
                         f.write(f"[{seg['start']:.2f}s - {seg['end']:.2f}s] Original: {seg['original_text']}\n")
-                        f.write(f"                                   Translated: {seg['translated_text']}\n")
             else:
                 f.write("Translation output not available or translation model not loaded.\n")

         # 2. Align
         print("Aligning transcription with audio...")
+        align_model_local = None # Initialize to None to prevent UnboundLocalError in outer except
+        try:
+            # Load the alignment model based on the detected language
+            # The 'device' parameter is passed here, not to whisperx.align
+            align_model_local, metadata = whisperx.load_align_model(language_code=detected_language, device=device)
+        except Exception as e:
+            # Handle cases where the alignment model for the detected language cannot be loaded
+            print(f"Error loading alignment model for language '{detected_language}': {e}")
+            import traceback
+            print(traceback.format_exc())
+            # Provide a user-friendly message, possibly suggesting supported languages
+            return f"Error: Could not load alignment model for language '{detected_language}'. Alignment is typically supported for English, French, German, Spanish, Italian, Japanese, Chinese, Dutch, and Portuguese. Details: {e}", "", "", None
+        # Perform alignment using the loaded model
+        # Removed 'device' from here as the model itself is already on the correct device
         transcription_result = whisperx.align(transcription_result["segments"], align_model_local, audio, return_char_alignments=False)
+        # Removed the duplicate whisperx.align call and 'del align_model_local'
+        # as it can cause issues if an error occurs later.
         gc.collect()
         if device == "cuda":
             torch.cuda.empty_cache()
         final_result = whisperx.assign_word_speakers(diarize_segments, transcription_result)
         speaker_transcripts_raw = {}
+        # Prepare for display in diarized_transcription_output
         diarized_display_lines = []
         for segment in final_result["segments"]:
                         "translated_text": translated_text_output
                     })
                     translated_display_lines.append(f"[{seg['start']:.2f}s - {seg['end']:.2f}s] Original: {original_text}")
+                    translated_display_lines.append(f"           Translated: {translated_text_output}")
             translated_output_str = "\n".join(translated_display_lines)
         else:
             translated_output_str = "Translation model not loaded. Skipping translation."
                     f.write(f"\n### Speaker {speaker} ###\n")
                     for seg in segments:
                         f.write(f"[{seg['start']:.2f}s - {seg['end']:.2f}s] Original: {seg['original_text']}\n")
+                        f.write(f"           Translated: {seg['translated_text']}\n")
             else:
                 f.write("Translation output not available or translation model not loaded.\n")