Spaces:

clementBE
/

Audio_transcrib_base

Paused

App Files Files Community

clementBE commited on Sep 29, 2025

Commit

6e566c7

verified ·

1 Parent(s): 4091834

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -25

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import datetime
 import time
 from transformers import pipeline
 from docx import Document
 # Define the available models and their approximate relative speeds
 MODEL_SIZES = {
@@ -35,7 +36,9 @@ def get_model_pipeline(model_name, progress):
         model_cache[model_name] = pipeline(
             "automatic-speech-recognition",
             model=model_id,
-            device=device
         )
         progress(0.5, desc="✅ Model loaded successfully!")
     return model_cache[model_name]
@@ -47,7 +50,7 @@ def create_vtt(segments, file_path):
     with open(file_path, "w", encoding="utf-8") as f:
         f.write("WEBVTT\n\n")
         for i, segment in enumerate(segments):
-            # Calculate time strings in "HH:MM:SS.mmm" format (though VTT only strictly requires up to milliseconds)
             start_ms = int(segment.get('start', 0) * 1000)
             end_ms = int(segment.get('end', 0) * 1000)
@@ -85,9 +88,10 @@ def create_docx(segments, file_path, with_timestamps):
     document.save(file_path)
 @spaces.GPU
-def transcribe_and_export(audio_file, model_size, vtt_output, docx_timestamp_output, docx_no_timestamp_output, progress=gr.Progress()):
     """
     Main function to transcribe audio and export to selected formats.
     """
     if audio_file is None:
         return (None, None, None, "Please upload an audio file.")
@@ -96,53 +100,95 @@ def transcribe_and_export(audio_file, model_size, vtt_output, docx_timestamp_out
     pipe = get_model_pipeline(model_size, progress)
-    progress(0.75, desc="🎤 Transcribing audio...")
-    # Check if the French-specific model option was selected
     if model_size == "Distil-Large-v3-FR (French-Specific)":
         # Force French for this specific option
-        raw_output = pipe(
-            audio_file,
-            return_timestamps="word", # Use word-level timestamps for more detail if needed, but 'True' works for chunk timestamps too
-            generate_kwargs={"language": "fr"}
-        )
     else:
-        # For other models, let the model auto-detect the language
         raw_output = pipe(
             audio_file,
             return_timestamps="word",
         )
-    # Use 'chunks' if available, otherwise default to the whole text
-    segments = raw_output.get("chunks", [])
-    # If no chunks are returned (e.g., if return_timestamps=False was used, though not in this code),
-    # create a single segment from the full text.
-    if not segments and 'text' in raw_output:
-        segments = [{'text': raw_output['text'].strip(), 'start': 0.0, 'end': 0.0}]
     outputs = {}
     progress(0.85, desc="📝 Generating output files...")
     if vtt_output:
         vtt_path = "transcription.vtt"
-        create_vtt(segments, vtt_path)
         outputs["VTT"] = vtt_path
     if docx_timestamp_output:
         docx_ts_path = "transcription_with_timestamps.docx"
-        create_docx(segments, docx_ts_path, with_timestamps=True)
         outputs["DOCX (with timestamps)"] = docx_ts_path
     if docx_no_timestamp_output:
         docx_no_ts_path = "transcription_without_timestamps.docx"
-        create_docx(segments, docx_no_ts_path, with_timestamps=False)
         outputs["DOCX (without timestamps)"] = docx_no_ts_path
     end_time = time.time()
     total_time = end_time - start_time
-    transcribed_text = raw_output['text']
     downloadable_files = [path for path in outputs.values()]
     status_message = f"✅ Transcription complete! Total time: {total_time:.2f} seconds."
@@ -165,9 +211,15 @@ with gr.Blocks(title="Whisper ZeroGPU Transcription") as demo:
             model_selector = gr.Dropdown(
                 label="Choose Whisper Model Size",
                 choices=list(MODEL_SIZES.keys()),
-                # Default to the French-specific model, which now uses the correct ID
                 value="Distil-Large-v3-FR (French-Specific)"
             )
             gr.Markdown("### Choose Output Formats")
             with gr.Row():
                 vtt_checkbox = gr.Checkbox(label="VTT", value=True)
@@ -182,7 +234,8 @@ with gr.Blocks(title="Whisper ZeroGPU Transcription") as demo:
     transcribe_btn.click(
         fn=transcribe_and_export,
-        inputs=[audio_input, model_selector, vtt_checkbox, docx_ts_checkbox, docx_no_ts_checkbox],
         outputs=[transcription_output, downloadable_files_output, audio_input, status_text]
     )

 import time
 from transformers import pipeline
 from docx import Document
+from pydub import AudioSegment
 # Define the available models and their approximate relative speeds
 MODEL_SIZES = {
         model_cache[model_name] = pipeline(
             "automatic-speech-recognition",
             model=model_id,
+            device=device,
+            # Set max_new_tokens for generation, common for ASR
+            max_new_tokens=128
         )
         progress(0.5, desc="✅ Model loaded successfully!")
     return model_cache[model_name]
     with open(file_path, "w", encoding="utf-8") as f:
         f.write("WEBVTT\n\n")
         for i, segment in enumerate(segments):
+            # Calculate time strings in "HH:MM:SS.mmm" format
             start_ms = int(segment.get('start', 0) * 1000)
             end_ms = int(segment.get('end', 0) * 1000)
     document.save(file_path)
 @spaces.GPU
+def transcribe_and_export(audio_file, model_size, vtt_output, docx_timestamp_output, docx_no_timestamp_output, sequence_5_min, progress=gr.Progress()):
     """
     Main function to transcribe audio and export to selected formats.
+    Added logic for 5-minute sequencing.
     """
     if audio_file is None:
         return (None, None, None, "Please upload an audio file.")
     pipe = get_model_pipeline(model_size, progress)
+    # Define generation arguments
+    generate_kwargs = {}
     if model_size == "Distil-Large-v3-FR (French-Specific)":
         # Force French for this specific option
+        generate_kwargs["language"] = "fr"
+    full_segments = []
+    full_text_list = []
+    # --- New 5-Minute Sequencing Logic ---
+    if sequence_5_min:
+        progress(0.70, desc="✂️ Splitting audio into 5-minute chunks...")
+        audio = AudioSegment.from_file(audio_file)
+        chunk_length_ms = 5 * 60 * 1000  # 5 minutes in milliseconds
+        total_duration_ms = len(audio)
+        num_chunks = (total_duration_ms + chunk_length_ms - 1) // chunk_length_ms # Ceiling division
+        for i in range(num_chunks):
+            start_ms = i * chunk_length_ms
+            end_ms = min((i + 1) * chunk_length_ms, total_duration_ms)
+            progress_val = 0.70 + (i / num_chunks) * 0.15
+            progress(progress_val, desc=f"🎤 Transcribing chunk {i+1}/{num_chunks}...")
+            chunk = audio[start_ms:end_ms]
+            temp_chunk_path = f"/tmp/chunk_{i}.mp3" # Save as a temp file for the pipeline
+            chunk.export(temp_chunk_path, format="mp3")
+            # Transcribe the chunk
+            chunk_output = pipe(
+                temp_chunk_path,
+                return_timestamps="word",
+                generate_kwargs=generate_kwargs
+            )
+            # Adjust timestamps for the full file
+            offset = start_ms / 1000.0
+            chunk_segments = chunk_output.get("chunks", [])
+            for segment in chunk_segments:
+                segment['start'] = segment.get('start', 0.0) + offset
+                segment['end'] = segment.get('end', 0.0) + offset
+                full_segments.append(segment)
+            full_text_list.append(chunk_output.get('text', ''))
+            os.remove(temp_chunk_path) # Clean up temp file
+        transcribed_text = " ".join(full_text_list).strip()
     else:
+        # Standard transcription for the whole file at once
+        progress(0.75, desc="🎤 Transcribing full audio file...")
         raw_output = pipe(
             audio_file,
             return_timestamps="word",
+            generate_kwargs=generate_kwargs
         )
+        full_segments = raw_output.get("chunks", [])
+        transcribed_text = raw_output.get('text', '').strip()
+    # Ensure segments is not empty
+    if not full_segments and transcribed_text:
+        # Create a single segment from the full text if chunks were not generated for some reason
+        full_segments = [{'text': transcribed_text, 'start': 0.0, 'end': 0.0}]
     outputs = {}
     progress(0.85, desc="📝 Generating output files...")
+    # Generate VTT
     if vtt_output:
         vtt_path = "transcription.vtt"
+        create_vtt(full_segments, vtt_path)
         outputs["VTT"] = vtt_path
+    # Generate DOCX with timestamps
     if docx_timestamp_output:
         docx_ts_path = "transcription_with_timestamps.docx"
+        create_docx(full_segments, docx_ts_path, with_timestamps=True)
         outputs["DOCX (with timestamps)"] = docx_ts_path
+    # Generate DOCX without timestamps
     if docx_no_timestamp_output:
         docx_no_ts_path = "transcription_without_timestamps.docx"
+        create_docx(full_segments, docx_no_ts_path, with_timestamps=False)
         outputs["DOCX (without timestamps)"] = docx_no_ts_path
     end_time = time.time()
     total_time = end_time - start_time
     downloadable_files = [path for path in outputs.values()]
     status_message = f"✅ Transcription complete! Total time: {total_time:.2f} seconds."
             model_selector = gr.Dropdown(
                 label="Choose Whisper Model Size",
                 choices=list(MODEL_SIZES.keys()),
                 value="Distil-Large-v3-FR (French-Specific)"
             )
+            gr.Markdown("### Processing Options")
+            # NEW CHECKBOX for 5-minute sequencing
+            sequence_checkbox = gr.Checkbox(
+                label="Process in 5-minute sequences (Recommended for files > 30 min or to prevent memory errors)",
+                value=False
+            )
             gr.Markdown("### Choose Output Formats")
             with gr.Row():
                 vtt_checkbox = gr.Checkbox(label="VTT", value=True)
     transcribe_btn.click(
         fn=transcribe_and_export,
+        # UPDATED INPUTS list to include the new checkbox
+        inputs=[audio_input, model_selector, vtt_checkbox, docx_ts_checkbox, docx_no_ts_checkbox, sequence_checkbox],
         outputs=[transcription_output, downloadable_files_output, audio_input, status_text]
     )