Spaces:

clementBE
/

Audio_transcrib_base

Paused

App Files Files Community

clementBE commited on Sep 29, 2025

Commit

7b256c2

verified ·

1 Parent(s): 6e566c7

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -82

app.py CHANGED Viewed

@@ -14,14 +14,15 @@ MODEL_SIZES = {
     "Base (Faster)": "openai/whisper-base",
     "Small (Balanced)": "openai/whisper-small",
     "Distil-Large-v3 (General Purpose)": "distil-whisper/distil-large-v3",
-    # FIX: The model 'distil-whisper/distil-large-v3-fr' does not exist.
-    # We use the general distil-large-v3 and rely on the code below to force French.
     "Distil-Large-v3-FR (French-Specific)": "distil-whisper/distil-large-v3"
 }
 # Use a dictionary to cache loaded models
 model_cache = {}
 def get_model_pipeline(model_name, progress):
     """
     Initializes and caches the ASR pipeline for a given model name.
@@ -29,7 +30,6 @@ def get_model_pipeline(model_name, progress):
     if model_name not in model_cache:
         progress(0, desc="🚀 Initializing ZeroGPU instance...")
         model_id = MODEL_SIZES[model_name]
-        # Use GPU if available, otherwise fallback to CPU
         device = 0 if torch.cuda.is_available() else "cpu"
         progress(0.1, desc=f"⏳ Loading {model_name} model...")
@@ -37,12 +37,15 @@ def get_model_pipeline(model_name, progress):
             "automatic-speech-recognition",
             model=model_id,
             device=device,
-            # Set max_new_tokens for generation, common for ASR
             max_new_tokens=128
         )
         progress(0.5, desc="✅ Model loaded successfully!")
     return model_cache[model_name]
 def create_vtt(segments, file_path):
     """
     Creates a WebVTT (.vtt) file from transcription segments.
@@ -50,7 +53,6 @@ def create_vtt(segments, file_path):
     with open(file_path, "w", encoding="utf-8") as f:
         f.write("WEBVTT\n\n")
         for i, segment in enumerate(segments):
-            # Calculate time strings in "HH:MM:SS.mmm" format
             start_ms = int(segment.get('start', 0) * 1000)
             end_ms = int(segment.get('end', 0) * 1000)
@@ -77,9 +79,8 @@ def create_docx(segments, file_path, with_timestamps):
     if with_timestamps:
         for segment in segments:
             text = segment.get('text', '').strip()
-            # Format time as HH:MM:SS for DOCX
-            start = str(datetime.timedelta(seconds=int(segment.get('start', 0))))
-            end = str(datetime.timedelta(seconds=int(segment.get('end', 0))))
             document.add_paragraph(f"[{start} - {end}] {text}")
     else:
         full_text = " ".join([segment.get('text', '').strip() for segment in segments])
@@ -87,101 +88,123 @@ def create_docx(segments, file_path, with_timestamps):
     document.save(file_path)
-@spaces.GPU
-def transcribe_and_export(audio_file, model_size, vtt_output, docx_timestamp_output, docx_no_timestamp_output, sequence_5_min, progress=gr.Progress()):
     """
-    Main function to transcribe audio and export to selected formats.
-    Added logic for 5-minute sequencing.
     """
     if audio_file is None:
-        return (None, None, None, "Please upload an audio file.")
-    start_time = time.time()
-    pipe = get_model_pipeline(model_size, progress)
-    # Define generation arguments
-    generate_kwargs = {}
-    if model_size == "Distil-Large-v3-FR (French-Specific)":
-        # Force French for this specific option
-        generate_kwargs["language"] = "fr"
-    full_segments = []
-    full_text_list = []
-    # --- New 5-Minute Sequencing Logic ---
-    if sequence_5_min:
-        progress(0.70, desc="✂️ Splitting audio into 5-minute chunks...")
         audio = AudioSegment.from_file(audio_file)
-        chunk_length_ms = 5 * 60 * 1000  # 5 minutes in milliseconds
         total_duration_ms = len(audio)
-        num_chunks = (total_duration_ms + chunk_length_ms - 1) // chunk_length_ms # Ceiling division
         for i in range(num_chunks):
-            start_ms = i * chunk_length_ms
-            end_ms = min((i + 1) * chunk_length_ms, total_duration_ms)
-            progress_val = 0.70 + (i / num_chunks) * 0.15
-            progress(progress_val, desc=f"🎤 Transcribing chunk {i+1}/{num_chunks}...")
-            chunk = audio[start_ms:end_ms]
-            temp_chunk_path = f"/tmp/chunk_{i}.mp3" # Save as a temp file for the pipeline
-            chunk.export(temp_chunk_path, format="mp3")
-            # Transcribe the chunk
-            chunk_output = pipe(
-                temp_chunk_path,
-                return_timestamps="word",
-                generate_kwargs=generate_kwargs
-            )
-            # Adjust timestamps for the full file
-            offset = start_ms / 1000.0
-            chunk_segments = chunk_output.get("chunks", [])
-            for segment in chunk_segments:
-                segment['start'] = segment.get('start', 0.0) + offset
-                segment['end'] = segment.get('end', 0.0) + offset
-                full_segments.append(segment)
-            full_text_list.append(chunk_output.get('text', ''))
-            os.remove(temp_chunk_path) # Clean up temp file
-        transcribed_text = " ".join(full_text_list).strip()
-    else:
-        # Standard transcription for the whole file at once
-        progress(0.75, desc="🎤 Transcribing full audio file...")
-        raw_output = pipe(
-            audio_file,
-            return_timestamps="word",
-            generate_kwargs=generate_kwargs
-        )
-        full_segments = raw_output.get("chunks", [])
-        transcribed_text = raw_output.get('text', '').strip()
-    # Ensure segments is not empty
-    if not full_segments and transcribed_text:
-        # Create a single segment from the full text if chunks were not generated for some reason
-        full_segments = [{'text': transcribed_text, 'start': 0.0, 'end': 0.0}]
     outputs = {}
     progress(0.85, desc="📝 Generating output files...")
-    # Generate VTT
     if vtt_output:
         vtt_path = "transcription.vtt"
         create_vtt(full_segments, vtt_path)
         outputs["VTT"] = vtt_path
-    # Generate DOCX with timestamps
     if docx_timestamp_output:
         docx_ts_path = "transcription_with_timestamps.docx"
         create_docx(full_segments, docx_ts_path, with_timestamps=True)
         outputs["DOCX (with timestamps)"] = docx_ts_path
-    # Generate DOCX without timestamps
     if docx_no_timestamp_output:
         docx_no_ts_path = "transcription_without_timestamps.docx"
         create_docx(full_segments, docx_no_ts_path, with_timestamps=False)
@@ -195,14 +218,14 @@ def transcribe_and_export(audio_file, model_size, vtt_output, docx_timestamp_out
     return (
         transcribed_text,
         gr.Files(value=downloadable_files, label="Download Transcripts"),
-        gr.Audio(value=None), # Clear the audio input
         status_message
     )
 # --- Gradio UI ---
 with gr.Blocks(title="Whisper ZeroGPU Transcription") as demo:
     gr.Markdown("# 🎙️ Whisper ZeroGPU Transcription")
-    gr.Markdown("Transcribe audio with timestamps and choose your output format. The first run may take up to a minute due to cold start.")
     with gr.Row():
         audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio File")
@@ -213,13 +236,18 @@ with gr.Blocks(title="Whisper ZeroGPU Transcription") as demo:
                 choices=list(MODEL_SIZES.keys()),
                 value="Distil-Large-v3-FR (French-Specific)"
             )
-            gr.Markdown("### Processing Options")
-            # NEW CHECKBOX for 5-minute sequencing
-            sequence_checkbox = gr.Checkbox(
-                label="Process in 5-minute sequences (Recommended for files > 30 min or to prevent memory errors)",
-                value=False
-            )
             gr.Markdown("### Choose Output Formats")
             with gr.Row():
                 vtt_checkbox = gr.Checkbox(label="VTT", value=True)
@@ -232,10 +260,17 @@ with gr.Blocks(title="Whisper ZeroGPU Transcription") as demo:
     transcription_output = gr.Textbox(label="Full Transcription", lines=10)
     downloadable_files_output = gr.Files(label="Download Transcripts")
     transcribe_btn.click(
         fn=transcribe_and_export,
-        # UPDATED INPUTS list to include the new checkbox
-        inputs=[audio_input, model_selector, vtt_checkbox, docx_ts_checkbox, docx_no_ts_checkbox, sequence_checkbox],
         outputs=[transcription_output, downloadable_files_output, audio_input, status_text]
     )

     "Base (Faster)": "openai/whisper-base",
     "Small (Balanced)": "openai/whisper-small",
     "Distil-Large-v3 (General Purpose)": "distil-whisper/distil-large-v3",
     "Distil-Large-v3-FR (French-Specific)": "distil-whisper/distil-large-v3"
 }
 # Use a dictionary to cache loaded models
 model_cache = {}
+# Define the fixed chunk length (5 minutes in milliseconds)
+CHUNK_LENGTH_MS = 5 * 60 * 1000
 def get_model_pipeline(model_name, progress):
     """
     Initializes and caches the ASR pipeline for a given model name.
     if model_name not in model_cache:
         progress(0, desc="🚀 Initializing ZeroGPU instance...")
         model_id = MODEL_SIZES[model_name]
         device = 0 if torch.cuda.is_available() else "cpu"
         progress(0.1, desc=f"⏳ Loading {model_name} model...")
             "automatic-speech-recognition",
             model=model_id,
             device=device,
             max_new_tokens=128
         )
         progress(0.5, desc="✅ Model loaded successfully!")
     return model_cache[model_name]
+# Helper function to format seconds to HH:MM:SS string
+def format_seconds(seconds):
+    return str(datetime.timedelta(seconds=int(seconds)))
 def create_vtt(segments, file_path):
     """
     Creates a WebVTT (.vtt) file from transcription segments.
     with open(file_path, "w", encoding="utf-8") as f:
         f.write("WEBVTT\n\n")
         for i, segment in enumerate(segments):
             start_ms = int(segment.get('start', 0) * 1000)
             end_ms = int(segment.get('end', 0) * 1000)
     if with_timestamps:
         for segment in segments:
             text = segment.get('text', '').strip()
+            start = format_seconds(segment.get('start', 0))
+            end = format_seconds(segment.get('end', 0))
             document.add_paragraph(f"[{start} - {end}] {text}")
     else:
         full_text = " ".join([segment.get('text', '').strip() for segment in segments])
     document.save(file_path)
+# --- NEW FUNCTION: Analyze Audio and Populate Dropdown ---
+def analyze_audio_and_get_chunks(audio_file):
     """
+    Reads the audio file and generates chunk options for the dropdown.
     """
     if audio_file is None:
+        return gr.Dropdown(choices=["Full Audio"], value="Full Audio", interactive=False), "Please upload an audio file first."
+    try:
         audio = AudioSegment.from_file(audio_file)
         total_duration_ms = len(audio)
+        num_chunks = (total_duration_ms + CHUNK_LENGTH_MS - 1) // CHUNK_LENGTH_MS
+        chunk_options = ["Full Audio"]
         for i in range(num_chunks):
+            start_ms = i * CHUNK_LENGTH_MS
+            end_ms = min((i + 1) * CHUNK_LENGTH_MS, total_duration_ms)
+            start_sec = start_ms / 1000
+            end_sec = end_ms / 1000
+            start_time_str = format_seconds(start_sec).split('.')[0]
+            end_time_str = format_seconds(end_sec).split('.')[0]
+            option_name = f"Chunk {i+1} ({start_time_str} - {end_time_str})"
+            chunk_options.append(option_name)
+        status = f"Audio analyzed. Duration: {format_seconds(total_duration_ms/1000.0)}. Found {num_chunks} chunks."
+        return gr.Dropdown(choices=chunk_options, value="Full Audio", interactive=True), status
+    except Exception as e:
+        error_msg = f"Error analyzing audio: {e}"
+        return gr.Dropdown(choices=["Full Audio"], value="Full Audio", interactive=False), error_msg
+# --------------------------------------------------------
+@spaces.GPU
+def transcribe_and_export(audio_file, model_size, chunk_choice, vtt_output, docx_timestamp_output, docx_no_timestamp_output, progress=gr.Progress()):
+    """
+    Main function to transcribe audio and export to selected formats.
+    Modified to process a single selected chunk or the full audio.
+    """
+    if audio_file is None:
+        return (None, None, None, "Please upload an audio file.")
+    start_time = time.time()
+    pipe = get_model_pipeline(model_size, progress)
+    # 1. Determine which segment to process
+    audio_segment_to_process = audio_file
+    offset = 0.0 # Time offset for segment timestamps
+    if chunk_choice != "Full Audio":
+        progress(0.70, desc="✂️ Preparing audio segment...")
+        try:
+            # Parse chunk number from choice string (e.g., "Chunk 2 (5:00:00 - 10:00:00)")
+            chunk_num = int(chunk_choice.split(' ')[1]) - 1
+            full_audio = AudioSegment.from_file(audio_file)
+            total_duration_ms = len(full_audio)
+            start_ms = chunk_num * CHUNK_LENGTH_MS
+            end_ms = min((chunk_num + 1) * CHUNK_LENGTH_MS, total_duration_ms)
+            # Slice the audio
+            chunk = full_audio[start_ms:end_ms]
+            temp_chunk_path = "/tmp/selected_chunk.mp3"
+            chunk.export(temp_chunk_path, format="mp3")
+            audio_segment_to_process = temp_chunk_path
+            offset = start_ms / 1000.0 # Offset is the start time of the chunk in seconds
+        except Exception as e:
+            return (None, None, None, f"Error preparing audio chunk: {e}")
+    # 2. Define generation arguments (Language)
+    generate_kwargs = {}
+    if model_size == "Distil-Large-v3-FR (French-Specific)":
+        generate_kwargs["language"] = "fr"
+    # 3. Transcribe the segment
+    progress(0.75, desc=f"🎤 Transcribing {chunk_choice}...")
+    raw_output = pipe(
+        audio_segment_to_process,
+        return_timestamps="word",
+        generate_kwargs=generate_kwargs
+    )
+    # 4. Process and adjust segments
+    full_segments = raw_output.get("chunks", [])
+    transcribed_text = raw_output.get('text', '').strip()
+    # Adjust timestamps if a chunk was processed
+    if chunk_choice != "Full Audio":
+        for segment in full_segments:
+            # Add the offset to the segment start and end times
+            segment['start'] = segment.get('start', 0.0) + offset
+            segment['end'] = segment.get('end', 0.0) + offset
+        # Clean up the temporary file
+        if os.path.exists(audio_segment_to_process):
+            os.remove(audio_segment_to_process)
+    # 5. Generate output files
     outputs = {}
     progress(0.85, desc="📝 Generating output files...")
     if vtt_output:
         vtt_path = "transcription.vtt"
         create_vtt(full_segments, vtt_path)
         outputs["VTT"] = vtt_path
     if docx_timestamp_output:
         docx_ts_path = "transcription_with_timestamps.docx"
         create_docx(full_segments, docx_ts_path, with_timestamps=True)
         outputs["DOCX (with timestamps)"] = docx_ts_path
     if docx_no_timestamp_output:
         docx_no_ts_path = "transcription_without_timestamps.docx"
         create_docx(full_segments, docx_no_ts_path, with_timestamps=False)
     return (
         transcribed_text,
         gr.Files(value=downloadable_files, label="Download Transcripts"),
+        gr.Audio(value=None),
         status_message
     )
 # --- Gradio UI ---
 with gr.Blocks(title="Whisper ZeroGPU Transcription") as demo:
     gr.Markdown("# 🎙️ Whisper ZeroGPU Transcription")
+    gr.Markdown("1. **Upload** an audio file. 2. Click **'Analyze Audio'** to load the 5-minute chunks. 3. Select a chunk or **'Full Audio'** and click **'Transcribe'**.")
     with gr.Row():
         audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio File")
                 choices=list(MODEL_SIZES.keys()),
                 value="Distil-Large-v3-FR (French-Specific)"
             )
+            # NEW: Button to analyze audio and populate chunk options
+            analyze_btn = gr.Button("Analyze Audio 🔎", variant="secondary")
+            # NEW: Dropdown for chunk selection
+            chunk_selector = gr.Dropdown(
+                label="Select Audio Segment (5-minute chunks)",
+                choices=["Full Audio"],
+                value="Full Audio",
+                interactive=False # Disabled until audio is uploaded and analyzed
+            )
             gr.Markdown("### Choose Output Formats")
             with gr.Row():
                 vtt_checkbox = gr.Checkbox(label="VTT", value=True)
     transcription_output = gr.Textbox(label="Full Transcription", lines=10)
     downloadable_files_output = gr.Files(label="Download Transcripts")
+    # NEW: Link the analyze button to the analysis function
+    analyze_btn.click(
+        fn=analyze_audio_and_get_chunks,
+        inputs=[audio_input],
+        outputs=[chunk_selector, status_text]
+    )
+    # UPDATED: Link the transcribe button to the transcription function
     transcribe_btn.click(
         fn=transcribe_and_export,
+        inputs=[audio_input, model_selector, chunk_selector, vtt_checkbox, docx_ts_checkbox, docx_no_ts_checkbox],
         outputs=[transcription_output, downloadable_files_output, audio_input, status_text]
     )