Spaces:

clementBE
/

Audio_transcrib_base

Paused

App Files Files Community

clementBE commited on Sep 29, 2025

Commit

4091834

verified ·

1 Parent(s): 3872026

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -17

app.py CHANGED Viewed

@@ -13,16 +13,22 @@ MODEL_SIZES = {
     "Base (Faster)": "openai/whisper-base",
     "Small (Balanced)": "openai/whisper-small",
     "Distil-Large-v3 (General Purpose)": "distil-whisper/distil-large-v3",
-    "Distil-Large-v3-FR (French-Specific)": "distil-whisper/distil-large-v3-fr" # New, French-specific model
 }
 # Use a dictionary to cache loaded models
 model_cache = {}
 def get_model_pipeline(model_name, progress):
     if model_name not in model_cache:
         progress(0, desc="🚀 Initializing ZeroGPU instance...")
         model_id = MODEL_SIZES[model_name]
         device = 0 if torch.cuda.is_available() else "cpu"
         progress(0.1, desc=f"⏳ Loading {model_name} model...")
@@ -35,37 +41,54 @@ def get_model_pipeline(model_name, progress):
     return model_cache[model_name]
 def create_vtt(segments, file_path):
     with open(file_path, "w", encoding="utf-8") as f:
         f.write("WEBVTT\n\n")
         for i, segment in enumerate(segments):
-            start_seconds = segment.get('start', 0)
-            end_seconds = segment.get('end', 0)
-            start = str(datetime.timedelta(seconds=int(start_seconds)))
-            end = str(datetime.timedelta(seconds=int(end_seconds)))
             f.write(f"{i+1}\n")
             f.write(f"{start} --> {end}\n")
             f.write(f"{segment.get('text', '').strip()}\n\n")
 def create_docx(segments, file_path, with_timestamps):
     document = Document()
     document.add_heading("Transcription", 0)
     if with_timestamps:
         for segment in segments:
             text = segment.get('text', '').strip()
-            start_seconds = segment.get('start', 0)
-            end_seconds = segment.get('end', 0)
-            start = str(datetime.timedelta(seconds=int(start_seconds)))
-            end = str(datetime.timedelta(seconds=int(end_seconds)))
             document.add_paragraph(f"[{start} - {end}] {text}")
     else:
         full_text = " ".join([segment.get('text', '').strip() for segment in segments])
         document.add_paragraph(full_text)
     document.save(file_path)
 @spaces.GPU
 def transcribe_and_export(audio_file, model_size, vtt_output, docx_timestamp_output, docx_no_timestamp_output, progress=gr.Progress()):
     if audio_file is None:
         return (None, None, None, "Please upload an audio file.")
@@ -75,22 +98,29 @@ def transcribe_and_export(audio_file, model_size, vtt_output, docx_timestamp_out
     progress(0.75, desc="🎤 Transcribing audio...")
-    # Forcing French for the new specific model
-    # Note: If the user picks a different model, the language auto-detection will work as normal.
     if model_size == "Distil-Large-v3-FR (French-Specific)":
         raw_output = pipe(
             audio_file,
-            return_timestamps=True,
             generate_kwargs={"language": "fr"}
         )
     else:
-        # For other models, let the model auto-detect
         raw_output = pipe(
             audio_file,
-            return_timestamps=True,
         )
     segments = raw_output.get("chunks", [])
     outputs = {}
     progress(0.85, desc="📝 Generating output files...")
@@ -119,7 +149,7 @@ def transcribe_and_export(audio_file, model_size, vtt_output, docx_timestamp_out
     return (
         transcribed_text,
         gr.Files(value=downloadable_files, label="Download Transcripts"),
-        gr.Audio(value=None),
         status_message
     )
@@ -135,7 +165,8 @@ with gr.Blocks(title="Whisper ZeroGPU Transcription") as demo:
             model_selector = gr.Dropdown(
                 label="Choose Whisper Model Size",
                 choices=list(MODEL_SIZES.keys()),
-                value="Distil-Large-v3-FR (French-Specific)" # Default to the French-specific model
             )
             gr.Markdown("### Choose Output Formats")
             with gr.Row():

     "Base (Faster)": "openai/whisper-base",
     "Small (Balanced)": "openai/whisper-small",
     "Distil-Large-v3 (General Purpose)": "distil-whisper/distil-large-v3",
+    # FIX: The model 'distil-whisper/distil-large-v3-fr' does not exist.
+    # We use the general distil-large-v3 and rely on the code below to force French.
+    "Distil-Large-v3-FR (French-Specific)": "distil-whisper/distil-large-v3"
 }
 # Use a dictionary to cache loaded models
 model_cache = {}
 def get_model_pipeline(model_name, progress):
+    """
+    Initializes and caches the ASR pipeline for a given model name.
+    """
     if model_name not in model_cache:
         progress(0, desc="🚀 Initializing ZeroGPU instance...")
         model_id = MODEL_SIZES[model_name]
+        # Use GPU if available, otherwise fallback to CPU
         device = 0 if torch.cuda.is_available() else "cpu"
         progress(0.1, desc=f"⏳ Loading {model_name} model...")
     return model_cache[model_name]
 def create_vtt(segments, file_path):
+    """
+    Creates a WebVTT (.vtt) file from transcription segments.
+    """
     with open(file_path, "w", encoding="utf-8") as f:
         f.write("WEBVTT\n\n")
         for i, segment in enumerate(segments):
+            # Calculate time strings in "HH:MM:SS.mmm" format (though VTT only strictly requires up to milliseconds)
+            start_ms = int(segment.get('start', 0) * 1000)
+            end_ms = int(segment.get('end', 0) * 1000)
+            def format_time(ms):
+                hours, remainder = divmod(ms, 3600000)
+                minutes, remainder = divmod(remainder, 60000)
+                seconds, milliseconds = divmod(remainder, 1000)
+                return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}.{int(milliseconds):03}"
+            start = format_time(start_ms)
+            end = format_time(end_ms)
             f.write(f"{i+1}\n")
             f.write(f"{start} --> {end}\n")
             f.write(f"{segment.get('text', '').strip()}\n\n")
 def create_docx(segments, file_path, with_timestamps):
+    """
+    Creates a DOCX (.docx) file from transcription segments.
+    """
     document = Document()
     document.add_heading("Transcription", 0)
     if with_timestamps:
         for segment in segments:
             text = segment.get('text', '').strip()
+            # Format time as HH:MM:SS for DOCX
+            start = str(datetime.timedelta(seconds=int(segment.get('start', 0))))
+            end = str(datetime.timedelta(seconds=int(segment.get('end', 0))))
             document.add_paragraph(f"[{start} - {end}] {text}")
     else:
         full_text = " ".join([segment.get('text', '').strip() for segment in segments])
         document.add_paragraph(full_text)
     document.save(file_path)
 @spaces.GPU
 def transcribe_and_export(audio_file, model_size, vtt_output, docx_timestamp_output, docx_no_timestamp_output, progress=gr.Progress()):
+    """
+    Main function to transcribe audio and export to selected formats.
+    """
     if audio_file is None:
         return (None, None, None, "Please upload an audio file.")
     progress(0.75, desc="🎤 Transcribing audio...")
+    # Check if the French-specific model option was selected
     if model_size == "Distil-Large-v3-FR (French-Specific)":
+        # Force French for this specific option
         raw_output = pipe(
             audio_file,
+            return_timestamps="word", # Use word-level timestamps for more detail if needed, but 'True' works for chunk timestamps too
             generate_kwargs={"language": "fr"}
         )
     else:
+        # For other models, let the model auto-detect the language
         raw_output = pipe(
             audio_file,
+            return_timestamps="word",
         )
+    # Use 'chunks' if available, otherwise default to the whole text
     segments = raw_output.get("chunks", [])
+    # If no chunks are returned (e.g., if return_timestamps=False was used, though not in this code),
+    # create a single segment from the full text.
+    if not segments and 'text' in raw_output:
+        segments = [{'text': raw_output['text'].strip(), 'start': 0.0, 'end': 0.0}]
     outputs = {}
     progress(0.85, desc="📝 Generating output files...")
     return (
         transcribed_text,
         gr.Files(value=downloadable_files, label="Download Transcripts"),
+        gr.Audio(value=None), # Clear the audio input
         status_message
     )
             model_selector = gr.Dropdown(
                 label="Choose Whisper Model Size",
                 choices=list(MODEL_SIZES.keys()),
+                # Default to the French-specific model, which now uses the correct ID
+                value="Distil-Large-v3-FR (French-Specific)"
             )
             gr.Markdown("### Choose Output Formats")
             with gr.Row():