Spaces:

clementBE
/

Audio_transcrib_base

Paused

App Files Files Community

clementBE commited on Sep 29, 2025

Commit

2b96b70

verified ·

1 Parent(s): 49df268

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -127

app.py CHANGED Viewed

@@ -1,165 +1,158 @@
 import os
-import tempfile
 import datetime
 import time
-import torch
-import gradio as gr
-import spaces
 from transformers import pipeline
 from docx import Document
-from pydub import AudioSegment
-from sumy.parsers.plaintext import PlaintextParser
-from sumy.nlp.tokenizers import Tokenizer
-from sumy.summarizers.lex_rank import LexRankSummarizer
-import nltk
-# --- Ensure NLTK punkt tokenizer is downloaded ---
-try:
-    nltk.data.find("tokenizers/punkt")
-except LookupError:
-    nltk.download("punkt")
-# --- Model definitions ---
 MODEL_SIZES = {
     "Tiny (Fastest)": "openai/whisper-tiny",
     "Base (Faster)": "openai/whisper-base",
     "Small (Balanced)": "openai/whisper-small",
     "Distil-Large-v3 (General Purpose)": "distil-whisper/distil-large-v3",
-    "Distil-Large-v3-FR (French-Specific)": "eustlb/distil-large-v3-fr"
 }
-# --- Caches ---
 model_cache = {}
-# --- Whisper pipeline loader ---
 def get_model_pipeline(model_name, progress):
     if model_name not in model_cache:
-        progress(0, desc="🚀 Loading model...")
         model_id = MODEL_SIZES[model_name]
         device = 0 if torch.cuda.is_available() else "cpu"
         model_cache[model_name] = pipeline(
             "automatic-speech-recognition",
             model=model_id,
             device=device
         )
-        progress(0.5, desc=f"✅ {model_name} loaded")
     return model_cache[model_name]
-# --- Extractive summary ---
-def extractive_summary(text, sentences_count=7):
-    """
-    Summarize the text using LexRank (extractive summarization)
-    """
-    parser = PlaintextParser.from_string(text, Tokenizer("french"))
-    summarizer = LexRankSummarizer()
-    summary = summarizer(parser.document, sentences_count)
-    return " ".join(str(s) for s in summary)
-# --- Extract audio from video/audio ---
-def extract_audio(file_path):
-    ext = os.path.splitext(file_path)[1].lower()
-    if ext in [".wav", ".mp3", ".m4a", ".flac"]:
-        return file_path
-    temp_audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-    temp_audio.close()
-    audio = AudioSegment.from_file(file_path)
-    audio.export(temp_audio.name, format="wav")
-    return temp_audio.name
-# --- Split audio into 10-minute chunks ---
-def split_audio(audio_path):
-    audio = AudioSegment.from_file(audio_path)
-    chunk_length_ms = 10 * 60 * 1000  # 10 minutes
-    chunks = []
-    labels = []
-    for i, start in enumerate(range(0, len(audio), chunk_length_ms)):
-        end = min(start + chunk_length_ms, len(audio))
-        chunk = audio[start:end]
-        temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-        chunk.export(temp_file.name, format="wav")
-        chunks.append(temp_file.name)
-        labels.append(f"{i*10}-{(i+1)*10} min")
-    return chunks, labels
-# --- Export transcription to DOCX ---
-def export_transcription_docx(text, file_path="transcription_full.docx"):
-    doc = Document()
-    doc.add_heading("Full Transcription", 0)
-    for paragraph in text.split("\n"):
-        doc.add_paragraph(paragraph.strip())
-    doc.save(file_path)
-    return file_path
-# --- Transcribe selected chunks ---
 @spaces.GPU
-def transcribe_selected(file, model_size, selected_chunks, generate_summary, progress=gr.Progress()):
-    if file is None:
-        return None, None, None, "Please upload a file."
-    progress(0, desc="🎬 Extracting audio...")
-    audio_file = extract_audio(file)
-    chunks, labels = split_audio(audio_file)
-    # Select chunks
-    chosen_files = [chunks[i] for i, label in enumerate(labels) if label in selected_chunks]
     pipe = get_model_pipeline(model_size, progress)
-    full_text = ""
-    for idx, chunk_file in enumerate(chosen_files):
-        progress((idx+1)/len(chosen_files), desc=f"🎤 Transcribing chunk {idx+1}/{len(chosen_files)}...")
-        if model_size == "Distil-Large-v3-FR (French-Specific)":
-            output = pipe(chunk_file, return_timestamps=True, generate_kwargs={"language": "fr"})
-        else:
-            output = pipe(chunk_file, return_timestamps=True)
-        full_text += output.get("text", "") + "\n"
-    # Export full transcription DOCX
-    docx_path = export_transcription_docx(full_text)
-    # Generate extractive summary (optional, not shown in UI)
-    summary_text = None
-    if generate_summary and full_text.strip():
-        summary_text = extractive_summary(full_text, sentences_count=7)
-    return full_text, docx_path, summary_text, f"✅ Done. Transcribed {len(chosen_files)} parts."
 # --- Gradio UI ---
-with gr.Blocks(title="Whisper Chunked Transcription") as demo:
-    gr.Markdown("# 🎙️ Whisper Chunked Transcription")
-    gr.Markdown("Upload audio/video, select 10-minute parts to transcribe, generate extractive summary (hidden), and export full transcription as DOCX.")
     with gr.Row():
-        file_input = gr.Audio(sources=["upload"], type="filepath", label="Upload File")
-        model_selector = gr.Dropdown(
-            label="Whisper Model",
-            choices=list(MODEL_SIZES.keys()),
-            value="Distil-Large-v3-FR (French-Specific)"
-        )
-    chunk_selector = gr.CheckboxGroup(label="Select 10-minute parts", choices=[])
-    summary_checkbox = gr.Checkbox(label="Generate Extractive Summary", value=True)
-    transcribe_btn = gr.Button("Transcribe")
-    transcription_output = gr.Textbox(label="Transcription", lines=10)
-    docx_output = gr.File(label="Download DOCX")
-    status_text = gr.Textbox(label="Status", interactive=False)
-    # Update chunk choices after file upload
-    def update_chunks(file):
-        if file is None:
-            return gr.update(choices=[])
-        audio_file = extract_audio(file)
-        _, labels = split_audio(audio_file)
-        return gr.update(choices=labels, value=[])
-    file_input.change(update_chunks, inputs=file_input, outputs=chunk_selector)
     transcribe_btn.click(
-        fn=transcribe_selected,
-        inputs=[file_input, model_selector, chunk_selector, summary_checkbox],
-        outputs=[transcription_output, docx_output, gr.Textbox(visible=False), status_text]
     )
 if __name__ == "__main__":
-    demo.launch()

+import gradio as gr
+import spaces
+import torch
 import os
 import datetime
 import time
 from transformers import pipeline
 from docx import Document
+# Define the available models and their approximate relative speeds
 MODEL_SIZES = {
     "Tiny (Fastest)": "openai/whisper-tiny",
     "Base (Faster)": "openai/whisper-base",
     "Small (Balanced)": "openai/whisper-small",
     "Distil-Large-v3 (General Purpose)": "distil-whisper/distil-large-v3",
+    "Distil-Large-v3-FR (French-Specific)": "eustlb/distil-large-v3-fr" # Corrected French-specific model
 }
+# Use a dictionary to cache loaded models
 model_cache = {}
 def get_model_pipeline(model_name, progress):
     if model_name not in model_cache:
+        progress(0, desc="🚀 Initializing ZeroGPU instance...")
         model_id = MODEL_SIZES[model_name]
         device = 0 if torch.cuda.is_available() else "cpu"
+        progress(0.1, desc=f"⏳ Loading {model_name} model...")
         model_cache[model_name] = pipeline(
             "automatic-speech-recognition",
             model=model_id,
             device=device
         )
+        progress(0.5, desc="✅ Model loaded successfully!")
     return model_cache[model_name]
+def create_vtt(segments, file_path):
+    with open(file_path, "w", encoding="utf-8") as f:
+        f.write("WEBVTT\n\n")
+        for i, segment in enumerate(segments):
+            start_seconds = segment.get('start', 0)
+            end_seconds = segment.get('end', 0)
+            start = str(datetime.timedelta(seconds=int(start_seconds)))
+            end = str(datetime.timedelta(seconds=int(end_seconds)))
+            f.write(f"{i+1}\n")
+            f.write(f"{start} --> {end}\n")
+            f.write(f"{segment.get('text', '').strip()}\n\n")
+def create_docx(segments, file_path, with_timestamps):
+    document = Document()
+    document.add_heading("Transcription", 0)
+    if with_timestamps:
+        for segment in segments:
+            text = segment.get('text', '').strip()
+            start_seconds = segment.get('start', 0)
+            end_seconds = segment.get('end', 0)
+            start = str(datetime.timedelta(seconds=int(start_seconds)))
+            end = str(datetime.timedelta(seconds=int(end_seconds)))
+            document.add_paragraph(f"[{start} - {end}] {text}")
+    else:
+        full_text = " ".join([segment.get('text', '').strip() for segment in segments])
+        document.add_paragraph(full_text)
+    document.save(file_path)
 @spaces.GPU
+def transcribe_and_export(audio_file, model_size, vtt_output, docx_timestamp_output, docx_no_timestamp_output, progress=gr.Progress()):
+    if audio_file is None:
+        return (None, None, None, "Please upload an audio file.")
+    start_time = time.time()
     pipe = get_model_pipeline(model_size, progress)
+    progress(0.75, desc="🎤 Transcribing audio...")
+    # If the user selects the French-specific model, explicitly set the language
+    if model_size == "Distil-Large-v3-FR (French-Specific)":
+        raw_output = pipe(
+            audio_file,
+            return_timestamps=True,
+            generate_kwargs={"language": "fr"}
+        )
+    # For all other models, auto-detect the language
+    else:
+        raw_output = pipe(
+            audio_file,
+            return_timestamps=True,
+        )
+    segments = raw_output.get("chunks", [])
+    outputs = {}
+    progress(0.85, desc="📝 Generating output files...")
+    if vtt_output:
+        vtt_path = "transcription.vtt"
+        create_vtt(segments, vtt_path)
+        outputs["VTT"] = vtt_path
+    if docx_timestamp_output:
+        docx_ts_path = "transcription_with_timestamps.docx"
+        create_docx(segments, docx_ts_path, with_timestamps=True)
+        outputs["DOCX (with timestamps)"] = docx_ts_path
+    if docx_no_timestamp_output:
+        docx_no_ts_path = "transcription_without_timestamps.docx"
+        create_docx(segments, docx_no_ts_path, with_timestamps=False)
+        outputs["DOCX (without timestamps)"] = docx_no_ts_path
+    end_time = time.time()
+    total_time = end_time - start_time
+    transcribed_text = raw_output['text']
+    downloadable_files = [path for path in outputs.values()]
+    status_message = f"✅ Transcription complete! Total time: {total_time:.2f} seconds."
+    return (
+        transcribed_text,
+        gr.Files(value=downloadable_files, label="Download Transcripts"),
+        gr.Audio(value=None),
+        status_message
+    )
 # --- Gradio UI ---
+with gr.Blocks(title="Whisper ZeroGPU Transcription") as demo:
+    gr.Markdown("# 🎙️ Whisper ZeroGPU Transcription")
+    gr.Markdown("Transcribe audio with timestamps and choose your output format. The first run may take up to a minute due to cold start.")
     with gr.Row():
+        audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio File")
+        with gr.Column(scale=2):
+            model_selector = gr.Dropdown(
+                label="Choose Whisper Model Size",
+                choices=list(MODEL_SIZES.keys()),
+                value="Distil-Large-v3-FR (French-Specific)" # Default to the French-specific model
+            )
+            gr.Markdown("### Choose Output Formats")
+            with gr.Row():
+                vtt_checkbox = gr.Checkbox(label="VTT", value=True)
+                docx_ts_checkbox = gr.Checkbox(label="DOCX (with timestamps)", value=False)
+                docx_no_ts_checkbox = gr.Checkbox(label="DOCX (without timestamps)", value=True)
+            transcribe_btn = gr.Button("Transcribe", variant="primary")
+            status_text = gr.Textbox(label="Status", interactive=False)
+    transcription_output = gr.Textbox(label="Full Transcription", lines=10)
+    downloadable_files_output = gr.Files(label="Download Transcripts")
     transcribe_btn.click(
+        fn=transcribe_and_export,
+        inputs=[audio_input, model_selector, vtt_checkbox, docx_ts_checkbox, docx_no_ts_checkbox],
+        outputs=[transcription_output, downloadable_files_output, audio_input, status_text]
     )
 if __name__ == "__main__":
+    demo.launch()