Spaces:

clementBE
/

Audio_transcrib_base

Paused

App Files Files Community

clementBE commited on Sep 29, 2025

Commit

ded0018

verified ·

1 Parent(s): 7d4389c

Update app.py

Browse files

Files changed (1) hide show

app.py +145 -121

app.py CHANGED Viewed

@@ -1,135 +1,159 @@
-import os
-import uuid
-import shutil
-import whisper
-import librosa
 import gradio as gr
 from docx import Document
-MODEL_SPEED = {
-    "tiny": 10,
-    "base": 5,
-    "small": 3,
-    "medium": 2,
-    "large": 1
 }
-def format_timestamp(seconds):
-    h = int(seconds // 3600)
-    m = int((seconds % 3600) // 60)
-    s = seconds % 60
-    return f"{h:02d}:{m:02d}:{s:06.3f}"
-def write_vtt(segments, vtt_path):
-    with open(vtt_path, "w", encoding="utf-8") as f:
         f.write("WEBVTT\n\n")
-        for i, seg in enumerate(segments):
-            start = format_timestamp(seg['start'])
-            end = format_timestamp(seg['end'])
-            f.write(f"{i+1}\n{start} --> {end}\n{seg['text'].strip()}\n\n")
-def generate_docx(segments, docx_path):
-    doc = Document()
-    for seg in segments:
-        start = format_timestamp(seg['start'])
-        end = format_timestamp(seg['end'])
-        doc.add_paragraph(f"{start} - {end}: {seg['text']}")
-    doc.save(docx_path)
-def generate_docx_no_timestamps(segments, docx_path):
-    doc = Document()
-    full_text = " ".join(seg['text'].strip() for seg in segments)
-    doc.add_paragraph(full_text)
-    doc.save(docx_path)
-def process(audio_file_path, model_name):
-    session_id = str(uuid.uuid4())
-    base_dir = os.path.join("session_data", session_id)
-    os.makedirs(base_dir, exist_ok=True)
-    audio_path = os.path.join(base_dir, os.path.basename(audio_file_path))
-    shutil.copy(audio_file_path, audio_path)
-    duration = librosa.get_duration(path=audio_path)
-    speed_factor = MODEL_SPEED.get(model_name, 4)
-    estimated_time = round(duration / speed_factor, 2)
-    log = f"🔍 File: {os.path.basename(audio_path)}\n"
-    log += f"📏 Duration: {duration:.2f} sec\n"
-    log += f"🧠 Model: {model_name}\n"
-    log += f"⏱ Estimated time: ~{estimated_time} sec\n\n"
-    log += "🚀 Loading model...\n"
-    yield None, None, None, None, log
-    model = whisper.load_model(model_name)
-    log += "✅ Model loaded. Transcribing...\n"
-    yield None, None, None, None, log
-    result = model.transcribe(audio_path)
-    log += "📝 Transcription complete. Writing files...\n"
-    yield None, None, None, None, log
-    segments = result.get('segments', [{
-        'start': 0,
-        'end': result.get('duration', 0),
-        'text': result.get('text', '')
-    }])
-    audio_id = os.path.splitext(os.path.basename(audio_path))[0]
-    vtt_path = os.path.join(base_dir, f"{audio_id}.vtt")
-    docx_path = os.path.join(base_dir, f"{audio_id}.docx")
-    docx_no_ts_path = os.path.join(base_dir, f"{audio_id}_no_timestamps.docx")
-    html_path = os.path.join(base_dir, f"{audio_id}.html")
-    write_vtt(segments, vtt_path)
-    generate_docx(segments, docx_path)
-    generate_docx_no_timestamps(segments, docx_no_ts_path)
-    with open(html_path, "w", encoding="utf-8") as f:
-        f.write(f"<html><head><title>{audio_id} Transcript</title></head><body>\n")
-        f.write(f"<h1>Transcript for {audio_id}</h1>\n")
-        for seg in segments:
-            start_str = format_timestamp(seg['start'])
-            end_str = format_timestamp(seg['end'])
-            f.write(f"<p><b>{start_str} → {end_str}</b><br><span contenteditable='true'>{seg['text']}</span></p>\n")
-        f.write("</body></html>")
-    log += "✅ All done!"
-    yield html_path, vtt_path, docx_path, docx_no_ts_path, log
-# ===================== Gradio UI =====================
-with gr.Blocks(title="Whisper MP3 Transcription Tool") as app:
-    gr.Markdown("## 🎙️ Whisper Transcription Tool")
-    gr.Markdown("Upload an MP3 file, select a model, and download your transcript in multiple formats.")
-    gr.Markdown("We recommend the base model.")
-    with gr.Row():
-        audio_input = gr.Audio(type="filepath", label="🎵 Upload MP3")
-        model_selector = gr.Dropdown(
-            choices=["tiny", "base", "small", "medium", "large"],
-            value="base",
-            label="🧠 Whisper model"
         )
-    run_button = gr.Button("🚀 Transcribe")
     with gr.Row():
-        html_output = gr.File(label="📝 HTML Transcript")
-        vtt_output = gr.File(label="🔤 Subtitle (VTT)")
-        docx_output = gr.File(label="📄 Word Document (with timestamps)")
-        docx_no_ts_output = gr.File(label="📄 Word Document (no timestamps)")
-    progress_box = gr.Textbox(label="📡 Processing Log", lines=20)
-    run_button.click(
-        fn=process,
-        inputs=[audio_input, model_selector],
-        outputs=[html_output, vtt_output, docx_output, docx_no_ts_output, progress_box],
-        show_progress=True,
-        api_name="transcribe"
     )
 if __name__ == "__main__":
-    app.launch()

 import gradio as gr
+import spaces
+import torch
+import os
+import datetime
+import time
+from transformers import pipeline
 from docx import Document
+# Define the available models and their approximate relative speeds
+MODEL_SIZES = {
+    "Tiny (Fastest)": "openai/whisper-tiny",
+    "Base (Faster)": "openai/whisper-base",
+    "Small (Balanced)": "openai/whisper-small",
+    "Distil-Large-v3 (General Purpose)": "distil-whisper/distil-large-v3",
+    "Distil-Large-v2-FR (French-Specific)": "distil-whisper/distil-large-v2-fr" # New, French-specific model
 }
+# Use a dictionary to cache loaded models
+model_cache = {}
+def get_model_pipeline(model_name, progress):
+    if model_name not in model_cache:
+        progress(0, desc="🚀 Initializing ZeroGPU instance...")
+        model_id = MODEL_SIZES[model_name]
+        device = 0 if torch.cuda.is_available() else "cpu"
+        progress(0.1, desc=f"⏳ Loading {model_name} model...")
+        model_cache[model_name] = pipeline(
+            "automatic-speech-recognition",
+            model=model_id,
+            device=device
+        )
+        progress(0.5, desc="✅ Model loaded successfully!")
+    return model_cache[model_name]
+def create_vtt(segments, file_path):
+    with open(file_path, "w", encoding="utf-8") as f:
         f.write("WEBVTT\n\n")
+        for i, segment in enumerate(segments):
+            start_seconds = segment.get('start', 0)
+            end_seconds = segment.get('end', 0)
+            start = str(datetime.timedelta(seconds=int(start_seconds)))
+            end = str(datetime.timedelta(seconds=int(end_seconds)))
+            f.write(f"{i+1}\n")
+            f.write(f"{start} --> {end}\n")
+            f.write(f"{segment.get('text', '').strip()}\n\n")
+def create_docx(segments, file_path, with_timestamps):
+    document = Document()
+    document.add_heading("Transcription", 0)
+    if with_timestamps:
+        for segment in segments:
+            text = segment.get('text', '').strip()
+            start_seconds = segment.get('start', 0)
+            end_seconds = segment.get('end', 0)
+            start = str(datetime.timedelta(seconds=int(start_seconds)))
+            end = str(datetime.timedelta(seconds=int(end_seconds)))
+            document.add_paragraph(f"[{start} - {end}] {text}")
+    else:
+        full_text = " ".join([segment.get('text', '').strip() for segment in segments])
+        document.add_paragraph(full_text)
+    document.save(file_path)
+@spaces.GPU
+def transcribe_and_export(audio_file, model_size, vtt_output, docx_timestamp_output, docx_no_timestamp_output, progress=gr.Progress()):
+    if audio_file is None:
+        return (None, None, None, "Please upload an audio file.")
+    start_time = time.time()
+    pipe = get_model_pipeline(model_size, progress)
+    progress(0.75, desc="🎤 Transcribing audio...")
+    # Forcing French for the new specific model
+    # Note: If the user picks a different model, the language auto-detection will work as normal.
+    if model_size == "Distil-Large-v2-FR (French-Specific)":
+        raw_output = pipe(
+            audio_file,
+            return_timestamps=True,
+            generate_kwargs={"language": "fr"}
+        )
+    else:
+        # For other models, let the model auto-detect
+        raw_output = pipe(
+            audio_file,
+            return_timestamps=True,
         )
+    segments = raw_output.get("chunks", [])
+    outputs = {}
+    progress(0.85, desc="📝 Generating output files...")
+    if vtt_output:
+        vtt_path = "transcription.vtt"
+        create_vtt(segments, vtt_path)
+        outputs["VTT"] = vtt_path
+    if docx_timestamp_output:
+        docx_ts_path = "transcription_with_timestamps.docx"
+        create_docx(segments, docx_ts_path, with_timestamps=True)
+        outputs["DOCX (with timestamps)"] = docx_ts_path
+    if docx_no_timestamp_output:
+        docx_no_ts_path = "transcription_without_timestamps.docx"
+        create_docx(segments, docx_no_ts_path, with_timestamps=False)
+        outputs["DOCX (without timestamps)"] = docx_no_ts_path
+    end_time = time.time()
+    total_time = end_time - start_time
+    transcribed_text = raw_output['text']
+    downloadable_files = [path for path in outputs.values()]
+    status_message = f"✅ Transcription complete! Total time: {total_time:.2f} seconds."
+    return (
+        transcribed_text,
+        gr.Files(value=downloadable_files, label="Download Transcripts"),
+        gr.Audio(value=None),
+        status_message
+    )
+# --- Gradio UI ---
+with gr.Blocks(title="Whisper ZeroGPU Transcription") as demo:
+    gr.Markdown("# 🎙️ Whisper ZeroGPU Transcription")
+    gr.Markdown("Transcribe audio with timestamps and choose your output format. The first run may take up to a minute due to cold start.")
     with gr.Row():
+        audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio File")
+        with gr.Column(scale=2):
+            model_selector = gr.Dropdown(
+                label="Choose Whisper Model Size",
+                choices=list(MODEL_SIZES.keys()),
+                value="Distil-Large-v2-FR (French-Specific)" # Default to the French-specific model
+            )
+            gr.Markdown("### Choose Output Formats")
+            with gr.Row():
+                vtt_checkbox = gr.Checkbox(label="VTT", value=True)
+                docx_ts_checkbox = gr.Checkbox(label="DOCX (with timestamps)", value=False)
+                docx_no_ts_checkbox = gr.Checkbox(label="DOCX (without timestamps)", value=True)
+            transcribe_btn = gr.Button("Transcribe", variant="primary")
+            status_text = gr.Textbox(label="Status", interactive=False)
+    transcription_output = gr.Textbox(label="Full Transcription", lines=10)
+    downloadable_files_output = gr.Files(label="Download Transcripts")
+    transcribe_btn.click(
+        fn=transcribe_and_export,
+        inputs=[audio_input, model_selector, vtt_checkbox, docx_ts_checkbox, docx_no_ts_checkbox],
+        outputs=[transcription_output, downloadable_files_output, audio_input, status_text]
     )
 if __name__ == "__main__":
+    demo.launch()