Transcriber

Runtime error

App Files Files Community

Jonascaps1 commited on Dec 14, 2025

Commit

e6c1f00

verified ·

1 Parent(s): e7f8285

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -56

app.py CHANGED Viewed

@@ -10,26 +10,28 @@ from pathlib import Path
 from tempfile import NamedTemporaryFile
 from datetime import timedelta
-# ---------------- LOGGING ----------------
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# ---------------- CONFIG ----------------
 MODEL_ID = "KBLab/kb-whisper-large"
 CHUNK_DURATION_MS = 10000
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
 SUPPORTED_FORMATS = {".wav", ".mp3", ".m4a"}
-# ---------------- FFMPEG CHECK ----------------
 def check_ffmpeg():
     try:
         subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
         return True
-    except Exception:
         return False
-# ---------------- LOAD MODEL ----------------
 def initialize_pipeline():
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
         MODEL_ID,
@@ -48,66 +50,77 @@ def initialize_pipeline():
         torch_dtype=TORCH_DTYPE
     )
-PIPELINE = initialize_pipeline()
-# ---------------- AUDIO UTILS ----------------
 def convert_to_wav(audio_path: str) -> str:
     if not check_ffmpeg():
-        raise RuntimeError("ffmpeg not available")
-    ext = Path(audio_path).suffix.lower()
     if ext not in SUPPORTED_FORMATS:
-        raise ValueError("Unsupported audio format")
     if ext != ".wav":
         audio = AudioSegment.from_file(audio_path)
-        wav_path = str(Path(audio_path).with_suffix(".wav"))
         audio.export(wav_path, format="wav")
         return wav_path
     return audio_path
-def split_audio(audio_path: str):
     audio = AudioSegment.from_file(audio_path)
     return [audio[i:i + CHUNK_DURATION_MS] for i in range(0, len(audio), CHUNK_DURATION_MS)]
-def get_chunk_time(index: int) -> str:
-    return str(timedelta(milliseconds=index * CHUNK_DURATION_MS))
-# ---------------- TRANSCRIBE ----------------
 def transcribe(audio_path: str, include_timestamps: bool, progress=gr.Progress()):
     if not audio_path or not os.path.exists(audio_path):
-        yield "Please upload an audio file.", None
         return
     wav_path = convert_to_wav(audio_path)
     chunks = split_audio(wav_path)
     transcript = []
-    timestamped = []
     for i, chunk in enumerate(chunks):
-        with NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
-            chunk.export(tmp.name, format="wav")
-            result = PIPELINE(
-                tmp.name,
-                generate_kwargs={"task": "transcribe", "language": "sv"}
-            )
-        os.remove(tmp.name)
-        text = result["text"].strip()
-        if text:
-            transcript.append(text)
-            if include_timestamps:
-                ts = get_chunk_time(i)
-                timestamped.append(f"[{ts}] {text}")
         progress((i + 1) / len(chunks))
-        yield " ".join(transcript), None
-    content = "\n".join(timestamped) if include_timestamps else " ".join(transcript)
     with NamedTemporaryFile(
         suffix=".txt",
@@ -118,27 +131,33 @@ def transcribe(audio_path: str, include_timestamps: bool, progress=gr.Progress()
         f.write(content)
         download_path = f.name
-    yield " ".join(transcript), download_path
-# ---------------- UI ----------------
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# Swedish Whisper Transcriber")
-    gr.Markdown("Upload an .m4a file and download the transcript with timestamps.")
-    with gr.Row():
-        with gr.Column():
-            audio_input = gr.Audio(type="filepath", label="Upload Audio (.m4a)")
-            timestamp_toggle = gr.Checkbox(label="Include timestamps in download")
-            transcribe_btn = gr.Button("Transcribe")
-        with gr.Column():
-            transcript_output = gr.Textbox(label="Live Transcription", lines=12)
-            download_output = gr.File(label="Download Transcript")
-    transcribe_btn.click(
-        fn=transcribe,
-        inputs=[audio_input, timestamp_toggle],
-        outputs=[transcript_output, download_output]
-    )
-demo.launch()

 from tempfile import NamedTemporaryFile
 from datetime import timedelta
+# Setup logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
+# Configuration
 MODEL_ID = "KBLab/kb-whisper-large"
 CHUNK_DURATION_MS = 10000
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
 SUPPORTED_FORMATS = {".wav", ".mp3", ".m4a"}
+# Check for ffmpeg availability
 def check_ffmpeg():
     try:
         subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
+        logger.info("ffmpeg is installed and accessible.")
         return True
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        logger.error("ffmpeg is not installed or not found in PATH.")
         return False
+# Initialize model and pipeline
 def initialize_pipeline():
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
         MODEL_ID,
         torch_dtype=TORCH_DTYPE
     )
+# Convert audio if needed
 def convert_to_wav(audio_path: str) -> str:
     if not check_ffmpeg():
+        raise RuntimeError("ffmpeg is required")
+    ext = str(Path(audio_path).suffix).lower()
     if ext not in SUPPORTED_FORMATS:
+        raise ValueError(f"Unsupported format: {ext}")
     if ext != ".wav":
         audio = AudioSegment.from_file(audio_path)
+        wav_path = str(Path(audio_path).with_suffix(".converted.wav"))
         audio.export(wav_path, format="wav")
         return wav_path
     return audio_path
+# Split audio into chunks
+def split_audio(audio_path: str) -> list:
     audio = AudioSegment.from_file(audio_path)
     return [audio[i:i + CHUNK_DURATION_MS] for i in range(0, len(audio), CHUNK_DURATION_MS)]
+# Helper to compute chunk start time
+def get_chunk_time(index: int, chunk_duration_ms: int) -> str:
+    start_ms = index * chunk_duration_ms
+    return str(timedelta(milliseconds=start_ms))
+# Transcribe audio with streaming + working download
 def transcribe(audio_path: str, include_timestamps: bool, progress=gr.Progress()):
     if not audio_path or not os.path.exists(audio_path):
+        yield "Please upload a valid audio file.", None
         return
     wav_path = convert_to_wav(audio_path)
     chunks = split_audio(wav_path)
     transcript = []
+    timestamped_transcript = []
     for i, chunk in enumerate(chunks):
+        temp_file_path = None
+        try:
+            with NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+                temp_file_path = temp_file.name
+                chunk.export(temp_file.name, format="wav")
+                result = PIPELINE(
+                    temp_file.name,
+                    generate_kwargs={"task": "transcribe", "language": "sv"}
+                )
+                text = result["text"].strip()
+                if text:
+                    transcript.append(text)
+                    if include_timestamps:
+                        timestamp = get_chunk_time(i, CHUNK_DURATION_MS)
+                        timestamped_transcript.append(f"[{timestamp}] {text}")
+        finally:
+            if temp_file_path and os.path.exists(temp_file_path):
+                os.remove(temp_file_path)
         progress((i + 1) / len(chunks))
+        yield " ".join(transcript), None  # STREAM TEXT ONLY
+    # Create downloadable file ONLY ONCE (fix)
+    content = (
+        "\n".join(timestamped_transcript)
+        if include_timestamps
+        else " ".join(transcript)
+    )
     with NamedTemporaryFile(
         suffix=".txt",
         f.write(content)
         download_path = f.name
+    yield " ".join(transcript), download_path  # FINAL OUTPUT
+# Initialize pipeline globally
+PIPELINE = initialize_pipeline()
+# Gradio Interface
+def create_interface():
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# Swedish Whisper Transcriber")
+        with gr.Row():
+            with gr.Column():
+                audio_input = gr.Audio(type="filepath", label="Upload .m4a Audio")
+                timestamp_toggle = gr.Checkbox(label="Include Timestamps in Download")
+                transcribe_btn = gr.Button("Transcribe")
+            with gr.Column():
+                transcript_output = gr.Textbox(label="Live Transcription", lines=10)
+                download_output = gr.File(label="Download Transcript")
+        transcribe_btn.click(
+            fn=transcribe,
+            inputs=[audio_input, timestamp_toggle],
+            outputs=[transcript_output, download_output]
+        )
+    return demo
+if __name__ == "__main__":
+    create_interface().launch()