Spaces:

hevold
/

transkribering

Sleeping

App Files Files Community

hevold commited on Oct 29, 2025

Commit

ba0dc1d

verified ·

1 Parent(s): c3f1d45

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -93

app.py CHANGED Viewed

@@ -1,98 +1,116 @@
-# Install ffmpeg and pydub for audio extraction from video if needed
-!apt-get update -qq && apt-get install -qq -y ffmpeg
-!pip install pydub -q
-from pydub import AudioSegment
-# Initialize the transcription pipeline with a multilingual model
-# Note: openai/whisper-large-v3 is a very large model and might cause OutOfMemoryError
 try:
-    print("👂 Loading multilingual transcription pipeline with openai/whisper-large-v3...")
-    transcriber = pipeline(
-        "automatic-speech-recognition",
-        model="openai/whisper-large-v3",
-        return_timestamps=True, # Needed for long audio
-        device_map="auto" # Automatically chooses device
-    )
-    print("✅ Multilingual transcription pipeline loaded")
-    # Function to handle file upload, extract audio if necessary, and transcribe
-    def handle_upload_and_transcribe(file_obj):
-        """Handles uploaded file (audio or video), extracts audio, and transcribes."""
-        if file_obj is None:
-            return "Please upload an audio or video file."
-        input_path = file_obj # file_obj is already the file path string
-        output_audio_path = None
-        temp_dir = None # Initialize temp_dir to None
-        try:
-            # Check if the file is likely a video based on extension (a simple heuristic)
-            video_extensions = ['.mp4', '.avi', '.mov', '.mkv', '.webm']
-            is_video = any(input_path.lower().endswith(ext) for ext in video_extensions)
-            if is_video:
-                print(f"🎬 Detected video file: {input_path}. Extracting audio...")
-                # Use pydub and ffmpeg to extract audio
-                audio = AudioSegment.from_file(input_path)
-                # Create a temporary file for the extracted audio
-                temp_dir = tempfile.mkdtemp()
-                output_audio_path = os.path.join(temp_dir, "extracted_audio.wav")
-                audio.export(output_audio_path, format="wav")
-                print(f"🔊 Audio extracted to: {output_audio_path}")
-                audio_source_path = output_audio_path
-            else:
-                # Assume it's an audio file, use the original path
-                print(f"🎵 Detected audio file: {input_path}. Using directly for transcription.")
-                audio_source_path = input_path
-            # Now transcribe the audio source path
-            print(f" transcribe {audio_source_path}...")
-            transcription = transcriber(audio_source_path)
-            # Clean up temporary directory if audio was extracted and temp_dir was created
-            if temp_dir and os.path.exists(temp_dir):
-                 shutil.rmtree(temp_dir)
-                 print(f"🗑️ Cleaned up temporary directory {temp_dir}")
-            # The output format depends on return_timestamps. If True, it's a dict with 'text'.
-            if isinstance(transcription, dict) and 'text' in transcription:
-                 return transcription['text']
-            elif isinstance(transcription, list) and transcription:
-                 # Handle cases where output might be a list of dicts (e.g., without timestamps)
-                 return transcription[0].get('text', str(transcription)) # Return text from first item or string representation
-            else:
-                 return str(transcription) # Return string representation if format is unexpected
-        except Exception as e:
-            # Clean up temporary directory in case of error during transcription
-            if temp_dir and os.path.exists(temp_dir):
-                 shutil.rmtree(temp_dir)
-                 print(f"🗑️ Cleaned up temporary directory {temp_dir} after error")
-            return f"❌ Processing or Transcription failed: {e}"
-    # Create the Gradio interface
-    print("🚀 Creating Gradio interface...")
-    # Use gr.File for broader input type support, although gr.Audio often handles videos too
-    # gr.Audio(type="filepath") might be sufficient if ffmpeg handles the format
-    # Let's stick to gr.Audio with filepath type as it often works with ffmpeg installed
-    interface = gr.Interface(
-        fn=handle_upload_and_transcribe,
-        inputs=gr.Audio(type="filepath", label="Upload Audio or Video File"),
-        outputs=gr.Textbox(label="Transcription"),
-        title="Multilingual Audio/Video Transcription",
-        description="Upload an audio (.mp3, .wav, .m4a, etc.) or video (.mp4, .avi, etc.) file to get its transcription."
     )
-    # Launch the interface
-    print("Starting Gradio interface...")
-    interface.launch(debug=True) # Set debug=True for more detailed error messages
-except Exception as e:
-    print(f"❌ Error initializing the transcription pipeline or Gradio interface: {e}")
-    print("Please check the model name and available resources.")
-    display({"error": f"Initialization failed: {e}"})

+import os
+import shutil
+import tempfile
+from pathlib import Path
+import gradio as gr
+from transformers import pipeline
+# Prøv å støtte video via pydub + ffmpeg hvis tilgjengelig (valgfritt)
 try:
+    from pydub import AudioSegment
+    HAS_PYDUB = True
+except Exception:
+    HAS_PYDUB = False
+# --- Konfigurasjon ---
+# CPU: bruk en mindre, flerspråklig modell. (large-v3 på CPU vil ofte knekke.)
+# Alternativer: "openai/whisper-small", "openai/whisper-medium", "distil-whisper/distil-small.multilingual"
+ASR_MODEL_ID = "openai/whisper-small"
+def make_transcriber():
+    # device=-1 tvinger CPU. return_timestamps=True gir tidskoder i retur.
+    return pipeline(
+        task="automatic-speech-recognition",
+        model=ASR_MODEL_ID,
+        device=-1,
+        return_timestamps=True
     )
+transcriber = make_transcriber()
+VIDEO_EXTS = {".mp4", ".avi", ".mov", ".mkv", ".webm"}
+AUDIO_EXTS = {".wav", ".mp3", ".m4a", ".flac", ".ogg", ".opus", ".aac"}
+def extract_audio_if_needed(input_path: str) -> str:
+    """
+    Tar inn en filsti (audio eller video).
+    Hvis video og pydub+ffmpeg finnes, ekstraheres WAV til temp-katalog og
+    vi returnerer ny filsti. Hvis ikke, kastes en forklarende feil.
+    Hvis allerede audio, returneres originalstien.
+    """
+    suffix = Path(input_path).suffix.lower()
+    # Allerede audio?
+    if suffix in AUDIO_EXTS:
+        return input_path
+    # Video?
+    if suffix in VIDEO_EXTS:
+        if not HAS_PYDUB:
+            raise RuntimeError(
+                "Video oppdaget, men pydub/ffmpeg er ikke tilgjengelig. "
+                "Installer pydub og ffmpeg (se requirements.txt og apt.txt), "
+                "eller last opp en ren lydfil."
+            )
+        # Ekstraher WAV
+        temp_dir = tempfile.mkdtemp(prefix="asr_")
+        out_wav = os.path.join(temp_dir, "extracted_audio.wav")
+        audio = AudioSegment.from_file(input_path)
+        audio.export(out_wav, format="wav")
+        return out_wav
+    # Ukjent – la Whisper prøve; hvis det feiler, får brukeren feilmelding
+    return input_path
+def handle_upload_and_transcribe(file_path: str):
+    if not file_path:
+        return "Last opp en lyd- eller videofil."
+    tmp_to_cleanup = None
+    try:
+        # Kan generere en temp WAV (for video)
+        maybe_audio = extract_audio_if_needed(file_path)
+        if maybe_audio != file_path:
+            tmp_to_cleanup = os.path.dirname(maybe_audio)
+        # Tips: du kan sette språk eksplisitt for raskere/mer stabil dekoding:
+        # generate_kwargs={"task": "transcribe", "language": "no"}
+        result = transcriber(maybe_audio)
+        # Rydd temp
+        if tmp_to_cleanup and os.path.exists(tmp_to_cleanup):
+            shutil.rmtree(tmp_to_cleanup, ignore_errors=True)
+        # Normaliser utdata
+        if isinstance(result, dict):
+            # transformers>=4.30 gir ofte {"text": "...", "chunks": [...]}
+            text = result.get("text")
+            if text:
+                return text.strip()
+            # fallback
+            return str(result)
+        elif isinstance(result, list) and result:
+            return result[0].get("text", str(result))
+        return str(result)
+    except Exception as e:
+        # Rydd opp ved feil
+        if tmp_to_cleanup and os.path.exists(tmp_to_cleanup):
+            shutil.rmtree(tmp_to_cleanup, ignore_errors=True)
+        return f"❌ Feil under prosessering/transkripsjon: {e}"
+with gr.Blocks(title="Multilingual Audio/Video Transcription") as demo:
+    gr.Markdown(
+        "## Multilingual Transcription (CPU)\n"
+        "Last opp en lydfil (.wav/.mp3/.m4a/…) eller videofil (.mp4/.mov/…). "
+        "På CPU brukes en mindre Whisper-modell for stabil kjøring."
+    )
+    inp = gr.Audio(type="filepath", label="Fil (audio eller video)")
+    out = gr.Textbox(label="Transkripsjon")
+    btn = gr.Button("Transkriber")
+    btn.click(handle_upload_and_transcribe, inputs=inp, outputs=out)
+if __name__ == "__main__":
+    # På HF Spaces trenger du vanligvis ikke server_name/server_port her.
+    demo.launch()