hevold commited on
Commit
ba0dc1d
·
verified ·
1 Parent(s): c3f1d45

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -93
app.py CHANGED
@@ -1,98 +1,116 @@
 
 
 
 
1
 
2
- # Install ffmpeg and pydub for audio extraction from video if needed
3
- !apt-get update -qq && apt-get install -qq -y ffmpeg
4
- !pip install pydub -q
5
 
6
-
7
- from pydub import AudioSegment
8
-
9
- # Initialize the transcription pipeline with a multilingual model
10
- # Note: openai/whisper-large-v3 is a very large model and might cause OutOfMemoryError
11
  try:
12
- print("👂 Loading multilingual transcription pipeline with openai/whisper-large-v3...")
13
- transcriber = pipeline(
14
- "automatic-speech-recognition",
15
- model="openai/whisper-large-v3",
16
- return_timestamps=True, # Needed for long audio
17
- device_map="auto" # Automatically chooses device
18
- )
19
- print("✅ Multilingual transcription pipeline loaded")
20
-
21
- # Function to handle file upload, extract audio if necessary, and transcribe
22
- def handle_upload_and_transcribe(file_obj):
23
- """Handles uploaded file (audio or video), extracts audio, and transcribes."""
24
- if file_obj is None:
25
- return "Please upload an audio or video file."
26
-
27
- input_path = file_obj # file_obj is already the file path string
28
- output_audio_path = None
29
- temp_dir = None # Initialize temp_dir to None
30
-
31
- try:
32
- # Check if the file is likely a video based on extension (a simple heuristic)
33
- video_extensions = ['.mp4', '.avi', '.mov', '.mkv', '.webm']
34
- is_video = any(input_path.lower().endswith(ext) for ext in video_extensions)
35
-
36
- if is_video:
37
- print(f"🎬 Detected video file: {input_path}. Extracting audio...")
38
- # Use pydub and ffmpeg to extract audio
39
- audio = AudioSegment.from_file(input_path)
40
- # Create a temporary file for the extracted audio
41
- temp_dir = tempfile.mkdtemp()
42
- output_audio_path = os.path.join(temp_dir, "extracted_audio.wav")
43
- audio.export(output_audio_path, format="wav")
44
- print(f"🔊 Audio extracted to: {output_audio_path}")
45
- audio_source_path = output_audio_path
46
- else:
47
- # Assume it's an audio file, use the original path
48
- print(f"🎵 Detected audio file: {input_path}. Using directly for transcription.")
49
- audio_source_path = input_path
50
-
51
- # Now transcribe the audio source path
52
- print(f" transcribe {audio_source_path}...")
53
- transcription = transcriber(audio_source_path)
54
-
55
- # Clean up temporary directory if audio was extracted and temp_dir was created
56
- if temp_dir and os.path.exists(temp_dir):
57
- shutil.rmtree(temp_dir)
58
- print(f"🗑️ Cleaned up temporary directory {temp_dir}")
59
-
60
-
61
- # The output format depends on return_timestamps. If True, it's a dict with 'text'.
62
- if isinstance(transcription, dict) and 'text' in transcription:
63
- return transcription['text']
64
- elif isinstance(transcription, list) and transcription:
65
- # Handle cases where output might be a list of dicts (e.g., without timestamps)
66
- return transcription[0].get('text', str(transcription)) # Return text from first item or string representation
67
- else:
68
- return str(transcription) # Return string representation if format is unexpected
69
-
70
- except Exception as e:
71
- # Clean up temporary directory in case of error during transcription
72
- if temp_dir and os.path.exists(temp_dir):
73
- shutil.rmtree(temp_dir)
74
- print(f"🗑️ Cleaned up temporary directory {temp_dir} after error")
75
- return f"❌ Processing or Transcription failed: {e}"
76
-
77
-
78
- # Create the Gradio interface
79
- print("🚀 Creating Gradio interface...")
80
- # Use gr.File for broader input type support, although gr.Audio often handles videos too
81
- # gr.Audio(type="filepath") might be sufficient if ffmpeg handles the format
82
- # Let's stick to gr.Audio with filepath type as it often works with ffmpeg installed
83
- interface = gr.Interface(
84
- fn=handle_upload_and_transcribe,
85
- inputs=gr.Audio(type="filepath", label="Upload Audio or Video File"),
86
- outputs=gr.Textbox(label="Transcription"),
87
- title="Multilingual Audio/Video Transcription",
88
- description="Upload an audio (.mp3, .wav, .m4a, etc.) or video (.mp4, .avi, etc.) file to get its transcription."
89
  )
90
 
91
- # Launch the interface
92
- print("Starting Gradio interface...")
93
- interface.launch(debug=True) # Set debug=True for more detailed error messages
94
-
95
- except Exception as e:
96
- print(f"❌ Error initializing the transcription pipeline or Gradio interface: {e}")
97
- print("Please check the model name and available resources.")
98
- display({"error": f"Initialization failed: {e}"})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import tempfile
4
+ from pathlib import Path
5
 
6
+ import gradio as gr
7
+ from transformers import pipeline
 
8
 
9
+ # Prøv å støtte video via pydub + ffmpeg hvis tilgjengelig (valgfritt)
 
 
 
 
10
  try:
11
+ from pydub import AudioSegment
12
+ HAS_PYDUB = True
13
+ except Exception:
14
+ HAS_PYDUB = False
15
+
16
+ # --- Konfigurasjon ---
17
+ # CPU: bruk en mindre, flerspråklig modell. (large-v3 på CPU vil ofte knekke.)
18
+ # Alternativer: "openai/whisper-small", "openai/whisper-medium", "distil-whisper/distil-small.multilingual"
19
+ ASR_MODEL_ID = "openai/whisper-small"
20
+
21
+ def make_transcriber():
22
+ # device=-1 tvinger CPU. return_timestamps=True gir tidskoder i retur.
23
+ return pipeline(
24
+ task="automatic-speech-recognition",
25
+ model=ASR_MODEL_ID,
26
+ device=-1,
27
+ return_timestamps=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  )
29
 
30
+ transcriber = make_transcriber()
31
+
32
+ VIDEO_EXTS = {".mp4", ".avi", ".mov", ".mkv", ".webm"}
33
+ AUDIO_EXTS = {".wav", ".mp3", ".m4a", ".flac", ".ogg", ".opus", ".aac"}
34
+
35
+ def extract_audio_if_needed(input_path: str) -> str:
36
+ """
37
+ Tar inn en filsti (audio eller video).
38
+ Hvis video og pydub+ffmpeg finnes, ekstraheres WAV til temp-katalog og
39
+ vi returnerer ny filsti. Hvis ikke, kastes en forklarende feil.
40
+ Hvis allerede audio, returneres originalstien.
41
+ """
42
+ suffix = Path(input_path).suffix.lower()
43
+
44
+ # Allerede audio?
45
+ if suffix in AUDIO_EXTS:
46
+ return input_path
47
+
48
+ # Video?
49
+ if suffix in VIDEO_EXTS:
50
+ if not HAS_PYDUB:
51
+ raise RuntimeError(
52
+ "Video oppdaget, men pydub/ffmpeg er ikke tilgjengelig. "
53
+ "Installer pydub og ffmpeg (se requirements.txt og apt.txt), "
54
+ "eller last opp en ren lydfil."
55
+ )
56
+ # Ekstraher WAV
57
+ temp_dir = tempfile.mkdtemp(prefix="asr_")
58
+ out_wav = os.path.join(temp_dir, "extracted_audio.wav")
59
+ audio = AudioSegment.from_file(input_path)
60
+ audio.export(out_wav, format="wav")
61
+ return out_wav
62
+
63
+ # Ukjent – la Whisper prøve; hvis det feiler, får brukeren feilmelding
64
+ return input_path
65
+
66
+ def handle_upload_and_transcribe(file_path: str):
67
+ if not file_path:
68
+ return "Last opp en lyd- eller videofil."
69
+
70
+ tmp_to_cleanup = None
71
+ try:
72
+ # Kan generere en temp WAV (for video)
73
+ maybe_audio = extract_audio_if_needed(file_path)
74
+ if maybe_audio != file_path:
75
+ tmp_to_cleanup = os.path.dirname(maybe_audio)
76
+
77
+ # Tips: du kan sette språk eksplisitt for raskere/mer stabil dekoding:
78
+ # generate_kwargs={"task": "transcribe", "language": "no"}
79
+ result = transcriber(maybe_audio)
80
+
81
+ # Rydd temp
82
+ if tmp_to_cleanup and os.path.exists(tmp_to_cleanup):
83
+ shutil.rmtree(tmp_to_cleanup, ignore_errors=True)
84
+
85
+ # Normaliser utdata
86
+ if isinstance(result, dict):
87
+ # transformers>=4.30 gir ofte {"text": "...", "chunks": [...]}
88
+ text = result.get("text")
89
+ if text:
90
+ return text.strip()
91
+ # fallback
92
+ return str(result)
93
+ elif isinstance(result, list) and result:
94
+ return result[0].get("text", str(result))
95
+ return str(result)
96
+
97
+ except Exception as e:
98
+ # Rydd opp ved feil
99
+ if tmp_to_cleanup and os.path.exists(tmp_to_cleanup):
100
+ shutil.rmtree(tmp_to_cleanup, ignore_errors=True)
101
+ return f"❌ Feil under prosessering/transkripsjon: {e}"
102
+
103
+ with gr.Blocks(title="Multilingual Audio/Video Transcription") as demo:
104
+ gr.Markdown(
105
+ "## Multilingual Transcription (CPU)\n"
106
+ "Last opp en lydfil (.wav/.mp3/.m4a/…) eller videofil (.mp4/.mov/…). "
107
+ "På CPU brukes en mindre Whisper-modell for stabil kjøring."
108
+ )
109
+ inp = gr.Audio(type="filepath", label="Fil (audio eller video)")
110
+ out = gr.Textbox(label="Transkripsjon")
111
+ btn = gr.Button("Transkriber")
112
+ btn.click(handle_upload_and_transcribe, inputs=inp, outputs=out)
113
+
114
+ if __name__ == "__main__":
115
+ # På HF Spaces trenger du vanligvis ikke server_name/server_port her.
116
+ demo.launch()