clementBE commited on
Commit
ded0018
Β·
verified Β·
1 Parent(s): 7d4389c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +145 -121
app.py CHANGED
@@ -1,135 +1,159 @@
1
- import os
2
- import uuid
3
- import shutil
4
- import whisper
5
- import librosa
6
  import gradio as gr
 
 
 
 
 
 
7
  from docx import Document
8
 
9
- MODEL_SPEED = {
10
- "tiny": 10,
11
- "base": 5,
12
- "small": 3,
13
- "medium": 2,
14
- "large": 1
 
15
  }
16
 
17
- def format_timestamp(seconds):
18
- h = int(seconds // 3600)
19
- m = int((seconds % 3600) // 60)
20
- s = seconds % 60
21
- return f"{h:02d}:{m:02d}:{s:06.3f}"
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- def write_vtt(segments, vtt_path):
24
- with open(vtt_path, "w", encoding="utf-8") as f:
25
  f.write("WEBVTT\n\n")
26
- for i, seg in enumerate(segments):
27
- start = format_timestamp(seg['start'])
28
- end = format_timestamp(seg['end'])
29
- f.write(f"{i+1}\n{start} --> {end}\n{seg['text'].strip()}\n\n")
30
-
31
- def generate_docx(segments, docx_path):
32
- doc = Document()
33
- for seg in segments:
34
- start = format_timestamp(seg['start'])
35
- end = format_timestamp(seg['end'])
36
- doc.add_paragraph(f"{start} - {end}: {seg['text']}")
37
- doc.save(docx_path)
38
-
39
- def generate_docx_no_timestamps(segments, docx_path):
40
- doc = Document()
41
- full_text = " ".join(seg['text'].strip() for seg in segments)
42
- doc.add_paragraph(full_text)
43
- doc.save(docx_path)
44
-
45
- def process(audio_file_path, model_name):
46
- session_id = str(uuid.uuid4())
47
- base_dir = os.path.join("session_data", session_id)
48
- os.makedirs(base_dir, exist_ok=True)
49
-
50
- audio_path = os.path.join(base_dir, os.path.basename(audio_file_path))
51
- shutil.copy(audio_file_path, audio_path)
52
-
53
- duration = librosa.get_duration(path=audio_path)
54
- speed_factor = MODEL_SPEED.get(model_name, 4)
55
- estimated_time = round(duration / speed_factor, 2)
56
-
57
- log = f"πŸ” File: {os.path.basename(audio_path)}\n"
58
- log += f"πŸ“ Duration: {duration:.2f} sec\n"
59
- log += f"🧠 Model: {model_name}\n"
60
- log += f"⏱ Estimated time: ~{estimated_time} sec\n\n"
61
- log += "πŸš€ Loading model...\n"
62
- yield None, None, None, None, log
63
-
64
- model = whisper.load_model(model_name)
65
- log += "βœ… Model loaded. Transcribing...\n"
66
- yield None, None, None, None, log
67
-
68
- result = model.transcribe(audio_path)
69
- log += "πŸ“ Transcription complete. Writing files...\n"
70
- yield None, None, None, None, log
71
-
72
- segments = result.get('segments', [{
73
- 'start': 0,
74
- 'end': result.get('duration', 0),
75
- 'text': result.get('text', '')
76
- }])
77
-
78
- audio_id = os.path.splitext(os.path.basename(audio_path))[0]
79
- vtt_path = os.path.join(base_dir, f"{audio_id}.vtt")
80
- docx_path = os.path.join(base_dir, f"{audio_id}.docx")
81
- docx_no_ts_path = os.path.join(base_dir, f"{audio_id}_no_timestamps.docx")
82
- html_path = os.path.join(base_dir, f"{audio_id}.html")
83
-
84
- write_vtt(segments, vtt_path)
85
- generate_docx(segments, docx_path)
86
- generate_docx_no_timestamps(segments, docx_no_ts_path)
87
-
88
- with open(html_path, "w", encoding="utf-8") as f:
89
- f.write(f"<html><head><title>{audio_id} Transcript</title></head><body>\n")
90
- f.write(f"<h1>Transcript for {audio_id}</h1>\n")
91
- for seg in segments:
92
- start_str = format_timestamp(seg['start'])
93
- end_str = format_timestamp(seg['end'])
94
- f.write(f"<p><b>{start_str} β†’ {end_str}</b><br><span contenteditable='true'>{seg['text']}</span></p>\n")
95
- f.write("</body></html>")
96
-
97
- log += "βœ… All done!"
98
- yield html_path, vtt_path, docx_path, docx_no_ts_path, log
99
-
100
-
101
- # ===================== Gradio UI =====================
102
-
103
- with gr.Blocks(title="Whisper MP3 Transcription Tool") as app:
104
- gr.Markdown("## πŸŽ™οΈ Whisper Transcription Tool")
105
- gr.Markdown("Upload an MP3 file, select a model, and download your transcript in multiple formats.")
106
- gr.Markdown("We recommend the base model.")
107
-
108
- with gr.Row():
109
- audio_input = gr.Audio(type="filepath", label="🎡 Upload MP3")
110
- model_selector = gr.Dropdown(
111
- choices=["tiny", "base", "small", "medium", "large"],
112
- value="base",
113
- label="🧠 Whisper model"
114
  )
115
 
116
- run_button = gr.Button("πŸš€ Transcribe")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
 
 
 
 
 
118
  with gr.Row():
119
- html_output = gr.File(label="πŸ“ HTML Transcript")
120
- vtt_output = gr.File(label="πŸ”€ Subtitle (VTT)")
121
- docx_output = gr.File(label="πŸ“„ Word Document (with timestamps)")
122
- docx_no_ts_output = gr.File(label="πŸ“„ Word Document (no timestamps)")
123
-
124
- progress_box = gr.Textbox(label="πŸ“‘ Processing Log", lines=20)
125
-
126
- run_button.click(
127
- fn=process,
128
- inputs=[audio_input, model_selector],
129
- outputs=[html_output, vtt_output, docx_output, docx_no_ts_output, progress_box],
130
- show_progress=True,
131
- api_name="transcribe"
 
 
 
 
 
 
 
 
 
 
 
132
  )
133
 
134
  if __name__ == "__main__":
135
- app.launch()
 
 
 
 
 
 
1
  import gradio as gr
2
+ import spaces
3
+ import torch
4
+ import os
5
+ import datetime
6
+ import time
7
+ from transformers import pipeline
8
  from docx import Document
9
 
10
+ # Define the available models and their approximate relative speeds
11
+ MODEL_SIZES = {
12
+ "Tiny (Fastest)": "openai/whisper-tiny",
13
+ "Base (Faster)": "openai/whisper-base",
14
+ "Small (Balanced)": "openai/whisper-small",
15
+ "Distil-Large-v3 (General Purpose)": "distil-whisper/distil-large-v3",
16
+ "Distil-Large-v2-FR (French-Specific)": "distil-whisper/distil-large-v2-fr" # New, French-specific model
17
  }
18
 
19
+ # Use a dictionary to cache loaded models
20
+ model_cache = {}
21
+
22
+ def get_model_pipeline(model_name, progress):
23
+ if model_name not in model_cache:
24
+ progress(0, desc="πŸš€ Initializing ZeroGPU instance...")
25
+ model_id = MODEL_SIZES[model_name]
26
+ device = 0 if torch.cuda.is_available() else "cpu"
27
+
28
+ progress(0.1, desc=f"⏳ Loading {model_name} model...")
29
+ model_cache[model_name] = pipeline(
30
+ "automatic-speech-recognition",
31
+ model=model_id,
32
+ device=device
33
+ )
34
+ progress(0.5, desc="βœ… Model loaded successfully!")
35
+ return model_cache[model_name]
36
 
37
+ def create_vtt(segments, file_path):
38
+ with open(file_path, "w", encoding="utf-8") as f:
39
  f.write("WEBVTT\n\n")
40
+ for i, segment in enumerate(segments):
41
+ start_seconds = segment.get('start', 0)
42
+ end_seconds = segment.get('end', 0)
43
+ start = str(datetime.timedelta(seconds=int(start_seconds)))
44
+ end = str(datetime.timedelta(seconds=int(end_seconds)))
45
+ f.write(f"{i+1}\n")
46
+ f.write(f"{start} --> {end}\n")
47
+ f.write(f"{segment.get('text', '').strip()}\n\n")
48
+
49
+ def create_docx(segments, file_path, with_timestamps):
50
+ document = Document()
51
+ document.add_heading("Transcription", 0)
52
+
53
+ if with_timestamps:
54
+ for segment in segments:
55
+ text = segment.get('text', '').strip()
56
+ start_seconds = segment.get('start', 0)
57
+ end_seconds = segment.get('end', 0)
58
+ start = str(datetime.timedelta(seconds=int(start_seconds)))
59
+ end = str(datetime.timedelta(seconds=int(end_seconds)))
60
+ document.add_paragraph(f"[{start} - {end}] {text}")
61
+ else:
62
+ full_text = " ".join([segment.get('text', '').strip() for segment in segments])
63
+ document.add_paragraph(full_text)
64
+
65
+ document.save(file_path)
66
+
67
+ @spaces.GPU
68
+ def transcribe_and_export(audio_file, model_size, vtt_output, docx_timestamp_output, docx_no_timestamp_output, progress=gr.Progress()):
69
+ if audio_file is None:
70
+ return (None, None, None, "Please upload an audio file.")
71
+
72
+ start_time = time.time()
73
+
74
+ pipe = get_model_pipeline(model_size, progress)
75
+
76
+ progress(0.75, desc="🎀 Transcribing audio...")
77
+
78
+ # Forcing French for the new specific model
79
+ # Note: If the user picks a different model, the language auto-detection will work as normal.
80
+ if model_size == "Distil-Large-v2-FR (French-Specific)":
81
+ raw_output = pipe(
82
+ audio_file,
83
+ return_timestamps=True,
84
+ generate_kwargs={"language": "fr"}
85
+ )
86
+ else:
87
+ # For other models, let the model auto-detect
88
+ raw_output = pipe(
89
+ audio_file,
90
+ return_timestamps=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  )
92
 
93
+ segments = raw_output.get("chunks", [])
94
+ outputs = {}
95
+
96
+ progress(0.85, desc="πŸ“ Generating output files...")
97
+
98
+ if vtt_output:
99
+ vtt_path = "transcription.vtt"
100
+ create_vtt(segments, vtt_path)
101
+ outputs["VTT"] = vtt_path
102
+
103
+ if docx_timestamp_output:
104
+ docx_ts_path = "transcription_with_timestamps.docx"
105
+ create_docx(segments, docx_ts_path, with_timestamps=True)
106
+ outputs["DOCX (with timestamps)"] = docx_ts_path
107
+
108
+ if docx_no_timestamp_output:
109
+ docx_no_ts_path = "transcription_without_timestamps.docx"
110
+ create_docx(segments, docx_no_ts_path, with_timestamps=False)
111
+ outputs["DOCX (without timestamps)"] = docx_no_ts_path
112
+
113
+ end_time = time.time()
114
+ total_time = end_time - start_time
115
+ transcribed_text = raw_output['text']
116
+ downloadable_files = [path for path in outputs.values()]
117
+ status_message = f"βœ… Transcription complete! Total time: {total_time:.2f} seconds."
118
+
119
+ return (
120
+ transcribed_text,
121
+ gr.Files(value=downloadable_files, label="Download Transcripts"),
122
+ gr.Audio(value=None),
123
+ status_message
124
+ )
125
 
126
+ # --- Gradio UI ---
127
+ with gr.Blocks(title="Whisper ZeroGPU Transcription") as demo:
128
+ gr.Markdown("# πŸŽ™οΈ Whisper ZeroGPU Transcription")
129
+ gr.Markdown("Transcribe audio with timestamps and choose your output format. The first run may take up to a minute due to cold start.")
130
+
131
  with gr.Row():
132
+ audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio File")
133
+
134
+ with gr.Column(scale=2):
135
+ model_selector = gr.Dropdown(
136
+ label="Choose Whisper Model Size",
137
+ choices=list(MODEL_SIZES.keys()),
138
+ value="Distil-Large-v2-FR (French-Specific)" # Default to the French-specific model
139
+ )
140
+ gr.Markdown("### Choose Output Formats")
141
+ with gr.Row():
142
+ vtt_checkbox = gr.Checkbox(label="VTT", value=True)
143
+ docx_ts_checkbox = gr.Checkbox(label="DOCX (with timestamps)", value=False)
144
+ docx_no_ts_checkbox = gr.Checkbox(label="DOCX (without timestamps)", value=True)
145
+
146
+ transcribe_btn = gr.Button("Transcribe", variant="primary")
147
+ status_text = gr.Textbox(label="Status", interactive=False)
148
+
149
+ transcription_output = gr.Textbox(label="Full Transcription", lines=10)
150
+ downloadable_files_output = gr.Files(label="Download Transcripts")
151
+
152
+ transcribe_btn.click(
153
+ fn=transcribe_and_export,
154
+ inputs=[audio_input, model_selector, vtt_checkbox, docx_ts_checkbox, docx_no_ts_checkbox],
155
+ outputs=[transcription_output, downloadable_files_output, audio_input, status_text]
156
  )
157
 
158
  if __name__ == "__main__":
159
+ demo.launch()