clementBE commited on
Commit
6e566c7
Β·
verified Β·
1 Parent(s): 4091834

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -25
app.py CHANGED
@@ -6,6 +6,7 @@ import datetime
6
  import time
7
  from transformers import pipeline
8
  from docx import Document
 
9
 
10
  # Define the available models and their approximate relative speeds
11
  MODEL_SIZES = {
@@ -35,7 +36,9 @@ def get_model_pipeline(model_name, progress):
35
  model_cache[model_name] = pipeline(
36
  "automatic-speech-recognition",
37
  model=model_id,
38
- device=device
 
 
39
  )
40
  progress(0.5, desc="βœ… Model loaded successfully!")
41
  return model_cache[model_name]
@@ -47,7 +50,7 @@ def create_vtt(segments, file_path):
47
  with open(file_path, "w", encoding="utf-8") as f:
48
  f.write("WEBVTT\n\n")
49
  for i, segment in enumerate(segments):
50
- # Calculate time strings in "HH:MM:SS.mmm" format (though VTT only strictly requires up to milliseconds)
51
  start_ms = int(segment.get('start', 0) * 1000)
52
  end_ms = int(segment.get('end', 0) * 1000)
53
 
@@ -85,9 +88,10 @@ def create_docx(segments, file_path, with_timestamps):
85
  document.save(file_path)
86
 
87
  @spaces.GPU
88
- def transcribe_and_export(audio_file, model_size, vtt_output, docx_timestamp_output, docx_no_timestamp_output, progress=gr.Progress()):
89
  """
90
  Main function to transcribe audio and export to selected formats.
 
91
  """
92
  if audio_file is None:
93
  return (None, None, None, "Please upload an audio file.")
@@ -96,53 +100,95 @@ def transcribe_and_export(audio_file, model_size, vtt_output, docx_timestamp_out
96
 
97
  pipe = get_model_pipeline(model_size, progress)
98
 
99
- progress(0.75, desc="🎀 Transcribing audio...")
100
-
101
- # Check if the French-specific model option was selected
102
  if model_size == "Distil-Large-v3-FR (French-Specific)":
103
  # Force French for this specific option
104
- raw_output = pipe(
105
- audio_file,
106
- return_timestamps="word", # Use word-level timestamps for more detail if needed, but 'True' works for chunk timestamps too
107
- generate_kwargs={"language": "fr"}
108
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  else:
110
- # For other models, let the model auto-detect the language
 
111
  raw_output = pipe(
112
  audio_file,
113
  return_timestamps="word",
 
114
  )
115
-
116
- # Use 'chunks' if available, otherwise default to the whole text
117
- segments = raw_output.get("chunks", [])
118
 
119
- # If no chunks are returned (e.g., if return_timestamps=False was used, though not in this code),
120
- # create a single segment from the full text.
121
- if not segments and 'text' in raw_output:
122
- segments = [{'text': raw_output['text'].strip(), 'start': 0.0, 'end': 0.0}]
123
 
124
  outputs = {}
125
 
126
  progress(0.85, desc="πŸ“ Generating output files...")
127
 
 
128
  if vtt_output:
129
  vtt_path = "transcription.vtt"
130
- create_vtt(segments, vtt_path)
131
  outputs["VTT"] = vtt_path
132
 
 
133
  if docx_timestamp_output:
134
  docx_ts_path = "transcription_with_timestamps.docx"
135
- create_docx(segments, docx_ts_path, with_timestamps=True)
136
  outputs["DOCX (with timestamps)"] = docx_ts_path
137
 
 
138
  if docx_no_timestamp_output:
139
  docx_no_ts_path = "transcription_without_timestamps.docx"
140
- create_docx(segments, docx_no_ts_path, with_timestamps=False)
141
  outputs["DOCX (without timestamps)"] = docx_no_ts_path
142
 
143
  end_time = time.time()
144
  total_time = end_time - start_time
145
- transcribed_text = raw_output['text']
146
  downloadable_files = [path for path in outputs.values()]
147
  status_message = f"βœ… Transcription complete! Total time: {total_time:.2f} seconds."
148
 
@@ -165,9 +211,15 @@ with gr.Blocks(title="Whisper ZeroGPU Transcription") as demo:
165
  model_selector = gr.Dropdown(
166
  label="Choose Whisper Model Size",
167
  choices=list(MODEL_SIZES.keys()),
168
- # Default to the French-specific model, which now uses the correct ID
169
  value="Distil-Large-v3-FR (French-Specific)"
170
  )
 
 
 
 
 
 
 
171
  gr.Markdown("### Choose Output Formats")
172
  with gr.Row():
173
  vtt_checkbox = gr.Checkbox(label="VTT", value=True)
@@ -182,7 +234,8 @@ with gr.Blocks(title="Whisper ZeroGPU Transcription") as demo:
182
 
183
  transcribe_btn.click(
184
  fn=transcribe_and_export,
185
- inputs=[audio_input, model_selector, vtt_checkbox, docx_ts_checkbox, docx_no_ts_checkbox],
 
186
  outputs=[transcription_output, downloadable_files_output, audio_input, status_text]
187
  )
188
 
 
6
  import time
7
  from transformers import pipeline
8
  from docx import Document
9
+ from pydub import AudioSegment
10
 
11
  # Define the available models and their approximate relative speeds
12
  MODEL_SIZES = {
 
36
  model_cache[model_name] = pipeline(
37
  "automatic-speech-recognition",
38
  model=model_id,
39
+ device=device,
40
+ # Set max_new_tokens for generation, common for ASR
41
+ max_new_tokens=128
42
  )
43
  progress(0.5, desc="βœ… Model loaded successfully!")
44
  return model_cache[model_name]
 
50
  with open(file_path, "w", encoding="utf-8") as f:
51
  f.write("WEBVTT\n\n")
52
  for i, segment in enumerate(segments):
53
+ # Calculate time strings in "HH:MM:SS.mmm" format
54
  start_ms = int(segment.get('start', 0) * 1000)
55
  end_ms = int(segment.get('end', 0) * 1000)
56
 
 
88
  document.save(file_path)
89
 
90
  @spaces.GPU
91
+ def transcribe_and_export(audio_file, model_size, vtt_output, docx_timestamp_output, docx_no_timestamp_output, sequence_5_min, progress=gr.Progress()):
92
  """
93
  Main function to transcribe audio and export to selected formats.
94
+ Added logic for 5-minute sequencing.
95
  """
96
  if audio_file is None:
97
  return (None, None, None, "Please upload an audio file.")
 
100
 
101
  pipe = get_model_pipeline(model_size, progress)
102
 
103
+ # Define generation arguments
104
+ generate_kwargs = {}
 
105
  if model_size == "Distil-Large-v3-FR (French-Specific)":
106
  # Force French for this specific option
107
+ generate_kwargs["language"] = "fr"
108
+
109
+ full_segments = []
110
+ full_text_list = []
111
+
112
+ # --- New 5-Minute Sequencing Logic ---
113
+ if sequence_5_min:
114
+ progress(0.70, desc="βœ‚οΈ Splitting audio into 5-minute chunks...")
115
+ audio = AudioSegment.from_file(audio_file)
116
+ chunk_length_ms = 5 * 60 * 1000 # 5 minutes in milliseconds
117
+ total_duration_ms = len(audio)
118
+ num_chunks = (total_duration_ms + chunk_length_ms - 1) // chunk_length_ms # Ceiling division
119
+
120
+ for i in range(num_chunks):
121
+ start_ms = i * chunk_length_ms
122
+ end_ms = min((i + 1) * chunk_length_ms, total_duration_ms)
123
+
124
+ progress_val = 0.70 + (i / num_chunks) * 0.15
125
+ progress(progress_val, desc=f"🎀 Transcribing chunk {i+1}/{num_chunks}...")
126
+
127
+ chunk = audio[start_ms:end_ms]
128
+ temp_chunk_path = f"/tmp/chunk_{i}.mp3" # Save as a temp file for the pipeline
129
+ chunk.export(temp_chunk_path, format="mp3")
130
+
131
+ # Transcribe the chunk
132
+ chunk_output = pipe(
133
+ temp_chunk_path,
134
+ return_timestamps="word",
135
+ generate_kwargs=generate_kwargs
136
+ )
137
+
138
+ # Adjust timestamps for the full file
139
+ offset = start_ms / 1000.0
140
+ chunk_segments = chunk_output.get("chunks", [])
141
+ for segment in chunk_segments:
142
+ segment['start'] = segment.get('start', 0.0) + offset
143
+ segment['end'] = segment.get('end', 0.0) + offset
144
+ full_segments.append(segment)
145
+
146
+ full_text_list.append(chunk_output.get('text', ''))
147
+
148
+ os.remove(temp_chunk_path) # Clean up temp file
149
+
150
+ transcribed_text = " ".join(full_text_list).strip()
151
+
152
  else:
153
+ # Standard transcription for the whole file at once
154
+ progress(0.75, desc="🎀 Transcribing full audio file...")
155
  raw_output = pipe(
156
  audio_file,
157
  return_timestamps="word",
158
+ generate_kwargs=generate_kwargs
159
  )
160
+ full_segments = raw_output.get("chunks", [])
161
+ transcribed_text = raw_output.get('text', '').strip()
 
162
 
163
+ # Ensure segments is not empty
164
+ if not full_segments and transcribed_text:
165
+ # Create a single segment from the full text if chunks were not generated for some reason
166
+ full_segments = [{'text': transcribed_text, 'start': 0.0, 'end': 0.0}]
167
 
168
  outputs = {}
169
 
170
  progress(0.85, desc="πŸ“ Generating output files...")
171
 
172
+ # Generate VTT
173
  if vtt_output:
174
  vtt_path = "transcription.vtt"
175
+ create_vtt(full_segments, vtt_path)
176
  outputs["VTT"] = vtt_path
177
 
178
+ # Generate DOCX with timestamps
179
  if docx_timestamp_output:
180
  docx_ts_path = "transcription_with_timestamps.docx"
181
+ create_docx(full_segments, docx_ts_path, with_timestamps=True)
182
  outputs["DOCX (with timestamps)"] = docx_ts_path
183
 
184
+ # Generate DOCX without timestamps
185
  if docx_no_timestamp_output:
186
  docx_no_ts_path = "transcription_without_timestamps.docx"
187
+ create_docx(full_segments, docx_no_ts_path, with_timestamps=False)
188
  outputs["DOCX (without timestamps)"] = docx_no_ts_path
189
 
190
  end_time = time.time()
191
  total_time = end_time - start_time
 
192
  downloadable_files = [path for path in outputs.values()]
193
  status_message = f"βœ… Transcription complete! Total time: {total_time:.2f} seconds."
194
 
 
211
  model_selector = gr.Dropdown(
212
  label="Choose Whisper Model Size",
213
  choices=list(MODEL_SIZES.keys()),
 
214
  value="Distil-Large-v3-FR (French-Specific)"
215
  )
216
+ gr.Markdown("### Processing Options")
217
+ # NEW CHECKBOX for 5-minute sequencing
218
+ sequence_checkbox = gr.Checkbox(
219
+ label="Process in 5-minute sequences (Recommended for files > 30 min or to prevent memory errors)",
220
+ value=False
221
+ )
222
+
223
  gr.Markdown("### Choose Output Formats")
224
  with gr.Row():
225
  vtt_checkbox = gr.Checkbox(label="VTT", value=True)
 
234
 
235
  transcribe_btn.click(
236
  fn=transcribe_and_export,
237
+ # UPDATED INPUTS list to include the new checkbox
238
+ inputs=[audio_input, model_selector, vtt_checkbox, docx_ts_checkbox, docx_no_ts_checkbox, sequence_checkbox],
239
  outputs=[transcription_output, downloadable_files_output, audio_input, status_text]
240
  )
241