lucamartinelli commited on
Commit
ce8528d
Β·
1 Parent(s): 3d5ee3a

Aggiunto tasto download

Browse files
Files changed (1) hide show
  1. app.py +55 -29
app.py CHANGED
@@ -1,7 +1,9 @@
1
  """Whisper + Pyannote Transcription & Diarization Web Interface."""
 
2
  import logging
3
  import tempfile
4
  from pathlib import Path
 
5
 
6
  import gradio as gr
7
 
@@ -20,7 +22,7 @@ def process_audio(
20
  pyannote_model: str,
21
  openai_whisper_prompt: str,
22
  openai_whisper_language: str | None,
23
- progress=gr.Progress()
24
  ):
25
  """
26
  Process audio file with diarization and transcription.
@@ -37,16 +39,17 @@ def process_audio(
37
  transcription_model=transcription_model,
38
  pyannote_model=pyannote_model,
39
  whisper_prompt=openai_whisper_prompt,
40
- whisper_language=openai_whisper_language
41
  )
42
 
43
  return processor.process(
44
- audio_path=audio_path,
45
- progress_callback=lambda p, desc: progress(p, desc=desc)
46
  )
47
 
48
 
49
- def rename_speaker_in_vtt(vtt_content: str, transcripts_state, old_speaker: str, new_speaker: str):
 
 
50
  """Rename speaker and regenerate VTT."""
51
  if not vtt_content or not transcripts_state:
52
  return vtt_content
@@ -65,12 +68,17 @@ def prepare_download(vtt_content: str, audio_filename: str) -> str | None:
65
  Returns:
66
  Path to temporary VTT file, or None if inputs are invalid
67
  """
68
- if not vtt_content or not audio_filename:
69
  return None
70
 
71
- download_path = Path(tempfile.gettempdir()) / f"{audio_filename}.vtt"
 
 
 
 
 
72
 
73
- with open(download_path, 'w', encoding='utf-8') as f:
74
  f.write(vtt_content)
75
 
76
  return str(download_path)
@@ -78,10 +86,12 @@ def prepare_download(vtt_content: str, audio_filename: str) -> str | None:
78
 
79
  with gr.Blocks(title="Transcription & Diarization") as app:
80
 
81
- gr.Markdown("""
 
82
  # πŸŽ™οΈ Transcription & Diarization
83
  Fill the required settings, upload an audio file, and start the transcription using Whisper and Pyannote!
84
- """)
 
85
 
86
  transcripts_state = gr.State([])
87
  audio_filename_state = gr.State("")
@@ -96,15 +106,22 @@ with gr.Blocks(title="Transcription & Diarization") as app:
96
  transcription_model = gr.Dropdown(
97
  label="Transcription model",
98
  choices=[("Whisper", "whisper-1")],
99
- value="whisper-1"
100
  )
101
  pyannote_model = gr.Dropdown(
102
  label="Pyannote model",
103
- choices=[("Speaker diarization community 1", "pyannote/speaker-diarization-community-1")],
104
- value="pyannote/speaker-diarization-community-1"
 
 
 
 
 
105
  )
106
 
107
- openai_whisper_prompt = gr.Textbox(label="Additional whisper prompt", value="")
 
 
108
  openai_whisper_language = gr.Dropdown(
109
  label="Whisper language",
110
  choices=[
@@ -115,7 +132,7 @@ with gr.Blocks(title="Transcription & Diarization") as app:
115
  ("πŸ‡ͺπŸ‡Έ Spanish", "es"),
116
  ("πŸ‡«πŸ‡· French", "fr"),
117
  ],
118
- value=None
119
  )
120
 
121
  audio_input = gr.Audio(type="filepath", label="Upload audio")
@@ -128,20 +145,27 @@ with gr.Blocks(title="Transcription & Diarization") as app:
128
  label="Transcription",
129
  lines=20,
130
  placeholder="Your transcription will appear here...",
131
- show_copy_button=True,
132
  )
133
 
134
  validation_status = gr.Markdown("βšͺ No content", container=True)
135
 
 
 
 
 
136
  with gr.Accordion("🎭 Rename speakers", open=True):
137
  with gr.Row():
138
- old_speaker_name = gr.Textbox(label="Current speaker name (e.g., SPEAKER_00)", placeholder="SPEAKER_00", value="SPEAKER_00")
139
- new_speaker_name = gr.Textbox(label="New speaker name", placeholder="Davide")
 
 
 
 
 
 
140
 
141
  rename_btn = gr.Button("Rename")
142
 
143
- download_file = gr.File(label="Download VTT", visible=False)
144
-
145
  def check_inputs(openai_key: str, hf_key: str, audio) -> gr.Button:
146
  """
147
  Enable submit button only if both API keys and audio are provided.
@@ -175,29 +199,31 @@ with gr.Blocks(title="Transcription & Diarization") as app:
175
 
176
  # Prepare download file if valid
177
  file_path = None
178
- if is_valid and vtt_content and audio_filename:
179
  file_path = prepare_download(vtt_content, audio_filename)
180
 
181
  return (
182
  status,
183
- gr.File(value=file_path, visible=False) # download_file
 
 
184
  )
185
 
186
  # Enable/disable submit button based on API keys and audio input
187
  openapi_api_key.change(
188
  fn=check_inputs,
189
  inputs=[openapi_api_key, hf_api_key, audio_input],
190
- outputs=submit_btn
191
  )
192
  hf_api_key.change(
193
  fn=check_inputs,
194
  inputs=[openapi_api_key, hf_api_key, audio_input],
195
- outputs=submit_btn
196
  )
197
  audio_input.change(
198
  fn=check_inputs,
199
  inputs=[openapi_api_key, hf_api_key, audio_input],
200
- outputs=submit_btn
201
  )
202
 
203
  # Main transcription process
@@ -210,7 +236,7 @@ with gr.Blocks(title="Transcription & Diarization") as app:
210
  transcription_model,
211
  pyannote_model,
212
  openai_whisper_prompt,
213
- openai_whisper_language
214
  ],
215
  outputs=[output_vtt, transcripts_state, audio_filename_state],
216
  )
@@ -220,20 +246,20 @@ with gr.Blocks(title="Transcription & Diarization") as app:
220
  output_vtt.change(
221
  fn=update_validation,
222
  inputs=[output_vtt, audio_filename_state],
223
- outputs=[validation_status, download_file]
224
  )
225
 
226
  audio_filename_state.change(
227
  fn=update_validation,
228
  inputs=[output_vtt, audio_filename_state],
229
- outputs=[validation_status, download_file]
230
  )
231
 
232
  # Speaker renaming
233
  rename_btn.click(
234
  fn=rename_speaker_in_vtt,
235
  inputs=[output_vtt, transcripts_state, old_speaker_name, new_speaker_name],
236
- outputs=output_vtt
237
  )
238
 
239
  if __name__ == "__main__":
 
1
  """Whisper + Pyannote Transcription & Diarization Web Interface."""
2
+
3
  import logging
4
  import tempfile
5
  from pathlib import Path
6
+ from datetime import datetime
7
 
8
  import gradio as gr
9
 
 
22
  pyannote_model: str,
23
  openai_whisper_prompt: str,
24
  openai_whisper_language: str | None,
25
+ progress=gr.Progress(),
26
  ):
27
  """
28
  Process audio file with diarization and transcription.
 
39
  transcription_model=transcription_model,
40
  pyannote_model=pyannote_model,
41
  whisper_prompt=openai_whisper_prompt,
42
+ whisper_language=openai_whisper_language,
43
  )
44
 
45
  return processor.process(
46
+ audio_path=audio_path, progress_callback=lambda p, desc: progress(p, desc=desc)
 
47
  )
48
 
49
 
50
+ def rename_speaker_in_vtt(
51
+ vtt_content: str, transcripts_state, old_speaker: str, new_speaker: str
52
+ ):
53
  """Rename speaker and regenerate VTT."""
54
  if not vtt_content or not transcripts_state:
55
  return vtt_content
 
68
  Returns:
69
  Path to temporary VTT file, or None if inputs are invalid
70
  """
71
+ if not vtt_content:
72
  return None
73
 
74
+ if not audio_filename:
75
+ audio_filename = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
76
+
77
+ # Create a unique temp directory to avoid caching issues
78
+ temp_dir = Path(tempfile.mkdtemp())
79
+ download_path = temp_dir / f"{audio_filename}.vtt"
80
 
81
+ with open(download_path, "w", encoding="utf-8") as f:
82
  f.write(vtt_content)
83
 
84
  return str(download_path)
 
86
 
87
  with gr.Blocks(title="Transcription & Diarization") as app:
88
 
89
+ gr.Markdown(
90
+ """
91
  # πŸŽ™οΈ Transcription & Diarization
92
  Fill the required settings, upload an audio file, and start the transcription using Whisper and Pyannote!
93
+ """
94
+ )
95
 
96
  transcripts_state = gr.State([])
97
  audio_filename_state = gr.State("")
 
106
  transcription_model = gr.Dropdown(
107
  label="Transcription model",
108
  choices=[("Whisper", "whisper-1")],
109
+ value="whisper-1",
110
  )
111
  pyannote_model = gr.Dropdown(
112
  label="Pyannote model",
113
+ choices=[
114
+ (
115
+ "Speaker diarization community 1",
116
+ "pyannote/speaker-diarization-community-1",
117
+ )
118
+ ],
119
+ value="pyannote/speaker-diarization-community-1",
120
  )
121
 
122
+ openai_whisper_prompt = gr.Textbox(
123
+ label="Additional whisper prompt", value=""
124
+ )
125
  openai_whisper_language = gr.Dropdown(
126
  label="Whisper language",
127
  choices=[
 
132
  ("πŸ‡ͺπŸ‡Έ Spanish", "es"),
133
  ("πŸ‡«πŸ‡· French", "fr"),
134
  ],
135
+ value=None,
136
  )
137
 
138
  audio_input = gr.Audio(type="filepath", label="Upload audio")
 
145
  label="Transcription",
146
  lines=20,
147
  placeholder="Your transcription will appear here...",
 
148
  )
149
 
150
  validation_status = gr.Markdown("βšͺ No content", container=True)
151
 
152
+ download_btn = gr.DownloadButton(
153
+ "Download VTT", variant="primary", visible=False
154
+ )
155
+
156
  with gr.Accordion("🎭 Rename speakers", open=True):
157
  with gr.Row():
158
+ old_speaker_name = gr.Textbox(
159
+ label="Current speaker name (e.g., SPEAKER_00)",
160
+ placeholder="SPEAKER_00",
161
+ value="SPEAKER_00",
162
+ )
163
+ new_speaker_name = gr.Textbox(
164
+ label="New speaker name", placeholder="Davide"
165
+ )
166
 
167
  rename_btn = gr.Button("Rename")
168
 
 
 
169
  def check_inputs(openai_key: str, hf_key: str, audio) -> gr.Button:
170
  """
171
  Enable submit button only if both API keys and audio are provided.
 
199
 
200
  # Prepare download file if valid
201
  file_path = None
202
+ if is_valid and vtt_content:
203
  file_path = prepare_download(vtt_content, audio_filename)
204
 
205
  return (
206
  status,
207
+ gr.DownloadButton(
208
+ value=file_path, visible=bool(file_path), interactive=True
209
+ ),
210
  )
211
 
212
  # Enable/disable submit button based on API keys and audio input
213
  openapi_api_key.change(
214
  fn=check_inputs,
215
  inputs=[openapi_api_key, hf_api_key, audio_input],
216
+ outputs=submit_btn,
217
  )
218
  hf_api_key.change(
219
  fn=check_inputs,
220
  inputs=[openapi_api_key, hf_api_key, audio_input],
221
+ outputs=submit_btn,
222
  )
223
  audio_input.change(
224
  fn=check_inputs,
225
  inputs=[openapi_api_key, hf_api_key, audio_input],
226
+ outputs=submit_btn,
227
  )
228
 
229
  # Main transcription process
 
236
  transcription_model,
237
  pyannote_model,
238
  openai_whisper_prompt,
239
+ openai_whisper_language,
240
  ],
241
  outputs=[output_vtt, transcripts_state, audio_filename_state],
242
  )
 
246
  output_vtt.change(
247
  fn=update_validation,
248
  inputs=[output_vtt, audio_filename_state],
249
+ outputs=[validation_status, download_btn],
250
  )
251
 
252
  audio_filename_state.change(
253
  fn=update_validation,
254
  inputs=[output_vtt, audio_filename_state],
255
+ outputs=[validation_status, download_btn],
256
  )
257
 
258
  # Speaker renaming
259
  rename_btn.click(
260
  fn=rename_speaker_in_vtt,
261
  inputs=[output_vtt, transcripts_state, old_speaker_name, new_speaker_name],
262
+ outputs=output_vtt,
263
  )
264
 
265
  if __name__ == "__main__":