Update app.py
Browse files
app.py
CHANGED
|
@@ -41,14 +41,15 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
| 41 |
pipe = pipeline(
|
| 42 |
task="automatic-speech-recognition",
|
| 43 |
model=MODEL_NAME,
|
| 44 |
-
chunk_length_s=30,
|
| 45 |
device=device,
|
| 46 |
model_kwargs={"low_cpu_mem_usage": True},
|
|
|
|
| 47 |
)
|
| 48 |
|
| 49 |
|
| 50 |
|
| 51 |
|
|
|
|
| 52 |
def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.1, min_segment_duration=0.5):
|
| 53 |
word_segments = transcription_result['chunks']
|
| 54 |
diarization_segments = list(diarization.itertracks(yield_label=True))
|
|
@@ -123,9 +124,11 @@ def parse_simplified_diarization(simplified_text):
|
|
| 123 |
def process_transcription(*args):
|
| 124 |
generator = transcribe_and_diarize(*args)
|
| 125 |
for progress_message, raw_text, speaker_transcription in generator:
|
| 126 |
-
|
|
|
|
|
|
|
| 127 |
simplified_diarization = simplify_diarization_output(speaker_transcription)
|
| 128 |
-
|
| 129 |
|
| 130 |
def process_yt_transcription(*args):
|
| 131 |
html_embed, raw_text, speaker_transcription = yt_transcribe(*args)
|
|
@@ -173,6 +176,10 @@ def display_progress(progress_state):
|
|
| 173 |
""")
|
| 174 |
|
| 175 |
@spaces.GPU(duration=120)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
def transcribe_and_diarize(file_path, task, progress=gr.Progress()):
|
| 177 |
progress(0, desc="Initialisation...")
|
| 178 |
yield "Chargement du fichier...", None, None
|
|
@@ -180,10 +187,12 @@ def transcribe_and_diarize(file_path, task, progress=gr.Progress()):
|
|
| 180 |
progress(0.2, desc="Préparation de l'audio...")
|
| 181 |
yield "Préparation de l'audio...", None, None
|
| 182 |
|
| 183 |
-
progress(0.
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
|
|
|
|
|
|
| 187 |
progress(0.6, desc=" C'est fait 😮💨 ! Je m'active à fusionner tout ça, un instant, J'y suis presque...")
|
| 188 |
if diarization_pipeline:
|
| 189 |
diarization = diarization_pipeline(file_path)
|
|
@@ -391,7 +400,7 @@ with demo:
|
|
| 391 |
progress_display = gr.Markdown(label="État de la progression")
|
| 392 |
|
| 393 |
with gr.Accordion("Résultats 📊", open=True):
|
| 394 |
-
|
| 395 |
speaker_output = gr.Textbox(label="👥 Diarisation (format simplifié)", info="Identification des locuteurs. Format : 'SPEAKER_XX: texte'")
|
| 396 |
with gr.Accordion("Métadonnées (optionnel) 📌", open=False):
|
| 397 |
audio_duration = gr.Textbox(label="⏱️ Durée de l'audio (mm:ss)")
|
|
@@ -474,10 +483,11 @@ with demo:
|
|
| 474 |
""")
|
| 475 |
|
| 476 |
# Connexions des boutons aux fonctions appropriées
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
|
|
|
| 481 |
)
|
| 482 |
|
| 483 |
format_button.click(
|
|
|
|
| 41 |
pipe = pipeline(
|
| 42 |
task="automatic-speech-recognition",
|
| 43 |
model=MODEL_NAME,
|
|
|
|
| 44 |
device=device,
|
| 45 |
model_kwargs={"low_cpu_mem_usage": True},
|
| 46 |
+
return_timestamps="word"
|
| 47 |
)
|
| 48 |
|
| 49 |
|
| 50 |
|
| 51 |
|
| 52 |
+
|
| 53 |
def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.1, min_segment_duration=0.5):
|
| 54 |
word_segments = transcription_result['chunks']
|
| 55 |
diarization_segments = list(diarization.itertracks(yield_label=True))
|
|
|
|
| 124 |
def process_transcription(*args):
|
| 125 |
generator = transcribe_and_diarize(*args)
|
| 126 |
for progress_message, raw_text, speaker_transcription in generator:
|
| 127 |
+
yield progress_message, raw_text, "" # Streaming de la transcription brute
|
| 128 |
+
|
| 129 |
+
# Une fois la transcription terminée, effectuez la diarisation
|
| 130 |
simplified_diarization = simplify_diarization_output(speaker_transcription)
|
| 131 |
+
yield progress_message, raw_text, simplified_diarization
|
| 132 |
|
| 133 |
def process_yt_transcription(*args):
|
| 134 |
html_embed, raw_text, speaker_transcription = yt_transcribe(*args)
|
|
|
|
| 176 |
""")
|
| 177 |
|
| 178 |
@spaces.GPU(duration=120)
|
| 179 |
+
def stream_transcription(audio):
|
| 180 |
+
for result in pipe(audio, chunk_length_s=10, stride_length_s=(4, 2)):
|
| 181 |
+
yield result["text"]
|
| 182 |
+
|
| 183 |
def transcribe_and_diarize(file_path, task, progress=gr.Progress()):
|
| 184 |
progress(0, desc="Initialisation...")
|
| 185 |
yield "Chargement du fichier...", None, None
|
|
|
|
| 187 |
progress(0.2, desc="Préparation de l'audio...")
|
| 188 |
yield "Préparation de l'audio...", None, None
|
| 189 |
|
| 190 |
+
progress(0.3, desc="Laissez moi quelques minutes pour déchiffrer les voix et rédiger l'audio 🤓 ✍️ ...")
|
| 191 |
+
transcription = ""
|
| 192 |
+
for chunk in stream_transcription(audio_np):
|
| 193 |
+
transcription += chunk
|
| 194 |
+
yield "Transcription en cours...", transcription, []
|
| 195 |
+
|
| 196 |
progress(0.6, desc=" C'est fait 😮💨 ! Je m'active à fusionner tout ça, un instant, J'y suis presque...")
|
| 197 |
if diarization_pipeline:
|
| 198 |
diarization = diarization_pipeline(file_path)
|
|
|
|
| 400 |
progress_display = gr.Markdown(label="État de la progression")
|
| 401 |
|
| 402 |
with gr.Accordion("Résultats 📊", open=True):
|
| 403 |
+
transcription_output = gr.Textbox(label="📝 Transcription brute", info="Texte généré par le modèle. Modifiable si nécessaire.")
|
| 404 |
speaker_output = gr.Textbox(label="👥 Diarisation (format simplifié)", info="Identification des locuteurs. Format : 'SPEAKER_XX: texte'")
|
| 405 |
with gr.Accordion("Métadonnées (optionnel) 📌", open=False):
|
| 406 |
audio_duration = gr.Textbox(label="⏱️ Durée de l'audio (mm:ss)")
|
|
|
|
| 483 |
""")
|
| 484 |
|
| 485 |
# Connexions des boutons aux fonctions appropriées
|
| 486 |
+
submit_button.click(
|
| 487 |
+
process_transcription,
|
| 488 |
+
inputs=[audio_input],
|
| 489 |
+
outputs=[progress_output, transcription_output, diarization_output],
|
| 490 |
+
show_progress=True,
|
| 491 |
)
|
| 492 |
|
| 493 |
format_button.click(
|