Spaces:

evannh
/

test_diarization

Sleeping

App Files Files Community

evannh commited on Jun 2, 2025

Commit

703daa1

verified ·

1 Parent(s): d8167ea

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -22

app.py CHANGED Viewed

@@ -8,12 +8,22 @@ from pyannote.audio import Pipeline as DiarizationPipeline
 # Initialisation des modèles
 whisper_model = WhisperModel("large-v2", device="cpu", compute_type="int8")
 diari_pipeline = DiarizationPipeline.from_pretrained(
     "pyannote/speaker-diarization-3.1",
     use_auth_token="hf_YOUR_TOKEN_HERE"  # Remplace par ton token Hugging Face perso
 )
 def convert_mp3_to_wav(mp3_path):
     wav_path = tempfile.mktemp(suffix=".wav")
     audio = AudioSegment.from_file(mp3_path, format="mp3")
@@ -24,18 +34,10 @@ def convert_mp3_to_wav(mp3_path):
 def transcribe_and_diarize(audio_file):
     wav_path = convert_mp3_to_wav(audio_file)
-    # Transcription avec Whisper
     segments, _ = whisper_model.transcribe(wav_path, language="fr", beam_size=5)
-    transcript = []
-    for seg in segments:
-        transcript.append({
-            "start": seg.start,
-            "end": seg.end,
-            "text": seg.text.strip()
-        })
-    # Diarisation avec pyannote
     diarization = diari_pipeline(wav_path)
     speakers = []
     for turn, _, speaker in diarization.itertracks(yield_label=True):
@@ -47,29 +49,30 @@ def transcribe_and_diarize(audio_file):
     # Fusion transcription + speaker
     final_output = []
-    for t in transcript:
         speaker = "Inconnu"
-        for d in speakers:
-            if d["start"] <= t["start"] <= d["end"]:
-                speaker = d["speaker"]
                 break
         final_output.append({
-            "start": t["start"],
-            "end": t["end"],
             "speaker": speaker,
-            "text": t["text"]
         })
     df = pd.DataFrame(final_output)
-    # Export TXT format
     txt_lines = [f"[{row['start']:.2f}s - {row['end']:.2f}s] {row['speaker']} : {row['text']}" for _, row in df.iterrows()]
     txt_output = "\n".join(txt_lines)
     txt_path = tempfile.mktemp(suffix=".txt")
     with open(txt_path, "w", encoding="utf-8") as f:
         f.write(txt_output)
-    # Export CSV format
     csv_path = tempfile.mktemp(suffix=".csv")
     df.to_csv(csv_path, index=False)
@@ -85,5 +88,5 @@ gr.Interface(
         gr.File(label="Télécharger le TXT")
     ],
     title="Transcription + Diarisation (FR)",
-    description="Charge un fichier MP3. Transcription FR + séparation des locuteurs + export CSV et TXT."
 ).launch()

 # Initialisation des modèles
 whisper_model = WhisperModel("large-v2", device="cpu", compute_type="int8")
 diari_pipeline = DiarizationPipeline.from_pretrained(
     "pyannote/speaker-diarization-3.1",
     use_auth_token="hf_YOUR_TOKEN_HERE"  # Remplace par ton token Hugging Face perso
 )
+# Pipeline de traitement :
+# .mp3
+#  ↓ (converti .wav)
+# .wav
+#  ↓
+# faster-whisper → segments (texte + timestamps)
+#  ↓
+# pyannote-audio → diarisation (segments + speaker X)
+#  ↓
+# Fusion des deux → transcription enrichie avec speaker + timestamp
 def convert_mp3_to_wav(mp3_path):
     wav_path = tempfile.mktemp(suffix=".wav")
     audio = AudioSegment.from_file(mp3_path, format="mp3")
 def transcribe_and_diarize(audio_file):
     wav_path = convert_mp3_to_wav(audio_file)
+    # Transcription
     segments, _ = whisper_model.transcribe(wav_path, language="fr", beam_size=5)
+    # Diarisation
     diarization = diari_pipeline(wav_path)
     speakers = []
     for turn, _, speaker in diarization.itertracks(yield_label=True):
     # Fusion transcription + speaker
     final_output = []
+    for seg in segments:
+        seg_start = seg.start
+        seg_end = seg.end
+        text = seg.text.strip()
         speaker = "Inconnu"
+        for s in speakers:
+            if s["start"] <= seg_start <= s["end"]:
+                speaker = s["speaker"]
                 break
         final_output.append({
+            "start": seg_start,
+            "end": seg_end,
             "speaker": speaker,
+            "text": text
         })
     df = pd.DataFrame(final_output)
     txt_lines = [f"[{row['start']:.2f}s - {row['end']:.2f}s] {row['speaker']} : {row['text']}" for _, row in df.iterrows()]
     txt_output = "\n".join(txt_lines)
     txt_path = tempfile.mktemp(suffix=".txt")
     with open(txt_path, "w", encoding="utf-8") as f:
         f.write(txt_output)
     csv_path = tempfile.mktemp(suffix=".csv")
     df.to_csv(csv_path, index=False)
         gr.File(label="Télécharger le TXT")
     ],
     title="Transcription + Diarisation (FR)",
+    description="Charge un fichier MP3. Transcription FR + séparation des locuteurs + export CSV/TXT."
 ).launch()