evannh commited on
Commit
703daa1
·
verified ·
1 Parent(s): d8167ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -22
app.py CHANGED
@@ -8,12 +8,22 @@ from pyannote.audio import Pipeline as DiarizationPipeline
8
 
9
  # Initialisation des modèles
10
  whisper_model = WhisperModel("large-v2", device="cpu", compute_type="int8")
11
-
12
  diari_pipeline = DiarizationPipeline.from_pretrained(
13
  "pyannote/speaker-diarization-3.1",
14
  use_auth_token="hf_YOUR_TOKEN_HERE" # Remplace par ton token Hugging Face perso
15
  )
16
 
 
 
 
 
 
 
 
 
 
 
 
17
  def convert_mp3_to_wav(mp3_path):
18
  wav_path = tempfile.mktemp(suffix=".wav")
19
  audio = AudioSegment.from_file(mp3_path, format="mp3")
@@ -24,18 +34,10 @@ def convert_mp3_to_wav(mp3_path):
24
  def transcribe_and_diarize(audio_file):
25
  wav_path = convert_mp3_to_wav(audio_file)
26
 
27
- # Transcription avec Whisper
28
  segments, _ = whisper_model.transcribe(wav_path, language="fr", beam_size=5)
29
 
30
- transcript = []
31
- for seg in segments:
32
- transcript.append({
33
- "start": seg.start,
34
- "end": seg.end,
35
- "text": seg.text.strip()
36
- })
37
-
38
- # Diarisation avec pyannote
39
  diarization = diari_pipeline(wav_path)
40
  speakers = []
41
  for turn, _, speaker in diarization.itertracks(yield_label=True):
@@ -47,29 +49,30 @@ def transcribe_and_diarize(audio_file):
47
 
48
  # Fusion transcription + speaker
49
  final_output = []
50
- for t in transcript:
 
 
 
51
  speaker = "Inconnu"
52
- for d in speakers:
53
- if d["start"] <= t["start"] <= d["end"]:
54
- speaker = d["speaker"]
55
  break
56
  final_output.append({
57
- "start": t["start"],
58
- "end": t["end"],
59
  "speaker": speaker,
60
- "text": t["text"]
61
  })
62
 
63
  df = pd.DataFrame(final_output)
64
-
65
- # Export TXT format
66
  txt_lines = [f"[{row['start']:.2f}s - {row['end']:.2f}s] {row['speaker']} : {row['text']}" for _, row in df.iterrows()]
67
  txt_output = "\n".join(txt_lines)
 
68
  txt_path = tempfile.mktemp(suffix=".txt")
69
  with open(txt_path, "w", encoding="utf-8") as f:
70
  f.write(txt_output)
71
 
72
- # Export CSV format
73
  csv_path = tempfile.mktemp(suffix=".csv")
74
  df.to_csv(csv_path, index=False)
75
 
@@ -85,5 +88,5 @@ gr.Interface(
85
  gr.File(label="Télécharger le TXT")
86
  ],
87
  title="Transcription + Diarisation (FR)",
88
- description="Charge un fichier MP3. Transcription FR + séparation des locuteurs + export CSV et TXT."
89
  ).launch()
 
8
 
9
  # Initialisation des modèles
10
  whisper_model = WhisperModel("large-v2", device="cpu", compute_type="int8")
 
11
  diari_pipeline = DiarizationPipeline.from_pretrained(
12
  "pyannote/speaker-diarization-3.1",
13
  use_auth_token="hf_YOUR_TOKEN_HERE" # Remplace par ton token Hugging Face perso
14
  )
15
 
16
+ # Pipeline de traitement :
17
+ # .mp3
18
+ # ↓ (converti .wav)
19
+ # .wav
20
+ # ↓
21
+ # faster-whisper → segments (texte + timestamps)
22
+ # ↓
23
+ # pyannote-audio → diarisation (segments + speaker X)
24
+ # ↓
25
+ # Fusion des deux → transcription enrichie avec speaker + timestamp
26
+
27
  def convert_mp3_to_wav(mp3_path):
28
  wav_path = tempfile.mktemp(suffix=".wav")
29
  audio = AudioSegment.from_file(mp3_path, format="mp3")
 
34
  def transcribe_and_diarize(audio_file):
35
  wav_path = convert_mp3_to_wav(audio_file)
36
 
37
+ # Transcription
38
  segments, _ = whisper_model.transcribe(wav_path, language="fr", beam_size=5)
39
 
40
+ # Diarisation
 
 
 
 
 
 
 
 
41
  diarization = diari_pipeline(wav_path)
42
  speakers = []
43
  for turn, _, speaker in diarization.itertracks(yield_label=True):
 
49
 
50
  # Fusion transcription + speaker
51
  final_output = []
52
+ for seg in segments:
53
+ seg_start = seg.start
54
+ seg_end = seg.end
55
+ text = seg.text.strip()
56
  speaker = "Inconnu"
57
+ for s in speakers:
58
+ if s["start"] <= seg_start <= s["end"]:
59
+ speaker = s["speaker"]
60
  break
61
  final_output.append({
62
+ "start": seg_start,
63
+ "end": seg_end,
64
  "speaker": speaker,
65
+ "text": text
66
  })
67
 
68
  df = pd.DataFrame(final_output)
 
 
69
  txt_lines = [f"[{row['start']:.2f}s - {row['end']:.2f}s] {row['speaker']} : {row['text']}" for _, row in df.iterrows()]
70
  txt_output = "\n".join(txt_lines)
71
+
72
  txt_path = tempfile.mktemp(suffix=".txt")
73
  with open(txt_path, "w", encoding="utf-8") as f:
74
  f.write(txt_output)
75
 
 
76
  csv_path = tempfile.mktemp(suffix=".csv")
77
  df.to_csv(csv_path, index=False)
78
 
 
88
  gr.File(label="Télécharger le TXT")
89
  ],
90
  title="Transcription + Diarisation (FR)",
91
+ description="Charge un fichier MP3. Transcription FR + séparation des locuteurs + export CSV/TXT."
92
  ).launch()