evannh commited on
Commit
7201182
·
verified ·
1 Parent(s): 05d5de4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -50
app.py CHANGED
@@ -1,57 +1,88 @@
1
- # app.py
2
  import gradio as gr
3
- import spacy
4
  import torch
5
  import os
6
- from transformers import pipeline
7
- from pyannote.audio import Pipeline as DiarizationPipeline
8
-
9
- # Chargement du modèle Whisper via transformers
10
- asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small", chunk_length_s=30)
11
-
12
- # Chargement du modèle spaCy pour NER
13
- nlp = spacy.load("fr_core_news_md")
14
-
15
- # Diarisation avec PyAnnote (si HF_TOKEN dispo dans l'environnement)
16
- hf_token = os.getenv("HF_TOKEN")
17
- if hf_token:
18
- diar_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=hf_token)
19
- else:
20
- diar_pipeline = None
21
-
22
- def process_audio(file):
23
- # Transcription avec Whisper via transformers
24
- result = asr_pipeline(file)
25
- transcription = result["text"]
26
-
27
- # Diarisation
28
- if diar_pipeline:
29
- diar_result = diar_pipeline(file)
30
- diar_str = "\n".join([
31
- f"{turn.start:.1f}s - {turn.end:.1f}s : {speaker}"
32
- for turn, _, speaker in diar_result.itertracks(yield_label=True)
33
- ])
34
- else:
35
- diar_str = "Diarisation non disponible (ajoutez votre HF_TOKEN dans les secrets)"
36
-
37
- # NER
38
- doc = nlp(transcription)
39
- entities = [(ent.text, ent.label_) for ent in doc.ents]
40
- ent_str = "\n".join([f"{text} ({label})" for text, label in entities]) if entities else "Aucune entité détectée"
41
-
42
- return transcription, diar_str, ent_str
43
-
44
- demo = gr.Interface(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  fn=process_audio,
46
- inputs=gr.Audio(type="filepath", label="Audio (.mp3/.wav)"),
47
- outputs=[
48
- gr.Textbox(label="📝 Transcription Whisper"),
49
- gr.Textbox(label="🗣️ Diarisation (PyAnnote)"),
50
- gr.Textbox(label="🧠 Entités Nommées (spaCy)")
51
- ],
52
- title="🔎 Pipeline Audio Intelligent",
53
- description="Transcription, Diarisation, et Extraction d'Entités Nommées sur un fichier audio français."
54
  )
55
 
56
  if __name__ == "__main__":
57
- demo.launch()
 
 
1
  import gradio as gr
 
2
  import torch
3
  import os
4
+ from whisperx import load_model, load_align_model, align
5
+ from resemblyzer import preprocess_wav, VoiceEncoder
6
+ from sklearn.cluster import AgglomerativeClustering
7
+ import librosa
8
+ import numpy as np
9
+
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
11
+ compute_type = "float16" if device == "cuda" else "int8"
12
+
13
+ whisper_model = load_model("medium", device=device, compute_type=compute_type)
14
+ align_model, metadata = load_align_model(language_code="fr", device=device)
15
+ voice_encoder = VoiceEncoder()
16
+
17
+ def get_speaker_segments(audio_path, window_size=1.0, step_size=0.5, num_speakers=2):
18
+ wav, sr = librosa.load(audio_path, sr=16000, mono=True)
19
+ wav = librosa.util.normalize(wav)
20
+ duration = librosa.get_duration(y=wav, sr=sr)
21
+
22
+ segments = []
23
+ embeddings = []
24
+
25
+ for start in np.arange(0, duration - window_size, step_size):
26
+ end = start + window_size
27
+ clip = wav[int(start * sr):int(end * sr)]
28
+ if len(clip) == 0:
29
+ continue
30
+ try:
31
+ embed = voice_encoder.embed_utterance(clip)
32
+ embeddings.append(embed)
33
+ segments.append((start, end))
34
+ except Exception as e:
35
+ print(f"⚠️ Skipped segment {start}-{end}s: {e}")
36
+
37
+ if len(embeddings) < 2:
38
+ print("⚠️ Pas assez de segments pour la diarisation. Diarisation annulée.")
39
+ return [{"start": 0, "end": duration, "speaker": "speaker_00"}]
40
+
41
+ clustering = AgglomerativeClustering(n_clusters=num_speakers)
42
+ labels = clustering.fit_predict(embeddings)
43
+
44
+ speaker_segments = []
45
+ for (start, end), label in zip(segments, labels):
46
+ speaker_segments.append({"start": start, "end": end, "speaker": f"speaker_{label:02d}"})
47
+
48
+ return speaker_segments
49
+
50
+ def process_audio(audio_file):
51
+ tmp_path = audio_file
52
+
53
+ # Step 1: Transcription
54
+ result = whisper_model.transcribe(tmp_path, language="fr", word_timestamps=False, verbose=False)
55
+
56
+ # Step 2: Diarisation via resemblyzer
57
+ speaker_segments = get_speaker_segments(tmp_path)
58
+
59
+ # Step 3: Alignement mot à mot
60
+ result_aligned = align(result["segments"], align_model, metadata, tmp_path, return_char_alignments=False)
61
+
62
+ # Attribution speaker
63
+ for segment in result_aligned["segments"]:
64
+ segment_start = segment["start"]
65
+ speaker_found = next((sp["speaker"] for sp in speaker_segments if sp["start"] <= segment_start <= sp["end"]), "speaker_??")
66
+ segment["speaker"] = speaker_found
67
+
68
+ # Format final
69
+ final_output = ""
70
+ for seg in result_aligned["segments"]:
71
+ speaker = seg["speaker"]
72
+ start = f"{seg['start']:.2f}s"
73
+ end = f"{seg['end']:.2f}s"
74
+ text = seg['text'].strip()
75
+ final_output += f"[{start} - {end}] {speaker}: {text}\n"
76
+
77
+ return final_output
78
+
79
+ iface = gr.Interface(
80
  fn=process_audio,
81
+ inputs=gr.Audio(type="filepath", label="Audio (.wav, .mp3...)"),
82
+ outputs=gr.Textbox(label="Transcription + Diarisation + Alignement"),
83
+ title="🎙️ Transcription enrichie avec WhisperX + Resemblyzer",
84
+ description="Transcription française, diarisation légère (sans token), alignement mot à mot."
 
 
 
 
85
  )
86
 
87
  if __name__ == "__main__":
88
+ iface.launch()