Spaces:
Sleeping
Sleeping
| import os | |
| import tempfile | |
| import pandas as pd | |
| import gradio as gr | |
| from pydub import AudioSegment | |
| from faster_whisper import WhisperModel | |
| from pyannote.audio import Pipeline as DiarizationPipeline | |
| # Initialisation des modèles | |
| whisper_model = WhisperModel("large-v2", device="cpu", compute_type="int8") | |
| token = os.getenv("HF_TOKEN") # Ajoute cette variable dans les secrets de ton Space si besoin | |
| diari_pipeline = DiarizationPipeline.from_pretrained( | |
| "pyannote/speaker-diarization-3.1", | |
| use_auth_token=token | |
| ) | |
| # Pipeline de traitement : | |
| # .mp3 | |
| # ↓ (converti .wav) | |
| # .wav | |
| # ↓ | |
| # faster-whisper → segments (texte + timestamps) | |
| # ↓ | |
| # pyannote-audio → diarisation (segments + speaker X) | |
| # ↓ | |
| # Fusion des deux → transcription enrichie avec speaker + timestamp | |
| def convert_mp3_to_wav(mp3_path): | |
| wav_path = tempfile.mktemp(suffix=".wav") | |
| audio = AudioSegment.from_file(mp3_path, format="mp3") | |
| audio = audio.set_channels(1).set_frame_rate(16000) | |
| audio.export(wav_path, format="wav") | |
| return wav_path | |
| def transcribe_and_diarize(audio_file): | |
| wav_path = convert_mp3_to_wav(audio_file) | |
| # Transcription | |
| segments, _ = whisper_model.transcribe(wav_path, language="fr", beam_size=5) | |
| # Diarisation | |
| diarization = diari_pipeline(wav_path) | |
| speakers = [] | |
| for turn, _, speaker in diarization.itertracks(yield_label=True): | |
| speakers.append({ | |
| "start": turn.start, | |
| "end": turn.end, | |
| "speaker": speaker | |
| }) | |
| # Fusion transcription + speaker | |
| final_output = [] | |
| for seg in segments: | |
| seg_start = seg.start | |
| seg_end = seg.end | |
| text = seg.text.strip() | |
| speaker = "Inconnu" | |
| for s in speakers: | |
| if s["start"] <= seg_start <= s["end"]: | |
| speaker = s["speaker"] | |
| break | |
| final_output.append({ | |
| "start": seg_start, | |
| "end": seg_end, | |
| "speaker": speaker, | |
| "text": text | |
| }) | |
| df = pd.DataFrame(final_output) | |
| txt_lines = [f"[{row['start']:.2f}s - {row['end']:.2f}s] {row['speaker']} : {row['text']}" for _, row in df.iterrows()] | |
| txt_output = "\n".join(txt_lines) | |
| txt_path = tempfile.mktemp(suffix=".txt") | |
| with open(txt_path, "w", encoding="utf-8") as f: | |
| f.write(txt_output) | |
| csv_path = tempfile.mktemp(suffix=".csv") | |
| df.to_csv(csv_path, index=False) | |
| return txt_output, csv_path, txt_path | |
| # Interface Gradio | |
| gr.Interface( | |
| fn=transcribe_and_diarize, | |
| inputs=gr.Audio(type="filepath", label="Fichier audio MP3"), | |
| outputs=[ | |
| gr.Textbox(label="Transcription avec locuteurs"), | |
| gr.File(label="Télécharger le CSV"), | |
| gr.File(label="Télécharger le TXT") | |
| ], | |
| title="Transcription + Diarisation (FR)", | |
| description="Charge un fichier MP3. Transcription FR + séparation des locuteurs + export CSV/TXT." | |
| ).launch() |