Spaces:

sarahai
/

diar

Running

App Files Files Community

sarahai commited on Oct 7, 2025

Commit

d5bf736

verified ·

1 Parent(s): 3145737

Create app.py

Browse files

Files changed (1) hide show

app.py +132 -0

app.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import os
+import wget
+import torch
+import streamlit as st
+from omegaconf import OmegaConf
+import nemo.collections.asr as nemo_asr
+import json
+import soundfile as sf
+from torchaudio.transforms import Resample
+import shutil
+# --- 1. SETUP & CONFIGURATION ---
+TEMP_DIR = os.path.join(os.getcwd(), "temp_streamlit_output")
+os.makedirs(TEMP_DIR, exist_ok=True)
+NUM_SPEAKERS = 2 # Default number of speakers
+@st.cache_resource
+def load_models():
+    """
+    Loads all the necessary models and configurations once and caches them.
+    """
+    # Load the official NeMo config file
+    CONFIG_URL = "https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/diar_infer_telephonic.yaml"
+    CONFIG_PATH = os.path.join(TEMP_DIR, "diar_infer_telephonic.yaml")
+    if not os.path.exists(CONFIG_PATH):
+        wget.download(CONFIG_URL, TEMP_DIR)
+    cfg = OmegaConf.load(CONFIG_PATH)
+    # Load Silero VAD model
+    vad_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False, onnx=True)
+    get_speech_timestamps_func = utils[0]
+    return cfg, vad_model, get_speech_timestamps_func
+# --- 2. HELPER FUNCTIONS (from our previous script) ---
+def run_silero_vad(audio_path, vad_model, get_speech_timestamps_func):
+    SAMPLING_RATE = 16000
+    wav, sr = sf.read(audio_path)
+    if len(wav.shape) > 1: wav = wav.mean(axis=1)
+    if sr != SAMPLING_RATE:
+        resampler = Resample(orig_freq=sr, new_freq=SAMPLING_RATE)
+        wav = resampler(torch.tensor(wav, dtype=torch.float32))
+    else:
+        wav = torch.tensor(wav, dtype=torch.float32)
+    return get_speech_timestamps_func(wav, vad_model, sampling_rate=SAMPLING_RATE)
+def write_vad_manifest(timestamps, audio_path, manifest_path):
+    with open(manifest_path, 'w') as f:
+        for ts in timestamps:
+            entry = {'audio_filepath': audio_path, 'offset': ts['start'] / 16000.0, 'duration': (ts['end'] - ts['start']) / 16000.0}
+            f.write(json.dumps(entry) + '\n')
+def format_rttm_labels(input_rttm, output_rttm):
+    with open(input_rttm, 'r') as infile, open(output_rttm, 'w') as outfile:
+        for line in infile:
+            new_line = line
+            for i in range(20):
+                pyannote_label = f"speaker_{i}"
+                standard_label = f"SPEAKER_{i:02d}"
+                if pyannote_label in new_line:
+                    new_line = new_line.replace(pyannote_label, standard_label)
+            outfile.write(new_line)
+# --- 3. MAIN DIARIZATION LOGIC ---
+def diarize_audio(audio_file_path, num_speakers, cfg, vad_model, get_speech_timestamps_func):
+    # Modify the config with our parameters
+    cfg.diarizer.manifest_filepath = os.path.join(TEMP_DIR, "input_manifest.json")
+    cfg.diarizer.out_dir = TEMP_DIR
+    cfg.diarizer.speaker_embeddings.model_path = "titanet_large"
+    cfg.diarizer.msdd_model.model_path = None
+    cfg.diarizer.clustering.parameters.num_speakers = num_speakers
+    # Prepare VAD output
+    vad_timestamps = run_silero_vad(audio_file_path, vad_model, get_speech_timestamps_func)
+    vad_manifest_path = os.path.join(TEMP_DIR, "vad_outputs.json")
+    write_vad_manifest(vad_timestamps, audio_file_path, vad_manifest_path)
+    # Prepare main manifest
+    meta = {'audio_filepath': audio_file_path, 'offset': 0, 'duration': None, 'label': 'infer', 'text': '-', 'vad_filepath': vad_manifest_path}
+    with open(cfg.diarizer.manifest_filepath, "w") as f:
+        f.write(json.dumps(meta) + '\n')
+    # Initialize and run diarizer
+    diarizer = nemo_asr.models.ClusteringDiarizer(cfg=cfg)
+    diarizer.diarize()
+    # Format and return the path to the final RTTM
+    file_id = os.path.splitext(os.path.basename(audio_file_path))[0]
+    raw_rttm_path = os.path.join(TEMP_DIR, "pred_rttms", f"{file_id}.rttm")
+    final_rttm_path = os.path.join(TEMP_DIR, f"{file_id}_formatted.rttm")
+    format_rttm_labels(raw_rttm_path, final_rttm_path)
+    return final_rttm_path
+# --- 4. STREAMLIT UI ---
+st.set_page_config(layout="wide")
+st.title("🗣️ Speaker Diarization Tool")
+st.write("Upload an audio file (.wav, .mp3) and the model will determine who spoke when.")
+# Load models once
+cfg, vad_model, get_speech_timestamps_func = load_models()
+uploaded_file = st.file_uploader("Choose an audio file...", type=["wav", "mp3", "flac"])
+if uploaded_file is not None:
+    st.audio(uploaded_file, format='audio/wav')
+    if st.button("Diarize Audio"):
+        with st.spinner('Processing... This may take a moment.'):
+            # Save uploaded file to a temporary path
+            temp_audio_path = os.path.join(TEMP_DIR, uploaded_file.name)
+            with open(temp_audio_path, "wb") as f:
+                f.write(uploaded_file.getbuffer())
+            # Run diarization
+            final_rttm_path = diarize_audio(temp_audio_path, NUM_SPEAKERS, cfg, vad_model, get_speech_timestamps_func)
+            # Read and display the RTTM content
+            with open(final_rttm_path, 'r') as f:
+                rttm_content = f.read()
+            st.success("Diarization complete!")
+            st.text_area("RTTM Output", rttm_content, height=300)
+            # Add a download button
+            st.download_button(
+                label="Download RTTM file",
+                data=rttm_content,
+                file_name=os.path.basename(final_rttm_path),
+                mime='text/plain',
+            )