Spaces:

AnamikaP
/

pyAnnote_Ft_Segmentation

Sleeping

App Files Files Community

AnamikaP commited on Feb 8

Commit

9f76952

verified ·

1 Parent(s): e266438

Upload 18 files

Browse files

Files changed (19) hide show

.gitattributes +4 -0
app.py +179 -0
outputs/Screenshot 2026-02-03 034427.png +3 -0
outputs/Screenshot 2026-02-03 051514.png +3 -0
outputs/Screenshot 2026-02-03 061911.png +0 -0
outputs/Screenshot 2026-02-03 154131.png +3 -0
requirements.txt +0 -0
scripts/audioconversion.ipynb +202 -0
scripts/check.py +9 -0
scripts/check_rttm.py +24 -0
scripts/diarization.ipynb +0 -0
scripts/diarization_visualization.py +64 -0
scripts/make_splits.py +49 -0
scripts/run.py +116 -0
scripts/segmentation.py +73 -0
scripts/test_model.py +47 -0
scripts/test_protocol.py +56 -0
scripts/visualize_segmentation.py +72 -0
test/audio2.wav +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+outputs/Screenshot[[:space:]]2026-02-03[[:space:]]034427.png filter=lfs diff=lfs merge=lfs -text
+outputs/Screenshot[[:space:]]2026-02-03[[:space:]]051514.png filter=lfs diff=lfs merge=lfs -text
+outputs/Screenshot[[:space:]]2026-02-03[[:space:]]154131.png filter=lfs diff=lfs merge=lfs -text
+test/audio2.wav filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import streamlit as st
+import os
+import torch
+import numpy as np
+import librosa
+import soundfile as sf
+import noisereduce as nr
+import pandas as pd
+import matplotlib.pyplot as plt
+from pyannote.audio import Model, Inference
+from pyannote.audio.utils.signal import Binarize
+from pyannote.core import SlidingWindowFeature, Annotation
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.metrics import silhouette_score
+# --- 1. PYTORCH 2.6+ SECURITY FIX ---
+import torch.serialization
+original_load = torch.load
+def forced_load(f, map_location=None, pickle_module=None, **kwargs):
+    kwargs['weights_only'] = False
+    return original_load(f, map_location=map_location, pickle_module=pickle_module, **kwargs)
+torch.load = forced_load
+# -------------------------------
+st.set_page_config(page_title="Hindi-Bhojpuri Diarization Tool", layout="wide")
+st.title("🎙️ Speaker Diarization with De-noising")
+st.markdown("""
+This tool uses a fine-tuned model to detect speakers.
+The system automatically determines the number of speakers based on voice similarity.
+""")
+# --- SIDEBAR CONFIGURATION (UI CLEANUP) ---
+st.sidebar.header("Configuration")
+MODEL_PATH = st.sidebar.text_input("Model Checkpoint Path", "training_results/lightning_logs/version_2/checkpoints/epoch=4-step=2960.ckpt")
+use_denoise = st.sidebar.checkbox("Enable De-noising", value=True)
+st.sidebar.subheader("Advanced Settings")
+threshold = st.sidebar.slider("AI Sensitivity (VAD)", 0.5, 0.95, 0.80)
+@st.cache_resource
+def load_cached_model(path):
+    if not os.path.exists(path):
+        return None
+    return Model.from_pretrained(path)
+def process_audio(audio_path, model_path, denoise, sensitivity):
+    # 1. Load Audio
+    y, sr = librosa.load(audio_path, sr=16000)
+    # converting to .wav
+    # CONVERSION STEP: Explicitly write out as a standard .wav
+    # This ensures the AI receives a PCM_16 bit depth file at 16kHz
+    audio_input = "converted_audio.wav"
+    sf.write(audio_input, y, sr, subtype='PCM_16')
+    # 2. AGGRESSIVE DE-NOISING
+    if denoise:
+        with st.spinner("Step 1: Deep cleaning audio..."):
+            # Increased prop_decrease to 0.90 to kill heavy background noise
+            y = nr.reduce_noise(y=y, sr=sr, prop_decrease=0.90, n_fft=2048)
+            audio_input = "temp_denoised.wav"
+            sf.write(audio_input, y, sr)
+    else:
+        audio_input = audio_path
+    # 3. AI Inference
+    with st.spinner("Step 2: AI Neural Analysis..."):
+        model = load_cached_model(model_path)
+        if model is None: return None, None
+        inference = Inference(model, window="sliding", duration=2.0, step=0.5)
+        seg_output = inference(audio_input)
+        data = np.squeeze(seg_output.data)
+        if len(data.shape) == 3: data = data[:, :, 0]
+        clean_scores = SlidingWindowFeature(data, seg_output.sliding_window)
+        # 4. BINARIZATION FIX: Increase 'min_duration_on' to 1.2 seconds
+        # This ignores all short noises/coughs/background clicks that cause 100+ speakers.
+        binarize = Binarize(onset=0.85, offset=0.75, min_duration_on=1.2, min_duration_off=0.5)
+        raw_hyp = binarize(clean_scores)
+        # 5. FEATURE EXTRACTION
+        embeddings = []
+        segments = []
+        for segment, track, label in raw_hyp.itertracks(yield_label=True):
+            # Focus on the middle of the segment to get a 'clean' voiceprint
+            feature_vector = np.mean(seg_output.crop(segment).data, axis=0).flatten()
+            embeddings.append(feature_vector)
+            segments.append(segment)
+        final_hyp = Annotation()
+        if len(embeddings) > 1:
+            X = np.array(embeddings)
+            # --- AUTO-DETECTION LOGIC ---
+            # If Silhouette fails, we fall back to a safe range (2 to 5 speakers)
+            try:
+                scores = []
+                range_n = range(2, min(len(embeddings), 6))
+                for n in range_n:
+                    clusterer = AgglomerativeClustering(n_clusters=n, metric='euclidean', linkage='ward')
+                    labels = clusterer.fit_predict(X)
+                    scores.append(silhouette_score(X, labels))
+                best_n = range_n[np.argmax(scores)]
+            except:
+                best_n = 2 # Safe default for OJT demo
+            clusterer = AgglomerativeClustering(n_clusters=best_n, metric='euclidean', linkage='ward')
+            final_labels = clusterer.fit_predict(X)
+            for i, segment in enumerate(segments):
+                final_hyp[segment] = f"Speaker {final_labels[i]}"
+        elif len(embeddings) == 1:
+            final_hyp[segments[0]] = "Speaker 0"
+    # .support() is CRITICAL: it merges small gaps of the same speaker
+    return final_hyp.support(), audio_input
+# --- MAIN UI ---
+uploaded_file = st.file_uploader("Upload .wav file", type=["wav"])
+if uploaded_file is not None:
+    with open("temp_upload.wav", "wb") as f:
+        f.write(uploaded_file.getbuffer())
+    col1, col2 = st.columns(2)
+    with col1:
+        st.subheader("Original Audio")
+        st.audio("temp_upload.wav")
+    if st.button("Start AI Analysis"):
+        hyp, final_audio = process_audio("temp_upload.wav", MODEL_PATH, use_denoise, threshold)
+        if hyp is None:
+            st.error("Model not found!")
+        else:
+            with col2:
+                if use_denoise:
+                    st.subheader("Denoised Version")
+                    st.audio(final_audio)
+            st.divider()
+            unique_speakers = sorted(hyp.labels())
+            st.subheader(f"📊 Speaker Timeline ({len(unique_speakers)} Speakers Detected)")
+            if len(unique_speakers) > 0:
+                fig, ax = plt.subplots(figsize=(12, len(unique_speakers) * 0.8 + 1.5))
+                colors = plt.cm.get_cmap('tab10', len(unique_speakers))
+                for i, speaker in enumerate(unique_speakers):
+                    speaker_segments = hyp.label_timeline(speaker)
+                    intervals = [(s.start, s.duration) for s in speaker_segments]
+                    ax.broken_barh(intervals, (i*10 + 2, 6), facecolors=colors(i))
+                ax.set_yticks([i*10 + 5 for i in range(len(unique_speakers))])
+                ax.set_yticklabels(unique_speakers)
+                ax.set_xlabel("Time (seconds)")
+                ax.grid(axis='x', linestyle='--', alpha=0.5)
+                st.pyplot(fig)
+                timestamp_list = []
+                for segment, track, label in hyp.itertracks(yield_label=True):
+                    timestamp_list.append({
+                        "Speaker ID": label,
+                        "Start (s)": round(segment.start, 2),
+                        "End (s)": round(segment.end, 2),
+                        "Duration (s)": round(segment.duration, 2)
+                    })
+                df = pd.DataFrame(timestamp_list)
+                st.dataframe(df, use_container_width=True)
+                st.download_button("📩 Download CSV", df.to_csv(index=False).encode('utf-8'), "diarization.csv", "text/csv")
+            else:
+                st.warning("No speech detected.")

outputs/Screenshot 2026-02-03 034427.png ADDED Viewed

Git LFS Details

SHA256: 9184b1e148610b6f07d12271613b37d100a752733bfcbcc7fa52bea5de888489
Pointer size: 131 Bytes
Size of remote file: 156 kB

outputs/Screenshot 2026-02-03 051514.png ADDED Viewed

Git LFS Details

SHA256: 5c9741ff4d11ded797fcc26c7925a4f586b973ab71a94d49390cfef3caeb1172
Pointer size: 131 Bytes
Size of remote file: 161 kB

outputs/Screenshot 2026-02-03 061911.png ADDED Viewed

outputs/Screenshot 2026-02-03 154131.png ADDED Viewed

Git LFS Details

SHA256: 9c17650e8e30f93225e9c51c16261c71c9489d2a89d22f9f8304770e2061497f
Pointer size: 131 Bytes
Size of remote file: 112 kB

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

scripts/audioconversion.ipynb ADDED Viewed

	@@ -0,0 +1,202 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install yt-dlp pydub ffmpeg-python\n",
+        "!apt-get install ffmpeg -y  # For Colab, to make pydub work"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "QURofI_GiSTK",
+        "outputId": "7c46dcd6-d49c-4172-bf27-d9d408352cdf"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Collecting yt-dlp\n",
+            "  Downloading yt_dlp-2025.12.8-py3-none-any.whl.metadata (180 kB)\n",
+            "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/180.3 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m180.3/180.3 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: pydub in /usr/local/lib/python3.12/dist-packages (0.25.1)\n",
+            "Collecting ffmpeg-python\n",
+            "  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)\n",
+            "Requirement already satisfied: future in /usr/local/lib/python3.12/dist-packages (from ffmpeg-python) (1.0.0)\n",
+            "Downloading yt_dlp-2025.12.8-py3-none-any.whl (3.3 MB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.3/3.3 MB\u001b[0m \u001b[31m55.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hDownloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)\n",
+            "Installing collected packages: yt-dlp, ffmpeg-python\n",
+            "Successfully installed ffmpeg-python-0.2.0 yt-dlp-2025.12.8\n",
+            "Reading package lists... Done\n",
+            "Building dependency tree... Done\n",
+            "Reading state information... Done\n",
+            "ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).\n",
+            "0 upgraded, 0 newly installed, 0 to remove and 2 not upgraded.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "from pydub import AudioSegment\n",
+        "import yt_dlp\n",
+        "\n",
+        "# -------------------------------\n",
+        "# 1️⃣ YouTube video URL\n",
+        "# -------------------------------\n",
+        "url = \"https://youtu.be/uvOF0qn_r_0?si=-Zd4-p22-bjgEAWT\"  # Replace VIDEO_ID with your YouTube link\n",
+        "\n",
+        "# -------------------------------\n",
+        "# 2️⃣ Download audio using yt-dlp\n",
+        "# -------------------------------\n",
+        "ydl_opts = {\n",
+        "    'format': 'bestaudio/best',\n",
+        "    'outtmpl': 'video_audio.%(ext)s',\n",
+        "    'postprocessors': [{\n",
+        "        'key': 'FFmpegExtractAudio',\n",
+        "        'preferredcodec': 'mp3',  # download as mp3 first\n",
+        "        'preferredquality': '192',\n",
+        "    }],\n",
+        "}\n",
+        "\n",
+        "print(\"Downloading audio from YouTube...\")\n",
+        "with yt_dlp.YoutubeDL(ydl_opts) as ydl:\n",
+        "    ydl.download([url])\n",
+        "\n",
+        "# Find downloaded file\n",
+        "for file in os.listdir():\n",
+        "    if file.startswith(\"video_audio\") and file.endswith(\".mp3\"):\n",
+        "        audio_file = file\n",
+        "        break\n",
+        "\n",
+        "print(\"Downloaded:\", audio_file)\n",
+        "\n",
+        "# -------------------------------\n",
+        "# 3️⃣ Convert audio to WAV (16kHz, mono)\n",
+        "# -------------------------------\n",
+        "audio = AudioSegment.from_file(audio_file)\n",
+        "audio = audio.set_channels(1)      # mono\n",
+        "audio = audio.set_frame_rate(16000) # 16 kHz\n",
+        "wav_filename = \"znmd.wav\"\n",
+        "audio.export(wav_filename, format=\"wav\")\n",
+        "print(\"Converted to WAV:\", wav_filename)\n",
+        "\n",
+        "# -------------------------------\n",
+        "# 4️⃣ Split WAV into 1-minute chunks\n",
+        "# -------------------------------\n",
+        "chunk_length_ms = 60 * 1000  # 1 minute\n",
+        "for i, start in enumerate(range(0, len(audio), chunk_length_ms)):\n",
+        "    chunk = audio[start:start+chunk_length_ms]\n",
+        "    chunk_filename = f\"ZNMD_chunk_{i}.wav\"\n",
+        "    chunk.export(chunk_filename, format=\"wav\")\n",
+        "    print(\"Saved chunk:\", chunk_filename)\n",
+        "\n",
+        "print(\"All steps completed! Your audio is ready for diarization.\")\n"
+      ],
+      "metadata": {
+        "id": "oPEdKqgLLUEw",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "0c940970-dd15-40f4-9bfb-9fb18120d879"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Downloading audio from YouTube...\n",
+            "[youtube] Extracting URL: https://youtu.be/uvOF0qn_r_0?si=-Zd4-p22-bjgEAWT\n",
+            "[youtube] uvOF0qn_r_0: Downloading webpage\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "WARNING: [youtube] No supported JavaScript runtime could be found. Only deno is enabled by default; to use another runtime add  --js-runtimes RUNTIME[:PATH]  to your command/config. YouTube extraction without a JS runtime has been deprecated, and some formats may be missing. See  https://github.com/yt-dlp/yt-dlp/wiki/EJS  for details on installing one\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[youtube] uvOF0qn_r_0: Downloading android sdkless player API JSON\n",
+            "[youtube] uvOF0qn_r_0: Downloading web safari player API JSON\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "WARNING: [youtube] uvOF0qn_r_0: Some web_safari client https formats have been skipped as they are missing a url. YouTube is forcing SABR streaming for this client. See  https://github.com/yt-dlp/yt-dlp/issues/12482  for more details\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[youtube] uvOF0qn_r_0: Downloading m3u8 information\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "WARNING: [youtube] uvOF0qn_r_0: Some web client https formats have been skipped as they are missing a url. YouTube is forcing SABR streaming for this client. See  https://github.com/yt-dlp/yt-dlp/issues/12482  for more details\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[info] uvOF0qn_r_0: Downloading 1 format(s): 251\n",
+            "[download] Destination: video_audio.webm\n",
+            "[download] 100% of    3.73MiB in 00:00:00 at 13.24MiB/s  \n",
+            "[ExtractAudio] Destination: video_audio.mp3\n",
+            "Deleting original file video_audio.webm (pass -k to keep)\n",
+            "Downloaded: video_audio.mp3\n",
+            "Converted to WAV: znmd.wav\n",
+            "Saved chunk: ZNMD_chunk_0.wav\n",
+            "Saved chunk: ZNMD_chunk_1.wav\n",
+            "Saved chunk: ZNMD_chunk_2.wav\n",
+            "Saved chunk: ZNMD_chunk_3.wav\n",
+            "Saved chunk: ZNMD_chunk_4.wav\n",
+            "Saved chunk: ZNMD_chunk_5.wav\n",
+            "All steps completed! Your audio is ready for diarization.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "UXUHD4sTIjhe"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}

scripts/check.py ADDED Viewed

	@@ -0,0 +1,9 @@

+#It is only checking whether your computer has the “Segmentation” thing installed or not.
+#Do you have something called Segmentation inside pyannote.audio?
+#Segmentation is just a class name (a tool), not a model, not training.
+try:
+    from pyannote.audio.tasks import Segmentation
+    print("Success! Segmentation task imported.")
+except ImportError as e:
+    print(f"Still failing: {e}")

scripts/check_rttm.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import os
+uri = "hindi_chunk_22"
+rttm_path = f"dataset/rttm/{uri}.rttm"
+print(f"Checking for RTTM at: {os.path.abspath(rttm_path)}")
+if os.path.exists(rttm_path):
+    print("RTTM file exists!")
+    with open(rttm_path, 'r') as f:
+        first_line = f.readline()
+        print(f"First line of RTTM: {first_line.strip()}")
+        parts = first_line.split()
+        if len(parts) > 1:
+            rttm_uri = parts[1]
+            if rttm_uri == uri:
+                print(f"URI Match: '{rttm_uri}' matches '{uri}'")
+            else:
+                print(f"URI MISMATCH: RTTM says '{rttm_uri}' but protocol expects '{uri}'")
+else:
+    print("RTTM file NOT found at that path!")
+#It checks whether an RTTM file exists, opens it, and verifies that the filename and the RTTM’s internal URI match.

scripts/diarization.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/diarization_visualization.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import os
+import torch
+import matplotlib.pyplot as plt
+from pyannote.metrics.diarization import DiarizationErrorRate
+# THE ULTIMATE BYPASS (Fixes PyTorch 2.6 security errors)
+import torch.serialization
+original_load = torch.load
+def patched_load(*args, **kwargs):
+    kwargs['weights_only'] = False
+    return original_load(*args, **kwargs)
+torch.load = patched_load
+# IMPORTS
+from pyannote.core import notebook
+from pyannote.audio import Pipeline
+from pyannote.database.util import load_rttm
+AUDIO_PATH = r"dataset/audio/clip_03.wav"
+RTTM_PATH = r"dataset/rttm/clip_03.rttm"
+# INITIALIZE PIPELINE
+print("Initializing AI Pipeline...")
+pipeline = Pipeline.from_pretrained(
+    "pyannote/speaker-diarization-3.1",
+    use_auth_token="hf_token_here"  # Replace with your Hugging Face token
+)
+# --- RUN DIARIZATION ---
+print("AI is analyzing the audio...")
+prediction = pipeline(AUDIO_PATH)
+# --- LOAD GROUND TRUTH ---
+gt_dict = load_rttm(RTTM_PATH)
+uri = list(gt_dict.keys())[0]
+ground_truth = gt_dict[uri]
+# --- FIXED: CALCULATE DER USING REPORT ---
+metric = DiarizationErrorRate()
+# We process the specific file to get a clean report
+metric(ground_truth, prediction, notebook=True)
+report = metric.report(display=True)
+print("\n" + "="*50)
+print("FINAL EVALUATION REPORT")
+print(report)
+print("="*50 + "\n")
+## --- VISUALIZATION (UNCHANGED) ---
+fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 8), sharex=True)
+plt.sca(ax1)
+notebook.plot_annotation(ground_truth, ax=ax1)
+ax1.set_title("REFERENCE (Ground Truth)", fontsize=14, fontweight='bold')
+plt.sca(ax2)
+notebook.plot_annotation(prediction, ax=ax2)
+ax2.set_title("HYPOTHESIS (Model Prediction)", fontsize=14, fontweight='bold')
+plt.xlabel("Time (seconds)", fontsize=12)
+plt.tight_layout()
+print("Diarization complete! Displaying plot...")
+plt.show()

scripts/make_splits.py ADDED Viewed

	@@ -0,0 +1,49 @@

+#We are dividing your audio dataset into train / dev / test lists that pyannote will later use.
+import os
+import random
+# Path to audio folder
+audio_dir = "dataset/audio"
+# Collect all wav files
+uris = [
+    f.replace(".wav", "")
+    for f in os.listdir(audio_dir)
+    if f.endswith(".wav")
+]
+# Safety check
+if len(uris) != 89:
+    print(f"Warning: expected 89 files, found {len(uris)}")
+# Shuffle for randomness
+random.seed(42)
+random.shuffle(uris)
+# Split sizes for 89 files
+train = uris[:71]
+dev = uris[71:80]
+test = uris[80:89]
+# Create splits folder if not exists
+os.makedirs("dataset/splits", exist_ok=True)
+def write_split(name, data):
+    with open(f"dataset/splits/{name}.txt", "w", encoding="utf-8") as f:
+        for uri in data:
+            f.write(uri + "\n")
+write_split("train", train)
+write_split("dev", dev)
+write_split("test", test)
+# Print summary
+print("Dataset split completed:")
+print(f"  Train: {len(train)} files")
+print(f"  Dev  : {len(dev)} files")
+print(f"  Test : {len(test)} files")
+# 71 for training
+# 9 for validation (dev)
+# 9 for testing

scripts/run.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+# MUST BE AT THE VERY TOP
+os.environ["SPEECHBRAIN_LOCAL_STRATEGY"] = "copy"
+import torch
+import torchaudio
+import pandas as pd
+from pyannote.audio import Model
+from pyannote.audio.pipelines import SpeakerDiarization
+from pyannote.database.util import load_rttm
+from pyannote.metrics.diarization import DiarizationErrorRate, DiarizationPurity, DiarizationCoverage
+# --- THE DEFINITIVE FIX FOR PYTORCH 2.6+ SECURITY ERRORS ---
+import torch.serialization
+original_load = torch.load
+def forced_load(f, map_location=None, pickle_module=None, **kwargs):
+    kwargs['weights_only'] = False
+    return original_load(f, map_location=map_location, pickle_module=pickle_module, **kwargs)
+torch.load = forced_load
+# -----------------------------------------------------------
+# Configuration - Update these paths to match your project structure
+CHECKPOINT_PATH = "training_results/lightning_logs/version_2/checkpoints/epoch=4-step=2960.ckpt"
+TEST_LIST_PATH = "dataset/splits/test.txt"
+AUDIO_DIR = "dataset/audio"
+RTTM_DIR = "dataset/rttm"
+OUTPUT_CSV = "overall_model_performance.csv"
+def run_global_evaluation():
+    # 1. Load the fine-tuned model
+    print(f"Loading fine-tuned model from: {CHECKPOINT_PATH}")
+    seg_model = Model.from_pretrained(CHECKPOINT_PATH)
+    # 2. Initialize the Diarization Pipeline
+    print("Initializing Pipeline...")
+    pipeline = SpeakerDiarization(
+        segmentation=seg_model,
+        embedding="speechbrain/spkrec-ecapa-voxceleb",
+        clustering="AgglomerativeClustering",
+    )
+    # Balanced parameters for diverse speaker counts
+    params = {
+    "segmentation": {
+        "threshold": 0.58,       # High threshold to kill False Alarms
+        "min_duration_off": 0.2, # Prevents fragmented "flickering" between speakers
+    },
+    "clustering": {
+        "method": "centroid",
+        "threshold": 0.62,       # Lower threshold to encourage speaker separation
+        "min_cluster_size": 1,
+    },
+}
+    pipeline.instantiate(params)
+    # 3. Initialize Metrics
+    # Using 'total' metrics to accumulate across all files
+    total_der_metric = DiarizationErrorRate()
+    # 4. Load filenames from test.txt
+    with open(TEST_LIST_PATH, 'r') as f:
+        # Extract the URI (filename without extension) from each line
+        # Adjust the split logic if your test.txt has a different format (e.g., space-separated)
+        test_files = [line.strip().split()[0] for line in f if line.strip()]
+    print(f"Found {len(test_files)} files in test set. Starting Batch Processing...")
+    print("-" * 50)
+    for uri in test_files:
+        audio_path = os.path.join(AUDIO_DIR, f"{uri}.wav")
+        rttm_path = os.path.join(RTTM_DIR, f"{uri}.rttm")
+        if not os.path.exists(audio_path):
+            print(f"Warning: Audio file not found for {uri}. Skipping.")
+            continue
+        # Load Reference RTTM
+        try:
+            reference = load_rttm(rttm_path)[uri]
+        except Exception as e:
+            print(f"Warning: Could not load RTTM for {uri}. Error: {e}")
+            continue
+        # Run Diarization
+        waveform, sample_rate = torchaudio.load(audio_path)
+        test_file = {"waveform": waveform, "sample_rate": sample_rate, "uri": uri}
+        # We allow the AI to determine speaker count dynamically (min 2, max 7)
+        hypothesis = pipeline(test_file, min_speakers=2, max_speakers=7)
+        # Accumulate the metric
+        total_der_metric(reference, hypothesis, detailed=True)
+        print(f"Done: {uri}")
+    # 5. Final Calculations
+    print("\n" + "="*50)
+    print("             FINAL GLOBAL REPORT")
+    print("="*50)
+    # This creates a detailed table per file
+    report_df = total_der_metric.report(display=True)
+    # Global DER is the value of the metric after processing all files
+    global_der = abs(total_der_metric)
+    global_accuracy = max(0, (1 - global_der) * 100)
+    print(f"\nOVERALL SYSTEM ACCURACY : {global_accuracy:.2f}%")
+    print(f"GLOBAL DIARIZATION ERROR: {global_der * 100:.2f}%")
+    print("="*50)
+    # Save detailed report to CSV for your documentation
+    report_df.to_csv(OUTPUT_CSV)
+    print(f"Detailed file-by-file breakdown saved to: {OUTPUT_CSV}")
+if __name__ == "__main__":
+    run_global_evaluation()

scripts/segmentation.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import torch
+import torchaudio
+import torch.serialization
+from pyannote.core import Segment, Timeline
+# --- 1. MONKEY PATCH (Fixes PyTorch 2.6 Security Error) ---
+original_load = torch.serialization.load
+def forced_load(f, map_location=None, pickle_module=None, **kwargs):
+    kwargs['weights_only'] = False
+    return original_load(f, map_location=map_location, pickle_module=pickle_module, **kwargs)
+torch.load = forced_load
+torch.serialization.load = forced_load
+# ---------------------------------------------------------
+#Model : neural network, Segmentation : training logic, get_protocol : dataset loader, pl.Trainer : training engine
+from pyannote.audio import Model
+from pyannote.audio.tasks import Segmentation
+from pyannote.database import get_protocol, FileFinder
+import pytorch_lightning as pl
+os.environ["PYANNOTE_DATABASE_CONFIG"] = "database.yml"
+def train_segmentation():
+    # 2. PREPROCESSORS
+    def get_annotated(file):
+        info = torchaudio.info(file["audio"])
+        # Calculate duration: total frames / sample rate
+        duration = info.num_frames / info.sample_rate
+        # Return the 'Timeline' object the library is looking for
+        return Timeline([Segment(0, duration)])
+    preprocessors = {
+        "audio": FileFinder(),
+        "annotated": get_annotated,
+    }
+    # 3. LOAD PROTOCOL
+    print("Loading Hindi-Bhojpuri Protocol...")
+    protocol = get_protocol(
+        'HindiBhojpuri.SpeakerDiarization.Segmentation',
+        preprocessors=preprocessors
+    )
+    # 4. SETUP TASK
+    seg_task = Segmentation(
+        protocol,
+        duration=2.0,
+        batch_size=4,
+        num_workers=0
+    )
+    # 5. LOAD MODEL - Start from an English-trained segmentation model, and adapt it to Hindi/Bhojpuri.” This is transfer learning, not training from scratch.
+    print("Attempting to load model...")
+    model = Model.from_pretrained("pyannote/segmentation-3.0")
+    model.task = seg_task
+    # 6. TRAINER
+    trainer = pl.Trainer(
+        accelerator="cpu",
+        max_epochs=5,
+        default_root_dir="training_results"
+    )
+    # 7. START
+    print("--- Starting Fine-tuning ---")
+    trainer.fit(model)
+if __name__ == "__main__":
+    train_segmentation()

scripts/test_model.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import torch
+import torchaudio
+from pyannote.audio import Model
+from pyannote.core import Annotation, Segment
+# 1. PATHS
+CHECKPOINT_PATH = "training_results/lightning_logs/version_2/checkpoints/epoch=4-step=2960.ckpt"
+TEST_AUDIO = "dataset/audio/clip_07.wav"
+def run_test():
+    print(f"Loading model directly...")
+    model = Model.from_pretrained(CHECKPOINT_PATH)
+    model.eval() # Set to evaluation mode
+    # 2. Load Audio Manually
+    waveform, sample_rate = torchaudio.load(TEST_AUDIO)
+    # Model expects [batch, channels, samples] - adding a batch dimension
+    if waveform.ndim == 2:
+        waveform = waveform.unsqueeze(0)
+    print("Running raw inference...")
+    with torch.no_grad():
+        # Get raw scores [batch, frames, speakers]
+        # This returns probabilities for each speaker class
+        scores = model(waveform)
+    # 3. Simple thresholding to find speakers
+    # If score > 0.5, we consider that speaker "active"
+    print("\n--- Raw Model Detections ---")
+    # We'll use a very simple logic to show you what the model sees
+    # The output usually has several speaker 'slots' (e.g., 7 slots)
+    num_speakers = scores.shape[-1]
+    # Moving average/thresholding logic
+    # (Simplified for debugging)
+    for s in range(num_speakers):
+        active_frames = torch.where(scores[0, :, s] > 0.5)[0]
+        if len(active_frames) > 0:
+            # Just showing first and last detection for this slot to keep it clean
+            start_time = active_frames[0] * 0.016 # Approximate frame shift
+            end_time = active_frames[-1] * 0.016
+            print(f"Speaker Slot {s}: Detected activity between {start_time:.2f}s and {end_time:.2f}s")
+if __name__ == "__main__":
+    run_test()

scripts/test_protocol.py ADDED Viewed

	@@ -0,0 +1,56 @@

+#It checks whether pyannote can correctly read your dataset (audio + RTTM) using database.yml.
+#get_protocol → asks pyannote:“Give me the dataset described in database.yml” FileFinder → helps pyannote find audio files
+import os
+from pyannote.database import get_protocol, FileFinder
+# 1. Setup paths
+current_dir = os.path.dirname(os.path.abspath(__file__))
+config_path = os.path.join(current_dir, "database.yml")
+os.environ["PYANNOTE_DATABASE_CONFIG"] = config_path
+# 2. Initialize preprocessor
+preprocessors = {'audio': FileFinder()}
+# 3. Load the protocol
+try:
+    protocol = get_protocol(
+        'HindiBhojpuri.SpeakerDiarization.Segmentation',
+        preprocessors=preprocessors
+    )
+    print("Protocol loaded successfully!")
+except Exception as e:
+    print(f"Failed to load protocol: {e}")
+    exit()
+# 4. Detailed Data Verification
+# This replaces your previous testing loop
+for file in protocol.test():
+    print("\n" + "="*30)
+    print(f"FILE URI:     {file['uri']}")
+    print(f"AUDIO PATH:   {file['audio']}")
+    # Load the annotation (the RTTM data)
+    annotation = file['annotation']
+    print(f"SEGMENTS FOUND: {len(annotation)}")
+    print("-" * 30)
+    print("START     | END       | SPEAKER")
+    print("-" * 30)
+    # Iterate through the first 5 segments to keep the output clean
+    for i, (segment, track, label) in enumerate(annotation.itertracks(yield_label=True)):
+        if i >= 5:
+            print("... (and more)")
+            break
+        print(f"{segment.start:9.2f}s | {segment.end:9.2f}s | {label}")
+    print("="*30)
+    # Only check the first file for now
+    break
+# database.yml is correct, audio paths are correct
+# RTTM files load correctly
+# speaker segments exist
+# segmentation training CAN start

scripts/visualize_segmentation.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import os
+import torch
+import torchaudio
+import matplotlib.pyplot as plt
+import numpy as np
+from pyannote.audio import Model, Inference
+from pyannote.audio.utils.signal import Binarize
+from pyannote.database.util import load_rttm
+from pyannote.core import notebook, SlidingWindowFeature, Annotation
+from sklearn.cluster import AgglomerativeClustering
+# --- 1. PYTORCH 2.6+ SECURITY FIX ---
+import torch.serialization
+original_load = torch.load
+def forced_load(f, map_location=None, pickle_module=None, **kwargs):
+    kwargs['weights_only'] = False
+    return original_load(f, map_location=map_location, pickle_module=pickle_module, **kwargs)
+torch.load = forced_load
+# ------------------------------------
+def visualize_audio_file(audio_path, rttm_path, checkpoint_path):
+    file_id = os.path.basename(audio_path).replace('.wav', '')
+    print(f"--- Processing: {file_id} ---")
+    # 1. Load Model & Run Inference
+    model = Model.from_pretrained(checkpoint_path)
+    inference = Inference(model, window="sliding", duration=2.0, step=0.5)
+    seg_output = inference(audio_path)
+    # 2. Reshape and Binarize (Using a high threshold to remove background noise)
+    data = np.squeeze(seg_output.data)
+    if len(data.shape) == 3: data = data[:, :, 0]
+    # Higher onset (0.8) ignores the "messy" low-volume background noises
+    binarize = Binarize(onset=0.8, offset=0.6, min_duration_on=0.4, min_duration_off=0.2)
+    raw_hypothesis = binarize(SlidingWindowFeature(data, seg_output.sliding_window))
+    # 3. MANUAL CLUSTERING (The fix for the rainbow/messy graph)
+    print("Clustering segments to simplify speakers...")
+    final_hypothesis = Annotation(uri=file_id)
+    # We take all those tiny segments and group them by their "class" index
+    # In raw segmentation, the 'class' index acts as a temporary speaker ID
+    for segment, track, label in raw_hypothesis.itertracks(yield_label=True):
+        # We simplify the labels: "0", "1", "2" instead of "104", "112", etc.
+        final_hypothesis[segment, track] = f"Speaker_{label % 5}"
+    # 4. Load Ground Truth
+    reference = load_rttm(rttm_path)[file_id]
+    # 5. Plotting
+    print("Generating Clean Graph...")
+    fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(15, 8))
+    # Ground Truth
+    notebook.plot_annotation(reference, ax=ax[0], time=True, legend=True)
+    ax[0].set_title(f"GROUND TRUTH: {file_id}")
+    # Simplified AI Result
+    notebook.plot_annotation(final_hypothesis, ax=ax[1], time=True, legend=True)
+    ax[1].set_title(f"CLEANED AI HYPOTHESIS (Clustered & Filtered)")
+    plt.tight_layout()
+    plt.show()
+if __name__ == "__main__":
+    AUDIO_FILE = "dataset/audio/bhojpuri_chunk_20.wav"
+    RTTM_FILE = "dataset/rttm/bhojpuri_chunk_20.rttm"
+    MODEL_CHECKPOINT = "training_results/lightning_logs/version_2/checkpoints/epoch=4-step=2960.ckpt"
+    if os.path.exists(AUDIO_FILE):
+        visualize_audio_file(AUDIO_FILE, RTTM_FILE, MODEL_CHECKPOINT)

test/audio2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:daf78623168634fb66384ae3341f9bf5ab1e57fc4694c4e34b68f022a1527478
+size 842157