File size: 5,362 Bytes
a361db3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | """
Speaker detection using simple voice activity analysis.
No neural models needed - uses basic signal processing.
"""
import numpy as np
import soundfile as sf
import librosa
import os
def analyze_speakers(audio_path: str, output_dir: str = None) -> dict:
"""
Analyze audio to detect and count unique speakers.
Uses multiple cues: voice activity, energy, spectral characteristics.
"""
print(f"Loading audio: {audio_path}")
audio, sr = sf.read(audio_path)
if audio.ndim > 1:
audio_mono = audio.mean(axis=1)
else:
audio_mono = audio
print(f"Audio: {len(audio_mono) / sr:.1f}s at {sr}Hz")
if output_dir:
os.makedirs(output_dir, exist_ok=True)
print("\nAnalyzing speaker segments...")
frame_length = 2048
hop_length = 512
energy = librosa.feature.rms(
y=audio_mono, frame_length=frame_length, hop_length=hop_length
)[0]
times = librosa.times_like(energy, sr=sr, hop_length=hop_length)
energy_threshold = np.percentile(energy, 15)
speech_mask = energy > energy_threshold
segment_duration = 1.0
segment_samples = int(segment_duration * sr)
n_segments = len(audio_mono) // segment_samples
print(f" Splitting into {n_segments} segments of {segment_duration}s each")
segments_data = []
for seg_idx in range(n_segments):
start = seg_idx * segment_samples
end = start + segment_samples
segment = audio_mono[start:end]
seg_energy = np.mean(segment**2)
if seg_energy < 0.001:
continue
f0, voiced, _ = librosa.pyin(
segment, fmin=70, fmax=400, sr=sr, frame_length=2048
)
f0_valid = f0[~np.isnan(f0)]
if len(f0_valid) > 10:
f0_median = np.median(f0_valid)
f0_std = np.std(f0_valid)
else:
f0_median = 0
f0_std = 0
spectral_centroid = np.mean(
librosa.feature.spectral_centroid(y=segment, sr=sr)[0]
)
segments_data.append(
{
"segment": seg_idx,
"start_time": start / sr,
"energy": seg_energy,
"f0_median": f0_median,
"f0_std": f0_std,
"spectral_centroid": spectral_centroid,
}
)
print(f"Analyzed {len(segments_data)} speech segments")
print("\nClustering segments by voice characteristics...")
features = []
for seg in segments_data:
features.append(
[
seg["f0_median"] if seg["f0_median"] > 0 else 150,
seg["spectral_centroid"],
np.log10(seg["energy"] + 1e-10) * 100,
]
)
features = np.array(features)
features[:, 0] = features[:, 0] / 300
features[:, 1] = features[:, 1] / 5000
features[:, 2] = np.clip(features[:, 2], -2, 2)
from scipy.cluster.hierarchy import linkage, fcluster
Z = linkage(features, method="average")
n_clusters = min(8, len(segments_data) // 3)
n_clusters = max(n_clusters, 2)
labels = fcluster(Z, n_clusters, criterion="maxclust")
unique_speakers = len(set(labels))
print("\nResults:")
print(f" Total segments analyzed: {len(segments_data)}")
print(f" Estimated unique speakers: {unique_speakers}")
for cluster_id in sorted(set(labels)):
cluster_segs = [s for s, l in zip(segments_data, labels) if l == cluster_id]
avg_energy = np.mean([s["energy"] for s in cluster_segs])
avg_f0 = np.mean([s["f0_median"] for s in cluster_segs if s["f0_median"] > 0])
if avg_f0 > 0:
if avg_f0 < 140:
gender = "male"
elif avg_f0 > 185:
gender = "female"
else:
gender = "ambiguous"
else:
gender = "unknown"
distance = "near" if avg_energy > 0.03 else "far"
print(
f" Speaker {cluster_id}: {len(cluster_segs)} segments, {gender}, {distance} (energy: {avg_energy:.4f})"
)
result = {
"n_speakers": unique_speakers,
"segments": segments_data,
"cluster_labels": labels.tolist(),
}
if output_dir:
with open(os.path.join(output_dir, "speaker_analysis.txt"), "w") as f:
f.write(f"Estimated unique speakers: {unique_speakers}\n\n")
for cluster_id in sorted(set(labels)):
cluster_segs = [
s for s, l in zip(segments_data, labels) if l == cluster_id
]
avg_energy = np.mean([s["energy"] for s in cluster_segs])
avg_f0 = np.mean(
[s["f0_median"] for s in cluster_segs if s["f0_median"] > 0]
)
gender = (
"male"
if avg_f0 > 0 and avg_f0 < 140
else ("female" if avg_f0 > 185 else "unknown")
)
f.write(
f"Speaker {cluster_id}: {len(cluster_segs)} segments, gender: {gender}\n"
)
return result
if __name__ == "__main__":
import sys
audio_file = sys.argv[1] if len(sys.argv) > 1 else "../data/mixture.wav"
output = sys.argv[2] if len(sys.argv) > 2 else "speaker_analysis_output"
analyze_speakers(audio_file, output)
|