midmid3 / midmid /sections.py
markury's picture
Initial commit
d171350
"""Structural segmentation (intro, verse, chorus, etc.)."""
import numpy as np
import librosa
def detect_sections(
audio_path: str,
min_section_duration: float = 8.0,
) -> list[tuple[float, str]]:
"""Detect structural sections in an audio file."""
y, sr = librosa.load(audio_path, sr=22050, mono=True)
duration = len(y) / sr
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
n_frames = mfcc.shape[1]
k = max(2, min(n_frames - 1, int(duration / 25)))
bounds = librosa.segment.agglomerative(mfcc, k=k)
bound_times = librosa.frames_to_time(bounds, sr=sr)
if len(bound_times) == 0 or bound_times[0] > 0.5:
bound_times = np.concatenate([[0.0], bound_times])
bound_times = _merge_short_segments(bound_times, duration, min_section_duration)
labels = _assign_labels(y, sr, bound_times, duration)
return list(zip(bound_times.tolist(), labels))
def _merge_short_segments(bounds, duration, min_dur):
merged = [bounds[0]]
for t in bounds[1:]:
if t - merged[-1] >= min_dur:
merged.append(t)
return np.array(merged)
def _assign_labels(y, sr, bound_times, duration):
n = len(bound_times)
if n == 0:
return []
if n == 1:
return ["Intro"]
segment_features = []
for i in range(n):
start_sample = int(bound_times[i] * sr)
end_sample = int(bound_times[i + 1] * sr) if i + 1 < n else len(y)
seg = y[start_sample:end_sample]
if len(seg) < sr // 4:
segment_features.append(np.zeros(13))
else:
mfcc = librosa.feature.mfcc(y=seg, sr=sr, n_mfcc=13)
segment_features.append(np.mean(mfcc, axis=1))
labels = ["Intro"]
letter_idx = 0
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
assigned = {}
for i in range(1, n):
best_sim = -1
best_j = -1
for j in range(i):
sim = _cosine_sim(segment_features[i], segment_features[j])
if sim > best_sim:
best_sim = sim
best_j = j
if best_sim > 0.85 and best_j in assigned:
labels.append(f"Section {assigned[best_j]}")
else:
letter = letters[letter_idx % len(letters)]
letter_idx += 1
assigned[i] = letter
labels.append(f"Section {letter}")
if best_j not in assigned and best_j > 0:
assigned[best_j] = labels[best_j].split()[-1] if " " in labels[best_j] else "A"
return labels
def _cosine_sim(a, b):
norm_a = np.linalg.norm(a)
norm_b = np.linalg.norm(b)
if norm_a == 0 or norm_b == 0:
return 0.0
return float(np.dot(a, b) / (norm_a * norm_b))