| """Structural segmentation (intro, verse, chorus, etc.).""" |
|
|
| import numpy as np |
| import librosa |
|
|
|
|
| def detect_sections( |
| audio_path: str, |
| min_section_duration: float = 8.0, |
| ) -> list[tuple[float, str]]: |
| """Detect structural sections in an audio file.""" |
| y, sr = librosa.load(audio_path, sr=22050, mono=True) |
| duration = len(y) / sr |
|
|
| mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) |
|
|
| n_frames = mfcc.shape[1] |
| k = max(2, min(n_frames - 1, int(duration / 25))) |
| bounds = librosa.segment.agglomerative(mfcc, k=k) |
| bound_times = librosa.frames_to_time(bounds, sr=sr) |
|
|
| if len(bound_times) == 0 or bound_times[0] > 0.5: |
| bound_times = np.concatenate([[0.0], bound_times]) |
|
|
| bound_times = _merge_short_segments(bound_times, duration, min_section_duration) |
| labels = _assign_labels(y, sr, bound_times, duration) |
|
|
| return list(zip(bound_times.tolist(), labels)) |
|
|
|
|
| def _merge_short_segments(bounds, duration, min_dur): |
| merged = [bounds[0]] |
| for t in bounds[1:]: |
| if t - merged[-1] >= min_dur: |
| merged.append(t) |
| return np.array(merged) |
|
|
|
|
| def _assign_labels(y, sr, bound_times, duration): |
| n = len(bound_times) |
| if n == 0: |
| return [] |
| if n == 1: |
| return ["Intro"] |
|
|
| segment_features = [] |
| for i in range(n): |
| start_sample = int(bound_times[i] * sr) |
| end_sample = int(bound_times[i + 1] * sr) if i + 1 < n else len(y) |
| seg = y[start_sample:end_sample] |
| if len(seg) < sr // 4: |
| segment_features.append(np.zeros(13)) |
| else: |
| mfcc = librosa.feature.mfcc(y=seg, sr=sr, n_mfcc=13) |
| segment_features.append(np.mean(mfcc, axis=1)) |
|
|
| labels = ["Intro"] |
| letter_idx = 0 |
| letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| assigned = {} |
|
|
| for i in range(1, n): |
| best_sim = -1 |
| best_j = -1 |
| for j in range(i): |
| sim = _cosine_sim(segment_features[i], segment_features[j]) |
| if sim > best_sim: |
| best_sim = sim |
| best_j = j |
|
|
| if best_sim > 0.85 and best_j in assigned: |
| labels.append(f"Section {assigned[best_j]}") |
| else: |
| letter = letters[letter_idx % len(letters)] |
| letter_idx += 1 |
| assigned[i] = letter |
| labels.append(f"Section {letter}") |
|
|
| if best_j not in assigned and best_j > 0: |
| assigned[best_j] = labels[best_j].split()[-1] if " " in labels[best_j] else "A" |
|
|
| return labels |
|
|
|
|
| def _cosine_sim(a, b): |
| norm_a = np.linalg.norm(a) |
| norm_b = np.linalg.norm(b) |
| if norm_a == 0 or norm_b == 0: |
| return 0.0 |
| return float(np.dot(a, b) / (norm_a * norm_b)) |
|
|