File size: 2,694 Bytes
d171350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""Structural segmentation (intro, verse, chorus, etc.)."""

import numpy as np
import librosa


def detect_sections(
    audio_path: str,
    min_section_duration: float = 8.0,
) -> list[tuple[float, str]]:
    """Detect structural sections in an audio file."""
    y, sr = librosa.load(audio_path, sr=22050, mono=True)
    duration = len(y) / sr

    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

    n_frames = mfcc.shape[1]
    k = max(2, min(n_frames - 1, int(duration / 25)))
    bounds = librosa.segment.agglomerative(mfcc, k=k)
    bound_times = librosa.frames_to_time(bounds, sr=sr)

    if len(bound_times) == 0 or bound_times[0] > 0.5:
        bound_times = np.concatenate([[0.0], bound_times])

    bound_times = _merge_short_segments(bound_times, duration, min_section_duration)
    labels = _assign_labels(y, sr, bound_times, duration)

    return list(zip(bound_times.tolist(), labels))


def _merge_short_segments(bounds, duration, min_dur):
    merged = [bounds[0]]
    for t in bounds[1:]:
        if t - merged[-1] >= min_dur:
            merged.append(t)
    return np.array(merged)


def _assign_labels(y, sr, bound_times, duration):
    n = len(bound_times)
    if n == 0:
        return []
    if n == 1:
        return ["Intro"]

    segment_features = []
    for i in range(n):
        start_sample = int(bound_times[i] * sr)
        end_sample = int(bound_times[i + 1] * sr) if i + 1 < n else len(y)
        seg = y[start_sample:end_sample]
        if len(seg) < sr // 4:
            segment_features.append(np.zeros(13))
        else:
            mfcc = librosa.feature.mfcc(y=seg, sr=sr, n_mfcc=13)
            segment_features.append(np.mean(mfcc, axis=1))

    labels = ["Intro"]
    letter_idx = 0
    letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    assigned = {}

    for i in range(1, n):
        best_sim = -1
        best_j = -1
        for j in range(i):
            sim = _cosine_sim(segment_features[i], segment_features[j])
            if sim > best_sim:
                best_sim = sim
                best_j = j

        if best_sim > 0.85 and best_j in assigned:
            labels.append(f"Section {assigned[best_j]}")
        else:
            letter = letters[letter_idx % len(letters)]
            letter_idx += 1
            assigned[i] = letter
            labels.append(f"Section {letter}")

        if best_j not in assigned and best_j > 0:
            assigned[best_j] = labels[best_j].split()[-1] if " " in labels[best_j] else "A"

    return labels


def _cosine_sim(a, b):
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    if norm_a == 0 or norm_b == 0:
        return 0.0
    return float(np.dot(a, b) / (norm_a * norm_b))