Spaces:
Sleeping
Sleeping
File size: 4,833 Bytes
b72f14c 27b0097 b72f14c 27b0097 b72f14c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | import numpy as np
import librosa
import scipy.signal as sps
def compute_spectral_analysis(y, sr, n_fft=4096):
"""Comprehensive spectral analysis tuned for speech QC."""
hop_length = n_fft // 4
# ============================================================
# STFT → Magnitude + dB Conversion
# ============================================================
S = np.abs(librosa.stft(
y,
n_fft=n_fft,
hop_length=hop_length,
window="hann"
))
freqs = np.linspace(0, sr / 2, S.shape[0])
# Convert amplitude to dB scale
S_db = librosa.amplitude_to_db(S, ref=np.max)
# ============================================================
# 90th Percentile Energy Envelope
# ============================================================
S_power = S ** 2
energy = np.percentile(S_power, 90, axis=1) + 1e-20
total_energy = float(np.sum(energy))
cum_energy = np.cumsum(energy)
roll85_idx = np.searchsorted(cum_energy, 0.85 * total_energy)
roll95_idx = np.searchsorted(cum_energy, 0.95 * total_energy)
freq_at_85 = float(freqs[min(roll85_idx, len(freqs) - 1)])
freq_at_95 = float(freqs[min(roll95_idx, len(freqs) - 1)])
# ============================================================
# Updated HF Envelope: 90th percentile of dB
# ============================================================
mean_db_per_bin = np.percentile(S_db, 90, axis=1)
peak_db = float(np.max(S_db))
threshold_db = peak_db - 60
non_silent_bins = np.where(mean_db_per_bin > threshold_db)[0]
highest_freq = float(freqs[non_silent_bins[-1]]) if non_silent_bins.size else 0.0
# ============================================================
# Speech-Centric Band Energy Distribution
# ============================================================
def band_energy(low, high):
i1 = np.searchsorted(freqs, low)
i2 = np.searchsorted(freqs, high)
return float(100 * np.sum(energy[i1:i2]) / total_energy)
def band_energy_above(f):
idx = np.searchsorted(freqs, f)
return float(100 * np.sum(energy[idx:]) / total_energy)
energy_stats = {
"below_100hz": band_energy(0, 100),
"100_500hz": band_energy(100, 500),
"500_2khz": band_energy(500, 2000),
"2k_8khz": band_energy(2000, 8000),
"8k_12khz": band_energy(8000, 12000),
"12k_16khz": band_energy(12000, 16000),
"above_16khz": band_energy_above(16000)
}
# ============================================================
# Brick-wall Detection
# ============================================================
diffs = np.diff(mean_db_per_bin)
big_drop_idx = np.where(diffs < -20)[0]
brick_wall = bool(big_drop_idx.size)
brick_freq = float(freqs[big_drop_idx[0]]) if big_drop_idx.size else None
# ============================================================
# Spectral Notch Detection (Median-filtering)
# ============================================================
smooth = sps.medfilt(mean_db_per_bin, kernel_size=9)
minima = sps.argrelextrema(smooth, np.less)[0]
notches = []
for m in minima:
left = smooth[max(0, m - 6):m]
right = smooth[m + 1:min(len(smooth), m + 7)]
neighbor_peak = max(
left.max() if left.size else -999,
right.max() if right.size else -999
)
depth = neighbor_peak - smooth[m]
if depth >= 15 and freqs[m] > 100:
notches.append({
"freq": float(freqs[m]),
"depth_db": float(depth)
})
# ============================================================
# Additional Spectral Descriptors
# ============================================================
centroid = float(np.mean(librosa.feature.spectral_centroid(S=S, sr=sr)))
bandwidth = float(np.mean(librosa.feature.spectral_bandwidth(S=S, sr=sr)))
flatness = float(np.mean(librosa.feature.spectral_flatness(S=S)))
rolloff = float(np.mean(librosa.feature.spectral_rolloff(S=S, sr=sr)))
return {
"S_db": S_db,
"freqs": freqs,
"hop_length": hop_length,
"n_fft": n_fft,
"rolloff_85pct": freq_at_85,
"rolloff_95pct": freq_at_95,
"highest_freq_minus60db": highest_freq,
"energy_distribution": energy_stats,
"brick_wall_detected": brick_wall,
"brick_wall_freq": brick_freq,
"spectral_notches": notches,
"spectral_centroid": centroid,
"spectral_bandwidth": bandwidth,
"spectral_flatness": flatness,
"spectral_rolloff": rolloff,
"hf_env": mean_db_per_bin,
"lf_env": mean_db_per_bin[:200] if len(mean_db_per_bin) > 200 else mean_db_per_bin
}
|