Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,538 +1,178 @@
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import
|
| 3 |
-
import
|
| 4 |
-
|
| 5 |
-
from
|
| 6 |
-
from sklearn.
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
#
|
| 40 |
-
if
|
| 41 |
-
|
| 42 |
-
else
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
for j in range(n_mfcc):
|
| 88 |
-
feat[f"mfcc_{j+1}"] = float(np.mean(mfccs[j]))
|
| 89 |
-
except:
|
| 90 |
-
for j in range(n_mfcc):
|
| 91 |
-
feat[f"mfcc_{j+1}"] = 0.0
|
| 92 |
-
|
| 93 |
-
# Spectral features for quality assessment
|
| 94 |
-
try:
|
| 95 |
-
S = np.abs(librosa.stft(frame, n_fft=n_fft))
|
| 96 |
-
S_db = librosa.amplitude_to_db(S, ref=np.max)
|
| 97 |
-
freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft)
|
| 98 |
-
|
| 99 |
-
# Frequency bands for quality assessment
|
| 100 |
-
low_mask = freqs <= 500
|
| 101 |
-
mid_mask = (freqs > 500) & (freqs <= 4000) # Speech range
|
| 102 |
-
high_mask = freqs > 4000
|
| 103 |
-
|
| 104 |
-
feat["low_freq_energy"] = float(np.mean(S_db[low_mask])) if np.any(low_mask) else -80.0
|
| 105 |
-
feat["mid_freq_energy"] = float(np.mean(S_db[mid_mask])) if np.any(mid_mask) else -80.0
|
| 106 |
-
feat["high_freq_energy"] = float(np.mean(S_db[high_mask])) if np.any(high_mask) else -80.0
|
| 107 |
-
|
| 108 |
-
# Spectral rolloff (85%)
|
| 109 |
-
rolloff = np.mean(librosa.feature.spectral_rolloff(y=frame, sr=sr, roll_percent=0.85)[0])
|
| 110 |
-
feat["spectral_rolloff"] = float(rolloff)
|
| 111 |
-
|
| 112 |
-
# Spectral bandwidth
|
| 113 |
-
bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=frame, sr=sr)[0])
|
| 114 |
-
feat["spectral_bandwidth"] = float(bandwidth)
|
| 115 |
-
|
| 116 |
-
# Spectral flatness (noisiness)
|
| 117 |
-
flatness = np.mean(librosa.feature.spectral_flatness(y=frame)[0])
|
| 118 |
-
feat["spectral_flatness"] = float(flatness)
|
| 119 |
-
|
| 120 |
-
feat["spectrum"] = S_db
|
| 121 |
-
except:
|
| 122 |
-
feat["low_freq_energy"] = -80.0
|
| 123 |
-
feat["mid_freq_energy"] = -80.0
|
| 124 |
-
feat["high_freq_energy"] = -80.0
|
| 125 |
-
feat["spectral_rolloff"] = 0.0
|
| 126 |
-
feat["spectral_bandwidth"] = 0.0
|
| 127 |
-
feat["spectral_flatness"] = 0.0
|
| 128 |
-
feat["spectrum"] = np.zeros((n_fft // 2 + 1, 1))
|
| 129 |
-
|
| 130 |
-
features.append(feat)
|
| 131 |
-
|
| 132 |
-
if not features:
|
| 133 |
-
feat = {
|
| 134 |
-
"rms": 0.0, "spectral_centroid": 0.0, "zcr": 0.0,
|
| 135 |
-
"low_freq_energy": -80.0, "mid_freq_energy": -80.0, "high_freq_energy": -80.0,
|
| 136 |
-
"spectral_rolloff": 0.0, "spectral_bandwidth": 0.0, "spectral_flatness": 0.0,
|
| 137 |
-
"spectrum": np.zeros((n_fft // 2 + 1, 1))
|
| 138 |
-
}
|
| 139 |
-
for j in range(n_mfcc):
|
| 140 |
-
feat[f"mfcc_{j+1}"] = 0.0
|
| 141 |
-
features.append(feat)
|
| 142 |
-
|
| 143 |
-
return features
|
| 144 |
-
|
| 145 |
-
# ----------------------------
|
| 146 |
-
# Frame-wise Quality Metrics (0-1 scale)
|
| 147 |
-
# ----------------------------
|
| 148 |
-
|
| 149 |
-
def calculate_frame_quality_metrics(near_feats, far_feats):
|
| 150 |
-
"""Calculate multiple quality metrics between 0 and 1 for each frame"""
|
| 151 |
-
min_len = min(len(near_feats), len(far_feats))
|
| 152 |
-
if min_len == 0:
|
| 153 |
-
return pd.DataFrame({"frame_index": []})
|
| 154 |
-
|
| 155 |
-
results = {"frame_index": list(range(min_len))}
|
| 156 |
-
|
| 157 |
-
# Prepare feature vectors (excluding spectrum)
|
| 158 |
-
near_df = pd.DataFrame([f for f in near_feats[:min_len]])
|
| 159 |
-
far_df = pd.DataFrame([f for f in far_feats[:min_len]])
|
| 160 |
-
feature_cols = [col for col in near_df.columns if col != "spectrum"]
|
| 161 |
-
near_vec = near_df[feature_cols].values
|
| 162 |
-
far_vec = far_df[feature_cols].values
|
| 163 |
-
|
| 164 |
-
# 1. Spectral Similarity Score (0-1)
|
| 165 |
-
spectral_scores = []
|
| 166 |
-
for i in range(min_len):
|
| 167 |
-
try:
|
| 168 |
-
# Compare spectral distributions using cosine similarity
|
| 169 |
-
near_spectral = np.array([near_feats[i]["low_freq_energy"],
|
| 170 |
-
near_feats[i]["mid_freq_energy"],
|
| 171 |
-
near_feats[i]["high_freq_energy"]])
|
| 172 |
-
far_spectral = np.array([far_feats[i]["low_freq_energy"],
|
| 173 |
-
far_feats[i]["mid_freq_energy"],
|
| 174 |
-
far_feats[i]["high_freq_energy"]])
|
| 175 |
-
|
| 176 |
-
# Convert to positive values and normalize
|
| 177 |
-
near_spectral = near_spectral - near_spectral.min() + 1e-8
|
| 178 |
-
far_spectral = far_spectral - far_spectral.min() + 1e-8
|
| 179 |
-
near_spectral = near_spectral / near_spectral.sum()
|
| 180 |
-
far_spectral = far_spectral / far_spectral.sum()
|
| 181 |
-
|
| 182 |
-
# Use cosine similarity on spectral distribution
|
| 183 |
-
spec_sim = cosine_similarity([near_spectral], [far_spectral])[0][0]
|
| 184 |
-
spectral_scores.append(max(0, min(1, spec_sim)))
|
| 185 |
-
except:
|
| 186 |
-
spectral_scores.append(0.5)
|
| 187 |
-
results["spectral_similarity"] = spectral_scores
|
| 188 |
-
|
| 189 |
-
# 2. High-Frequency Preservation Score (0-1)
|
| 190 |
-
hf_scores = []
|
| 191 |
-
for i in range(min_len):
|
| 192 |
-
try:
|
| 193 |
-
near_hf = near_feats[i]["high_freq_energy"]
|
| 194 |
-
far_hf = far_feats[i]["high_freq_energy"]
|
| 195 |
-
|
| 196 |
-
# Normalize HF energy difference (assuming -80dB to 0dB range)
|
| 197 |
-
hf_diff = near_hf - far_hf
|
| 198 |
-
# Convert to 0-1 scale: 0dB difference = 1.0, 40dB loss = 0.0
|
| 199 |
-
hf_score = max(0, min(1, 1.0 - (max(0, hf_diff) / 40.0)))
|
| 200 |
-
hf_scores.append(hf_score)
|
| 201 |
-
except:
|
| 202 |
-
hf_scores.append(0.5)
|
| 203 |
-
results["high_freq_preservation"] = hf_scores
|
| 204 |
-
|
| 205 |
-
# 3. MFCC Structural Similarity (0-1)
|
| 206 |
-
mfcc_scores = []
|
| 207 |
-
for i in range(min_len):
|
| 208 |
try:
|
| 209 |
-
|
| 210 |
-
near_mfcc = np.array([near_feats[i][f"mfcc_{j+1}"] for j in range(13)])
|
| 211 |
-
far_mfcc = np.array([far_feats[i][f"mfcc_{j+1}"] for j in range(13)])
|
| 212 |
-
|
| 213 |
-
# Normalize and compute cosine similarity
|
| 214 |
-
near_mfcc_norm = (near_mfcc - near_mfcc.mean()) / (near_mfcc.std() + 1e-8)
|
| 215 |
-
far_mfcc_norm = (far_mfcc - far_mfcc.mean()) / (far_mfcc.std() + 1e-8)
|
| 216 |
-
|
| 217 |
-
mfcc_sim = cosine_similarity([near_mfcc_norm], [far_mfcc_norm])[0][0]
|
| 218 |
-
mfcc_scores.append(max(0, min(1, (mfcc_sim + 1) / 2))) # Convert -1:1 to 0:1
|
| 219 |
except:
|
| 220 |
-
|
| 221 |
-
results["mfcc_similarity"] = mfcc_scores
|
| 222 |
-
|
| 223 |
-
# 4. Temporal Consistency Score (RMS stability)
|
| 224 |
-
temporal_scores = []
|
| 225 |
-
for i in range(min_len):
|
| 226 |
try:
|
| 227 |
-
|
| 228 |
-
far_rms = far_feats[i]["rms"]
|
| 229 |
-
|
| 230 |
-
# Ratio of RMS energies (closer to 1 is better)
|
| 231 |
-
rms_ratio = min(near_rms, far_rms) / (max(near_rms, far_rms) + 1e-8)
|
| 232 |
-
temporal_scores.append(float(rms_ratio))
|
| 233 |
except:
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
# 5. Spectral Centroid Stability (0-1)
|
| 238 |
-
centroid_scores = []
|
| 239 |
-
for i in range(min_len):
|
| 240 |
try:
|
| 241 |
-
|
| 242 |
-
far_sc = far_feats[i]["spectral_centroid"]
|
| 243 |
-
|
| 244 |
-
# Ratio of spectral centroids
|
| 245 |
-
sc_ratio = min(near_sc, far_sc) / (max(near_sc, far_sc) + 1e-8)
|
| 246 |
-
centroid_scores.append(float(sc_ratio))
|
| 247 |
except:
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
weights = {
|
| 256 |
-
'
|
| 257 |
-
'high_freq_preservation': 0.30, # HF content preservation (most important)
|
| 258 |
-
'mfcc_similarity': 0.20, # Structural similarity
|
| 259 |
-
'temporal_consistency': 0.15, # Amplitude consistency
|
| 260 |
-
'spectral_centroid_stability': 0.10 # Spectral shape stability
|
| 261 |
}
|
| 262 |
-
|
| 263 |
-
total_score = 0
|
| 264 |
-
for metric, weight in weights.items():
|
| 265 |
-
total_score += results[metric][i] * weight
|
| 266 |
-
|
| 267 |
-
quality_scores.append(max(0, min(1, total_score)))
|
| 268 |
-
|
| 269 |
-
results["overall_quality"] = quality_scores
|
| 270 |
-
|
| 271 |
-
# 7. Quality Degradation Level
|
| 272 |
-
degradation_levels = []
|
| 273 |
-
for score in quality_scores:
|
| 274 |
-
if score >= 0.8:
|
| 275 |
-
degradation_levels.append("Excellent")
|
| 276 |
-
elif score >= 0.6:
|
| 277 |
-
degradation_levels.append("Good")
|
| 278 |
-
elif score >= 0.4:
|
| 279 |
-
degradation_levels.append("Moderate")
|
| 280 |
-
elif score >= 0.2:
|
| 281 |
-
degradation_levels.append("Poor")
|
| 282 |
-
else:
|
| 283 |
-
degradation_levels.append("Very Poor")
|
| 284 |
-
|
| 285 |
-
results["degradation_level"] = degradation_levels
|
| 286 |
-
|
| 287 |
-
return pd.DataFrame(results)
|
| 288 |
-
|
| 289 |
-
# ----------------------------
|
| 290 |
-
# Clustering and Visualization
|
| 291 |
-
# ----------------------------
|
| 292 |
-
|
| 293 |
-
def cluster_frames_custom(features_df, cluster_features, algo, n_clusters=5, eps=0.5):
|
| 294 |
-
if not cluster_features:
|
| 295 |
-
raise gr.Error("Please select at least one feature for clustering.")
|
| 296 |
-
|
| 297 |
-
if len(features_df) == 0:
|
| 298 |
-
features_df["cluster"] = []
|
| 299 |
-
return features_df
|
| 300 |
-
|
| 301 |
-
X = features_df[cluster_features].values
|
| 302 |
-
|
| 303 |
-
if algo == "KMeans":
|
| 304 |
-
n_clusters = min(n_clusters, len(X))
|
| 305 |
-
model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
|
| 306 |
-
labels = model.fit_predict(X)
|
| 307 |
-
elif algo == "Agglomerative":
|
| 308 |
-
n_clusters = min(n_clusters, len(X))
|
| 309 |
-
model = AgglomerativeClustering(n_clusters=n_clusters)
|
| 310 |
-
labels = model.fit_predict(X)
|
| 311 |
-
elif algo == "DBSCAN":
|
| 312 |
-
model = DBSCAN(eps=eps, min_samples=min(3, len(X)))
|
| 313 |
-
labels = model.fit_predict(X)
|
| 314 |
else:
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
)
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
def analyze_audio_pair(
|
| 356 |
-
near_file,
|
| 357 |
-
far_file,
|
| 358 |
-
frame_length_ms,
|
| 359 |
-
hop_length_ms,
|
| 360 |
-
window_type,
|
| 361 |
-
cluster_features,
|
| 362 |
-
clustering_algo,
|
| 363 |
-
n_clusters,
|
| 364 |
-
dbscan_eps
|
| 365 |
-
):
|
| 366 |
-
if not near_file or not far_file:
|
| 367 |
-
raise gr.Error("Upload both audio files.")
|
| 368 |
-
|
| 369 |
-
try:
|
| 370 |
-
y_near, sr_near = librosa.load(near_file.name, sr=None)
|
| 371 |
-
y_far, sr_far = librosa.load(far_file.name, sr=None)
|
| 372 |
-
except Exception as e:
|
| 373 |
-
raise gr.Error(f"Error loading audio files: {str(e)}")
|
| 374 |
-
|
| 375 |
-
if sr_near != sr_far:
|
| 376 |
-
y_far = librosa.resample(y_far, orig_sr=sr_far, target_sr=sr_near)
|
| 377 |
-
sr = sr_near
|
| 378 |
-
else:
|
| 379 |
-
sr = sr_near
|
| 380 |
-
|
| 381 |
-
frames_near, frame_length = segment_audio(y_near, sr, frame_length_ms, hop_length_ms, window_type)
|
| 382 |
-
frames_far, _ = segment_audio(y_far, sr, frame_length_ms, hop_length_ms, window_type)
|
| 383 |
-
|
| 384 |
-
near_feats = extract_features_with_spectrum(frames_near, sr)
|
| 385 |
-
far_feats = extract_features_with_spectrum(frames_far, sr)
|
| 386 |
-
|
| 387 |
-
# Calculate frame-wise quality metrics
|
| 388 |
-
comparison_df = calculate_frame_quality_metrics(near_feats, far_feats)
|
| 389 |
-
|
| 390 |
-
# Clustering (on near-field)
|
| 391 |
-
near_df = pd.DataFrame(near_feats)
|
| 392 |
-
near_df = near_df.drop(columns=["spectrum"], errors="ignore")
|
| 393 |
-
clustered_df = cluster_frames_custom(near_df, cluster_features, clustering_algo, n_clusters, dbscan_eps)
|
| 394 |
-
|
| 395 |
-
# Plots
|
| 396 |
-
plot_comparison = None
|
| 397 |
-
if len(comparison_df) > 0:
|
| 398 |
-
plot_comparison = px.line(
|
| 399 |
-
comparison_df,
|
| 400 |
-
x="frame_index",
|
| 401 |
-
y="overall_quality",
|
| 402 |
-
title="Overall Audio Quality Score Over Time (0-1 scale)",
|
| 403 |
-
labels={"overall_quality": "Quality Score", "frame_index": "Frame Index"}
|
| 404 |
-
)
|
| 405 |
-
plot_comparison.update_yaxes(range=[0, 1])
|
| 406 |
-
else:
|
| 407 |
-
plot_comparison = px.line(title="No comparison data available")
|
| 408 |
-
|
| 409 |
-
# Quality distribution plot
|
| 410 |
-
quality_dist_plot = None
|
| 411 |
-
if len(comparison_df) > 0:
|
| 412 |
-
quality_dist_plot = px.histogram(
|
| 413 |
-
comparison_df,
|
| 414 |
-
x="overall_quality",
|
| 415 |
-
title="Distribution of Audio Quality Scores",
|
| 416 |
-
nbins=20,
|
| 417 |
-
labels={"overall_quality": "Quality Score"}
|
| 418 |
-
)
|
| 419 |
-
quality_dist_plot.update_xaxes(range=[0, 1])
|
| 420 |
-
else:
|
| 421 |
-
quality_dist_plot = px.histogram(title="No quality data available")
|
| 422 |
-
|
| 423 |
-
# Scatter plot
|
| 424 |
-
plot_scatter = None
|
| 425 |
-
if len(cluster_features) >= 2 and len(clustered_df) > 0:
|
| 426 |
-
x_feat, y_feat = cluster_features[0], cluster_features[1]
|
| 427 |
-
if x_feat in clustered_df.columns and y_feat in clustered_df.columns:
|
| 428 |
-
plot_scatter = px.scatter(
|
| 429 |
-
clustered_df,
|
| 430 |
-
x=x_feat,
|
| 431 |
-
y=y_feat,
|
| 432 |
-
color="cluster",
|
| 433 |
-
title=f"Clustering: {x_feat} vs {y_feat}",
|
| 434 |
-
hover_data=["cluster"]
|
| 435 |
-
)
|
| 436 |
-
else:
|
| 437 |
-
plot_scatter = px.scatter(title="Selected features not available in data")
|
| 438 |
-
else:
|
| 439 |
-
plot_scatter = px.scatter(title="Select ≥2 features for scatter plot")
|
| 440 |
-
|
| 441 |
-
# Spectral difference heatmap
|
| 442 |
-
spec_heatmap = plot_spectral_difference(near_feats, far_feats, frame_idx=0)
|
| 443 |
-
|
| 444 |
-
return (
|
| 445 |
-
plot_comparison,
|
| 446 |
-
quality_dist_plot,
|
| 447 |
-
comparison_df,
|
| 448 |
-
plot_scatter,
|
| 449 |
-
clustered_df,
|
| 450 |
-
spec_heatmap
|
| 451 |
-
)
|
| 452 |
-
|
| 453 |
-
def export_results(comparison_df, clustered_df):
|
| 454 |
-
temp_dir = tempfile.mkdtemp()
|
| 455 |
-
comp_path = os.path.join(temp_dir, "frame_quality_scores.csv")
|
| 456 |
-
cluster_path = os.path.join(temp_dir, "clustered_frames.csv")
|
| 457 |
-
comparison_df.to_csv(comp_path, index=False)
|
| 458 |
-
clustered_df.to_csv(cluster_path, index=False)
|
| 459 |
-
return [comp_path, cluster_path]
|
| 460 |
-
|
| 461 |
-
# ----------------------------
|
| 462 |
-
# Gradio UI
|
| 463 |
-
# ----------------------------
|
| 464 |
-
|
| 465 |
-
dummy_features = ["rms", "spectral_centroid", "zcr", "spectral_rolloff",
|
| 466 |
-
"spectral_bandwidth", "spectral_flatness"] + \
|
| 467 |
-
[f"mfcc_{i}" for i in range(1,14)] + \
|
| 468 |
-
["low_freq_energy", "mid_freq_energy", "high_freq_energy"]
|
| 469 |
-
|
| 470 |
-
with gr.Blocks(title="Audio Quality Analyzer") as demo:
|
| 471 |
-
gr.Markdown("# 🎙️ Near vs Far Field Audio Quality Analyzer")
|
| 472 |
-
gr.Markdown("**Quantify audio degradation per frame (0-1 scale)** - Compare near-field vs far-field recording quality")
|
| 473 |
-
|
| 474 |
-
with gr.Row():
|
| 475 |
-
near_file = gr.File(label="Near-Field Audio (.wav)", file_types=[".wav"])
|
| 476 |
-
far_file = gr.File(label="Far-Field Audio (.wav)", file_types=[".wav"])
|
| 477 |
-
|
| 478 |
-
with gr.Accordion("⚙️ Frame Settings", open=True):
|
| 479 |
-
frame_length_ms = gr.Slider(10, 500, value=50, step=1, label="Frame Length (ms)")
|
| 480 |
-
hop_length_ms = gr.Slider(1, 250, value=25, step=1, label="Hop Length (ms)")
|
| 481 |
-
window_type = gr.Dropdown(["hann", "hamming", "rectangular"], value="hann", label="Window Type")
|
| 482 |
-
|
| 483 |
-
with gr.Accordion("🧩 Clustering Configuration", open=False):
|
| 484 |
-
cluster_features = gr.CheckboxGroup(
|
| 485 |
-
choices=dummy_features,
|
| 486 |
-
value=["rms", "spectral_centroid", "high_freq_energy"],
|
| 487 |
-
label="Features to Use for Clustering"
|
| 488 |
-
)
|
| 489 |
-
clustering_algo = gr.Radio(
|
| 490 |
-
["KMeans", "Agglomerative", "DBSCAN"],
|
| 491 |
-
value="KMeans",
|
| 492 |
-
label="Clustering Algorithm"
|
| 493 |
-
)
|
| 494 |
-
n_clusters = gr.Slider(2, 20, value=5, step=1, label="Number of Clusters (for KMeans/Agglomerative)")
|
| 495 |
-
dbscan_eps = gr.Slider(0.1, 2.0, value=0.5, step=0.1, label="DBSCAN eps (neighborhood radius)")
|
| 496 |
-
|
| 497 |
-
btn = gr.Button("🚀 Analyze Audio Quality")
|
| 498 |
-
|
| 499 |
-
with gr.Tabs():
|
| 500 |
-
with gr.Tab("📊 Quality Analysis"):
|
| 501 |
-
with gr.Row():
|
| 502 |
-
comp_plot = gr.Plot(label="Quality Over Time")
|
| 503 |
-
quality_dist_plot = gr.Plot(label="Quality Distribution")
|
| 504 |
-
comp_table = gr.Dataframe(label="Frame-wise Quality Scores")
|
| 505 |
-
|
| 506 |
-
with gr.Tab("🧩 Clustering"):
|
| 507 |
-
cluster_plot = gr.Plot()
|
| 508 |
-
cluster_table = gr.Dataframe()
|
| 509 |
-
|
| 510 |
-
with gr.Tab("🔍 Spectral Analysis"):
|
| 511 |
-
spec_heatmap = gr.Plot(label="Spectral Difference (Near - Far)")
|
| 512 |
-
|
| 513 |
-
with gr.Tab("📤 Export"):
|
| 514 |
-
gr.Markdown("### Download Analysis Results")
|
| 515 |
-
export_btn = gr.Button("💾 Download CSV Files")
|
| 516 |
-
export_files = gr.Files()
|
| 517 |
-
|
| 518 |
-
btn.click(
|
| 519 |
-
fn=analyze_audio_pair,
|
| 520 |
-
inputs=[
|
| 521 |
-
near_file, far_file,
|
| 522 |
-
frame_length_ms, hop_length_ms, window_type,
|
| 523 |
-
cluster_features,
|
| 524 |
-
clustering_algo,
|
| 525 |
-
n_clusters,
|
| 526 |
-
dbscan_eps
|
| 527 |
-
],
|
| 528 |
-
outputs=[comp_plot, quality_dist_plot, comp_table, cluster_plot, cluster_table, spec_heatmap]
|
| 529 |
-
)
|
| 530 |
-
|
| 531 |
-
export_btn.click(
|
| 532 |
-
fn=export_results,
|
| 533 |
-
inputs=[comp_table, cluster_table],
|
| 534 |
-
outputs=export_files
|
| 535 |
-
)
|
| 536 |
-
|
| 537 |
if __name__ == "__main__":
|
| 538 |
-
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
import gradio as gr
|
| 3 |
+
import numpy as np, soundfile as sf
|
| 4 |
+
import librosa, scipy
|
| 5 |
+
from pesq import pesq
|
| 6 |
+
from pystoi import stoi
|
| 7 |
+
from sklearn.ensemble import RandomForestRegressor
|
| 8 |
+
import pyroomacoustics as pra
|
| 9 |
+
|
| 10 |
+
# ------------- utility fns -------------
|
| 11 |
+
def load_audio(path, sr=16000):
|
| 12 |
+
y, sr0 = sf.read(path)
|
| 13 |
+
if y.ndim>1:
|
| 14 |
+
y = np.mean(y,axis=1)
|
| 15 |
+
if sr0 != sr:
|
| 16 |
+
y = librosa.resample(y, orig_sr=sr0, target_sr=sr)
|
| 17 |
+
y = y - np.mean(y)
|
| 18 |
+
if np.max(np.abs(y))>0:
|
| 19 |
+
y = y / np.max(np.abs(y))
|
| 20 |
+
return y, sr
|
| 21 |
+
|
| 22 |
+
def frame_audio(y, sr, win_ms=25, hop_ms=10):
|
| 23 |
+
win = int(win_ms*sr/1000)
|
| 24 |
+
hop = int(hop_ms*sr/1000)
|
| 25 |
+
frames = librosa.util.frame(y, frame_length=win, hop_length=hop).T
|
| 26 |
+
return frames, win, hop
|
| 27 |
+
|
| 28 |
+
def hf_energy_db(frame, sr, low=4000):
|
| 29 |
+
S = np.abs(librosa.stft(frame, n_fft=1024, win_length=len(frame), center=False))
|
| 30 |
+
freqs = librosa.fft_frequencies(sr=sr, n_fft=1024)
|
| 31 |
+
mask = freqs >= low
|
| 32 |
+
if mask.sum()==0:
|
| 33 |
+
return -120.0
|
| 34 |
+
E = 20*np.log10(np.maximum(1e-12, np.mean(S[mask])))
|
| 35 |
+
return float(E)
|
| 36 |
+
|
| 37 |
+
def frame_features(near_frame, far_frame, sr):
|
| 38 |
+
# spectral centroid, rms, zcr, hi-freq energy, coherence estimate via cross-spectrum
|
| 39 |
+
feats = {}
|
| 40 |
+
# rms
|
| 41 |
+
feats['rms_near'] = float(np.mean(near_frame**2)) if near_frame is not None else 0.0
|
| 42 |
+
feats['rms_far'] = float(np.mean(far_frame**2))
|
| 43 |
+
feats['centroid_near'] = float(np.mean(librosa.feature.spectral_centroid(y=near_frame, sr=sr))) if near_frame is not None else 0.0
|
| 44 |
+
feats['centroid_far'] = float(np.mean(librosa.feature.spectral_centroid(y=far_frame, sr=sr)))
|
| 45 |
+
feats['hi_near_db'] = hf_energy_db(near_frame, sr, low=4000) if near_frame is not None else -120.0
|
| 46 |
+
feats['hi_far_db'] = hf_energy_db(far_frame, sr, low=4000)
|
| 47 |
+
# basic coherence: compute magnitude-squared coherence using scipy.signal.coherence
|
| 48 |
+
try:
|
| 49 |
+
f, Cxy = scipy.signal.coherence(near_frame, far_frame, fs=sr, nperseg=min(len(near_frame),256))
|
| 50 |
+
feats['coherence_mean'] = float(np.mean(Cxy))
|
| 51 |
+
except:
|
| 52 |
+
feats['coherence_mean'] = 0.0
|
| 53 |
+
return feats
|
| 54 |
+
|
| 55 |
+
# quick DRR proxy using energy early vs late (simple heuristic)
|
| 56 |
+
def estimate_drr_from_pair(near, far, sr, early_ms=50):
|
| 57 |
+
# align roughly and compare early energy ratio (heuristic)
|
| 58 |
+
early = int(early_ms*sr/1000)
|
| 59 |
+
if len(near) < early or len(far) < early:
|
| 60 |
+
return 0.0
|
| 61 |
+
# direct energy proxy from near vs far first early segment
|
| 62 |
+
en_near = np.sum(near[:early]**2)
|
| 63 |
+
en_far = np.sum(far[:early]**2)
|
| 64 |
+
# avoid div0
|
| 65 |
+
if en_far<=1e-12:
|
| 66 |
+
return 0.0
|
| 67 |
+
drr_db = 10*np.log10((en_near+1e-12)/(en_far+1e-12))
|
| 68 |
+
return float(drr_db)
|
| 69 |
+
|
| 70 |
+
def normalize_metric(val, vmin, vmax):
|
| 71 |
+
return float(np.clip((val - vmin)/(vmax-vmin), 0, 1))
|
| 72 |
+
|
| 73 |
+
# ------------- scoring pipeline -------------
|
| 74 |
+
def score_pair(near_path, far_path):
|
| 75 |
+
sr = 16000
|
| 76 |
+
far, _ = load_audio(far_path, sr=sr)
|
| 77 |
+
near = None
|
| 78 |
+
if near_path:
|
| 79 |
+
near, _ = load_audio(near_path, sr=sr)
|
| 80 |
+
|
| 81 |
+
# global intrusive metrics (if near exists):
|
| 82 |
+
pesq_score = None
|
| 83 |
+
stoi_score = None
|
| 84 |
+
sisdr = None
|
| 85 |
+
if near is not None:
|
| 86 |
+
# align lengths
|
| 87 |
+
L = min(len(near), len(far))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
try:
|
| 89 |
+
pesq_score = pesq(sr, near[:L], far[:L], 'wb')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
except:
|
| 91 |
+
pesq_score = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
try:
|
| 93 |
+
stoi_score = stoi(near[:L], far[:L], sr, extended=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
except:
|
| 95 |
+
stoi_score = None
|
| 96 |
+
# sisdr quick: use pra.metrics
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
try:
|
| 98 |
+
sisdr = float(pra.metrics.sdr(near[:L], far[:L])[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
except:
|
| 100 |
+
sisdr = None
|
| 101 |
+
|
| 102 |
+
# frame-level features
|
| 103 |
+
frames_far, win, hop = frame_audio(far, sr)
|
| 104 |
+
frames_near = None
|
| 105 |
+
if near is not None:
|
| 106 |
+
near_cut = near[:len(far)]
|
| 107 |
+
frames_near, _, _ = frame_audio(near_cut, sr,)
|
| 108 |
+
|
| 109 |
+
feats = []
|
| 110 |
+
for i in range(len(frames_far)):
|
| 111 |
+
nf = frames_near[i] if frames_near is not None and i < len(frames_near) else None
|
| 112 |
+
ff = frames_far[i]
|
| 113 |
+
feats.append(frame_features(nf, ff, sr))
|
| 114 |
+
|
| 115 |
+
# aggregate metrics: example normalization ranges (you should tune)
|
| 116 |
+
# PESQ ~ [1..4.5], STOI [0..1], DRR [-20..20 dB], coherence [0..1], hi-loss in dB [-40..10]
|
| 117 |
+
q_pesq = normalize_metric(pesq_score if pesq_score is not None else 2.5, 1.0, 4.5)
|
| 118 |
+
q_stoi = normalize_metric(stoi_score if stoi_score is not None else 0.5, 0.0, 1.0)
|
| 119 |
+
# DRR: estimate using early energy proxy between near&far across whole file
|
| 120 |
+
q_drr = 0.5
|
| 121 |
+
if near is not None:
|
| 122 |
+
drr = estimate_drr_from_pair(near, far, sr)
|
| 123 |
+
q_drr = normalize_metric(drr, -20, 20)
|
| 124 |
+
# hi-freq loss average
|
| 125 |
+
hi_loss = np.mean([f['hi_near_db'] - f['hi_far_db'] if 'hi_near_db' in f else 0.0 for f in feats])
|
| 126 |
+
q_hf = normalize_metric(-hi_loss, -40, 0) # smaller loss -> higher score
|
| 127 |
+
|
| 128 |
+
# coherence average
|
| 129 |
+
q_coh = np.mean([f['coherence_mean'] for f in feats])
|
| 130 |
+
|
| 131 |
+
# example weighted aggregate (intrusive case)
|
| 132 |
+
if near is not None:
|
| 133 |
weights = {
|
| 134 |
+
'pesq':0.30, 'stoi':0.20, 'drr':0.20, 'hf':0.10, 'coh':0.20
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
}
|
| 136 |
+
score = (weights['pesq']*q_pesq + weights['stoi']*q_stoi + weights['drr']*q_drr + weights['hf']*q_hf + weights['coh']*q_coh) / sum(weights.values())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
else:
|
| 138 |
+
# non-intrusive fallback: combine hf, coherence, centroid shift heuristics
|
| 139 |
+
avg_centroid_far = np.mean([f['centroid_far'] for f in feats])
|
| 140 |
+
q_centroid = normalize_metric(avg_centroid_far, 500, 3500)
|
| 141 |
+
score = (0.4*q_coh + 0.4*q_hf + 0.2*q_centroid)
|
| 142 |
+
|
| 143 |
+
percent = float(score*100)
|
| 144 |
+
# frames needing fix
|
| 145 |
+
frame_scores = []
|
| 146 |
+
for f in feats:
|
| 147 |
+
# example per-frame heuristic: combine coherence & hf loss
|
| 148 |
+
s = 0.6*f['coherence_mean'] + 0.4*normalize_metric( -(f['hi_near_db'] - f['hi_far_db']), -40,0 )
|
| 149 |
+
frame_scores.append(float(s))
|
| 150 |
+
problem_frames = [i for i,v in enumerate(frame_scores) if v < 0.5]
|
| 151 |
+
|
| 152 |
+
return {
|
| 153 |
+
"score_percent": percent,
|
| 154 |
+
"pesq": pesq_score,
|
| 155 |
+
"stoi": stoi_score,
|
| 156 |
+
"drr_db": drr if 'drr' in locals() else None,
|
| 157 |
+
"avg_coherence": q_coh,
|
| 158 |
+
"hi_loss_db": hi_loss,
|
| 159 |
+
"problem_frames": problem_frames
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
# ------------- Gradio UI -------------
|
| 163 |
+
def analyze(near, far):
|
| 164 |
+
res = score_pair(near.name if near else None, far.name)
|
| 165 |
+
html = f"<h3>Far-field quality: {res['score_percent']:.1f}%</h3>"
|
| 166 |
+
html += "<ul>"
|
| 167 |
+
html += f"<li>PESQ: {res['pesq']}</li>"
|
| 168 |
+
html += f"<li>STOI: {res['stoi']}</li>"
|
| 169 |
+
html += f"<li>DRR (proxy, dB): {res['drr_db']}</li>"
|
| 170 |
+
html += f"<li>Avg coherence: {res['avg_coherence']:.3f}</li>"
|
| 171 |
+
html += f"<li>Avg high-freq loss (dB): {res['hi_loss_db']:.2f}</li>"
|
| 172 |
+
html += f"<li>Problem frames (indices): {res['problem_frames']}</li>"
|
| 173 |
+
html += "</ul>"
|
| 174 |
+
return html
|
| 175 |
+
|
| 176 |
+
iface = gr.Interface(fn=analyze, inputs=[gr.File(label="Near (optional)"), gr.File(label="Far")], outputs=gr.HTML, title="Far-field degradation score")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
if __name__ == "__main__":
|
| 178 |
+
iface.launch()
|