AdityaK007 commited on
Commit
8f7ec5d
Β·
verified Β·
1 Parent(s): dc8f6f5

Create app_works2.py

Browse files
Files changed (1) hide show
  1. app_works2.py +431 -0
app_works2.py ADDED
@@ -0,0 +1,431 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ from scipy.spatial.distance import jensenshannon
8
+ from scipy import signal
9
+ from scipy.signal import get_window as scipy_get_window
10
+ import plotly.express as px
11
+ import plotly.graph_objects as go
12
+ import os
13
+ import tempfile
14
+
15
+ # ----------------------------
16
+ # 1. Signal Alignment & Preprocessing (NEW)
17
+ # ----------------------------
18
+ def align_signals(ref, target):
19
+ """
20
+ Aligns target signal (Far Field) to reference signal (Near Field)
21
+ using Cross-Correlation to fix time-of-arrival delays.
22
+ """
23
+ # Normalize both to prevent amplitude from skewing correlation
24
+ ref_norm = librosa.util.normalize(ref)
25
+ target_norm = librosa.util.normalize(target)
26
+
27
+ # correlated = signal.correlate(target_norm, ref_norm, mode='full')
28
+ # Use FFT-based correlation for speed on longer audio
29
+ correlation = signal.fftconvolve(target_norm, ref_norm[::-1], mode='full')
30
+ lags = signal.correlation_lags(len(target_norm), len(ref_norm), mode='full')
31
+
32
+ lag = lags[np.argmax(correlation)]
33
+
34
+ print(f"Calculated Lag: {lag} samples")
35
+
36
+ if lag > 0:
37
+ # Target is "ahead" (starts later in the array structure relative to overlap)
38
+ # Shift target back
39
+ aligned_target = target[lag:]
40
+ aligned_ref = ref
41
+ else:
42
+ # Target is "behind" (delayed), typical for Far Field
43
+ # Shift target forward (padding start) or slice Ref
44
+ # Easier strategy: slice Ref to match where Target starts
45
+ aligned_target = target
46
+ aligned_ref = ref[abs(lag):]
47
+
48
+ # Truncate to same length
49
+ min_len = min(len(aligned_ref), len(aligned_target))
50
+ return aligned_ref[:min_len], aligned_target[:min_len]
51
+
52
+ # ----------------------------
53
+ # 2. Segment Audio into Frames
54
+ # ----------------------------
55
+ def segment_audio(y, sr, frame_length_ms, hop_length_ms, window_type="hann"):
56
+ frame_length = int(frame_length_ms * sr / 1000)
57
+ hop_length = int(hop_length_ms * sr / 1000)
58
+ window = scipy_get_window(window_type if window_type != "rectangular" else "boxcar", frame_length)
59
+ frames = []
60
+
61
+ # Pad to ensure we don't drop the last partial frame
62
+ y_padded = np.pad(y, (0, frame_length), mode='constant')
63
+
64
+ for i in range(0, len(y) - frame_length + 1, hop_length):
65
+ frame = y[i:i + frame_length] * window
66
+ frames.append(frame)
67
+
68
+ if frames:
69
+ frames = np.array(frames).T
70
+ else:
71
+ frames = np.zeros((frame_length, 1))
72
+ return frames, frame_length
73
+
74
+ # ----------------------------
75
+ # 3. Feature Extraction
76
+ # ----------------------------
77
+ def extract_features_with_spectrum(frames, sr):
78
+ features = []
79
+ n_mfcc = 13
80
+ n_fft = min(2048, frames.shape[0])
81
+
82
+ for i in range(frames.shape[1]):
83
+ frame = frames[:, i]
84
+
85
+ # Skip empty/silent frames to prevent NaN
86
+ if len(frame) < n_fft or np.max(np.abs(frame)) < 1e-10:
87
+ feat = {k: 0.0 for k in ["rms", "spectral_centroid", "zcr", "spectral_flatness",
88
+ "low_freq_energy", "mid_freq_energy", "high_freq_energy"]}
89
+ for j in range(n_mfcc): feat[f"mfcc_{j+1}"] = 0.0
90
+ feat["spectrum"] = np.zeros((n_fft // 2 + 1, 1))
91
+ features.append(feat)
92
+ continue
93
+
94
+ feat = {}
95
+ # Basic
96
+ feat["rms"] = float(np.mean(librosa.feature.rms(y=frame)[0]))
97
+ feat["zcr"] = float(np.mean(librosa.feature.zero_crossing_rate(frame)[0]))
98
+
99
+ # Spectral
100
+ try:
101
+ feat["spectral_centroid"] = float(np.mean(librosa.feature.spectral_centroid(y=frame, sr=sr)[0]))
102
+ except: feat["spectral_centroid"] = 0.0
103
+
104
+ # Reverb Metric (NEW)
105
+ try:
106
+ feat["spectral_flatness"] = float(np.mean(librosa.feature.spectral_flatness(y=frame)[0]))
107
+ except: feat["spectral_flatness"] = 0.0
108
+
109
+ # MFCC
110
+ try:
111
+ mfccs = librosa.feature.mfcc(y=frame, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft)
112
+ for j in range(n_mfcc):
113
+ feat[f"mfcc_{j+1}"] = float(np.mean(mfccs[j]))
114
+ except:
115
+ for j in range(n_mfcc): feat[f"mfcc_{j+1}"] = 0.0
116
+
117
+ # Frequency Bands
118
+ try:
119
+ S = np.abs(librosa.stft(frame, n_fft=n_fft))
120
+ S_db = librosa.amplitude_to_db(S, ref=np.max)
121
+ freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft)
122
+
123
+ low_mask = freqs <= 2000
124
+ mid_mask = (freqs > 2000) & (freqs <= 4000)
125
+ high_mask = freqs > 4000
126
+
127
+ feat["low_freq_energy"] = float(np.mean(S_db[low_mask])) if np.any(low_mask) else -80.0
128
+ feat["mid_freq_energy"] = float(np.mean(S_db[mid_mask])) if np.any(mid_mask) else -80.0
129
+ feat["high_freq_energy"] = float(np.mean(S_db[high_mask])) if np.any(high_mask) else -80.0
130
+ feat["spectrum"] = S_db
131
+ except:
132
+ feat["low_freq_energy"] = feat["mid_freq_energy"] = feat["high_freq_energy"] = -80.0
133
+ feat["spectrum"] = np.zeros((n_fft // 2 + 1, 1))
134
+
135
+ features.append(feat)
136
+
137
+ return features
138
+
139
+ # ----------------------------
140
+ # 4. Frame Comparison Logic
141
+ # ----------------------------
142
+ def compare_frames_enhanced(near_feats, far_feats, metrics):
143
+ min_len = min(len(near_feats), len(far_feats))
144
+ if min_len == 0:
145
+ return pd.DataFrame({"frame_index": []})
146
+
147
+ results = {"frame_index": list(range(min_len))}
148
+ near_df = pd.DataFrame([f for f in near_feats[:min_len]])
149
+ far_df = pd.DataFrame([f for f in far_feats[:min_len]])
150
+
151
+ # Feature Vectors (exclude non-numeric or high-dim cols)
152
+ drop_cols = ["spectrum"]
153
+ near_vec = near_df.drop(columns=drop_cols, errors="ignore").values
154
+ far_vec = far_df.drop(columns=drop_cols, errors="ignore").values
155
+
156
+ # Euclidean Distance
157
+ if "Euclidean Distance" in metrics:
158
+ results["euclidean_dist"] = np.linalg.norm(near_vec - far_vec, axis=1).tolist()
159
+
160
+ # Cosine Similarity
161
+ if "Cosine Similarity" in metrics:
162
+ cos_vals = []
163
+ for i in range(min_len):
164
+ a, b = near_vec[i].reshape(1, -1), far_vec[i].reshape(1, -1)
165
+ if np.all(a == 0) or np.all(b == 0):
166
+ cos_vals.append(0.0)
167
+ else:
168
+ cos_vals.append(float(cosine_similarity(a, b)[0][0]))
169
+ results["cosine_similarity"] = cos_vals
170
+
171
+ # High-Freq Loss Ratio
172
+ if "High-Freq Loss Ratio" in metrics:
173
+ loss_ratios = []
174
+ for i in range(min_len):
175
+ near_high = near_feats[i]["high_freq_energy"]
176
+ far_high = far_feats[i]["high_freq_energy"]
177
+ # Energy is in dB (negative), so we look at the difference
178
+ # Simple diff: Near (-20dB) - Far (-30dB) = 10dB loss
179
+ diff = near_high - far_high
180
+ loss_ratios.append(float(diff))
181
+ results["high_freq_loss_db"] = loss_ratios
182
+
183
+ # Spectral Flatness Difference (Reverberation Check)
184
+ flatness_diff = []
185
+ for i in range(min_len):
186
+ n_flat = near_feats[i]["spectral_flatness"]
187
+ f_flat = far_feats[i]["spectral_flatness"]
188
+ flatness_diff.append(f_flat - n_flat) # Postive usually means more noise/reverb
189
+ results["flatness_increase"] = flatness_diff
190
+
191
+ # Spectral Overlap
192
+ overlap_scores = []
193
+ for i in range(min_len):
194
+ near_spec = near_feats[i]["spectrum"].flatten()
195
+ far_spec = far_feats[i]["spectrum"].flatten()
196
+ if np.all(near_spec == 0) or np.all(far_spec == 0):
197
+ overlap_scores.append(0.0)
198
+ else:
199
+ overlap = float(cosine_similarity(near_spec.reshape(1, -1), far_spec.reshape(1, -1))[0][0])
200
+ overlap_scores.append(overlap)
201
+ results["spectral_overlap"] = overlap_scores
202
+
203
+ # Combined Quality Score (0 to 1 approximate)
204
+ # Higher overlap + Higher Cosine + Lower Loss = Better Quality
205
+ combined = []
206
+ for i in range(min_len):
207
+ score = (results["spectral_overlap"][i] * 0.5)
208
+ if "cosine_similarity" in results:
209
+ score += (results["cosine_similarity"][i] * 0.5)
210
+ combined.append(score)
211
+ results["combined_match_score"] = combined
212
+
213
+ return pd.DataFrame(results)
214
+
215
+ # ----------------------------
216
+ # 5. Clustering & Visualization
217
+ # ----------------------------
218
+ def cluster_frames_custom(features_df, cluster_features, algo, n_clusters=5, eps=0.5):
219
+ if not cluster_features:
220
+ return features_df
221
+
222
+ # Ensure selected features exist in DF
223
+ valid_features = [f for f in cluster_features if f in features_df.columns]
224
+ if not valid_features:
225
+ return features_df
226
+
227
+ X = features_df[valid_features].values
228
+
229
+ # Handle NaN/Inf just in case
230
+ X = np.nan_to_num(X)
231
+
232
+ if len(X) < 5:
233
+ features_df["cluster"] = -1
234
+ return features_df
235
+
236
+ if algo == "KMeans":
237
+ n_clusters = min(n_clusters, len(X))
238
+ model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
239
+ labels = model.fit_predict(X)
240
+ elif algo == "Agglomerative":
241
+ n_clusters = min(n_clusters, len(X))
242
+ model = AgglomerativeClustering(n_clusters=n_clusters)
243
+ labels = model.fit_predict(X)
244
+ elif algo == "DBSCAN":
245
+ model = DBSCAN(eps=eps, min_samples=min(3, len(X)))
246
+ labels = model.fit_predict(X)
247
+ else:
248
+ labels = np.zeros(len(X))
249
+
250
+ features_df = features_df.copy()
251
+ features_df["cluster"] = labels
252
+ return features_df
253
+
254
+ def plot_spectral_difference(near_feats, far_feats, frame_idx=0):
255
+ if not near_feats or not far_feats:
256
+ fig = go.Figure(); fig.update_layout(title="No data"); return fig
257
+
258
+ safe_idx = min(frame_idx, len(near_feats)-1, len(far_feats)-1)
259
+
260
+ near_spec = near_feats[safe_idx]["spectrum"]
261
+ far_spec = far_feats[safe_idx]["spectrum"]
262
+
263
+ min_freq_bins = min(near_spec.shape[0], far_spec.shape[0])
264
+ diff = near_spec[:min_freq_bins] - far_spec[:min_freq_bins]
265
+
266
+ fig = go.Figure(data=go.Heatmap(z=diff, colorscale='RdBu', zmid=0))
267
+ fig.update_layout(
268
+ title=f"Spectral Difference (Frame {safe_idx}) [Near - Far]",
269
+ yaxis_title="Frequency Bin",
270
+ xaxis_title="Time (within frame)",
271
+ height=350
272
+ )
273
+ return fig
274
+
275
+ # ----------------------------
276
+ # 6. Main Analysis Logic
277
+ # ----------------------------
278
+ def analyze_audio_pair(
279
+ near_file, far_file,
280
+ frame_length_ms, hop_length_ms, window_type,
281
+ comparison_metrics, cluster_features, clustering_algo, n_clusters, dbscan_eps
282
+ ):
283
+ if not near_file or not far_file:
284
+ raise gr.Error("Please upload both audio files.")
285
+
286
+ # 1. Load Audio
287
+ # Load Near
288
+ try:
289
+ y_near, sr_near = librosa.load(near_file.name, sr=None)
290
+ except:
291
+ raise gr.Error("Failed to load Near Field audio.")
292
+
293
+ # Load Far (Force resample to match Near)
294
+ try:
295
+ y_far, sr_far = librosa.load(far_file.name, sr=sr_near)
296
+ except:
297
+ raise gr.Error("Failed to load Far Field audio.")
298
+
299
+ # 2. Normalize and Align (CRITICAL STEP)
300
+ y_near = librosa.util.normalize(y_near)
301
+ y_far = librosa.util.normalize(y_far)
302
+
303
+ gr.Info("Aligning signals (calculating time delay)...")
304
+ y_near, y_far = align_signals(y_near, y_far)
305
+
306
+ # 3. Segment
307
+ frames_near, _ = segment_audio(y_near, sr_near, frame_length_ms, hop_length_ms, window_type)
308
+ frames_far, _ = segment_audio(y_far, sr_near, frame_length_ms, hop_length_ms, window_type)
309
+
310
+ # 4. Extract
311
+ gr.Info("Extracting features...")
312
+ near_feats = extract_features_with_spectrum(frames_near, sr_near)
313
+ far_feats = extract_features_with_spectrum(frames_far, sr_near)
314
+
315
+ # 5. Compare
316
+ comparison_df = compare_frames_enhanced(near_feats, far_feats, comparison_metrics)
317
+
318
+ # 6. Cluster (on Near field features usually, to classify phonemes)
319
+ near_df = pd.DataFrame(near_feats).drop(columns=["spectrum"], errors="ignore")
320
+ clustered_df = cluster_frames_custom(near_df, cluster_features, clustering_algo, n_clusters, dbscan_eps)
321
+
322
+ # 7. Visuals
323
+ metric_cols = [c for c in comparison_df.columns if c != "frame_index"]
324
+ if metric_cols:
325
+ plot_comparison = px.line(comparison_df, x="frame_index", y=metric_cols,
326
+ title="Frame-by-Frame Comparison Metrics")
327
+ else:
328
+ plot_comparison = px.line(title="No metrics selected")
329
+
330
+ if len(cluster_features) >= 2:
331
+ x_f, y_f = cluster_features[0], cluster_features[1]
332
+ plot_scatter = px.scatter(clustered_df, x=x_f, y=y_f, color="cluster",
333
+ title=f"Clustering Analysis (Near Field): {x_f} vs {y_f}")
334
+ else:
335
+ plot_scatter = px.scatter(title="Select at least 2 features to visualize clusters")
336
+
337
+ spec_heatmap = plot_spectral_difference(near_feats, far_feats, frame_idx=int(len(near_feats)/2))
338
+
339
+ # Metric Overlay: Combine Clustering with Quality
340
+ # Add combined score to clustered df for visualization
341
+ clustered_df["match_quality"] = comparison_df["combined_match_score"]
342
+
343
+ if len(cluster_features) > 0:
344
+ overlay_fig = px.scatter(clustered_df, x=cluster_features[0], y="match_quality",
345
+ color="cluster",
346
+ title=f"Cluster vs. Match Quality ({cluster_features[0]})")
347
+ else:
348
+ overlay_fig = px.scatter(title="Not enough data for overlay")
349
+
350
+ return plot_comparison, comparison_df, plot_scatter, clustered_df, spec_heatmap, overlay_fig
351
+
352
+ def export_results(comparison_df, clustered_df):
353
+ temp_dir = tempfile.mkdtemp()
354
+ comp_path = os.path.join(temp_dir, "frame_comparisons.csv")
355
+ cluster_path = os.path.join(temp_dir, "clustered_frames.csv")
356
+ comparison_df.to_csv(comp_path, index=False)
357
+ clustered_df.to_csv(cluster_path, index=False)
358
+ return [comp_path, cluster_path]
359
+
360
+ # ----------------------------
361
+ # 7. Gradio UI
362
+ # ----------------------------
363
+ # Expanded feature list for UI
364
+ feature_list = ["rms", "spectral_centroid", "zcr", "spectral_flatness",
365
+ "low_freq_energy", "mid_freq_energy", "high_freq_energy"] + \
366
+ [f"mfcc_{i}" for i in range(1, 14)]
367
+
368
+ with gr.Blocks(title="Corrected Near vs Far Field Analyzer", theme=gr.themes.Soft()) as demo:
369
+ gr.Markdown("""
370
+ # πŸŽ™οΈ Corrected Near vs Far Field Analyzer
371
+ **Now includes:** Automatic Time Alignment (Cross-Correlation), Normalization, and Reverb Detection.
372
+ """)
373
+
374
+ with gr.Row():
375
+ with gr.Column():
376
+ near_file = gr.File(label="Near-Field Audio (Reference)", file_types=[".wav", ".mp3"])
377
+ with gr.Column():
378
+ far_file = gr.File(label="Far-Field Audio (Target)", file_types=[".wav", ".mp3"])
379
+
380
+ with gr.Accordion("βš™οΈ Analysis Settings", open=False):
381
+ with gr.Row():
382
+ frame_length_ms = gr.Slider(10, 200, value=30, step=5, label="Frame Length (ms)")
383
+ hop_length_ms = gr.Slider(5, 100, value=15, step=5, label="Hop Length (ms)")
384
+ window_type = gr.Dropdown(["hann", "hamming", "rectangular"], value="hann", label="Window Type")
385
+
386
+ with gr.Accordion("πŸ“Š Metrics & Clustering", open=False):
387
+ comparison_metrics = gr.CheckboxGroup(
388
+ choices=["Euclidean Distance", "Cosine Similarity", "High-Freq Loss Ratio"],
389
+ value=["Cosine Similarity", "High-Freq Loss Ratio"],
390
+ label="Comparison Metrics"
391
+ )
392
+ cluster_features = gr.CheckboxGroup(
393
+ choices=feature_list,
394
+ value=["spectral_centroid", "spectral_flatness", "high_freq_energy"],
395
+ label="Features for Clustering (Select >= 2)"
396
+ )
397
+ with gr.Row():
398
+ clustering_algo = gr.Dropdown(["KMeans", "Agglomerative", "DBSCAN"], value="KMeans", label="Algorithm")
399
+ n_clusters = gr.Slider(2, 10, value=4, step=1, label="Num Clusters")
400
+ dbscan_eps = gr.Slider(0.1, 5.0, value=0.5, label="DBSCAN Epsilon")
401
+
402
+ btn = gr.Button("πŸš€ Align & Analyze", variant="primary")
403
+
404
+ with gr.Tabs():
405
+ with gr.Tab("πŸ“ˆ Time Series Comparison"):
406
+ comp_plot = gr.Plot()
407
+ # CORRECTED: Replaced height=200 with row_count=10
408
+ comp_table = gr.Dataframe(row_count=10)
409
+ with gr.Tab("🧩 Phoneme Clustering"):
410
+ cluster_plot = gr.Plot()
411
+ # CORRECTED: Replaced height=200 with row_count=10
412
+ cluster_table = gr.Dataframe(row_count=10)
413
+ with gr.Tab("πŸ” Spectral Check"):
414
+ gr.Markdown("Difference Heatmap (Near - Far). Blue = Near has more energy. Red = Far has more energy.")
415
+ spec_heatmap = gr.Plot()
416
+ with gr.Tab("🧭 Quality Overlay"):
417
+ overlay_plot = gr.Plot()
418
+
419
+ with gr.Tab("πŸ“€ Export"):
420
+ export_btn = gr.Button("πŸ’Ύ Download Results")
421
+ export_files = gr.Files()
422
+
423
+ btn.click(fn=analyze_audio_pair,
424
+ inputs=[near_file, far_file, frame_length_ms, hop_length_ms, window_type,
425
+ comparison_metrics, cluster_features, clustering_algo, n_clusters, dbscan_eps],
426
+ outputs=[comp_plot, comp_table, cluster_plot, cluster_table, spec_heatmap, overlay_plot])
427
+
428
+ export_btn.click(fn=export_results, inputs=[comp_table, cluster_table], outputs=export_files)
429
+
430
+ if __name__ == "__main__":
431
+ demo.launch()