Spaces:

dayngerous
/

sampled

Sleeping

App Files Files Community

dayngerous commited on 19 days ago

Commit

c2160cd

1 Parent(s): df731c1

Add spectrogram image input tab

Browse files

Files changed (1) hide show

app.py +152 -9

app.py CHANGED Viewed

@@ -501,6 +501,130 @@ def _plot_mels(
     return fig
 def preview_waveforms(track_audio, source_audio):
     if not track_audio or not source_audio:
         return None, None
@@ -595,13 +719,28 @@ with gr.Blocks(title="Sample Match Verifier") as demo:
     gr.Markdown("# Sample Match Verifier")
     gr.Markdown(
         "Upload a track and a possible source sample. "
-        "Waveforms appear immediately on upload. "
-        "Click **Verify match** to run the model and highlight sampled sections."
     )
-    with gr.Row():
-        track_audio = gr.Audio(label="Track / song audio", type="filepath", sources=["upload"])
-        source_audio = gr.Audio(label="Source sample audio", type="filepath", sources=["upload"])
     with gr.Accordion("Settings", open=False):
         checkpoint_path = gr.Textbox(label="Checkpoint path", value=DEFAULT_CHECKPOINT)
@@ -616,13 +755,11 @@ with gr.Blocks(title="Sample Match Verifier") as demo:
             stride_beats = gr.Slider(1, 16, value=4, step=1, label="Window stride, beats")
             max_windows = gr.Slider(4, 64, value=24, step=1, label="Max windows per upload")
-    run = gr.Button("Verify match", variant="primary")
     result = gr.Markdown()
     waveform_plot = gr.Plot(label="Waveforms")
     mel_plot = gr.Plot(label="Mel Spectrograms")
-    # Show waveforms as soon as both files are uploaded
     for audio_input in [track_audio, source_audio]:
         audio_input.change(
             preview_waveforms,
@@ -630,7 +767,7 @@ with gr.Blocks(title="Sample Match Verifier") as demo:
             outputs=[waveform_plot, mel_plot],
         )
-    run.click(
         verify,
         inputs=[
             track_audio,
@@ -647,6 +784,12 @@ with gr.Blocks(title="Sample Match Verifier") as demo:
         outputs=[result, waveform_plot, mel_plot],
     )
 if __name__ == "__main__":
     demo.queue(max_size=8).launch()

     return fig
+def _image_to_mel_tensor(image_path: str, args: dict) -> torch.Tensor:
+    """Reconstruct the model's input tensor from a saved BPM-normalized mel spectrogram PNG."""
+    from PIL import Image as PILImage
+    n_mels = int(args.get("n_mels", 128))
+    bars = int(args.get("bars", 4))
+    fixed_frames = bars * 4 * TARGET_FRAMES_PER_BEAT
+    img = PILImage.open(image_path).convert("RGB")
+    img = img.resize((fixed_frames, n_mels), PILImage.LANCZOS)
+    arr = np.array(img, dtype=np.float32) / 255.0  # [n_mels, fixed_frames, 3]
+    # Invert magma via luminance — monotone proxy for the original mel value
+    luminance = 0.2126 * arr[:, :, 0] + 0.7152 * arr[:, :, 1] + 0.0722 * arr[:, :, 2]
+    # PNG rows are top-to-bottom; origin="lower" means row 0 in data = bottom of image
+    luminance = luminance[::-1]  # flip so row 0 = lowest mel bin
+    mel = torch.from_numpy(luminance.T.copy()).float()  # [fixed_frames, n_mels]
+    mel = (mel - mel.mean()) / (mel.std() + 1e-6)
+    return mel.unsqueeze(0)  # [1, fixed_frames, n_mels]
+def _plot_spectrograms_with_mask(
+    track_img_path: str,
+    source_img_path: str,
+    track_beats: np.ndarray,
+    source_beats: np.ndarray,
+    score: float,
+    matched: bool,
+) -> plt.Figure:
+    from PIL import Image as PILImage
+    color = "#22c55e" if matched else "#f59e0b"
+    fig, axes = plt.subplots(2, 1, figsize=(12, 5))
+    fig.suptitle(f"Score: {score:.3f}", fontsize=12)
+    for ax, img_path, label, beats in [
+        (axes[0], track_img_path, "Track spectrogram", track_beats),
+        (axes[1], source_img_path, "Source spectrogram", source_beats),
+    ]:
+        img = np.array(PILImage.open(img_path).convert("RGB"))
+        W = img.shape[1]
+        ax.imshow(img, aspect="auto")
+        ax.set_title(label, loc="left", fontsize=10)
+        ax.set_xlabel("Time frame (BPM-normalized)")
+        ax.set_ylabel("Mel bin")
+        ax.tick_params(labelsize=7)
+        if beats is not None and beats.any():
+            n_beats = len(beats)
+            beat_w = W / n_beats
+            for i, active in enumerate(beats):
+                if active:
+                    ax.axvspan(i * beat_w, (i + 1) * beat_w, color=color, alpha=0.38, linewidth=0)
+        if not matched:
+            ax.text(0.5, 0.5, "No Match", transform=ax.transAxes,
+                    fontsize=18, color="white", ha="center", va="center", fontweight="bold",
+                    bbox=dict(boxstyle="round,pad=0.4", facecolor="#111827", alpha=0.65))
+    fig.tight_layout()
+    return fig
+def verify_spectrograms(
+    track_spec_path,
+    source_spec_path,
+    checkpoint_path,
+    match_threshold,
+    localization_threshold,
+):
+    if not track_spec_path or not source_spec_path:
+        raise gr.Error("Upload both spectrogram images before running verification.")
+    try:
+        loaded = _load_model(checkpoint_path or DEFAULT_CHECKPOINT)
+    except Exception as exc:
+        return f"Model could not be loaded: {exc}", None, None
+    model = loaded["model"]
+    args = loaded["args"]
+    device = loaded["device"]
+    track_mel = _image_to_mel_tensor(track_spec_path, args).unsqueeze(0).to(device)
+    source_mel = _image_to_mel_tensor(source_spec_path, args).unsqueeze(0).to(device)
+    with torch.inference_mode():
+        track_emb = model.encoder(track_mel)
+        source_emb = model.encoder(source_mel)
+        pair_feat = pair_summary_features(model.pair_mask_head(track_mel, source_mel))
+        combined = torch.cat(
+            [track_emb, source_emb, torch.abs(track_emb - source_emb), track_emb * source_emb, pair_feat],
+            dim=-1,
+        )
+        score = torch.softmax(model.head(combined), dim=-1)[0, 1].item()
+    matched = score >= float(match_threshold)
+    beats_per_window = int(args.get("bars", 4)) * 4
+    if loaded["pair_head_loaded"]:
+        with torch.inference_mode():
+            pair_probs = torch.sigmoid(model.pair_mask_head(track_mel, source_mel))[0].cpu().numpy()
+        track_beats, source_beats = _find_contiguous_beats(pair_probs, min_beats=2)
+        if not track_beats.any():
+            track_beats = np.ones(beats_per_window, dtype=bool)
+            source_beats = np.ones(beats_per_window, dtype=bool)
+    else:
+        track_beats = np.ones(beats_per_window, dtype=bool)
+        source_beats = np.ones(beats_per_window, dtype=bool)
+    spec_fig = _plot_spectrograms_with_mask(
+        track_spec_path, source_spec_path,
+        track_beats, source_beats,
+        score, matched,
+    )
+    verdict = "Likely match" if matched else "No match"
+    details = [
+        f"**{verdict}**",
+        f"Classifier score: `{score:.3f}` (threshold `{float(match_threshold):.2f}`).",
+        f"Model: `{args.get('backbone', 'ast')}` checkpoint epoch `{loaded['epoch']}` on `{device}`.",
+    ]
+    if not loaded["pair_head_loaded"]:
+        details.append("Checkpoint does not include a trained pairwise beat head.")
+    return "\n\n".join(details), None, spec_fig
 def preview_waveforms(track_audio, source_audio):
     if not track_audio or not source_audio:
         return None, None
     gr.Markdown("# Sample Match Verifier")
     gr.Markdown(
         "Upload a track and a possible source sample. "
+        "Use the **Audio** tab for raw audio files, or the **Spectrogram** tab to upload "
+        "pre-computed BPM-normalized mel spectrogram images. "
+        "Click **Verify match** to run the model."
     )
+    with gr.Tabs():
+        with gr.Tab("Audio"):
+            gr.Markdown("Waveforms appear immediately on upload.")
+            with gr.Row():
+                track_audio = gr.Audio(label="Track / song audio", type="filepath", sources=["upload"])
+                source_audio = gr.Audio(label="Source sample audio", type="filepath", sources=["upload"])
+            audio_run = gr.Button("Verify match", variant="primary")
+        with gr.Tab("Spectrogram"):
+            gr.Markdown(
+                "Upload BPM-normalized mel spectrogram images (e.g. from `make_test_spectrograms.py`). "
+                "Offset / duration / stride settings are ignored in this mode."
+            )
+            with gr.Row():
+                track_spec = gr.Image(label="Track spectrogram", type="filepath", sources=["upload"])
+                source_spec = gr.Image(label="Source spectrogram", type="filepath", sources=["upload"])
+            spec_run = gr.Button("Verify match", variant="primary")
     with gr.Accordion("Settings", open=False):
         checkpoint_path = gr.Textbox(label="Checkpoint path", value=DEFAULT_CHECKPOINT)
             stride_beats = gr.Slider(1, 16, value=4, step=1, label="Window stride, beats")
             max_windows = gr.Slider(4, 64, value=24, step=1, label="Max windows per upload")
     result = gr.Markdown()
     waveform_plot = gr.Plot(label="Waveforms")
     mel_plot = gr.Plot(label="Mel Spectrograms")
+    # Show waveforms as soon as both audio files are uploaded
     for audio_input in [track_audio, source_audio]:
         audio_input.change(
             preview_waveforms,
             outputs=[waveform_plot, mel_plot],
         )
+    audio_run.click(
         verify,
         inputs=[
             track_audio,
         outputs=[result, waveform_plot, mel_plot],
     )
+    spec_run.click(
+        verify_spectrograms,
+        inputs=[track_spec, source_spec, checkpoint_path, match_threshold, localization_threshold],
+        outputs=[result, waveform_plot, mel_plot],
+    )
 if __name__ == "__main__":
     demo.queue(max_size=8).launch()