Spaces:

Exgc
/

OmniSep

Runtime error

App Files Files Community

Exgc commited on Jun 22, 2025

Commit

9a6ee98

1 Parent(s): 285b2a6

init

Browse files

Files changed (6) hide show

app.py +172 -0
exp/checkpoints/best_model.pt +1 -0
exp/train-args.json +1 -0
omnisep.py +752 -0
requirements.txt +8 -0
utils.py +348 -0

app.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import gradio as gr
+import torch
+import numpy as np
+import librosa
+import pathlib
+import scipy.io.wavfile
+import os
+from imagebind import data
+from imagebind.models import imagebind_model
+from imagebind.models.imagebind_model import ModalityType
+import torch.nn.functional as F
+import omnisep
+import utils
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# ========== Configuration & Model Loading ==========
+def setup_models(checkpoint_path, train_args_path):
+    train_args = utils.load_json(train_args_path)
+    model = omnisep.OmniSep(
+        train_args['n_mix'], train_args['layers'], train_args['channels'],
+        use_log_freq=train_args['log_freq'],
+        use_weighted_loss=train_args['weighted_loss'],
+        use_binary_mask=train_args['binary_mask'],
+        emb_dim=train_args.get('emb_dim', 512)
+    )
+    model = torch.nn.DataParallel(model)
+    model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'))
+    model.to(device)
+    model.eval()
+    imagebind_net = imagebind_model.imagebind_huge(pretrained=True)
+    imagebind_net = torch.nn.DataParallel(imagebind_net)
+    imagebind_net.to(device)
+    imagebind_net.eval()
+    return model, imagebind_net, train_args
+# ========== Audio Loading & Preprocessing ==========
+def load_audio_and_spec(audio_file, audio_len, sample_rate, n_fft, hop_len, win_len):
+    y, sr = librosa.load(audio_file, sr=sample_rate, mono=True)
+    if len(y) < audio_len:
+        y = np.tile(y, (audio_len // len(y) + 1))[:audio_len]
+    else:
+        y = y[:audio_len]
+    y = np.clip(y, -1, 1)
+    spec_mix = librosa.stft(y, n_fft=n_fft, hop_length=hop_len, win_length=win_len)
+    mag_mix = torch.tensor(np.abs(spec_mix)).unsqueeze(0).unsqueeze(0)
+    phase_mix = torch.tensor(np.angle(spec_mix)).unsqueeze(0).unsqueeze(0)
+    return mag_mix, phase_mix, y.shape[0]
+# ========== Embedding Construction ==========
+def get_combined_embedding(imagebind_net, text=None, image=None, audio=None,
+                           text_w=1.0, image_w=1.0, audio_w=1.0):
+    inputs = {}
+    if text: inputs[ModalityType.TEXT] = data.load_and_transform_text([text], device)
+    if image: inputs[ModalityType.VISION] = data.load_and_transform_vision_data([image], device)
+    if audio: inputs[ModalityType.AUDIO] = data.load_and_transform_audio_data([audio], device)
+    emb = imagebind_net(inputs)
+    result = None
+    denom = 0
+    if text:
+        result = text_w * emb[ModalityType.TEXT]
+        denom += text_w
+    if image:
+        result = emb[ModalityType.VISION] * image_w if result is None else result + image_w * emb[ModalityType.VISION]
+        denom += image_w
+    if audio:
+        result = emb[ModalityType.AUDIO] * audio_w if result is None else result + audio_w * emb[ModalityType.AUDIO]
+        denom += audio_w
+    if denom > 0:
+        result = F.normalize(result / denom)
+    return result
+# ========== Waveform Recovery ==========
+def recover_waveform(mag_mix, phase_mix, pred_mask, args):
+    B = mag_mix.size(0)
+    if args['log_freq']:
+        grid_unwarp = torch.from_numpy(utils.warpgrid(B, args['n_fft'] // 2 + 1, pred_mask.size(3), warp=False)).to(pred_mask.device)
+        pred_mask_linear = F.grid_sample(pred_mask, grid_unwarp, align_corners=True)
+    else:
+        pred_mask_linear = pred_mask[0]
+    # pred_mag = mag_mix[0, 0].numpy() * pred_mask_linear[0, 0].numpy()
+    # pred_wav = utils.istft_reconstruction(pred_mag, phase_mix[0, 0].numpy(),
+    #                                       hop_len=args['hop_len'], win_len=args['win_len'])
+    # Convert into numpy arrays
+    mag_mix = mag_mix.detach().cpu().numpy()
+    phase_mix = phase_mix.detach().cpu().numpy()
+    pred_mask = pred_mask.detach().cpu().numpy()
+    pred_mask_linear = pred_mask_linear.detach().cpu().numpy()
+    # Apply the threshold
+    pred_mask = (pred_mask > 0.5).astype(np.float32)
+    pred_mask_linear = (pred_mask_linear > 0.5).astype(np.float32)
+    # Recover predicted audio
+    pred_mag = mag_mix[0, 0] * pred_mask_linear[0, 0]
+    pred_wav = utils.istft_reconstruction(
+        pred_mag,
+        phase_mix[0, 0],
+        hop_len=args['hop_len'],
+        win_len=args['win_len'],
+    )
+    return pred_wav
+# ========== Gradio Interface ==========
+def run_inference(input_audio, text_pos, audio_pos, image_pos, text_neg, audio_neg, image_neg,
+                  text_w, image_w, audio_w, neg_w):
+    model, imagebind_net, args = setup_models("./exp/checkpoints/best_model.pt", "./exp/checkpoints/train-args.json")
+    audio_len = 65535
+    mag_mix, phase_mix, out_len = load_audio_and_spec(input_audio, audio_len,
+                                                      args['audio_rate'], args['n_fft'], args['hop_len'], args['win_len'])
+    img_emb = get_combined_embedding(imagebind_net, text_pos, image_pos, audio_pos,
+                                     text_w, image_w, audio_w)
+    if any([text_neg, audio_neg, image_neg]):
+        neg_emb = get_combined_embedding(imagebind_net, text_neg, image_neg, audio_neg,
+                                         1.0, 1.0, 1.0)
+        img_emb = (1 + neg_w) * img_emb - neg_w * neg_emb
+    mag_mix = mag_mix.to(device)
+    phase_mix = phase_mix.to(device)
+    pred_mask = model.module.infer(mag_mix, [img_emb])[0]
+    pred_wav = recover_waveform(mag_mix, phase_mix, pred_mask, args)
+    out_path = "/tmp/output.wav"
+    scipy.io.wavfile.write(out_path, args['audio_rate'], pred_wav[:out_len])
+    return out_path
+with gr.Blocks(title="OmniSep UI") as iface:
+    gr.Markdown("## 🎧 Upload Your Mixed Audio")
+    mixed_audio = gr.Audio(type="filepath", label="Mixed Input Audio")
+    gr.Markdown("### ✅ Positive Query")
+    with gr.Row():
+        pos_text = gr.Textbox(label="Text Query", placeholder="e.g. dog barking")
+        pos_audio = gr.Audio(type="filepath", label="Audio Query")
+        pos_image = gr.Image(type="filepath", label="Image Query")
+    gr.Markdown("### ❌ Negative Query (Optional)")
+    with gr.Row():
+        neg_text = gr.Textbox(label="Negative Text Query")
+        neg_audio = gr.Audio(type="filepath", label="Negative Audio Query")
+        neg_image = gr.Image(type="filepath", label="Negative Image Query")
+    gr.Markdown("### 🎚️ Modality Weights")
+    with gr.Row():
+        text_weight = gr.Slider(0, 5, value=1.0, step=0.1, label="Text Weight")
+        image_weight = gr.Slider(0, 5, value=1.0, step=0.1, label="Image Weight")
+        audio_weight = gr.Slider(0, 5, value=1.0, step=0.1, label="Audio Weight")
+        neg_weight = gr.Slider(0, 2, value=0.5, step=0.1, label="Negative Embedding Weight")
+    output_audio = gr.Audio(type="filepath", label="Separated Output Audio")
+    btn = gr.Button("Run OmniSep Inference")
+    btn.click(fn=run_inference,
+              inputs=[mixed_audio, pos_text, pos_audio, pos_image, neg_text, neg_audio, neg_image,
+                      text_weight, image_weight, audio_weight, neg_weight],
+              outputs=output_audio)
+iface.launch(share=True)

exp/checkpoints/best_model.pt ADDED Viewed

	@@ -0,0 +1 @@


1	+ /root/autodl-tmp/data/OmniSep/best_model.pt

exp/train-args.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"out_dir": "/root/autodl-tmp/OmniSep/omnisep/exp/vggsound/omnisep", "train_list": ["data/vggsound/test.csv"], "val_list": ["data/vggsound/test.csv"], "n_validation": null, "weights": null, "batch_size": 32, "drop_closest": null, "drop_closest_steps": 10000, "repeat": null, "frame_margin": null, "audio_only": false, "audio_len": 65535, "emb_dim": 1024, "audio_rate": 16000, "n_fft": 1024, "hop_len": 256, "win_len": 1024, "img_size": 224, "fps": 1, "train_mode": ["image", "text", "audio"], "n_mix": 2, "channels": 32, "layers": 7, "frames": 3, "stride_frames": 1, "binary_mask": true, "loss": "bce", "weighted_loss": true, "log_freq": true, "n_labels": null, "steps": 500000, "valid_steps": 10000, "lr": 0.001, "lr_warmup_steps": 5000, "lr_decay_steps": 100000, "lr_decay_multiplier": 0.1, "grad_norm_clip": 1.0, "pit_warmup_steps": 0, "seed": 1234, "gpus": 1, "workers": 20, "quiet": false, "is_feature": true, "is_neg": false, "feature_mode": "imagebind"}

omnisep.py ADDED Viewed

	@@ -0,0 +1,752 @@

+"""Define the models."""
+import functools
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import utils
+def init_weights(net):
+    classname = net.__class__.__name__
+    if classname.find("Conv") != -1:
+        net.weight.data.normal_(0.0, 0.001)
+    elif classname.find("BatchNorm") != -1:
+        net.weight.data.normal_(1.0, 0.02)
+        net.bias.data.fill_(0)
+    elif classname.find("Linear") != -1:
+        net.weight.data.normal_(0.0, 0.0001)
+class OmniSep(torch.nn.Module):
+    """Separation model based on the CLIP model."""
+    def __init__(
+            self,
+            n_mix,
+            layers=7,
+            channels=32,
+            use_log_freq=True,
+            use_weighted_loss=True,
+            use_binary_mask=True,
+            emb_dim=512
+    ):
+        super().__init__()
+        self.n_mix = n_mix
+        self.use_log_freq = use_log_freq
+        self.use_weighted_loss = use_weighted_loss
+        self.use_binary_mask = use_binary_mask
+        # Create the neural net
+        self.sound_net = UNet(in_dim=1, out_dim=channels, num_downs=layers)
+        self.frame_net = nn.Linear(emb_dim, channels)
+        self.synth_net = InnerProd(fc_dim=channels)
+        # Initialize the weights
+        self.sound_net.apply(init_weights)
+        self.frame_net.apply(init_weights)
+        self.synth_net.apply(init_weights)
+    def forward(self, batch, img_emb, drop_closest=None):
+        N = self.n_mix
+        mag_mix = batch["mag_mix"]
+        mags = batch["mags"]
+        # Pass through the frame net -> Bx1xC
+        feat_frames_pre = [self.frame_net(img_emb[n]) for n in range(N)]
+        feat_frames = [torch.sigmoid(feat) for feat in feat_frames_pre]
+        # Compute similarities
+        if drop_closest is not None:
+            assert N == 2, "N must be 2 when `drop_closest` is enabled."
+            similarities = F.cosine_similarity(
+                img_emb[0].detach(), img_emb[1].detach()
+            )
+        # Drop most similar pairs
+        if drop_closest is not None and drop_closest > 0:
+            # Sort the similarities
+            sorted_indices = torch.argsort(similarities)
+            # Keep only those with low similarities
+            mag_mix = mag_mix[sorted_indices[:-drop_closest]]
+            for n in range(N):
+                mags[n] = mags[n][sorted_indices[:-drop_closest]]
+                feat_frames[n] = feat_frames[n][sorted_indices[:-drop_closest]]
+        mag_mix = mag_mix + 1e-10
+        B = mag_mix.size(0)
+        T = mag_mix.size(3)
+        # Warp the spectrogram
+        if self.use_log_freq:
+            grid_warp = torch.from_numpy(
+                utils.warpgrid(B, 256, T, warp=True)
+            )
+            grid_warp = grid_warp.to(mag_mix.device)
+            mag_mix = F.grid_sample(mag_mix, grid_warp, align_corners=True)
+            for n in range(N):
+                mags[n] = F.grid_sample(mags[n], grid_warp, align_corners=True)
+        # Calculate loss weighting coefficient (magnitude of input mixture)
+        if self.use_weighted_loss:
+            weight = torch.log1p(mag_mix)
+            weight = torch.clamp(weight, 1e-3, 10)
+        else:
+            weight = torch.ones_like(mag_mix)
+        # Drop most similar pairs
+        if drop_closest is not None and drop_closest == -1:
+            # Desired weight as a function of similarity:
+            #   sim    -1 <-> 0.5 <---------------> 1
+            #   weight  1      1    2 x (1 - sim)   0
+            w = F.relu(1 - 2 * F.relu(similarities - 0.5))
+            weight *= w.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
+        # Compute ground truth masks after warping!
+        gt_masks = [None] * N
+        for n in range(N):
+            if self.use_binary_mask:
+                gt_masks[n] = (mags[n] > 0.5 * mag_mix).float()
+            else:
+                gt_masks[n] = mags[n] / sum(mags[n])
+                gt_masks[n].clamp_(0.0, 1.0)
+        # Compute log magnitude
+        log_mag_mix = torch.log(mag_mix).detach()
+        # Pass through the sound net -> BxCxHxW
+        feat_sound = self.sound_net(log_mag_mix)
+        # Pass through the synth net
+        pred_masks = [
+            self.synth_net(feat_frames[n], feat_sound) for n in range(N)
+        ]
+        # Activate with Sigmoid function if using binary mask
+        if self.use_binary_mask:
+            pred_masks = [torch.sigmoid(mask) for mask in pred_masks]
+        # Compute the loss
+        loss = torch.mean(
+            torch.stack(
+                [
+                    F.binary_cross_entropy(pred_masks[n], gt_masks[n], weight)
+                    for n in range(N)
+                ]
+            )
+        )
+        return (
+            loss,
+            {
+                "pred_masks": pred_masks,
+                "gt_masks": gt_masks,
+                "mag_mix": mag_mix,
+                "mags": mags,
+                "weight": weight,
+            },
+        )
+    def infer(self, mag_mix, img_emb, n_mix=1):
+        N = n_mix
+        # Pass through the frame net -> Bx1xC
+        feat_frames_pre = [self.frame_net(img_emb[n]) for n in range(N)]
+        feat_frames = [torch.sigmoid(feat) for feat in feat_frames_pre]
+        mag_mix = mag_mix + 1e-10
+        B = mag_mix.size(0)
+        T = mag_mix.size(3)
+        # Warp the spectrogram
+        if self.use_log_freq:
+            grid_warp = torch.from_numpy(
+                utils.warpgrid(B, 256, T, warp=True)
+            ).to(mag_mix.device)
+            mag_mix = F.grid_sample(mag_mix, grid_warp, align_corners=True)
+        # Compute log magnitude
+        log_mag_mix = torch.log(mag_mix).detach()
+        # Pass through the sound net -> BxCxHxW
+        feat_sound = self.sound_net(log_mag_mix)
+        # Pass through the synth net
+        pred_masks = [
+            self.synth_net(feat_frames[n], feat_sound) for n in range(N)
+        ]
+        # Activate with Sigmoid function if using binary mask
+        if self.use_binary_mask:
+            pred_masks = [torch.sigmoid(mask) for mask in pred_masks]
+        return pred_masks
+    def infer2(self, batch, img_emb):
+        N = self.n_mix
+        mag_mix = batch["mag_mix"]
+        mags = batch["mags"]
+        # Pass through the frame net -> Bx1xC
+        feat_frames_pre = [self.frame_net(img_emb[0])]
+        feat_frames = [torch.sigmoid(feat) for feat in feat_frames_pre]
+        mag_mix = mag_mix + 1e-10
+        B = mag_mix.size(0)
+        T = mag_mix.size(3)
+        # Warp the spectrogram
+        if self.use_log_freq:
+            grid_warp = torch.from_numpy(
+                utils.warpgrid(B, 256, T, warp=True)
+            ).to(mag_mix.device)
+            mag_mix = F.grid_sample(mag_mix, grid_warp, align_corners=True)
+            for n in range(N):
+                mags[n] = F.grid_sample(mags[n], grid_warp, align_corners=True)
+        # Calculate loss weighting coefficient (magnitude of input mixture)
+        if self.use_weighted_loss:
+            weight = torch.log1p(mag_mix)
+            weight = torch.clamp(weight, 1e-3, 10)
+        else:
+            weight = torch.ones_like(mag_mix)
+        # Compute ground truth masks after warping!
+        gt_masks = [None] * N
+        for n in range(N):
+            if self.use_binary_mask:
+                gt_masks[n] = (mags[n] > 0.5 * mag_mix).float()
+            else:
+                gt_masks[n] = mags[n] / sum(mags[n])
+                gt_masks[n].clamp_(0.0, 1.0)
+        # Compute log magnitude
+        log_mag_mix = torch.log(mag_mix).detach()
+        # Pass through the sound net -> BxCxHxW
+        feat_sound = self.sound_net(log_mag_mix)
+        # Pass through the synth net
+        pred_masks = [self.synth_net(feat_frames[0], feat_sound)]
+        # Activate with Sigmoid function if using binary mask
+        if self.use_binary_mask:
+            pred_masks = [torch.sigmoid(pred_masks[0])]
+        return {
+            "pred_masks": pred_masks,
+            "gt_masks": gt_masks,
+            "mag_mix": mag_mix,
+            "mags": mags,
+            "weight": weight,
+        }
+    def infer3(self, batch, img_emb):
+        mag_mix = batch["mag_mix"]
+        # Pass through the frame net -> Bx1xC
+        feat_frames_pre = [self.frame_net(img_emb)]
+        feat_frames = [torch.sigmoid(feat) for feat in feat_frames_pre]
+        mag_mix = mag_mix + 1e-10
+        B = mag_mix.size(0)
+        T = mag_mix.size(3)
+        # Warp the spectrogram
+        if self.use_log_freq:
+            grid_warp = torch.from_numpy(
+                utils.warpgrid(B, 256, T, warp=True)
+            ).to(mag_mix.device)
+            mag_mix = F.grid_sample(mag_mix, grid_warp, align_corners=True)
+        # Calculate loss weighting coefficient (magnitude of input mixture)
+        if self.use_weighted_loss:
+            weight = torch.log1p(mag_mix)
+            weight = torch.clamp(weight, 1e-3, 10)
+        else:
+            weight = torch.ones_like(mag_mix)
+        # Compute log magnitude
+        log_mag_mix = torch.log(mag_mix).detach()
+        # Pass through the sound net -> BxCxHxW
+        feat_sound = self.sound_net(log_mag_mix)
+        # Pass through the synth net
+        pred_masks = [self.synth_net(feat_frames[0], feat_sound)]
+        # Get the input to the PIT stream
+        # mean_feat_frames_pre = feat_frames_pre[0]
+        # feat_pit_pre = [net(mean_feat_frames_pre) for net in self.pit_nets]
+        # feat_pit = [torch.sigmoid(feat) for feat in feat_pit_pre]
+        # Pass through the synth net for the PIT stream
+        # pit_masks = [self.synth_net(feat, feat_sound) for feat in feat_pit]
+        # Mean activation
+        mean_act = torch.mean(torch.sigmoid(pred_masks[0]))
+        # mean_pit_act = torch.mean(
+        #     torch.sigmoid(pit_masks[0]) + torch.sigmoid(pit_masks[1])
+        # )
+        return {
+            "pred_masks": pred_masks,
+            # "pit_masks": pit_masks,
+            "mag_mix": mag_mix,
+            "weight": weight,
+            "mean_act": mean_act,
+            # "mean_pit_act": mean_pit_act,
+        }
+class ResnetDilated(nn.Module):
+    def __init__(self, orig_resnet, pool_type="maxpool", dilate_scale=16):
+        super().__init__()
+        self.pool_type = pool_type
+        if dilate_scale == 8:
+            orig_resnet.layer3.apply(
+                functools.partial(self._nostride_dilate, dilate=2)
+            )
+            orig_resnet.layer4.apply(
+                functools.partial(self._nostride_dilate, dilate=4)
+            )
+        elif dilate_scale == 16:
+            orig_resnet.layer4.apply(
+                functools.partial(self._nostride_dilate, dilate=2)
+            )
+        self.features = nn.Sequential(*list(orig_resnet.children())[:-2])
+    def _nostride_dilate(self, m, dilate):
+        classname = m.__class__.__name__
+        if classname.find("Conv") != -1:
+            # Convolution layers with stride
+            if m.stride == (2, 2):
+                m.stride = (1, 1)
+                if m.kernel_size == (3, 3):
+                    m.dilation = (dilate // 2, dilate // 2)
+                    m.padding = (dilate // 2, dilate // 2)
+            # Other convolution layers
+            else:
+                if m.kernel_size == (3, 3):
+                    m.dilation = (dilate, dilate)
+                    m.padding = (dilate, dilate)
+    def forward(self, x, pool=True):
+        x = self.features(x)
+        if not pool:
+            return x
+        if self.pool_type == "avgpool":
+            x = F.adaptive_avg_pool2d(x, 1)
+        elif self.pool_type == "maxpool":
+            x = F.adaptive_max_pool2d(x, 1)
+        x = x.view(x.size(0), x.size(1))
+        return x
+class UNetBlock(nn.Module):
+    """A U-Net block that defines the submodule with skip connection.
+    X ---------------------identity-------------------- X
+      |-- downsampling --| submodule |-- upsampling --|
+    """
+    def __init__(
+            self,
+            outer_nc,
+            inner_input_nc,
+            input_nc=None,
+            submodule=None,
+            outermost=False,
+            innermost=False,
+            use_dropout=False,
+            inner_output_nc=None,
+            noskip=False,
+    ):
+        super().__init__()
+        self.outermost = outermost
+        self.noskip = noskip
+        use_bias = False
+        if input_nc is None:
+            input_nc = outer_nc
+        if innermost:
+            inner_output_nc = inner_input_nc
+        elif inner_output_nc is None:
+            inner_output_nc = 2 * inner_input_nc
+        downrelu = nn.LeakyReLU(0.2, True)
+        downnorm = nn.BatchNorm2d(inner_input_nc)
+        uprelu = nn.ReLU(True)
+        upnorm = nn.BatchNorm2d(outer_nc)
+        upsample = nn.Upsample(
+            scale_factor=2, mode="bilinear", align_corners=True
+        )
+        if outermost:
+            downconv = nn.Conv2d(
+                input_nc,
+                inner_input_nc,
+                kernel_size=4,
+                stride=2,
+                padding=1,
+                bias=use_bias,
+            )
+            upconv = nn.Conv2d(
+                inner_output_nc, outer_nc, kernel_size=3, padding=1
+            )
+            down = [downconv]
+            up = [uprelu, upsample, upconv]
+            model = down + [submodule] + up
+        elif innermost:
+            downconv = nn.Conv2d(
+                input_nc,
+                inner_input_nc,
+                kernel_size=4,
+                stride=2,
+                padding=1,
+                bias=use_bias,
+            )
+            upconv = nn.Conv2d(
+                inner_output_nc,
+                outer_nc,
+                kernel_size=3,
+                padding=1,
+                bias=use_bias,
+            )
+            down = [downrelu, downconv]
+            up = [uprelu, upsample, upconv, upnorm]
+            model = down + up
+        else:
+            downconv = nn.Conv2d(
+                input_nc,
+                inner_input_nc,
+                kernel_size=4,
+                stride=2,
+                padding=1,
+                bias=use_bias,
+            )
+            upconv = nn.Conv2d(
+                inner_output_nc,
+                outer_nc,
+                kernel_size=3,
+                padding=1,
+                bias=use_bias,
+            )
+            down = [downrelu, downconv, downnorm]
+            up = [uprelu, upsample, upconv, upnorm]
+            if use_dropout:
+                model = down + [submodule] + up + [nn.Dropout(0.5)]
+            else:
+                model = down + [submodule] + up
+        self.model = nn.Sequential(*model)
+    def forward(self, x):
+        if self.outermost or self.noskip:
+            return self.model(x)
+        else:
+            return torch.cat([x, self.model(x)], 1)
+class UNet(nn.Module):
+    """A UNet model."""
+    def __init__(
+            self,
+            in_dim=1,
+            out_dim=64,
+            num_downs=5,
+            ngf=64,
+            use_dropout=False,
+    ):
+        super().__init__()
+        # Construct the U-Net structure
+        unet_block = UNetBlock(
+            ngf * 8, ngf * 8, input_nc=None, submodule=None, innermost=True
+        )
+        for i in range(num_downs - 5):
+            unet_block = UNetBlock(
+                ngf * 8,
+                ngf * 8,
+                input_nc=None,
+                submodule=unet_block,
+                use_dropout=use_dropout,
+            )
+        unet_block = UNetBlock(
+            ngf * 4, ngf * 8, input_nc=None, submodule=unet_block
+        )
+        unet_block = UNetBlock(
+            ngf * 2, ngf * 4, input_nc=None, submodule=unet_block
+        )
+        unet_block = UNetBlock(
+            ngf, ngf * 2, input_nc=None, submodule=unet_block
+        )
+        unet_block = UNetBlock(
+            out_dim,
+            ngf,
+            input_nc=in_dim,
+            submodule=unet_block,
+            outermost=True,
+        )
+        self.bn0 = nn.BatchNorm2d(in_dim)
+        self.unet_block = unet_block
+    def forward(self, x):
+        x = self.bn0(x)
+        x = self.unet_block(x)
+        return x
+class CondUNetBlock(nn.Module):
+    """A U-Net block that defines the submodule with skip connection.
+    X ---------------------identity-------------------- X
+      |-- downsampling --| submodule |-- upsampling --|
+    """
+    def __init__(
+            self,
+            outer_nc,
+            inner_input_nc,
+            input_nc=None,
+            submodule=None,
+            outermost=False,
+            innermost=False,
+            inner_output_nc=None,
+            noskip=False,
+            cond_nc=None,
+    ):
+        super().__init__()
+        self.outermost = outermost
+        self.innermost = innermost
+        self.noskip = noskip
+        self.cond_nc = cond_nc
+        self.submodule = submodule
+        use_bias = False
+        if input_nc is None:
+            input_nc = outer_nc
+        if innermost:
+            assert cond_nc > 0
+            inner_output_nc = inner_input_nc + cond_nc
+        elif inner_output_nc is None:
+            inner_output_nc = 2 * inner_input_nc
+        self.downnorm = nn.BatchNorm2d(inner_input_nc)
+        self.uprelu = nn.ReLU(True)
+        self.upsample = nn.Upsample(
+            scale_factor=2, mode="bilinear", align_corners=True
+        )
+        if outermost:
+            self.downconv = nn.Conv2d(
+                input_nc,
+                inner_input_nc,
+                kernel_size=4,
+                stride=2,
+                padding=1,
+                bias=use_bias,
+            )
+            self.upconv = nn.Conv2d(
+                inner_output_nc, outer_nc, kernel_size=3, padding=1
+            )
+        elif innermost:
+            self.downrelu = nn.LeakyReLU(0.2, True)
+            self.downconv = nn.Conv2d(
+                input_nc,
+                inner_input_nc,
+                kernel_size=4,
+                stride=2,
+                padding=1,
+                bias=use_bias,
+            )
+            self.upconv = nn.Conv2d(
+                inner_output_nc,
+                outer_nc,
+                kernel_size=3,
+                padding=1,
+                bias=use_bias,
+            )
+            self.upnorm = nn.BatchNorm2d(outer_nc)
+        else:
+            self.downrelu = nn.LeakyReLU(0.2, True)
+            self.downconv = nn.Conv2d(
+                input_nc,
+                inner_input_nc,
+                kernel_size=4,
+                stride=2,
+                padding=1,
+                bias=use_bias,
+            )
+            self.upconv = nn.Conv2d(
+                inner_output_nc,
+                outer_nc,
+                kernel_size=3,
+                padding=1,
+                bias=use_bias,
+            )
+            self.upnorm = nn.BatchNorm2d(outer_nc)
+    def forward(self, x, cond):
+        if self.outermost:
+            x_ = self.downconv(x)
+            x_ = self.submodule(x_, cond)
+            x_ = self.upconv(self.upsample(self.uprelu(x_)))
+        elif self.innermost:
+            x_ = self.downconv(self.downrelu(x))
+            B, _, H, W = x_.size()
+            cond_ = cond.unsqueeze(-1).unsqueeze(-1) * torch.ones(
+                (B, self.cond_nc, H, W), device=x_.device
+            )
+            x_ = torch.concat((x_, cond_), 1)
+            x_ = self.upnorm(self.upconv(self.upsample(self.uprelu(x_))))
+        else:
+            x_ = self.downnorm(self.downconv(self.downrelu(x)))
+            x_ = self.submodule(x_, cond)
+            x_ = self.upnorm(self.upconv(self.upsample(self.uprelu(x_))))
+        if self.outermost or self.noskip:
+            return x_
+        else:
+            return torch.cat([x, x_], 1)
+class CondUNet(nn.Module):
+    """A UNet model."""
+    def __init__(
+            self,
+            in_dim=1,
+            out_dim=64,
+            cond_dim=32,
+            num_downs=5,
+            ngf=64,
+            use_dropout=False,
+    ):
+        super().__init__()
+        # Construct the U-Net structure
+        unet_block = CondUNetBlock(
+            ngf * 8,
+            ngf * 8,
+            input_nc=None,
+            submodule=None,
+            innermost=True,
+            cond_nc=cond_dim,
+        )
+        for _ in range(num_downs - 5):
+            unet_block = CondUNetBlock(
+                ngf * 8, ngf * 8, input_nc=None, submodule=unet_block
+            )
+        unet_block = CondUNetBlock(
+            ngf * 4, ngf * 8, input_nc=None, submodule=unet_block
+        )
+        unet_block = CondUNetBlock(
+            ngf * 2, ngf * 4, input_nc=None, submodule=unet_block
+        )
+        unet_block = CondUNetBlock(
+            ngf, ngf * 2, input_nc=None, submodule=unet_block
+        )
+        unet_block = CondUNetBlock(
+            out_dim,
+            ngf,
+            input_nc=in_dim,
+            submodule=unet_block,
+            outermost=True,
+        )
+        self.bn0 = nn.BatchNorm2d(in_dim)
+        self.unet_block = unet_block
+    def forward(self, x, cond):
+        x = self.bn0(x)
+        x = self.unet_block(x, cond)
+        return x
+class InnerProd(nn.Module):
+    def __init__(self, fc_dim):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(fc_dim))
+        self.bias = nn.Parameter(torch.zeros(1))
+    def forward(self, feat_img, feat_sound):
+        sound_size = feat_sound.size()
+        B, C = sound_size[0], sound_size[1]
+        feat_img = feat_img.view(B, 1, C)
+        z = torch.bmm(feat_img * self.scale, feat_sound.view(B, C, -1)).view(
+            B, 1, *sound_size[2:]
+        )
+        z = z + self.bias
+        return z
+    def forward_nosum(self, feat_img, feat_sound):
+        (B, C, H, W) = feat_sound.size()
+        feat_img = feat_img.view(B, C)
+        z = (feat_img * self.scale).view(B, C, 1, 1) * feat_sound
+        z = z + self.bias
+        return z
+    # inference purposes
+    def forward_pixelwise(self, feats_img, feat_sound):
+        (B, C, HI, WI) = feats_img.size()
+        (B, C, HS, WS) = feat_sound.size()
+        feats_img = feats_img.view(B, C, HI * WI)
+        feats_img = feats_img.transpose(1, 2)
+        feat_sound = feat_sound.view(B, C, HS * WS)
+        z = torch.bmm(feats_img * self.scale, feat_sound).view(
+            B, HI, WI, HS, WS
+        )
+        z = z + self.bias
+        return z
+class Bias(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.bias = nn.Parameter(torch.zeros(1))
+    def forward(self, feat_img, feat_sound):
+        (B, C, H, W) = feat_sound.size()
+        feat_img = feat_img.view(B, 1, C)
+        z = torch.bmm(feat_img, feat_sound.view(B, C, H * W)).view(B, 1, H, W)
+        z = z + self.bias
+        return z
+    def forward_nosum(self, feat_img, feat_sound):
+        (B, C, H, W) = feat_sound.size()
+        z = feat_img.view(B, C, 1, 1) * feat_sound
+        z = z + self.bias
+        return z
+    # inference purposes
+    def forward_pixelwise(self, feats_img, feat_sound):
+        (B, C, HI, WI) = feats_img.size()
+        (B, C, HS, WS) = feat_sound.size()
+        feats_img = feats_img.view(B, C, HI * WI)
+        feats_img = feats_img.transpose(1, 2)
+        feat_sound = feat_sound.view(B, C, HS * WS)
+        z = torch.bmm(feats_img, feat_sound).view(B, HI, WI, HS, WS)
+        z = z + self.bias
+        return z

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+librosa==0.9.2
+numba==0.56.2
+mir_eval==0.7
+opencv-python
+museval==0.4.0
+pydub
+gradio
+imagebind @ git+https://github.com/facebookresearch/ImageBind.git

utils.py ADDED Viewed

	@@ -0,0 +1,348 @@

+"""Utility functions."""
+import contextlib
+import csv
+import json
+import os
+import pathlib
+import subprocess as sp
+import warnings
+from threading import Timer
+import cv2
+import librosa
+import numpy as np
+def save_args(filename, args):
+    """Save the command-line arguments."""
+    args_dict = {}
+    for key, value in vars(args).items():
+        if isinstance(value, pathlib.Path):
+            args_dict[key] = str(value)
+        elif key =='train_list' or key =='val_list':
+            args_dict[key] = [str(v) for v in value]
+        else:
+            args_dict[key] = value
+    save_json(filename, args_dict)
+def inverse_dict(d):
+    """Return the inverse dictionary."""
+    return {v: k for k, v in d.items()}
+def save_txt(filename, data):
+    """Save a list to a TXT file."""
+    with open(filename, "w", encoding="utf8") as f:
+        for item in data:
+            f.write(f"{item}\n")
+def load_txt(filename):
+    """Load a TXT file as a list."""
+    with open(filename, encoding="utf8") as f:
+        return [line.strip() for line in f]
+def save_json(filename, data):
+    """Save data as a JSON file."""
+    with open(filename, "w", encoding="utf8") as f:
+        json.dump(data, f)
+def load_json(filename):
+    """Load data from a JSON file."""
+    with open(filename, encoding="utf8") as f:
+        return json.load(f)
+def save_csv(filename, data, fmt="%d", header=""):
+    """Save data as a CSV file."""
+    np.savetxt(
+        filename, data, fmt=fmt, delimiter=",", header=header, comments=""
+    )
+def load_csv(filename, skiprows=1):
+    """Load data from a CSV file."""
+    return np.loadtxt(filename, dtype=int, delimiter=",", skiprows=skiprows)
+def load_csv_text(filename, headerless=True):
+    """Read a CSV file into a list of dictionaries or lists."""
+    with open(filename) as f:
+        if headerless:
+            return [row for row in csv.reader(f)]
+        reader = csv.DictReader(f)
+        return [
+            {field: row[field] for field in reader.fieldnames}
+            for row in reader
+        ]
+def ignore_exceptions(func):
+    """Decorator that ignores all errors and warnings."""
+    def inner(*args, **kwargs):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            try:
+                return func(*args, **kwargs)
+            except Exception:
+                return None
+    return inner
+def suppress_outputs(func):
+    """Decorator that suppresses writing to stdout and stderr."""
+    def inner(*args, **kwargs):
+        devnull = open(os.devnull, "w")
+        with contextlib.redirect_stdout(devnull):
+            with contextlib.redirect_stderr(devnull):
+                return func(*args, **kwargs)
+    return inner
+def resolve_paths(func):
+    """Decorator that resolves all paths."""
+    def inner(*args, **kwargs):
+        parsed = func(*args, **kwargs)
+        for key in vars(parsed).keys():
+            if isinstance(getattr(parsed, key), pathlib.Path):
+                setattr(
+                    parsed, key, getattr(parsed, key).expanduser().resolve()
+                )
+        return parsed
+    return inner
+def warpgrid(bs, HO, WO, warp=True):
+    # meshgrid
+    x = np.linspace(-1, 1, WO)
+    y = np.linspace(-1, 1, HO)
+    xv, yv = np.meshgrid(x, y)
+    grid = np.zeros((bs, HO, WO, 2))
+    grid_x = xv
+    if warp:
+        grid_y = (np.power(21, (yv + 1) / 2) - 11) / 10
+    else:
+        grid_y = np.log(yv * 10 + 11) / np.log(21) * 2 - 1
+    grid[:, :, :, 0] = grid_x
+    grid[:, :, :, 1] = grid_y
+    grid = grid.astype(np.float32)
+    return grid
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.initialized = False
+        self.val = None
+        self.avg = None
+        self.sum = None
+        self.count = None
+    def initialize(self, val, weight):
+        self.val = val
+        self.avg = val
+        self.sum = val * weight
+        self.count = weight
+        self.initialized = True
+    def update(self, val, weight=1):
+        val = np.asarray(val)
+        if not self.initialized:
+            self.initialize(val, weight)
+        else:
+            self.add(val, weight)
+    def add(self, val, weight):
+        self.val = val
+        self.sum += val * weight
+        self.count += weight
+        self.avg = self.sum / self.count
+    def value(self):
+        if self.val is None:
+            return 0.0
+        else:
+            return self.val.tolist()
+    def average(self):
+        if self.avg is None:
+            return 0.0
+        else:
+            return self.avg.tolist()
+def recover_rgb(img):
+    for t, m, s in zip(img, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]):
+        t.mul_(s).add_(m)
+    img = (img.numpy().transpose((1, 2, 0)) * 255).astype(np.uint8)
+    return img
+def recover_rgb_clip(img):
+    for t, m, s in zip(
+        img,
+        [0.48145466, 0.4578275, 0.40821073],
+        [0.26862954, 0.26130258, 0.27577711],
+    ):
+        t.mul_(s).add_(m)
+    img = (img.numpy().transpose((1, 2, 0)) * 255).astype(np.uint8)
+    return img
+def magnitude2heatmap(mag, log=True, scale=200.0):
+    if log:
+        mag = np.log10(mag + 1.0)
+    mag *= scale
+    mag[mag > 255] = 255
+    mag = mag.astype(np.uint8)
+    # mag_color = cv2.applyColorMap(mag, cv2.COLORMAP_JET)
+    mag_color = cv2.applyColorMap(mag, cv2.COLORMAP_INFERNO)
+    mag_color = mag_color[:, :, ::-1]
+    return mag_color
+def istft_reconstruction(mag, phase, hop_len, win_len):
+    spec = mag.astype(np.complex) * np.exp(1j * phase)
+    wav = librosa.istft(spec, hop_length=hop_len, win_length=win_len)
+    return np.clip(wav, -1.0, 1.0).astype(np.float32)
+class VideoWriter:
+    """ Combine numpy frames into video using ffmpeg
+    Arguments:
+        filename: name of the output video
+        fps: frame per second
+        shape: shape of video frame
+    Properties:
+        add_frame(frame):
+            add a frame to the video
+        add_frames(frames):
+            add multiple frames to the video
+        release():
+            release writing pipe
+    """
+    def __init__(self, filename, fps, shape):
+        self.file = filename
+        self.fps = fps
+        self.shape = shape
+        # video codec
+        ext = filename.split(".")[-1]
+        if ext == "mp4":
+            self.vcodec = "h264"
+        else:
+            raise RuntimeError("Video codec not supoorted.")
+        # video writing pipe
+        cmd = [
+            "ffmpeg",
+            "-y",  # overwrite existing file
+            "-f",
+            "rawvideo",  # file format
+            "-s",
+            "{}x{}".format(shape[1], shape[0]),  # size of one frame
+            "-pix_fmt",
+            "rgb24",  # 3 channels
+            "-r",
+            str(self.fps),  # frames per second
+            "-i",
+            "-",  # input comes from a pipe
+            "-an",  # not to expect any audio
+            "-vcodec",
+            self.vcodec,  # video codec
+            "-pix_fmt",
+            "yuv420p",  # output video in yuv420p
+            self.file,
+        ]
+        self.pipe = sp.Popen(
+            cmd, stdin=sp.PIPE, stderr=sp.PIPE, bufsize=10 ** 9
+        )
+    def release(self):
+        self.pipe.stdin.close()
+    def add_frame(self, frame):
+        assert len(frame.shape) == 3
+        assert frame.shape[0] == self.shape[0]
+        assert frame.shape[1] == self.shape[1]
+        try:
+            self.pipe.stdin.write(frame.tostring())
+        except:
+            _, ffmpeg_error = self.pipe.communicate()
+            print(ffmpeg_error)
+    def add_frames(self, frames):
+        for frame in frames:
+            self.add_frame(frame)
+def kill_proc(proc):
+    proc.kill()
+    print("Process running overtime! Killed.")
+def run_proc_timeout(proc, timeout_sec):
+    # kill_proc = lambda p: p.kill()
+    timer = Timer(timeout_sec, kill_proc, [proc])
+    try:
+        timer.start()
+        proc.communicate()
+    finally:
+        timer.cancel()
+def combine_video_audio(src_video, src_audio, dst_video, verbose=False):
+    try:
+        cmd = [
+            "ffmpeg",
+            "-y",
+            "-loglevel",
+            "quiet",
+            "-i",
+            src_video,
+            "-i",
+            src_audio,
+            "-c:v",
+            "copy",
+            "-c:a",
+            "aac",
+            "-strict",
+            "experimental",
+            dst_video,
+        ]
+        proc = sp.Popen(cmd)
+        run_proc_timeout(proc, 10.0)
+        if verbose:
+            print("Processed:{}".format(dst_video))
+    except Exception as e:
+        print("Error:[{}] {}".format(dst_video, e))
+# save video to the disk using ffmpeg
+def save_video(path, tensor, fps=25):
+    assert tensor.ndim == 4, "video should be in 4D numpy array"
+    L, H, W, C = tensor.shape
+    writer = VideoWriter(path, fps=fps, shape=[H, W])
+    for t in range(L):
+        writer.add_frame(tensor[t])
+    writer.release()
+def save_audio(path, audio_numpy, sr):
+    librosa.output.write_wav(path, audio_numpy, sr)