Spaces:

wsntxxn
/

UniFlow-Audio

Running on Zero

App Files Files Community

wsntxxn commited on about 1 month ago

Commit

51bd0e0

1 Parent(s): 2d1110f

Unify duration scale: use latent token count for both training and inference

Browse files

Files changed (1) hide show

models/flow_matching.py +45 -25

models/flow_matching.py CHANGED Viewed

@@ -9,6 +9,7 @@ import copy
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from diffusers.utils.torch_utils import randn_tensor
 from diffusers import FlowMatchEulerDiscreteScheduler
@@ -475,6 +476,7 @@ class DurationAdapterMixin:
         pred = torch.exp(pred) * mask
         pred = torch.ceil(pred) - self.offset
         pred *= self.frame_resolution
         return pred
     def prepare_global_duration(
@@ -489,11 +491,10 @@ class DurationAdapterMixin:
         local_pred: predicted latent length
         """
         global_pred = torch.exp(global_pred) - self.offset
-        result = global_pred
         # avoid error accumulation for each frame
         if use_local:
-            pred_from_local = torch.round(local_pred * self.latent_token_rate)
-            pred_from_local = pred_from_local.sum(1) / self.latent_token_rate
             result[is_time_aligned] = pred_from_local[is_time_aligned]
         return result
@@ -503,20 +504,18 @@ class DurationAdapterMixin:
         x: torch.Tensor,
         content_mask: torch.Tensor,
         local_duration: torch.Tensor,
-        global_duration: torch.Tensor | None = None,
     ):
-        n_latents = torch.round(local_duration * self.latent_token_rate)
-        if global_duration is not None:
-            latent_length = torch.round(
-                global_duration * self.latent_token_rate
-            )
-        else:
-            latent_length = n_latents.sum(1)
         latent_mask = create_mask_from_length(latent_length).to(
             content_mask.device
         )
         attn_mask = content_mask.unsqueeze(-1) * latent_mask.unsqueeze(1)
-        align_path = create_alignment_path(n_latents, attn_mask)
         expanded_x = torch.matmul(align_path.transpose(1, 2).to(x.dtype), x)
         return expanded_x, latent_mask
@@ -665,14 +664,12 @@ class CrossAttentionAudioFlowMatching(
             )
         # prepare global duration
-        global_duration = self.prepare_global_duration(
             global_duration_pred,
             local_duration_pred,
             is_time_aligned,
             use_local=False
         )
-        # TODO: manually set duration for SE and AudioSR
-        latent_length = torch.round(global_duration * self.latent_token_rate)
         task_mask = torch.as_tensor([t in SAME_LENGTH_TASKS for t in task])
         latent_length[task_mask] = content[task_mask].size(1)
         latent_mask = create_mask_from_length(latent_length).to(device)
@@ -735,7 +732,8 @@ class DummyContentAudioFlowMatching(CrossAttentionAudioFlowMatching):
         duration_offset: float = 1.0,
         cfg_drop_ratio: float = 0.2,
         sample_strategy: str = 'normal',
-        num_train_steps: int = 1000
     ):
         super().__init__(
@@ -758,6 +756,7 @@ class DummyContentAudioFlowMatching(CrossAttentionAudioFlowMatching):
         )
         self.dummy_nta_embed = nn.Parameter(torch.zeros(content_dim))
         self.dummy_ta_embed = nn.Parameter(torch.zeros(content_dim))
     def get_backbone_input(
         self, target_length: int, content: torch.Tensor,
@@ -808,7 +807,12 @@ class DummyContentAudioFlowMatching(CrossAttentionAudioFlowMatching):
         **kwargs
     ):
         device = self.dummy_param.device
-        loss_reduce = self.training or (loss_reduce and not self.training)
         self.autoencoder.eval()
         with torch.no_grad():
@@ -859,10 +863,12 @@ class DummyContentAudioFlowMatching(CrossAttentionAudioFlowMatching):
             duration = F.pad(
                 duration, (0, content_mask.size(1) - duration.size(1))
             )
         time_aligned_content, _ = self.expand_by_duration(
             x=content[:, :trunc_ta_length],
             content_mask=ta_content_mask,
-            local_duration=duration,
         )
         # --------------------------------------------------------------------
@@ -899,6 +905,16 @@ class DummyContentAudioFlowMatching(CrossAttentionAudioFlowMatching):
         target = target.transpose(1, self.autoencoder.time_dim)
         diff_loss = F.mse_loss(pred, target, reduction="none")
         diff_loss = loss_with_mask(diff_loss, latent_mask, reduce=loss_reduce)
         return {
             "diff_loss": diff_loss,
             "local_duration_loss": local_duration_loss,
@@ -939,17 +955,21 @@ class DummyContentAudioFlowMatching(CrossAttentionAudioFlowMatching):
             trunc_ta_length = content.size(1)
         # prepare local duration
-        local_duration = self.prepare_local_duration(
             local_duration_pred, content_mask
         )
-        local_duration = local_duration[:, :trunc_ta_length]
         # use ground truth duration
         if use_gt_duration and "duration" in kwargs:
-            local_duration = torch.as_tensor(kwargs["duration"]).to(device)
         # prepare global duration
-        global_duration = self.prepare_global_duration(
-            global_duration_pred, local_duration, is_time_aligned
         )
         # --------------------------------------------------------------------
@@ -958,8 +978,8 @@ class DummyContentAudioFlowMatching(CrossAttentionAudioFlowMatching):
         time_aligned_content, latent_mask = self.expand_by_duration(
             x=content[:, :trunc_ta_length],
             content_mask=content_mask[:, :trunc_ta_length],
-            local_duration=local_duration,
-            global_duration=global_duration,
         )
         context, context_mask, time_aligned_content = self.get_backbone_input(

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.nn import init
 from diffusers.utils.torch_utils import randn_tensor
 from diffusers import FlowMatchEulerDiscreteScheduler
         pred = torch.exp(pred) * mask
         pred = torch.ceil(pred) - self.offset
         pred *= self.frame_resolution
+        pred = torch.round(pred * self.latent_token_rate)
         return pred
     def prepare_global_duration(
         local_pred: predicted latent length
         """
         global_pred = torch.exp(global_pred) - self.offset
+        result = torch.round(global_pred * self.latent_token_rate)
         # avoid error accumulation for each frame
         if use_local:
+            pred_from_local = local_pred.sum(1)
             result[is_time_aligned] = pred_from_local[is_time_aligned]
         return result
         x: torch.Tensor,
         content_mask: torch.Tensor,
         local_duration: torch.Tensor,
+        global_duration: torch.Tensor,
     ):
+        training = getattr(self, 'training', False)
+        if not training:  # inference mode
+            latent_length = global_duration
+        else:  # training mode
+            latent_length = local_duration.sum(1)
         latent_mask = create_mask_from_length(latent_length).to(
             content_mask.device
         )
         attn_mask = content_mask.unsqueeze(-1) * latent_mask.unsqueeze(1)
+        align_path = create_alignment_path(local_duration, attn_mask)
         expanded_x = torch.matmul(align_path.transpose(1, 2).to(x.dtype), x)
         return expanded_x, latent_mask
             )
         # prepare global duration
+        latent_length = self.prepare_global_duration(
             global_duration_pred,
             local_duration_pred,
             is_time_aligned,
             use_local=False
         )
         task_mask = torch.as_tensor([t in SAME_LENGTH_TASKS for t in task])
         latent_length[task_mask] = content[task_mask].size(1)
         latent_mask = create_mask_from_length(latent_length).to(device)
         duration_offset: float = 1.0,
         cfg_drop_ratio: float = 0.2,
         sample_strategy: str = 'normal',
+        num_train_steps: int = 1000,
+        task_weights: dict | None = None,
     ):
         super().__init__(
         )
         self.dummy_nta_embed = nn.Parameter(torch.zeros(content_dim))
         self.dummy_ta_embed = nn.Parameter(torch.zeros(content_dim))
+        self.task_weights = task_weights
     def get_backbone_input(
         self, target_length: int, content: torch.Tensor,
         **kwargs
     ):
         device = self.dummy_param.device
+        if self.training:
+            if self.task_weights:
+                loss_reduce = False
+            else:
+                loss_reduce = True
+        # loss_reduce = self.training or (loss_reduce and not self.training)
         self.autoencoder.eval()
         with torch.no_grad():
             duration = F.pad(
                 duration, (0, content_mask.size(1) - duration.size(1))
             )
+        local_latent_duration = torch.round(duration * self.latent_token_rate)
         time_aligned_content, _ = self.expand_by_duration(
             x=content[:, :trunc_ta_length],
             content_mask=ta_content_mask,
+            local_duration=local_latent_duration,
+            global_duration=latent_mask.sum(1),
         )
         # --------------------------------------------------------------------
         target = target.transpose(1, self.autoencoder.time_dim)
         diff_loss = F.mse_loss(pred, target, reduction="none")
         diff_loss = loss_with_mask(diff_loss, latent_mask, reduce=loss_reduce)
+        if self.training and self.task_weights:
+            loss_weights = torch.tensor([self.task_weights[t] for t in task],
+                                        device=device)
+            diff_loss = (diff_loss * loss_weights).sum() / loss_weights.sum()
+            local_duration_loss = (local_duration_loss *
+                                   loss_weights).sum() / loss_weights.sum()
+            global_duration_loss = (global_duration_loss *
+                                    loss_weights).sum() / loss_weights.sum()
         return {
             "diff_loss": diff_loss,
             "local_duration_loss": local_duration_loss,
             trunc_ta_length = content.size(1)
         # prepare local duration
+        local_latent_duration = self.prepare_local_duration(
             local_duration_pred, content_mask
         )
+        local_latent_duration = local_latent_duration[:, :trunc_ta_length]
         # use ground truth duration
         if use_gt_duration and "duration" in kwargs:
+            local_latent_duration = torch.as_tensor(kwargs["duration"]
+                                                   ).to(device)
+            local_latent_duration = torch.round(
+                local_latent_duration * self.latent_token_rate
+            )
         # prepare global duration
+        global_latent_duration = self.prepare_global_duration(
+            global_duration_pred, local_latent_duration, is_time_aligned
         )
         # --------------------------------------------------------------------
         time_aligned_content, latent_mask = self.expand_by_duration(
             x=content[:, :trunc_ta_length],
             content_mask=content_mask[:, :trunc_ta_length],
+            local_duration=local_latent_duration,
+            global_duration=global_latent_duration,
         )
         context, context_mask, time_aligned_content = self.get_backbone_input(