Spaces:

ASLP-lab
/

YingMusic-Singer-Plus

Running on Zero

YingMusic-Singer-Plus / src /YingMusicSinger /melody /SmoothMelody.py

64ec292 25 days ago

4.98 kB

	import torch
	import torch.nn as nn


	class MIDIFuzzDisturb(nn.Module):
	"""Applies fuzzing perturbations to MIDI latent representations.

	The raw MIDI teacher model output preserves good prosody but causes
	pronunciation interference. This module mitigates that by applying
	blur, temporal dropout, and noise to the melody latent.
	"""

	def __init__(
	self, dim=128, drop_prob=0.3, noise_scale=0.1, blur_kernel=3, drop_type="random"
	):
	super().__init__()
	self.blur = None
	self.drop_prob = None
	self.noise_scale = None
	self.dim = dim
	self.drop_type = drop_type

	assert drop_prob is not None
	assert drop_type is not None
	if drop_type == "random":
	# drop_prob is a float
	if drop_prob != 0:
	self.drop_prob = drop_prob
	elif drop_type == "equal_space":
	# drop_prob is a [drop, keep] list, e.g., [1, 1] means 1 frame drop, 1 frame keep
	self.drop_prob = drop_prob
	else:
	raise ValueError(f"Unknown drop_type: {drop_type}")

	if noise_scale != 0:
	self.noise_scale = noise_scale
	if blur_kernel != 0:
	assert blur_kernel % 2 == 1, f"blur_kernel {blur_kernel} must be odd"
	self.blur = nn.AvgPool1d(
	kernel_size=blur_kernel, stride=1, padding=blur_kernel // 2
	)

	def _create_equal_space_mask(self, batch_size, seq_len, device):
	"""Create an equally-spaced mask cycling [drop, keep] frames."""
	drop_frames, keep_frames = self.drop_prob
	cycle_len = drop_frames + keep_frames

	# Pattern: first drop_frames are 0 (drop), next keep_frames are 1 (keep)
	pattern = torch.cat(
	[
	torch.zeros(drop_frames, device=device),
	torch.ones(keep_frames, device=device),
	]
	)

	# Repeat pattern to cover the full sequence length
	num_repeats = (seq_len + cycle_len - 1) // cycle_len
	mask = pattern.repeat(num_repeats)[:seq_len] # [T]

	# Expand to [B, T, 1]
	mask = mask.view(1, seq_len, 1).expand(batch_size, -1, -1)

	return mask

	def forward(self, x):
	# x: [B, T, D=128], pre-sigmoid logits
	x = torch.sigmoid(x)

	assert x.shape[-1] == self.dim, (
	f"MIDIFuzzDisturb: expected dim={self.dim}, got {x.shape[-1]}"
	)

	if self.blur:
	x = self.blur(x.transpose(1, 2)).transpose(1, 2)

	if self.drop_prob:
	if self.drop_type == "random":
	time_mask = (
	torch.rand(x.shape[0], x.shape[1], 1, device=x.device)
	> self.drop_prob
	)
	x = x * time_mask.float()
	elif self.drop_type == "equal_space":
	time_mask = self._create_equal_space_mask(
	x.shape[0], x.shape[1], x.device
	)
	x = x * time_mask.float()
	else:
	raise ValueError(f"Unknown drop_type: {self.drop_type}")

	if self.noise_scale:
	noise = torch.randn_like(x) * self.noise_scale
	x = x + noise

	return x


	class MIDIDigitalEmbedding(nn.Module):
	"""Embeds continuous MIDI values into discrete token embeddings.

	Continuous MIDI values in [0, 127] are quantized at a configurable
	resolution (mark_distinguish_scale) and mapped to learned embeddings.
	"""

	def __init__(self, embed_dim=128, num_classes=128, mark_distinguish_scale=2):
	super().__init__()

	# num_classes covers the input range [0, 127] plus 2 special tokens
	self.num_classes = num_classes + 2
	self.mark_distinguish_scale = mark_distinguish_scale
	self.embedding_input_num_class = self.num_classes * self.mark_distinguish_scale
	self.embedding = nn.Embedding(self.embedding_input_num_class, embed_dim)

	def midi_to_class(self, midi_values):
	"""Map continuous MIDI values to discrete class indices.

	Args:
	midi_values: [B, T] continuous MIDI values, roughly in [0, 127]

	Returns:
	class_indices: [B, T] discrete class indices
	"""
	# Round to nearest quantization step
	# e.g., with scale=2: 0->0, 0.3->1, 0.5->1, 0.8->2, 1.0->2, ...
	class_indices = torch.round(midi_values * self.mark_distinguish_scale).long()

	# Clamp to valid range
	class_indices = torch.clamp(
	class_indices, 0, self.embedding_input_num_class - 1
	)

	return class_indices

	def forward(self, midi_values):
	"""
	Args:
	midi_values: [B, T] continuous MIDI values

	Returns:
	embeddings: [B, T, embed_dim] embedding vectors
	"""
	class_indices = self.midi_to_class(midi_values)
	embeddings = self.embedding(class_indices)
	return embeddings