Spaces:

Gamahea
/

ACE-Step-Custom

Running on Zero

ACE-Step-Custom / acestep /training /dataset_builder_modules /preprocess.py

ACE-Step Custom

Deploy ACE-Step Custom Edition with bug fixes

a602628 10 days ago

10.1 kB

	import os
	from typing import List, Tuple

	import torch
	from loguru import logger

	from .models import AudioSample
	from .preprocess_audio import load_audio_stereo
	from .preprocess_context import build_context_latents
	from .preprocess_encoder import run_encoder
	from .preprocess_lyrics import encode_lyrics
	from .preprocess_manifest import save_manifest
	from .preprocess_text import build_text_prompt, encode_text
	from .preprocess_utils import select_genre_indices
	from .preprocess_vae import vae_encode
	from acestep.debug_utils import (
	debug_log_for,
	debug_log_verbose_for,
	debug_start_verbose_for,
	debug_end_verbose_for,
	)


	class PreprocessMixin:
	"""Preprocess labeled samples to tensor files."""

	def preprocess_to_tensors(
	self,
	dit_handler,
	output_dir: str,
	max_duration: float = 240.0,
	progress_callback=None,
	) -> Tuple[List[str], str]:
	"""Preprocess all labeled samples to tensor files for efficient training."""
	debug_log_for("dataset", f"preprocess_to_tensors: output_dir='{output_dir}', max_duration={max_duration}")
	if not self.samples:
	return [], "❌ No samples to preprocess"

	labeled_samples = [s for s in self.samples if s.labeled]
	if not labeled_samples:
	return [], "❌ No labeled samples to preprocess"

	if dit_handler is None or dit_handler.model is None:
	return [], "❌ Model not initialized. Please initialize the service first."

	os.makedirs(output_dir, exist_ok=True)

	output_paths: List[str] = []
	success_count = 0
	fail_count = 0

	model = dit_handler.model
	vae = dit_handler.vae
	text_encoder = dit_handler.text_encoder
	text_tokenizer = dit_handler.text_tokenizer
	silence_latent = dit_handler.silence_latent
	device = dit_handler.device
	dtype = dit_handler.dtype

	target_sample_rate = 48000

	genre_indices = select_genre_indices(labeled_samples, self.metadata.genre_ratio)
	debug_log_verbose_for("dataset", f"selected genre indices: count={len(genre_indices)}")

	for i, sample in enumerate(labeled_samples):
	try:
	debug_log_verbose_for("dataset", f"sample[{i}] id={sample.id} file={sample.filename}")
	if progress_callback:
	progress_callback(f"Preprocessing {i+1}/{len(labeled_samples)}: {sample.filename}")

	use_genre = i in genre_indices

	t0 = debug_start_verbose_for("dataset", f"load_audio_stereo[{i}]")
	audio, _ = load_audio_stereo(sample.audio_path, target_sample_rate, max_duration)
	debug_end_verbose_for("dataset", f"load_audio_stereo[{i}]", t0)
	debug_log_verbose_for("dataset", f"audio shape={tuple(audio.shape)} dtype={audio.dtype}")
	audio = audio.unsqueeze(0).to(device).to(vae.dtype)
	debug_log_verbose_for(
	"dataset",
	f"vae device={next(vae.parameters()).device} vae dtype={vae.dtype} "
	f"audio device={audio.device} audio dtype={audio.dtype}",
	)

	with torch.no_grad():
	t0 = debug_start_verbose_for("dataset", f"vae_encode[{i}]")
	target_latents = vae_encode(vae, audio, dtype)
	debug_end_verbose_for("dataset", f"vae_encode[{i}]", t0)

	latent_length = target_latents.shape[1]
	attention_mask = torch.ones(1, latent_length, device=device, dtype=dtype)
	debug_log_verbose_for(
	"dataset",
	f"target_latents shape={tuple(target_latents.shape)} latent_length={latent_length}",
	)

	caption = sample.get_training_prompt(self.metadata.tag_position, use_genre=use_genre)
	text_prompt = build_text_prompt(sample, self.metadata.tag_position, use_genre)

	if i == 0:
	logger.info(f"\n{'='*70}")
	logger.info("🔍 [DEBUG] DiT TEXT ENCODER INPUT (Training Preprocess)")
	logger.info(f"{'='*70}")
	logger.info(f"text_prompt:\n{text_prompt}")
	logger.info(f"{'='*70}\n")

	t0 = debug_start_verbose_for("dataset", f"encode_text[{i}]")
	text_hidden_states, text_attention_mask = encode_text(
	text_encoder, text_tokenizer, text_prompt, device, dtype
	)
	debug_end_verbose_for("dataset", f"encode_text[{i}]", t0)
	debug_log_verbose_for(
	"dataset",
	f"text_hidden_states shape={tuple(text_hidden_states.shape)} "
	f"text_attention_mask shape={tuple(text_attention_mask.shape)}",
	)

	lyrics = sample.lyrics if sample.lyrics else "[Instrumental]"
	t0 = debug_start_verbose_for("dataset", f"encode_lyrics[{i}]")
	lyric_hidden_states, lyric_attention_mask = encode_lyrics(
	text_encoder, text_tokenizer, lyrics, device, dtype
	)
	debug_end_verbose_for("dataset", f"encode_lyrics[{i}]", t0)
	debug_log_verbose_for(
	"dataset",
	f"lyric_hidden_states shape={tuple(lyric_hidden_states.shape)} "
	f"lyric_attention_mask shape={tuple(lyric_attention_mask.shape)}",
	)

	t0 = debug_start_verbose_for("dataset", f"run_encoder[{i}]")
	# Ensure DiT encoder runs on the active residency device (GPU when loaded via
	# offload context). This prevents flash-attn CPU backend crashes.
	with dit_handler._load_model_context("model"):
	model_device = next(model.parameters()).device
	model_dtype = next(model.parameters()).dtype
	if text_hidden_states.device != model_device:
	text_hidden_states = text_hidden_states.to(model_device)
	if text_attention_mask.device != model_device:
	text_attention_mask = text_attention_mask.to(model_device)
	if lyric_hidden_states.device != model_device:
	lyric_hidden_states = lyric_hidden_states.to(model_device)
	if lyric_attention_mask.device != model_device:
	lyric_attention_mask = lyric_attention_mask.to(model_device)
	if text_hidden_states.dtype != model_dtype:
	text_hidden_states = text_hidden_states.to(model_dtype)
	if lyric_hidden_states.dtype != model_dtype:
	lyric_hidden_states = lyric_hidden_states.to(model_dtype)

	encoder_hidden_states, encoder_attention_mask = run_encoder(
	model,
	text_hidden_states=text_hidden_states,
	text_attention_mask=text_attention_mask,
	lyric_hidden_states=lyric_hidden_states,
	lyric_attention_mask=lyric_attention_mask,
	device=model_device,
	dtype=model_dtype,
	)
	debug_end_verbose_for("dataset", f"run_encoder[{i}]", t0)
	debug_log_verbose_for(
	"dataset",
	f"encoder_hidden_states shape={tuple(encoder_hidden_states.shape)} "
	f"encoder_attention_mask shape={tuple(encoder_attention_mask.shape)}",
	)

	t0 = debug_start_verbose_for("dataset", f"build_context_latents[{i}]")
	context_latents = build_context_latents(silence_latent, latent_length, device, dtype)
	debug_end_verbose_for("dataset", f"build_context_latents[{i}]", t0)

	output_data = {
	"target_latents": target_latents.squeeze(0).cpu(),
	"attention_mask": attention_mask.squeeze(0).cpu(),
	"encoder_hidden_states": encoder_hidden_states.squeeze(0).cpu(),
	"encoder_attention_mask": encoder_attention_mask.squeeze(0).cpu(),
	"context_latents": context_latents.squeeze(0).cpu(),
	"metadata": {
	"audio_path": sample.audio_path,
	"filename": sample.filename,
	"caption": caption,
	"lyrics": lyrics,
	"duration": sample.duration,
	"bpm": sample.bpm,
	"keyscale": sample.keyscale,
	"timesignature": sample.timesignature,
	"language": sample.language,
	"is_instrumental": sample.is_instrumental,
	},
	}

	output_path = os.path.join(output_dir, f"{sample.id}.pt")
	t0 = debug_start_verbose_for("dataset", f"torch.save[{i}]")
	torch.save(output_data, output_path)
	debug_end_verbose_for("dataset", f"torch.save[{i}]", t0)
	output_paths.append(output_path)
	success_count += 1

	except Exception as e:
	logger.exception(f"Error preprocessing {sample.filename}")
	fail_count += 1
	if progress_callback:
	progress_callback(f"❌ Failed: {sample.filename}: {str(e)}")

	t0 = debug_start_verbose_for("dataset", "save_manifest")
	save_manifest(output_dir, self.metadata, output_paths)
	debug_end_verbose_for("dataset", "save_manifest", t0)

	status = f"✅ Preprocessed {success_count}/{len(labeled_samples)} samples to {output_dir}"
	if fail_count > 0:
	status += f" ({fail_count} failed)"

	return output_paths, status