Spaces:

3outeille
/

tts-batch-builder

Paused

App Files Files Community

tts-batch-builder / eval.py

3outeille HF Staff

Sync eval.py

3862b8f verified about 1 month ago

Raw

History Blame Contribute Delete

19.9 kB

	import argparse
	import os
	import re
	import math
	import torch
	from torch.nn.attention import sdpa_kernel, SDPBackend
	from transformers import (
	AutoConfig,
	AutoModelForTextToWaveform,
	AutoModelForTDT,
	AutoModelForSpeechSeq2Seq,
	AutoProcessor,
	CompileConfig,
	)
	from transformers.audio_utils import load_audio
	import evaluate
	from tqdm import tqdm
	import random
	import numpy as np
	import pandas as pd
	from datasets import load_dataset

	OUTPUT_PATH = "results.csv"


	def upsert(record: dict, file_path: str = OUTPUT_PATH):
	"""
	Insert or update a record in the stored DataFrame.
	Uses 'model_id' as the unique key — updates the row if found, appends if not.
	"""
	assert "model_id" in record, "'model_id' key is required in the record dict."

	if os.path.exists(file_path):
	df = pd.read_csv(file_path) if file_path.endswith(".csv") else pd.read_excel(file_path)
	else:
	df = pd.DataFrame()

	new_row = pd.DataFrame([record])

	if not df.empty and "model_id" in df.columns and record["model_id"] in df["model_id"].values:
	# Update existing row
	df.set_index("model_id", inplace=True)
	new_row.set_index("model_id", inplace=True)
	df.update(new_row) # updates only fields present in new_row
	df = df.reindex(columns=df.columns) # preserve column order
	df.reset_index(inplace=True)
	print(f"Updated model_id='{record['model_id']}'")
	else:
	# Append new row
	df = pd.concat([df, new_row], ignore_index=True)
	print(f"Inserted model_id='{record['model_id']}'")

	if file_path.endswith(".csv"):
	df.to_csv(file_path, index=False)
	else:
	df.to_excel(file_path, index=False)

	return df


	wer_metric = evaluate.load("wer")
	torch.set_float32_matmul_precision('high')

	DTYPE_BYTES = {
	torch.float32: 4,
	torch.float16: 2,
	torch.bfloat16: 2,
	torch.int8: 1,
	torch.int4: 0.5,
	}

	def get_free_gpu_memory_bytes(device: int = 0) -> int:
	if not torch.cuda.is_available():
	raise RuntimeError("No CUDA device found.")
	torch.cuda.synchronize(device)
	free, _ = torch.cuda.mem_get_info(device)
	return free

	def infer_batch_size(
	model: torch.nn.Module,
	longest_input_length: int,
	dtype: torch.dtype = torch.bfloat16,
	usable_fraction: float = 0.70,
	) -> int:
	"""
	Estimate a safe batch size for generation.

	Budget: 75% of total GPU memory for everything (weights + activations + batch).
	The remaining 25% is left untouched as a buffer for KV cache growth, CUDA
	kernels, and other overhead.

	Per-sample activation cost is estimated as ~2 bytes * params^0.6 * seq_len,
	a empirically-derived heuristic that's dtype-agnostic and arch-agnostic.

	Args:
	model: Any nn.Module (HF, custom, etc.)
	longest_input_length: max(len(input_ids)) across your dataset.
	dtype: Dtype the model is loaded in.
	usable_fraction: Fraction of total VRAM to budget for everything.
	Default 0.75 — leaves 25% for cache and other overhead.

	Returns:
	Recommended batch size (>= 1).
	"""
	if isinstance(dtype, str):
	dtype = getattr(torch, dtype)
	bpe = DTYPE_BYTES.get(dtype)
	if bpe is None:
	raise ValueError(f"Unsupported dtype: {dtype}")

	# --- Total VRAM budget ---
	free_bytes = get_free_gpu_memory_bytes(0)
	_, total_bytes = torch.cuda.mem_get_info(0)
	budget_bytes = total_bytes * usable_fraction

	print(f"[gpu] Total VRAM : {total_bytes / 1e9:.2f} GB")
	print(f"[gpu] Free VRAM : {free_bytes / 1e9:.2f} GB")
	print(f"[gpu] Usable budget : {budget_bytes / 1e9:.2f} GB ({usable_fraction*100:.0f}% of total)")

	# --- Model weights ---
	num_params = sum(p.numel() for p in model.parameters())
	model_bytes = num_params * bpe
	print(f"[model] Params : {num_params / 1e9:.3f}B")
	print(f"[model] Weight memory : {model_bytes / 1e9:.2f} GB (dtype={dtype})")

	remaining_bytes = budget_bytes - model_bytes
	if remaining_bytes <= 0:
	raise RuntimeError(
	f"Model weights alone ({model_bytes/1e9:.2f} GB) exceed the usable budget "
	f"({budget_bytes/1e9:.2f} GB). Try a smaller model or lower usable_fraction."
	)

	# --- Per-sample activation cost (arch-agnostic heuristic) ---
	# Activations ≈ f(params, seq_len). Empirically, hidden states + attention
	# buffers scale roughly as params^0.6 per token across diverse architectures.
	# Multiply by 2 bytes as a base unit (independent of weight dtype, since
	# activations are often kept in fp16/bf16 regardless).
	bytes_per_token = 2 * (num_params ** 0.6)
	bytes_per_sample = bytes_per_token * longest_input_length

	print(f"[input] longest_input_length : {longest_input_length} tokens")
	print(f"[input] Est. bytes/sample : {bytes_per_sample / 1e6:.1f} MB")

	# --- Batch size ---
	batch_size = max(1, math.floor(remaining_bytes / bytes_per_sample))
	print(f"\n✅ Recommended batch_size : {batch_size}")
	return batch_size



	def main(args):

	# Set seed due to randomness in some models (e.g. VibeVoice's acoustic tokenizer sampling)
	seed = 42
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)
	torch.backends.cudnn.deterministic = True

	torch_dtype = getattr(torch, args.dtype)
	config = AutoConfig.from_pretrained(args.model_id, revision=args.revision)
	if "dia" in config.model_type:
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	args.model_id,
	dtype=torch_dtype,
	device_map=args.device,
	attn_implementation=args.attn_implementation,
	)
	else:
	model = AutoModelForTextToWaveform.from_pretrained(
	args.model_id,
	dtype=torch_dtype,
	device_map=args.device,
	attn_implementation=args.attn_implementation,
	)

	num_params = sum(p.numel() for p in model.parameters()) / 1e9
	print(f"Model size: {num_params:.2f}B parameters")
	processor_kwargs = {"device_map": args.device} if "higgs" in config.model_type else {}
	processor = AutoProcessor.from_pretrained(args.model_id, revision=args.revision, **processor_kwargs)

	# Set generate arguments
	if model.can_generate():
	gen_kwargs = {"max_new_tokens": args.max_new_tokens}
	if "higgs" not in config.model_type:
	gen_kwargs["min_new_tokens"] = args.max_new_tokens

	if "csm" in config.model_type:
	gen_kwargs["output_audio"] = True
	elif args.max_new_tokens:
	raise ValueError("`max_new_tokens` should only be set for auto-regressive models, but got a non-generative model.")

	if args.torch_compile is not None:
	if model.can_generate():
	gen_kwargs["compile_config"] = CompileConfig(mode=args.torch_compile, fullgraph=args.compile_fullgraph)
	# enable static k/v cache for autoregressive models
	model.generation_config.cache_implementation = "static"
	else:
	model = torch.compile(model, mode=args.torch_compile, fullgraph=args.compile_fullgraph)

	# Ensure warm-up runs when using torch.compile
	if args.warmup_steps is None or args.warmup_steps < 1:
	print("`--torch_compile` is enabled; forcing `--warmup_steps=10` to trigger compilation before timed runs.")
	args.warmup_steps = 10

	def benchmark(batch, text_column):
	# Load audio inputs
	texts_to_generate = batch[text_column]
	minibatch_size = len(texts_to_generate)
	sampling_rate = 16_000
	if hasattr(processor.feature_extractor, "sampling_rate"):
	sampling_rate = processor.feature_extractor.sampling_rate

	# START TIMING
	torch.cuda.synchronize(device=args.device)
	start_event = torch.cuda.Event(enable_timing=True)
	end_event = torch.cuda.Event(enable_timing=True)
	start_event.record()

	# 1. Pre-Processing
	# 1.1 Pad audios to max batch size if using torch compile to prevent re-compilations
	padding_size = None
	if minibatch_size != args.batch_size and args.torch_compile is not None:
	padding_size = args.batch_size - minibatch_size
	padding_duplicate = [texts_to_generate[-1] for _ in range(padding_size)]
	texts_to_generate.extend(padding_duplicate)

	# Apply jinja template if processor has smth saved
	if getattr(processor, "chat_template") is not None:
	# CSM uses speaekr ID and not role in conv
	if "csm" in config.model_type:
	texts_to_generate = [
	processor.apply_chat_template(
	[
	{
	"role": "0",
	"content": [{"type": "text", "text": text}],
	}
	],
	tokenize=False,
	add_generation_prompt=True,
	return_dict=False,
	)
	for text in texts_to_generate
	]
	elif "higgs" in config.model_type:
	inputs = processor.apply_chat_template(
	[[{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": "Generate audio following instruction."
	}
	],
	},
	{
	"role": "user",
	"content": text,
	}] for text in texts_to_generate],
	tokenize=True,
	add_generation_prompt=True,
	return_dict=True,
	return_tensors="pt",
	sampling_rate=24000,
	)
	else:
	texts_to_generate = [
	processor.apply_chat_template(
	[
	{
	"role": "user",
	"content": text,
	}
	],
	tokenize=False,
	add_generation_prompt=True,
	return_dict=False,
	)
	for text in texts_to_generate
	]

	if "higgs" not in config.model_type:
	inputs = processor(text=texts_to_generate, return_tensors="pt")
	inputs = inputs.to(args.device)
	prompt_len = inputs["input_ids"].shape[1]

	# 2. Model Inference
	if args.torch_compile is not None:
	sdpa_backends = [SDPBackend.MATH]
	else:
	sdpa_backends = [SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION, SDPBackend.MATH]
	with sdpa_kernel(sdpa_backends):
	pred_waveform = model.generate(inputs, gen_kwargs)

	# 3. Post-processing
	# 3.1 Strip padded ids from predictions
	if padding_size is not None:
	pred_waveform = pred_waveform[:-padding_size, ...]

	# 3.2 Convert token ids to text transcription
	if config.model_type == 'dia':
	prompt_len = processor.get_audio_prompt_len(inputs["decoder_attention_mask"])
	outputs = processor.batch_decode(pred_waveform, audio_prompt_len=prompt_len)
	elif "higgs" in config.model_type:
	outputs = processor.batch_decode(pred_waveform)
	else:
	outputs = pred_waveform

	# END TIMING
	end_event.record()
	torch.cuda.synchronize(device=args.device)
	runtime = start_event.elapsed_time(end_event) / 1000.0

	# normalize by minibatch size since we want the per-sample time
	batch["generation_time_s"] = minibatch_size * [runtime / minibatch_size]

	gen_paths = []
	audio_length_s = []
	os.makedirs(f"results_{args.model_id}", exist_ok=True)

	for audio, i in zip(outputs, batch['id']):
	try:
	processor.save_audio(audio, saving_path=f"results_{args.model_id}/output_{i}.wav")
	gen_paths.append(f"results_{args.model_id}/output_{i}.wav")
	audio_length_s.append(len(audio) / sampling_rate)
	except:
	# Prob the processor is not yet standard
	pass

	batch["predictions"] = gen_paths
	batch["input_text"] = [sample.lower() for sample in texts_to_generate]
	batch["audio_length_s"] = audio_length_s
	return batch

	dataset = load_dataset(args.dataset_path, split=args.split)
	dataset = dataset.add_column("id", list(range(len(dataset)))) # Pass id for easier reference when batch mapping

	# Infer batch size from model param count and longest input text in the dataset. Reserv 25% of VRAM for cache and overhead
	def add_input_token_length(batch, text_column):
	input_ids = processor.tokenizer(batch[text_column], return_tensors=None, padding=False, truncation=False).input_ids
	batch["input_token_length"] = [len(ids) for ids in input_ids]
	return batch

	# TODO:
	# dataset = dataset.map(
	# add_input_token_length, batch_size=args.batch_size, batched=True, fn_kwargs={"text_column": args.text_column}
	# )
	# longest_input_length = max(dataset["input_token_length"])
	# inferred_bs = infer_batch_size(
	# model=model,
	# longest_input_length=longest_input_length,
	# dtype=args.dtype,
	# )

	if args.warmup_steps is not None:
	num_warmup_samples = args.warmup_steps * args.batch_size
	warmup_dataset = dataset.select(range(min(num_warmup_samples, len(dataset))))
	warmup_dataset = iter(warmup_dataset.map(benchmark, batch_size=args.batch_size, batched=True, fn_kwargs={"text_column": args.text_column}))

	for _ in tqdm(warmup_dataset, desc="Warming up..."):
	continue

	if args.max_eval_samples is not None and args.max_eval_samples > 0:
	print(f"Subsampling dataset to first {args.max_eval_samples} samples!")
	dataset = dataset.select(range(min(args.max_eval_samples, len(dataset))))

	dataset = dataset.map(
	benchmark, batch_size=args.batch_size, batched=True, fn_kwargs={"text_column": args.text_column}
	)

	asr_processor = AutoProcessor.from_pretrained("nvidia/parakeet-tdt-0.6b-v3")
	asr_model = AutoModelForTDT.from_pretrained("nvidia/parakeet-tdt-0.6b-v3", device_map="auto")

	def transcribe_audio(batch) -> list[str]:
	"""
	Takes in minibatch of audio paths and returns transcribed text per each file.
	Each audio is transctibed with ParakeetTDT model.
	"""
	sr_rate = asr_processor.feature_extractor.sampling_rate
	speech_samples = [load_audio(path, sampling_rate=sr_rate) for path in batch["predictions"]]
	inputs = asr_processor(speech_samples, sampling_rate=sr_rate).to(asr_model.device, dtype=asr_model.dtype)
	outputs = asr_model.generate(**inputs, return_dict_in_generate=False)
	outputs = asr_processor.batch_decode(outputs.sequences, skip_special_tokens=True)
	batch["asr_outputs"] = [sample.lower() for sample in outputs]
	return batch

	dataset = dataset.map(transcribe_audio, batch_size=args.batch_size, batched=True)

	all_results = {
	"audio_length_s": [],
	"generation_time_s": [],
	"predictions": [],
	"input_text": [],
	"asr_outputs": [],
	}
	result_iter = iter(dataset)
	for result in tqdm(result_iter, desc="Samples..."):
	for key in all_results:
	all_results[key].append(result[key])

	# Write manifest results (WER and RTFX)
	# Filtering of empty references is handled inside write_manifest.
	# manifest_path = data_utils.write_manifest(
	# all_results["references"],
	# all_results["predictions"],
	# args.model_id,
	# args.dataset_path,
	# args.dataset,
	# args.split,
	# audio_length=all_results["audio_length_s"],
	# transcription_time=all_results["transcription_time_s"],
	# audio_filepaths=all_results["audio_filepath"],
	# )
	# print("Results saved at path:", os.path.abspath(manifest_path))

	wer = wer_metric.compute(
	references=all_results["input_text"], predictions=all_results["asr_outputs"]
	)
	wer = round(100 * wer, 2)
	rtfx = round(sum(all_results["audio_length_s"]) / sum(all_results["generation_time_s"]), 2)
	print("WER:", wer, "%", "RTFx:", rtfx)
	return {"model_id": args.model_id, "WER": wer, "RTFX": rtfx}


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()

	parser.add_argument(
	"--model_id",
	type=str,
	required=True,
	help="Model identifier. Should be loadable with 🤗 Transformers",
	)
	parser.add_argument(
	"--dataset_path",
	type=str,
	default="hf-audio/tts_leaderboard",
	help="Dataset path. By default, it is `hf-audio/tts_leaderboard`",
	)
	parser.add_argument(
	"--split",
	type=str,
	default="train",
	help="Split of the dataset. E.g. `'validation`' for the dev split, or `'test'` for the test split.",
	)
	parser.add_argument(
	"--text_column",
	type=str,
	default="text",
	help="Name of the column corresponding to the text that has to be generated in dataset with the given `split`.",
	)
	parser.add_argument(
	"--device",
	type=str,
	default="cuda",
	help="The device to run the pipeline on. `auto` for auto-inferring, 'cpu' for CPU, 'cuda' for the GPU (default).",
	)
	parser.add_argument(
	"--batch_size",
	type=int,
	default=32,
	help="Number of samples to go through each streamed batch.",
	)
	parser.add_argument(
	"--max_eval_samples",
	type=int,
	default=None,
	help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.",
	)
	parser.add_argument(
	"--max_new_tokens",
	type=int,
	default=8192,
	help="Maximum number of tokens to generate (for auto-regressive models).",
	)
	parser.add_argument(
	"--torch_compile",
	type=str,
	default=None,
	help="Mode for torch compiling model forward pass. Can be either 'default', 'reduce-overhead', 'max-autotune' or 'max-autotune-no-cudagraphs'.",
	)
	parser.add_argument(
	"--compile_fullgraph",
	action="store_true",
	help="Whether to do full graph compilation.",
	)
	parser.add_argument(
	"--dtype",
	type=str,
	default="bfloat16",
	help="The dtype to use for model loading and inference. E.g. 'bfloat16', 'float16', 'float32'.",
	)
	parser.add_argument(
	"--attn_implementation",
	type=str,
	default="sdpa",
	help="Attention implementation to use for model loading (e.g. 'sdpa', 'eager', 'flash_attention_2').",
	)
	parser.add_argument(
	"--warmup_steps",
	type=int,
	default=0,
	help="Number of warm-up steps to run before launching the timed runs.",
	)
	parser.add_argument(
	"--revision",
	type=str,
	default=None,
	help="Model revision to use (e.g. 'refs/pr/11' for a PR branch). Defaults to the main branch.",
	)
	args = parser.parse_args()

	print("" 100)
	print(f"Evaluating {args.model_id} on {args.dataset_path} / {args.split}")
	print("" 100)

	output_dict = main(args)
	upsert(output_dict)