NeMo_Canary / scripts /speech_llm /estimate_token_bins.py

Upload folder using huggingface_hub

b386992 verified 7 months ago

13.1 kB

	# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# flake8: noqa
	# pylint: disable=C0115
	# pylint: disable=C0116
	# pylint: disable=C0301

	import argparse
	import ast
	import math
	from functools import partial
	from itertools import islice
	from pathlib import Path
	from typing import Callable, Iterable

	import numpy as np
	import pandas as pd
	from lhotse.cut import Cut
	from omegaconf import OmegaConf

	import nemo.collections.speechlm2.data.salm_dataset # noqa
	from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper
	from nemo.collections.common.data.lhotse.cutset import read_cutset_from_config
	from nemo.collections.common.data.lhotse.dataloader import LhotseDataLoadingConfig, tokenize, tokenize_with_prompt
	from nemo.collections.common.data.lhotse.sampling import (
	MultimodalFixedBucketBatchSizeConstraint2D,
	MultimodalSamplingConstraint,
	TokenCountFilter,
	TokenPerTokenFilter,
	)
	from nemo.collections.common.prompts.formatter import PromptFormatter
	from nemo.collections.common.tokenizers import AggregateTokenizer, AutoTokenizer, SentencePieceTokenizer


	def parse_args():
	parser = argparse.ArgumentParser(
	description="Estimate token bins for Lhotse dynamic bucketing using a sample of the input dataset. "
	"The dataset is read either from one or more manifest files and supports data weighting. "
	"Unlike estimate_duration_bins.py, this script is intended for text data only. "
	"It supports 2D bucketing. ",
	formatter_class=argparse.ArgumentDefaultsHelpFormatter,
	)
	parser.add_argument(
	"input",
	help='Path to a data input configuration YAML file. '
	'This is the only type of input specification supported for text data.',
	)
	parser.add_argument(
	"-t",
	"--tokenizer",
	nargs="+",
	required=True,
	help="Path to one or more SPE tokenizers. More than one means we'll use AggregateTokenizer and --langs argument must also be used. When provided, we'll estimate a 2D distribution for input and output sequence lengths.",
	)
	parser.add_argument(
	"-a", "--langs", nargs="+", help="Language names for each of AggregateTokenizer sub-tokenizers."
	)
	parser.add_argument(
	"-b",
	"--buckets",
	type=int,
	default=30,
	help="The desired number of buckets (dim0 => covers input sequence length / audio duration).",
	)
	parser.add_argument(
	"-s",
	"--sub-buckets",
	type=int,
	default=None,
	help="The desired number of sub-buckets (dim1 => covers output sequence length / num_tokens). "
	"If not provided, we'll only perform 1D bucketing. ",
	)
	parser.add_argument(
	"-n",
	"--num_examples",
	type=int,
	default=-1,
	help="The number of examples (utterances) to estimate the bins. -1 means use all data "
	"(be careful: it could be iterated over infinitely).",
	)
	parser.add_argument(
	"-l",
	"--min_tokens",
	type=float,
	default=-float("inf"),
	help="If specified, we'll filter out examples with less tokens than this number.",
	)
	parser.add_argument(
	"-u",
	"--max_tokens",
	type=float,
	default=float("inf"),
	help="If specified, we'll filter out examples with more tokens than this number.",
	)
	parser.add_argument(
	"--max_tpt",
	type=float,
	default=float("inf"),
	help="If specified, we'll filter out examples with more output tokens per input token than this. ",
	)
	parser.add_argument(
	"-q", "--quiet", type=bool, default=False, help="When specified, only print the estimated duration bins."
	)
	parser.add_argument(
	"-f",
	"--prompt-format",
	type=str,
	help="When specified, we'll use a prompt formatter in addition to the tokenizer for the purpose of estimating token count bins. "
	"This is useful for accurate 2D bucket estimation with models such as EncDecMultiTaskModel (Canary-1B), "
	"or any model where the label sequence consists of a user prompt and a model's response.",
	)
	parser.add_argument(
	"-p",
	"--prompt",
	type=str,
	help="Prompt slots provided as a Python list of dicts. It is used together with --prompt-format option."
	"For example, with Canary-1B you may use: [{'role':'user','slots':{'source_lang':'en','target_lang':'en','task':'asr','pnc':'yes'}]",
	)
	parser.add_argument(
	"-m",
	"--measure-total-length",
	type=bool,
	default=False,
	help="When specified, we'll measure the total length (context+answer, i.e. input_ids) instead of context-only length. Total length is more suitable for decoder-only models while context-only length is more suitable for encoder-decoder models.",
	)
	return parser.parse_args()


	def estimate_token_buckets(
	cuts: Iterable[Cut],
	num_buckets: int,
	num_subbuckets: int \| None,
	quiet: bool,
	) -> list[tuple[float, float]]:
	"""
	This function is based on lhotse.dataset.sampling.dynamic_bucketing.estimate_duration_buckets.
	It extends it to a 2D bucketing case.
	"""
	assert num_buckets > 1
	is_2d = num_subbuckets is not None

	if is_2d:
	constraint = MultimodalFixedBucketBatchSizeConstraint2D([(0.0, 0.0)], [0], measure_total_length=False)
	else:
	constraint = MultimodalSamplingConstraint(measure_total_length=True)

	# Gather the duration and token count statistics for the dataset.
	num_input_tokens = []
	num_output_tokens = []
	for c in cuts:
	ans = constraint.measure_length(c)
	if is_2d:
	itoks, otoks = ans
	num_input_tokens.append(itoks)
	num_output_tokens.append(otoks)
	else:
	num_input_tokens.append(ans)
	num_input_tokens = np.array(num_input_tokens, dtype=np.int32)
	if is_2d:
	num_output_tokens = np.array(num_output_tokens, dtype=np.int32)
	joint = np.rec.fromarrays([num_input_tokens, num_output_tokens])
	joint.sort()
	num_input_tokens = joint.f0
	num_output_tokens = joint.f1
	else:
	num_input_tokens.sort()

	# We are building buckets with equal duration (empirically leads to more even bucket exhaustion over time).
	# We need to determine how much duration to allocate per bucket.
	size_per_bucket = num_input_tokens.sum() / num_buckets

	if not quiet:
	print("Duration distribution:")
	print(pd.Series(num_input_tokens).describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]))
	max_input_tokens = num_input_tokens[-1]

	if is_2d:
	tpt = num_output_tokens / num_input_tokens
	if not quiet:
	print("Output tokens per input token distribution:")
	print(pd.Series(tpt).describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]))
	max_tpt = tpt.max()
	del tpt

	bins = []
	bin_indexes = [0]
	tot = 0.0

	def _estimate_output_token_buckets(max_bucket_duration):
	# Since this is 2D bucketing, apply the same bin creation logic
	# for the second dimension (i.e. token count) as for the first dimension (duration).
	# That means we aim to have each bucket contain roughly the same number of tokens.
	# Note that this estimation is biased towards more padding if you have
	# a lot of zero-token examples (e.g. non-speech).
	nonlocal bins
	num_tokens_bucket = num_output_tokens[bin_indexes[-1] : binidx]
	num_tokens_bucket.sort()
	tokens_per_subbucket = num_tokens_bucket.sum() / num_subbuckets
	tot_toks = 0
	# Iterate over token counts, and whenever we hit tokens_per_subbucket, create a new 2D bucket bin.
	for num_toks in num_tokens_bucket:
	# Threshold hit: we are creating a new (max_duration, max_num_tokens) bin.
	if tot_toks > tokens_per_subbucket:
	bins.append((max_bucket_duration, num_toks))
	tot_toks = 0
	tot_toks += num_toks
	bins.append((size, math.ceil(size * max_tpt)))

	# Iterate over data, and whenever we hit size_per_bucket, create a new bucket bin.
	for binidx, size in enumerate(num_input_tokens):
	if tot > size_per_bucket:
	# Threshold hit: we are creating a new duration bin (multiplied by number of token bins).
	if is_2d:
	_estimate_output_token_buckets(max_bucket_duration=size)
	else:
	bins.append(size)
	tot = 0.0
	tot += size

	# Estimate an extra 2D bin set for global max duration.
	if num_subbuckets is not None:
	if is_2d:
	_estimate_output_token_buckets(max_bucket_duration=max_input_tokens)
	else:
	bins.append(max_input_tokens)

	return bins


	def load_tokenizer(paths: list[str], langs: list[str] = None) -> TokenizerWrapper:
	if len(paths) == 1:
	(p,) = paths
	if Path(p).exists():
	tok = SentencePieceTokenizer(p)
	else:
	# Assume it's HF name
	tok = AutoTokenizer(p, use_fast=True)
	else:
	assert langs is not None and len(paths) == len(
	langs
	), f"Cannot create AggregateTokenizer; each tokenizer must have assigned a language via --langs option (we got --tokenizers={paths} and --langs={langs})"
	tok = AggregateTokenizer({lang: SentencePieceTokenizer(p) for lang, p in zip(langs, paths)})
	return TokenizerWrapper(tok)


	def apply_tokenizer(cut, tokenizer=None, prompt: PromptFormatter = None):
	if prompt is not None:
	cut = tokenize_with_prompt(cut, tokenizer, prompt)
	elif tokenizer is not None:
	cut = tokenize(cut, tokenizer)
	return cut


	class RejectionsCounter:
	def __init__(self, predicate: Callable, message: str):
	self.predicate = predicate
	self.message = message
	self.total = 0
	self.rejected = 0

	def __call__(self, example) -> bool:
	ans = self.predicate(example)
	self.total += 1
	if not ans:
	self.rejected += 1
	return ans

	def print_report(self) -> None:
	if self.rejected:
	print(f"{self.message} \| Rejected {self.rejected}/{self.total} examples.")


	def main():
	args = parse_args()

	if not args.quiet:
	pd.set_option('display.float_format', lambda x: '%.2f' % x)

	tokenizer = None
	prompt = None
	if args.tokenizer is not None:
	tokenizer = load_tokenizer(args.tokenizer, args.langs)
	if args.prompt_format is not None:
	prompt_defaults = None
	if args.prompt is not None:
	prompt_defaults = ast.literal_eval(args.prompt)
	prompt = PromptFormatter.resolve(args.prompt_format)(tokenizer._tokenizer, defaults=prompt_defaults)

	assert args.input.endswith(".yaml")
	config = OmegaConf.merge(
	OmegaConf.structured(LhotseDataLoadingConfig),
	OmegaConf.from_dotlist([f"input_cfg={args.input}", "force_finite=True", "metadata_only=True"]),
	)
	cuts, _ = read_cutset_from_config(config)
	cuts = cuts.map(partial(apply_tokenizer, tokenizer=tokenizer, prompt=prompt), apply_fn=None)
	if hasattr(cuts, "prefetch"):
	cuts = cuts.prefetch() # to be released in lhotse 1.27
	token_filter = RejectionsCounter(
	TokenCountFilter(args.min_tokens, args.max_tokens, args.measure_total_length), "Token count filtering"
	)
	cuts = cuts.filter(token_filter)
	tpt_filter = RejectionsCounter(TokenPerTokenFilter(-1, args.max_tpt), "Output tokens per input token filtering")
	cuts = cuts.filter(tpt_filter)
	if (N := args.num_examples) > 0:
	cuts = islice(cuts, N)

	token_bins = estimate_token_buckets(
	cuts,
	num_buckets=args.buckets,
	num_subbuckets=args.sub_buckets,
	quiet=args.quiet,
	)
	if args.sub_buckets is not None:
	token_bins = "[" + ','.join(f"[{b:d},{sb:d}]" for b, sb in token_bins) + "]"
	else:
	token_bins = "[" + ','.join(f"{b:d}" for b in token_bins) + "]"
	if args.quiet:
	print(token_bins)
	return
	token_filter.print_report()
	tpt_filter.print_report()
	print("Use the following options in your config:")
	print(f"\tnum_buckets={args.buckets}")
	print(f"\tbucket_duration_bins={token_bins}")


	if __name__ == "__main__":
	main()