| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | from random import Random |
| |
|
| | import click |
| | from lhotse import CutSet |
| |
|
| | from nemo.collections.common.data.lhotse.text_adapters import ( |
| | NeMoMultimodalConversationJsonlAdapter, |
| | NeMoMultimodalConversationTarWriter, |
| | ) |
| |
|
| |
|
| | @click.command() |
| | @click.argument("manifest", type=click.Path()) |
| | @click.argument("output_dir", type=click.Path()) |
| | @click.option("-n", "--shard_size", type=int, default=100, help="Number of conversations per shard.") |
| | @click.option("--shuffle/--no-shuffle", default=False, help="Shuffle conversations.") |
| | @click.option("-s", "--seed", type=int, default=42, help="Random seed.") |
| | def export(manifest: str, output_dir: str, shard_size: int, shuffle: bool, seed: int): |
| | with NeMoMultimodalConversationTarWriter(output_dir, shard_size=shard_size) as writer: |
| | source = NeMoMultimodalConversationJsonlAdapter(manifest, audio_locator_tag="<dummy>") |
| | if shuffle: |
| | source = CutSet(source).shuffle(buffer_size=50000, rng=Random(seed)) |
| | for item in source: |
| | writer.write(item) |
| |
|
| |
|
| | if __name__ == '__main__': |
| | export() |
| |
|