| | |
| | """ |
| | Subset training data to a specific duration (e.g., 70 hours). |
| | """ |
| |
|
| | import gzip |
| | import json |
| | import argparse |
| | from pathlib import Path |
| |
|
| |
|
| | def subset_cuts(input_path, output_path, target_hours): |
| | """ |
| | Create a subset of cuts with target duration. |
| | """ |
| | target_duration = target_hours * 3600 |
| |
|
| | total_duration = 0 |
| | count = 0 |
| |
|
| | with gzip.open(input_path, 'rt') as fin, \ |
| | gzip.open(output_path, 'wt') as fout: |
| |
|
| | for line in fin: |
| | if not line.strip(): |
| | continue |
| |
|
| | cut = json.loads(line) |
| | cut_duration = cut['duration'] |
| |
|
| | |
| | if total_duration + cut_duration > target_duration: |
| | break |
| |
|
| | |
| | fout.write(line) |
| | total_duration += cut_duration |
| | count += 1 |
| |
|
| | |
| | if count % 10000 == 0: |
| | print(f'Processed {count:,} cuts, {total_duration/3600:.2f} hours') |
| |
|
| | print(f'\nSubset created:') |
| | print(f' Total cuts: {count:,}') |
| | print(f' Total duration: {total_duration/3600:.2f} hours') |
| | print(f' Output: {output_path}') |
| |
|
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser(description='Subset training data') |
| | parser.add_argument( |
| | '--input', |
| | type=str, |
| | default='data/manifests/cuts_train_ihm.jsonl.gz', |
| | help='Input manifest file' |
| | ) |
| | parser.add_argument( |
| | '--output', |
| | type=str, |
| | default='data/manifests/cuts_train_70h.jsonl.gz', |
| | help='Output manifest file' |
| | ) |
| | parser.add_argument( |
| | '--hours', |
| | type=float, |
| | default=70.0, |
| | help='Target duration in hours' |
| | ) |
| |
|
| | args = parser.parse_args() |
| |
|
| | print(f'Creating {args.hours}h subset from {args.input}') |
| | subset_cuts(args.input, args.output, args.hours) |
| |
|
| |
|
| | if __name__ == '__main__': |
| | main() |
| |
|