#!/usr/bin/env python3 """ Subset training data to a specific duration (e.g., 70 hours). """ import gzip import json import argparse from pathlib import Path def subset_cuts(input_path, output_path, target_hours): """ Create a subset of cuts with target duration. """ target_duration = target_hours * 3600 # Convert to seconds total_duration = 0 count = 0 with gzip.open(input_path, 'rt') as fin, \ gzip.open(output_path, 'wt') as fout: for line in fin: if not line.strip(): continue cut = json.loads(line) cut_duration = cut['duration'] # Check if adding this cut would exceed target if total_duration + cut_duration > target_duration: break # Write the cut fout.write(line) total_duration += cut_duration count += 1 # Print progress every 10k cuts if count % 10000 == 0: print(f'Processed {count:,} cuts, {total_duration/3600:.2f} hours') print(f'\nSubset created:') print(f' Total cuts: {count:,}') print(f' Total duration: {total_duration/3600:.2f} hours') print(f' Output: {output_path}') def main(): parser = argparse.ArgumentParser(description='Subset training data') parser.add_argument( '--input', type=str, default='data/manifests/cuts_train_ihm.jsonl.gz', help='Input manifest file' ) parser.add_argument( '--output', type=str, default='data/manifests/cuts_train_70h.jsonl.gz', help='Output manifest file' ) parser.add_argument( '--hours', type=float, default=70.0, help='Target duration in hours' ) args = parser.parse_args() print(f'Creating {args.hours}h subset from {args.input}') subset_cuts(args.input, args.output, args.hours) if __name__ == '__main__': main()