odg123's picture
Upload icefall experiment results and logs
d596074 verified
#!/usr/bin/env python3
"""
Subset training data to a specific duration (e.g., 70 hours).
"""
import gzip
import json
import argparse
from pathlib import Path
def subset_cuts(input_path, output_path, target_hours):
"""
Create a subset of cuts with target duration.
"""
target_duration = target_hours * 3600 # Convert to seconds
total_duration = 0
count = 0
with gzip.open(input_path, 'rt') as fin, \
gzip.open(output_path, 'wt') as fout:
for line in fin:
if not line.strip():
continue
cut = json.loads(line)
cut_duration = cut['duration']
# Check if adding this cut would exceed target
if total_duration + cut_duration > target_duration:
break
# Write the cut
fout.write(line)
total_duration += cut_duration
count += 1
# Print progress every 10k cuts
if count % 10000 == 0:
print(f'Processed {count:,} cuts, {total_duration/3600:.2f} hours')
print(f'\nSubset created:')
print(f' Total cuts: {count:,}')
print(f' Total duration: {total_duration/3600:.2f} hours')
print(f' Output: {output_path}')
def main():
parser = argparse.ArgumentParser(description='Subset training data')
parser.add_argument(
'--input',
type=str,
default='data/manifests/cuts_train_ihm.jsonl.gz',
help='Input manifest file'
)
parser.add_argument(
'--output',
type=str,
default='data/manifests/cuts_train_70h.jsonl.gz',
help='Output manifest file'
)
parser.add_argument(
'--hours',
type=float,
default=70.0,
help='Target duration in hours'
)
args = parser.parse_args()
print(f'Creating {args.hours}h subset from {args.input}')
subset_cuts(args.input, args.output, args.hours)
if __name__ == '__main__':
main()