ranjit-task-logs-analysis / egs /librispeech /SSL /local /process_librispeech4finetune.py

Upload icefall experiment results and logs

d596074 verified 3 months ago

3.11 kB

	#!/usr/bin/env python3
	# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
	#
	# See ../../../../LICENSE for clarification regarding multiple authors
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import argparse
	import logging
	import os
	from pathlib import Path
	from typing import Optional

	import torch
	from lhotse import CutSet
	from lhotse.recipes.utils import read_manifests_if_cached

	from icefall.utils import str2bool

	# Torch's multithreaded behavior needs to be disabled or
	# it wastes a lot of CPU and slow things down.
	# Do this outside of main() in case it needs to take effect
	# even when we are not invoking the main (e.g. when spawning subprocesses).
	torch.set_num_threads(1)
	torch.set_num_interop_threads(1)


	def get_args():
	parser = argparse.ArgumentParser()

	parser.add_argument(
	"--dataset",
	type=str,
	help="""Dataset parts to compute fbank. If None, we will use all""",
	)

	return parser.parse_args()


	def process_wav_librispeech(
	dataset: Optional[str] = None,
	):
	src_dir = Path("data/manifests")
	output_dir = Path("data/wav")

	if dataset is None:
	dataset_parts = (
	"dev-clean",
	"dev-other",
	"test-clean",
	"test-other",
	"train-clean-100",
	"train-clean-360",
	"train-other-500",
	)
	else:
	dataset_parts = dataset.split(" ", -1)

	prefix = "librispeech"
	suffix = "jsonl.gz"
	manifests = read_manifests_if_cached(
	dataset_parts=dataset_parts,
	output_dir=src_dir,
	prefix=prefix,
	suffix=suffix,
	)
	assert manifests is not None

	assert len(manifests) == len(dataset_parts), (
	len(manifests),
	len(dataset_parts),
	list(manifests.keys()),
	dataset_parts,
	)

	for partition, m in manifests.items():
	cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
	if (output_dir / cuts_filename).is_file():
	logging.info(f"{partition} already exists - skipping.")
	continue
	logging.info(f"Processing {partition}")
	cut_set = CutSet.from_manifests(
	recordings=m["recordings"],
	supervisions=m["supervisions"],
	)
	cut_set.to_file(output_dir / cuts_filename)


	if __name__ == "__main__":
	formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"

	logging.basicConfig(format=formatter, level=logging.INFO)
	args = get_args()
	logging.info(vars(args))
	process_wav_librispeech(
	dataset=args.dataset,
	)