Upload folder using huggingface_hub

838f737 verified 6 months ago

6.31 kB

	import os
	from pathlib import Path
	from typing import Tuple, Union

	from torch import Tensor
	from torch.utils.data import Dataset
	from torchaudio._internal import download_url_to_file
	from torchaudio.datasets.utils import _extract_tar, _load_waveform

	URL = "train-clean-100"
	FOLDER_IN_ARCHIVE = "LibriSpeech"
	SAMPLE_RATE = 16000
	_DATA_SUBSETS = [
	"dev-clean",
	"dev-other",
	"test-clean",
	"test-other",
	"train-clean-100",
	"train-clean-360",
	"train-other-500",
	]
	_CHECKSUMS = {
	"http://www.openslr.org/resources/12/dev-clean.tar.gz": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3", # noqa: E501
	"http://www.openslr.org/resources/12/dev-other.tar.gz": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365", # noqa: E501
	"http://www.openslr.org/resources/12/test-clean.tar.gz": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23", # noqa: E501
	"http://www.openslr.org/resources/12/test-other.tar.gz": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29", # noqa: E501
	"http://www.openslr.org/resources/12/train-clean-100.tar.gz": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2", # noqa: E501
	"http://www.openslr.org/resources/12/train-clean-360.tar.gz": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf", # noqa: E501
	"http://www.openslr.org/resources/12/train-other-500.tar.gz": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2", # noqa: E501
	}


	def _download_librispeech(root, url):
	base_url = "http://www.openslr.org/resources/12/"
	ext_archive = ".tar.gz"

	filename = url + ext_archive
	archive = os.path.join(root, filename)
	download_url = os.path.join(base_url, filename)
	if not os.path.isfile(archive):
	checksum = _CHECKSUMS.get(download_url, None)
	download_url_to_file(download_url, archive, hash_prefix=checksum)
	_extract_tar(archive)


	def _get_librispeech_metadata(
	fileid: str, root: str, folder: str, ext_audio: str, ext_txt: str
	) -> Tuple[str, int, str, int, int, int]:
	speaker_id, chapter_id, utterance_id = fileid.split("-")

	# Get audio path and sample rate
	fileid_audio = f"{speaker_id}-{chapter_id}-{utterance_id}"
	filepath = os.path.join(folder, speaker_id, chapter_id, f"{fileid_audio}{ext_audio}")

	# Load text
	file_text = f"{speaker_id}-{chapter_id}{ext_txt}"
	file_text = os.path.join(root, folder, speaker_id, chapter_id, file_text)
	with open(file_text) as ft:
	for line in ft:
	fileid_text, transcript = line.strip().split(" ", 1)
	if fileid_audio == fileid_text:
	break
	else:
	# Translation not found
	raise FileNotFoundError(f"Translation not found for {fileid_audio}")

	return (
	filepath,
	SAMPLE_RATE,
	transcript,
	int(speaker_id),
	int(chapter_id),
	int(utterance_id),
	)


	class LIBRISPEECH(Dataset):
	"""LibriSpeech :cite:`7178964` dataset.

	Args:
	root (str or Path): Path to the directory where the dataset is found or downloaded.
	url (str, optional): The URL to download the dataset from,
	or the type of the dataset to dowload.
	Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``,
	``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and
	``"train-other-500"``. (default: ``"train-clean-100"``)
	folder_in_archive (str, optional):
	The top-level directory of the dataset. (default: ``"LibriSpeech"``)
	download (bool, optional):
	Whether to download the dataset if it is not found at root path. (default: ``False``).
	"""

	_ext_txt = ".trans.txt"
	_ext_audio = ".flac"

	def __init__(
	self,
	root: Union[str, Path],
	url: str = URL,
	folder_in_archive: str = FOLDER_IN_ARCHIVE,
	download: bool = False,
	) -> None:
	self._url = url
	if url not in _DATA_SUBSETS:
	raise ValueError(f"Invalid url '{url}' given; please provide one of {_DATA_SUBSETS}.")

	root = os.fspath(root)
	self._archive = os.path.join(root, folder_in_archive)
	self._path = os.path.join(root, folder_in_archive, url)

	if not os.path.isdir(self._path):
	if download:
	_download_librispeech(root, url)
	else:
	raise RuntimeError(
	f"Dataset not found at {self._path}. Please set `download=True` to download the dataset."
	)

	self._walker = sorted(str(p.stem) for p in Path(self._path).glob("//*" + self._ext_audio))

	def get_metadata(self, n: int) -> Tuple[str, int, str, int, int, int]:
	"""Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
	but otherwise returns the same fields as :py:func:`__getitem__`.

	Args:
	n (int): The index of the sample to be loaded

	Returns:
	Tuple of the following items;

	str:
	Path to audio
	int:
	Sample rate
	str:
	Transcript
	int:
	Speaker ID
	int:
	Chapter ID
	int:
	Utterance ID
	"""
	fileid = self._walker[n]
	return _get_librispeech_metadata(fileid, self._archive, self._url, self._ext_audio, self._ext_txt)

	def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
	"""Load the n-th sample from the dataset.

	Args:
	n (int): The index of the sample to be loaded

	Returns:
	Tuple of the following items;

	Tensor:
	Waveform
	int:
	Sample rate
	str:
	Transcript
	int:
	Speaker ID
	int:
	Chapter ID
	int:
	Utterance ID
	"""
	metadata = self.get_metadata(n)
	waveform = _load_waveform(self._archive, metadata[0], metadata[1])
	return (waveform,) + metadata[1:]

	def __len__(self) -> int:
	return len(self._walker)