| | |
| |
|
| | |
| | |
| |
|
| | """Utility functions.""" |
| |
|
| | import fnmatch |
| | import logging |
| | import os |
| | import sys |
| |
|
| | import h5py |
| | import numpy as np |
| |
|
| |
|
| | def find_files(root_dir, query="*.wav", include_root_dir=True): |
| | """Find files recursively. |
| | |
| | Args: |
| | root_dir (str): Root root_dir to find. |
| | query (str): Query to find. |
| | include_root_dir (bool): If False, root_dir name is not included. |
| | |
| | Returns: |
| | list: List of found filenames. |
| | |
| | """ |
| | files = [] |
| | for root, dirnames, filenames in os.walk(root_dir, followlinks=True): |
| | for filename in fnmatch.filter(filenames, query): |
| | files.append(os.path.join(root, filename)) |
| | if not include_root_dir: |
| | files = [file_.replace(root_dir + "/", "") for file_ in files] |
| |
|
| | return files |
| |
|
| |
|
| | def read_hdf5(hdf5_name, hdf5_path): |
| | """Read hdf5 dataset. |
| | |
| | Args: |
| | hdf5_name (str): Filename of hdf5 file. |
| | hdf5_path (str): Dataset name in hdf5 file. |
| | |
| | Return: |
| | any: Dataset values. |
| | |
| | """ |
| | if not os.path.exists(hdf5_name): |
| | logging.error(f"There is no such a hdf5 file ({hdf5_name}).") |
| | sys.exit(1) |
| |
|
| | hdf5_file = h5py.File(hdf5_name, "r") |
| |
|
| | if hdf5_path not in hdf5_file: |
| | logging.error(f"There is no such a data in hdf5 file. ({hdf5_path})") |
| | sys.exit(1) |
| |
|
| | hdf5_data = hdf5_file[hdf5_path][()] |
| | hdf5_file.close() |
| |
|
| | return hdf5_data |
| |
|
| |
|
| | def write_hdf5(hdf5_name, hdf5_path, write_data, is_overwrite=True): |
| | """Write dataset to hdf5. |
| | |
| | Args: |
| | hdf5_name (str): Hdf5 dataset filename. |
| | hdf5_path (str): Dataset path in hdf5. |
| | write_data (ndarray): Data to write. |
| | is_overwrite (bool): Whether to overwrite dataset. |
| | |
| | """ |
| | |
| | write_data = np.array(write_data) |
| |
|
| | |
| | folder_name, _ = os.path.split(hdf5_name) |
| | if not os.path.exists(folder_name) and len(folder_name) != 0: |
| | os.makedirs(folder_name) |
| |
|
| | |
| | if os.path.exists(hdf5_name): |
| | |
| | hdf5_file = h5py.File(hdf5_name, "r+") |
| | |
| | if hdf5_path in hdf5_file: |
| | if is_overwrite: |
| | logging.warning("Dataset in hdf5 file already exists. " |
| | "recreate dataset in hdf5.") |
| | hdf5_file.__delitem__(hdf5_path) |
| | else: |
| | logging.error("Dataset in hdf5 file already exists. " |
| | "if you want to overwrite, please set is_overwrite = True.") |
| | hdf5_file.close() |
| | sys.exit(1) |
| | else: |
| | |
| | hdf5_file = h5py.File(hdf5_name, "w") |
| |
|
| | |
| | hdf5_file.create_dataset(hdf5_path, data=write_data) |
| | hdf5_file.flush() |
| | hdf5_file.close() |
| |
|
| |
|
| | class HDF5ScpLoader(object): |
| | """Loader class for a fests.scp file of hdf5 file. |
| | |
| | Examples: |
| | key1 /some/path/a.h5:feats |
| | key2 /some/path/b.h5:feats |
| | key3 /some/path/c.h5:feats |
| | key4 /some/path/d.h5:feats |
| | ... |
| | >>> loader = HDF5ScpLoader("hdf5.scp") |
| | >>> array = loader["key1"] |
| | |
| | key1 /some/path/a.h5 |
| | key2 /some/path/b.h5 |
| | key3 /some/path/c.h5 |
| | key4 /some/path/d.h5 |
| | ... |
| | >>> loader = HDF5ScpLoader("hdf5.scp", "feats") |
| | >>> array = loader["key1"] |
| | |
| | """ |
| |
|
| | def __init__(self, feats_scp, default_hdf5_path="feats"): |
| | """Initialize HDF5 scp loader. |
| | |
| | Args: |
| | feats_scp (str): Kaldi-style feats.scp file with hdf5 format. |
| | default_hdf5_path (str): Path in hdf5 file. If the scp contain the info, not used. |
| | |
| | """ |
| | self.default_hdf5_path = default_hdf5_path |
| | with open(feats_scp) as f: |
| | lines = [line.replace("\n", "") for line in f.readlines()] |
| | self.data = {} |
| | for line in lines: |
| | key, value = line.split() |
| | self.data[key] = value |
| |
|
| | def get_path(self, key): |
| | """Get hdf5 file path for a given key.""" |
| | return self.data[key] |
| |
|
| | def __getitem__(self, key): |
| | """Get ndarray for a given key.""" |
| | p = self.data[key] |
| | if ":" in p: |
| | return read_hdf5(*p.split(":")) |
| | else: |
| | return read_hdf5(p, self.default_hdf5_path) |
| |
|
| | def __len__(self): |
| | """Return the length of the scp file.""" |
| | return len(self.data) |
| |
|
| | def __iter__(self): |
| | """Return the iterator of the scp file.""" |
| | return iter(self.data) |
| |
|
| | def keys(self): |
| | """Return the keys of the scp file.""" |
| | return self.data.keys() |
| |
|