jirong
/

DeLVM

Model card Files Files and versions

DeLVM / InternLM /internlm /data /dataset.py

jirong's picture

Upload folder using huggingface_hub

ee3e701 verified about 1 year ago

history blame contribute delete

1.7 kB

	import os
	from typing import Dict

	from torch.utils.data import ConcatDataset

	from internlm.data.single_dataset import JsonlDataset


	def get_dataset_dict(folder, split="valid") -> Dict:
	"""
	Return a dictionary of Datasets from a folder containing data files for validation.

	Args:
	folder (str): The path to the folder containing data files.
	split (str): The split of the data files to be used, default is "valid".

	Returns:
	A dictionary containing Datasets for each folder in the given path
	that contains data files with the specified split.

	Raises:
	AssertionError: If the given folder does not exist.

	Example:
	If the given folder is as follows,
	- data
	- zhihu
	- xxx.bin
	- valid.bin
	- baike
	- xxx.bin
	- valid.bin

	The returned dictionary will be,
	{
	'zhihu': Dataset,
	'baike': Dataset
	}
	"""

	assert os.path.exists(folder), f"folder `{folder}` not exists"
	data_dict = {}

	for root, dirs, files in os.walk(folder, followlinks=True):
	dirs.sort() # The order is guaranteed, and the newly added data starting with z needs to be ranked behind
	datasets = []
	for fn in sorted(files): # Need sorted to ensure that the order is consistent
	if fn.endswith(".bin") and split in fn:
	fp = os.path.join(root, fn)
	ds = JsonlDataset(fp)
	datasets.append(ds)
	if datasets:
	ds = ConcatDataset(datasets=datasets)
	data_dict[os.path.basename(root)] = ds

	return data_dict