import os from pathlib import Path import datasets from datasets import Dataset from loguru import logger def get_filename() -> str: current_file_name = os.path.basename(__file__) log_name = "{}.log" return log_name.format(current_file_name.split('.')[0]) def set_log() -> None: filename = get_filename() logger.add(f'../log/{filename}') set_log() home_path = "/scDifformer/data/240412_test/output" output_directory = '/scDifformer/data/240412_test/output_directory_run' output_prefix = 'geneformer_run' output_path = (Path(output_directory) / output_prefix).with_suffix(".dataset") if __name__ == '__main__': files = [os.path.join(home_path, f) for f in os.listdir(home_path) if f.endswith('.dataset')] datas = [] i = 1 for file in files: i = i + 1 logger.info(i) data = Dataset.load_from_disk(file) logger.info(data) datas.append(data) conbine_data = datasets.concatenate_datasets(datas) conbine_data.save_to_disk(output_path, max_shard_size='5000000MB')