| import os | |
| from pathlib import Path | |
| import datasets | |
| from datasets import Dataset | |
| from loguru import logger | |
| def get_filename() -> str: | |
| current_file_name = os.path.basename(__file__) | |
| log_name = "{}.log" | |
| return log_name.format(current_file_name.split('.')[0]) | |
| def set_log() -> None: | |
| filename = get_filename() | |
| logger.add(f'../log/{filename}') | |
| set_log() | |
| home_path = "/scDifformer/data/240412_test/output" | |
| output_directory = '/scDifformer/data/240412_test/output_directory_run' | |
| output_prefix = 'geneformer_run' | |
| output_path = (Path(output_directory) / output_prefix).with_suffix(".dataset") | |
| if __name__ == '__main__': | |
| files = [os.path.join(home_path, f) for f in os.listdir(home_path) if f.endswith('.dataset')] | |
| datas = [] | |
| i = 1 | |
| for file in files: | |
| i = i + 1 | |
| logger.info(i) | |
| data = Dataset.load_from_disk(file) | |
| logger.info(data) | |
| datas.append(data) | |
| conbine_data = datasets.concatenate_datasets(datas) | |
| conbine_data.save_to_disk(output_path, max_shard_size='5000000MB') |