scDIFFormer / scDifformer_data_process /token_data_combine.py
allenxiao's picture
Upload 17 files
8081b08 verified
import os
from pathlib import Path
import datasets
from datasets import Dataset
from loguru import logger
def get_filename() -> str:
current_file_name = os.path.basename(__file__)
log_name = "{}.log"
return log_name.format(current_file_name.split('.')[0])
def set_log() -> None:
filename = get_filename()
logger.add(f'../log/{filename}')
set_log()
home_path = "/scDifformer/data/240412_test/output"
output_directory = '/scDifformer/data/240412_test/output_directory_run'
output_prefix = 'geneformer_run'
output_path = (Path(output_directory) / output_prefix).with_suffix(".dataset")
if __name__ == '__main__':
files = [os.path.join(home_path, f) for f in os.listdir(home_path) if f.endswith('.dataset')]
datas = []
i = 1
for file in files:
i = i + 1
logger.info(i)
data = Dataset.load_from_disk(file)
logger.info(data)
datas.append(data)
conbine_data = datasets.concatenate_datasets(datas)
conbine_data.save_to_disk(output_path, max_shard_size='5000000MB')