File size: 1,050 Bytes
8081b08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import os
from pathlib import Path

import datasets
from datasets import Dataset
from loguru import logger


def get_filename() -> str:
    current_file_name = os.path.basename(__file__)
    log_name = "{}.log"
    return log_name.format(current_file_name.split('.')[0])


def set_log() -> None:
    filename = get_filename()
    logger.add(f'../log/{filename}')


set_log()

home_path = "/scDifformer/data/240412_test/output"
output_directory = '/scDifformer/data/240412_test/output_directory_run'
output_prefix = 'geneformer_run'
output_path = (Path(output_directory) / output_prefix).with_suffix(".dataset")

if __name__ == '__main__':
    files = [os.path.join(home_path, f) for f in os.listdir(home_path) if f.endswith('.dataset')]
    datas = []
    i = 1
    for file in files:
        i = i + 1
        logger.info(i)
        data = Dataset.load_from_disk(file)
        logger.info(data)
        datas.append(data)
    conbine_data =  datasets.concatenate_datasets(datas)
    conbine_data.save_to_disk(output_path, max_shard_size='5000000MB')