File size: 1,587 Bytes
ab6c03c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | from multiprocessing import Pool
import copy
import argparse
from process_pretrain_data import Process
# filenames = ['xaa', 'xab', 'xac', 'xad', 'xae', 'xaf', 'xag', 'xah', 'xai', 'xaj', 'xak', 'xal', 'xam', 'xan', 'xao', 'xap', 'xaq', 'xar', 'xas', 'xat', 'xau', 'xav', 'xaw']
# filenames = ['xaa', 'xab']
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--sampling_rate",
default=1.0,
type=float,
help="We will sample sampling_rate*total_length*2/512 times",
)
parser.add_argument(
"--kmer",
default=1,
type=int,
help="K-mer",
)
parser.add_argument(
"--length",
default=10000,
type=int,
help="Length of the sampled sequence",
)
parser.add_argument(
"--file_path",
default=None,
type=str,
help="The path of the file to be processed",
)
parser.add_argument(
"--output_path",
default="/home/zhihan/dna/data/split/",
type=str,
help="The path of the file to be processed",
)
args = parser.parse_args()
# multiprocess
p = Pool(22)
for i in range(1,23):
arg_new = copy.deepcopy(args)
arg_new.file_path = "/root/data/genome/" + "GRCh38.chr" + str(i) + ".fa"
arg_new.output_path = "/root/data/sub_001_6140/" + "GRCh38.chr" + str(i) + ".fa"
# arg_new.file_path = arg_new.output_path + filename
p.apply_async(Process, args=(arg_new,))
p.close()
p.join()
if __name__ == "__main__":
main()
|