File size: 1,811 Bytes
ab6c03c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | import argparse
import os
import csv
import numpy as np
from process_pretrain_data import get_kmer_sentence
def Process(args):
SCAN_LIST = [int(500/(args.slide-1))*i for i in range(args.slide)]
old_file = open(args.file_path, "r", encoding="utf-8-sig")
old_lines = list(csv.reader(old_file, delimiter=",", quotechar=None))[1:]
if args.output_path:
root_path = args.output_path + "/"
else:
root_path = "/".join(args.file_path.split("/")[:-1]) + "/" + str(args.kmer) + "/"
if not os.path.exists(root_path):
os.makedirs(root_path)
labels = np.array([])
new_file = open(root_path+"dev.tsv", 'wt')
tsv_w = csv.writer(new_file, delimiter='\t')
tsv_w.writerow(["setence", "label"])
for line in old_lines:
label = line[6]
labels = np.append(labels, int(label))
for index in SCAN_LIST:
sub_sequence = line[8][index:index+500]
sub_sentence = get_kmer_sentence(sub_sequence, kmer=args.kmer)
tsv_w.writerow([sub_sentence, label])
np.save(root_path+"label.npy", labels)
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--kmer",
default=1,
type=int,
help="K-mer",
)
parser.add_argument(
"--file_path",
default=None,
type=str,
help="The path of the file to be processed",
)
parser.add_argument(
"--output_path",
default=None,
type=str,
help="The path of the processed data",
)
parser.add_argument(
"--slide",
default=11,
type=int,
help="How many 500s to use for the predictes result of 1000",
)
args = parser.parse_args()
Process(args)
if __name__ == "__main__":
main()
|