| import argparse |
| import os |
| import csv |
| import numpy as np |
| from process_pretrain_data import get_kmer_sentence |
|
|
|
|
|
|
|
|
| def Process(args): |
|
|
| SCAN_LIST = [int(500/(args.slide-1))*i for i in range(args.slide)] |
|
|
| old_file = open(args.file_path, "r", encoding="utf-8-sig") |
| old_lines = list(csv.reader(old_file, delimiter=",", quotechar=None))[1:] |
|
|
| if args.output_path: |
| root_path = args.output_path + "/" |
| else: |
| root_path = "/".join(args.file_path.split("/")[:-1]) + "/" + str(args.kmer) + "/" |
| if not os.path.exists(root_path): |
| os.makedirs(root_path) |
|
|
| labels = np.array([]) |
| new_file = open(root_path+"dev.tsv", 'wt') |
| tsv_w = csv.writer(new_file, delimiter='\t') |
| tsv_w.writerow(["setence", "label"]) |
|
|
| for line in old_lines: |
| label = line[6] |
| labels = np.append(labels, int(label)) |
|
|
| for index in SCAN_LIST: |
| sub_sequence = line[8][index:index+500] |
| sub_sentence = get_kmer_sentence(sub_sequence, kmer=args.kmer) |
| tsv_w.writerow([sub_sentence, label]) |
| |
| np.save(root_path+"label.npy", labels) |
|
|
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument( |
| "--kmer", |
| default=1, |
| type=int, |
| help="K-mer", |
| ) |
| parser.add_argument( |
| "--file_path", |
| default=None, |
| type=str, |
| help="The path of the file to be processed", |
| ) |
| parser.add_argument( |
| "--output_path", |
| default=None, |
| type=str, |
| help="The path of the processed data", |
| ) |
| parser.add_argument( |
| "--slide", |
| default=11, |
| type=int, |
| help="How many 500s to use for the predictes result of 1000", |
| ) |
| args = parser.parse_args() |
|
|
| Process(args) |
|
|
| |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|