| import numpy as np |
| import pandas as pd |
| from os.path import join |
|
|
| import argparse |
| import glob |
| import logging |
| import os |
| import pickle |
| import random |
| import re |
| import shutil |
| from typing import Dict, List, Tuple |
| from copy import deepcopy |
| from multiprocessing import Pool |
| import sys |
| import importlib |
| from pathlib import Path |
|
|
| import numpy as np |
| import torch |
| from torch.nn.utils.rnn import pad_sequence |
| from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler |
| from torch.utils.data.distributed import DistributedSampler |
| from tqdm import tqdm, trange |
| import collections |
| import itertools |
| import json |
|
|
| from transformers import ( |
| WEIGHTS_NAME, |
| AdamW, |
| BertConfig, |
| BertForMaskedLM, |
| BertTokenizer, |
| DNATokenizer, |
| myTokenizer, |
| MotifTokenizer, |
| CamembertConfig, |
| CamembertForMaskedLM, |
| CamembertTokenizer, |
| DistilBertConfig, |
| DistilBertForMaskedLM, |
| DistilBertTokenizer, |
| GPT2Config, |
| GPT2LMHeadModel, |
| GPT2Tokenizer, |
| OpenAIGPTConfig, |
| OpenAIGPTLMHeadModel, |
| OpenAIGPTTokenizer, |
| PreTrainedModel, |
| PreTrainedTokenizer, |
| RobertaConfig, |
| RobertaForMaskedLM, |
| RobertaTokenizer, |
| get_linear_schedule_with_warmup, |
| ) |
|
|
|
|
| MODEL_CLASSES = { |
| "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer), |
| "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), |
| "dna": (BertConfig, BertForMaskedLM, DNATokenizer), |
| "bert": (BertConfig, BertForMaskedLM, BertTokenizer), |
| "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer), |
| "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer), |
| "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer), |
| "myBert": (BertConfig, BertForMaskedLM, myTokenizer), |
| "motifBert": (BertConfig, BertForMaskedLM, MotifTokenizer) |
| } |
|
|
| MASK_LIST = { |
| "3mer_stride1": [-1, 1], |
| "3mer_stride3": [0], |
| "6mer_stride1": [-2, -1, 1, 2, 3], |
| "6mer_stride6": [0], |
| "motif": [0] |
| } |
|
|
| |
| os.environ['VOCAB_PATH'] = '/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/tokenizer_v4/hg38/vocab_dedup.txt' |
| os.environ['VOCAB_NAME'] = 'vocab_dedup.txt' |
| os.environ['POSITIONAL_EMBEDDINGS_SIZE'] = '512' |
|
|
|
|
| def tokenize_seq(seg, vocabs, maxlen, motif_hardcoded_sorted, motif_wildcarded_sorted, motif_variations_sorted, k3, k1): |
|
|
| i = 0 |
| tokens = [] |
| coordinates = [] |
| names = [] |
|
|
| t = [] |
| while i < len(seg): |
| t = [] |
| for l in range(maxlen, 0, -1): |
| if seg[i:i+l] in motif_hardcoded_sorted: |
| t = [seg[i:i+l]] |
| elif seg[i:i+l] in motif_wildcarded_sorted: |
| t = motif_wildcarded_sorted[seg[i:i+l]] |
| elif seg[i:i+l] in motif_variations_sorted: |
| t = motif_variations_sorted[seg[i:i+l]] |
| elif seg[i:i+l] in k3: |
| t = [seg[i:i+l]] |
| elif seg[i:i+l] in k1: |
| t = [seg[i:i+l]] |
|
|
| if t: |
| if len(t) > 1: |
| |
| |
| |
| random_choice = random.choice(t) |
| tokens.append(random_choice) |
| |
| else: |
| tokens.append(t[0]) |
| |
|
|
| |
| |
| i = i + l |
| break |
|
|
| return tokens, coordinates, names |
|
|
| def main(): |
|
|
| motif_hardcoded = pd.read_csv(join(args.tokenizer_dir, 'motifs_hardcode.txt'), header = None, names = ['column']) |
| motif_hardcoded_sorted = motif_hardcoded.sort_values(by='column', key=lambda col: col.str.len(), ascending=False) |
|
|
| |
| motif_wildcarded = collections.defaultdict(list) |
| with open(join(args.tokenizer_dir, "motifs_wildcard.txt"), "r") as file: |
| for line in file: |
| seq, operations = line.strip().split(maxsplit=1) |
| motif_wildcarded[seq].append(operations) |
| motif_wildcarded_sorted = {k: motif_wildcarded[k] for k in sorted(motif_wildcarded.keys(), key=len, reverse=True)} |
|
|
| |
| motif_variations = collections.defaultdict(list) |
| with open(join(args.tokenizer_dir, "motifs_variations.txt"), "r") as file: |
| for line in file: |
| seq, operations = line.strip().split(maxsplit=1) |
| motif_variations[seq].append(operations) |
| motif_variations_sorted = {k: motif_variations[k] for k in sorted(motif_variations.keys(), key=len, reverse=True)} |
|
|
| k1 = ['A', 'T', 'C', 'G', 'N'] |
| |
| combinations = list(itertools.product(['A', 'T', 'C', 'G'], repeat=3)) |
| k3 = [''.join(term) for term in combinations] |
|
|
| lookup_table = {} |
| with open(join(args.tokenizer_dir, "motifs_dedup.txt"), "r") as file: |
| for line in file: |
| segment, name = line.strip().split(maxsplit=1) |
| lookup_table[segment] = name |
|
|
| for folder in os.listdir(args.data_dir): |
| if not folder.startswith('.'): |
| for f in ['test', 'dev', 'train']: |
| data = join(args.data_dir, folder, 'split', f + '.csv') |
| print('process file: ' + data) |
|
|
| if not os.path.exists(data): |
| print(f"File {data} does not exist, skipping...") |
| continue |
| |
| df = pd.read_csv(data, sep = '\t') |
| print('Processing ' + folder + ' ' + f) |
| df_tokenized = [] |
|
|
| if args.only_positive: |
| for i in range(len(df['sequence'])): |
| if df['label'][i] == 1: |
| seg = df['sequence'][i] |
| t, _, _ = tokenize_seq(seg, args.tokenizer_dir, 12, motif_hardcoded_sorted, motif_wildcarded_sorted, motif_variations_sorted, k3, k1) |
| df_tokenized.append(t) |
|
|
| df_ = [" ".join(line) for line in df_tokenized] |
| f_ = join(args.data_dir, folder, 'split', f, name + '_token_v4_only_POS.json') |
| with open(f_, 'w') as file: |
| |
| json.dump(df_, file) |
|
|
| else: |
| for i in range(len(df['sequence'])): |
| seg = df['sequence'][i] |
| t, _, _ = tokenize_seq(seg, args.tokenizer_dir, 12, motif_hardcoded_sorted, motif_wildcarded_sorted, motif_variations_sorted, k3, k1) |
| df_tokenized.append(t) |
|
|
| df_ = [" ".join(line) for line in df_tokenized] |
| f_ = join(args.data_dir, folder, 'split', f + '_token_v4.json') |
| with open(f_, 'w') as file: |
| |
| json.dump(df_, file) |
|
|
| |
|
|
| if __name__ == "__main__": |
|
|
| parser = argparse.ArgumentParser() |
| parser.add_argument("--tokenizer_dir", type=str, required=True) |
| parser.add_argument("--data_dir", type=str, required=True) |
| parser.add_argument("--only_positive", action="store_true") |
|
|
| args = parser.parse_args() |
|
|
| main() |
|
|
|
|