| import numpy as np |
| import pandas as pd |
| from os.path import join |
|
|
| import argparse |
| import glob |
| import logging |
| import os |
| import pickle |
| import random |
| import re |
| import shutil |
| from typing import Dict, List, Tuple |
| from copy import deepcopy |
| from multiprocessing import Pool |
| import sys |
| import importlib |
| from pathlib import Path |
|
|
| import numpy as np |
| import torch |
| from torch.nn.utils.rnn import pad_sequence |
| from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler |
| from torch.utils.data.distributed import DistributedSampler |
| from tqdm import tqdm, trange |
| import collections |
| import itertools |
| import json |
|
|
| from transformers import ( |
| WEIGHTS_NAME, |
| AdamW, |
| BertConfig, |
| BertForMaskedLM, |
| BertTokenizer, |
| DNATokenizer, |
| myTokenizer, |
| MotifTokenizer, |
| CamembertConfig, |
| CamembertForMaskedLM, |
| CamembertTokenizer, |
| DistilBertConfig, |
| DistilBertForMaskedLM, |
| DistilBertTokenizer, |
| GPT2Config, |
| GPT2LMHeadModel, |
| GPT2Tokenizer, |
| OpenAIGPTConfig, |
| OpenAIGPTLMHeadModel, |
| OpenAIGPTTokenizer, |
| PreTrainedModel, |
| PreTrainedTokenizer, |
| RobertaConfig, |
| RobertaForMaskedLM, |
| RobertaTokenizer, |
| get_linear_schedule_with_warmup, |
| ) |
|
|
|
|
| MODEL_CLASSES = { |
| "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer), |
| "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), |
| "dna": (BertConfig, BertForMaskedLM, DNATokenizer), |
| "bert": (BertConfig, BertForMaskedLM, BertTokenizer), |
| "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer), |
| "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer), |
| "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer), |
| "myBert": (BertConfig, BertForMaskedLM, myTokenizer), |
| "motifBert": (BertConfig, BertForMaskedLM, MotifTokenizer) |
| } |
|
|
| MASK_LIST = { |
| "3mer_stride1": [-1, 1], |
| "3mer_stride3": [0], |
| "6mer_stride1": [-2, -1, 1, 2, 3], |
| "6mer_stride6": [0], |
| "motif": [0] |
| } |
|
|
| |
| os.environ['VOCAB_PATH'] = '/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/tokenizer_v5.1/hg38_NOOP/vocab_dedup.txt' |
| os.environ['VOCAB_NAME'] = 'vocab_dedup.txt' |
| os.environ['POSITIONAL_EMBEDDINGS_SIZE'] = '512' |
|
|
| class TrieNode: |
| def __init__(self): |
| self.children = {} |
| self.is_end_of_word = False |
| self.features = [] |
|
|
|
|
| class Trie: |
| def __init__(self): |
| self.root = TrieNode() |
| self.lookup_table = {} |
| def insert(self, word, features = None): |
| current_node = self.root |
| for char in word: |
| if char not in current_node.children: |
| current_node.children[char] = TrieNode() |
| current_node = current_node.children[char] |
| current_node.is_end_of_word = True |
| if features: |
| current_node.features.append(features) |
| def print_trie(self, node=None, prefix="", level=0): |
| if node is None: |
| node = self.root |
| for char, child_node in node.children.items(): |
| print(" " * level + "'{}'{}".format(char, " (end)" if child_node.is_end_of_word else "")) |
| self.print_trie(child_node, prefix + char, level + 1) |
| def search(self, word): |
| current_node = self.root |
| for char in word: |
| if char not in current_node.children: |
| return False |
| current_node = current_node.children[char] |
| if current_node.is_end_of_word: |
| if len(current_node.features) > 0: |
| return current_node.features |
| else: |
| return True |
| return False |
|
|
| def load_trie_from_file(filename): |
| with open(filename, 'rb') as file: |
| return pickle.load(file) |
|
|
| def load_tokenizer5_1(): |
| config_class, model_class, tokenizer_class = MODEL_CLASSES['motifBert'] |
| tokenizer = tokenizer_class.from_pretrained('motif', cache_dir=None) |
| |
| bases = ['A', 'T', 'C', 'G'] |
| |
| token_wc = [ |
| f"{operator}_POS_{i}_*_{char}" |
| for operator, i, char in itertools.product(['WC'], range(12), bases) |
| ] |
| |
| motif_wildcarded = [] |
| with open(os.path.join('/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/tokenizer_v5.1/hg38_NOOP', "motifs_wildcard.txt"), "r") as file: |
| for line in file: |
| seq, operations = line.strip().split(maxsplit=1) |
| motif_wildcarded.append(operations.split()[0]) |
| |
| tokenizer.add_tokens(token_wc + motif_wildcarded) |
| return tokenizer |
|
|
| def tokenize(seg, i, maxlen, motif_hardcoded_trie, motif_wildcarded_trie, k3, k1, lookup_table): |
| ''' |
| Parameters: |
| seg: a sequence chunk from the chromosome |
| i: the start position at this segment |
| maxlen: the longest distance considered to find motif, should be the longest word in vocabulary |
| |
| rule: |
| hardcoded motif > wildcarded motif > motif + operation |
| |
| score design rule: |
| reward length of underlying sequence(instead of the motif token, cuz it means how long these token combination can tokenize) |
| penalize # of wildcards (identifying how many positions have high uncertainty) |
| penalize mutation operation |
| ''' |
|
|
| score = 0 |
| t = [] |
|
|
| best_token = None |
| best_score = -float('inf') |
|
|
| for l in range(maxlen, 3, -1): |
| |
| segment = seg[i:i+l] |
|
|
| if motif_hardcoded_trie.search(segment): |
| |
| t = [segment] |
| score = 1 * l |
| best_token, best_score = max([(best_token, best_score), (t, score)], key=lambda x: x[1]) |
| |
| if motif_wildcarded_trie.search(segment): |
| |
| t = [random.choice(motif_wildcarded_trie.search(segment))] |
| wd = len(t[0].split()) - 1 |
| score = 1 * l - np.exp( wd / l) |
| best_token, best_score = max([(best_token, best_score), (t, score)], key=lambda x: x[1]) |
|
|
| |
| if best_token == None: |
|
|
| for l in range(3, 0, -1): |
|
|
| segment = seg[i:i+l] |
| |
| if segment in k3: |
| best_token = [segment] |
| best_score = 3 |
| break |
|
|
| if segment in k1: |
| best_token = [segment] |
| best_score = 1 |
|
|
| name = lookup_table.get(best_token[0].split()[0], '-') |
| next_pos = i + len(best_token[0].split()[0]) |
|
|
| return best_token[0], name, best_score, next_pos |
|
|
| def tokenize_seq(seg, vocab_path, maxlen, motif_hardcoded_trie, motif_wildcarded_trie, k1, k3, lookup_table): |
|
|
| i = 0 |
| tokens = [] |
| names = [] |
| coordinates = [] |
|
|
| t = [] |
|
|
| while i < len(seg): |
|
|
| t = [] |
|
|
| best_token, best_name, best_score, next_pos = tokenize(seg, i, maxlen, motif_hardcoded_trie, motif_wildcarded_trie, k3, k1, lookup_table) |
| best_i = i |
|
|
| _curr_token = best_token |
| offsets = [] |
|
|
| if len(_curr_token) > 1: |
| |
| offsets = [1, 2] |
|
|
| if offsets: |
| for shift in offsets: |
| i_shifted = i + shift |
| if i_shifted < len(seg): |
| token_, name_, score_, next_pos_ = tokenize(seg, i_shifted, maxlen, motif_hardcoded_trie, motif_wildcarded_trie, k3, k1, lookup_table) |
| best_token, best_name, best_i, next_pos, best_score = max([(best_token, best_name, best_i, next_pos, best_score), (token_, name_, i_shifted, next_pos_, score_ )], key=lambda x: x[4]) |
|
|
| for skip in range(best_i - i): |
| tokens.append(seg[i + skip]) |
| |
| |
|
|
| |
| tokens.append(best_token) |
| |
| |
|
|
| i = next_pos |
|
|
| return tokens, coordinates, names |
|
|
|
|
| def main(): |
|
|
| |
| motif_hardcoded_trie = load_trie_from_file(join(args.tokenizer_dir, 'motifs_hardcode_trie.pkl')) |
| motif_wildcarded_trie = load_trie_from_file(join(args.tokenizer_dir, 'motifs_wildcard_trie.pkl')) |
|
|
| k1 = ['A', 'T', 'C', 'G', 'N'] |
| |
| combinations = list(itertools.product(['A', 'T', 'C', 'G'], repeat=3)) |
| k3 = [''.join(term) for term in combinations] |
|
|
| lookup_table = {} |
| with open(join(args.tokenizer_dir, "motifs_dedup.txt"), "r") as file: |
| for line in file: |
| segment, name = line.strip().split(maxsplit=1) |
| lookup_table[segment] = name |
|
|
| for folder in os.listdir(args.data_dir): |
| if not folder.startswith('.'): |
| for f in ['test', 'dev', 'train']: |
| data = join(args.data_dir, folder, 'split', f + '.csv') |
|
|
| if not os.path.exists(data): |
| print(f"File {data} does not exist, skipping...") |
| continue |
| |
| df = pd.read_csv(data, sep = '\t') |
| print('Processing ' + folder + ' ' + f) |
| df_tokenized = [] |
|
|
| if args.only_positive: |
| for i in range(len(df['sequence'])): |
| if df['label'][i] == 1: |
| seg = df['sequence'][i] |
| t, _, _ = tokenize_seq(seg, args.tokenizer_dir, 12, motif_hardcoded_trie, motif_wildcarded_trie, k1, k3, lookup_table) |
| df_tokenized.append(t) |
|
|
| df_ = [" ".join(line) for line in df_tokenized] |
| f_ = join(args.data_dir, folder, 'split', f + '_token_v5_1_only_POS.json') |
| with open(f_, 'w') as file: |
| |
| json.dump(df_, file) |
|
|
| else: |
| for i in range(len(df['sequence'])): |
| seg = df['sequence'][i] |
| t, _, _ = tokenize_seq(seg, args.tokenizer_dir, 12, motif_hardcoded_trie, motif_wildcarded_trie, k1, k3, lookup_table) |
| df_tokenized.append(t) |
|
|
| df_ = [" ".join(line) for line in df_tokenized] |
| f_ = join(args.data_dir, folder, 'split', f + '_token_v5_1.json') |
| with open(f_, 'w') as file: |
| |
| json.dump(df_, file) |
|
|
|
|
| if __name__ == "__main__": |
|
|
| parser = argparse.ArgumentParser() |
|
|
| parser.add_argument("--tokenizer_dir", type=str, required=True) |
| parser.add_argument("--data_dir", type=str, required=True) |
| parser.add_argument("--only_positive", action="store_true") |
| args = parser.parse_args() |
|
|
| main() |
|
|