import numpy as np import pandas as pd from os.path import join import argparse import glob import logging import os import pickle import random import re import shutil from typing import Dict, List, Tuple from copy import deepcopy from multiprocessing import Pool import sys import importlib from pathlib import Path import numpy as np import torch from torch.nn.utils.rnn import pad_sequence from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange import collections import itertools import json from transformers import ( WEIGHTS_NAME, AdamW, BertConfig, BertForMaskedLM, BertTokenizer, DNATokenizer, myTokenizer, MotifTokenizer, CamembertConfig, CamembertForMaskedLM, CamembertTokenizer, DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, PreTrainedModel, PreTrainedTokenizer, RobertaConfig, RobertaForMaskedLM, RobertaTokenizer, get_linear_schedule_with_warmup, ) MODEL_CLASSES = { "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer), "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), "dna": (BertConfig, BertForMaskedLM, DNATokenizer), "bert": (BertConfig, BertForMaskedLM, BertTokenizer), "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer), "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer), "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer), "myBert": (BertConfig, BertForMaskedLM, myTokenizer), "motifBert": (BertConfig, BertForMaskedLM, MotifTokenizer) } MASK_LIST = { "3mer_stride1": [-1, 1], "3mer_stride3": [0], "6mer_stride1": [-2, -1, 1, 2, 3], "6mer_stride6": [0], "motif": [0] } # Setting environment variables os.environ['VOCAB_PATH'] = '/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/tokenizer_v5.1/hg38_NOOP/vocab_dedup.txt' os.environ['VOCAB_NAME'] = 'vocab_dedup.txt' os.environ['POSITIONAL_EMBEDDINGS_SIZE'] = '512' class TrieNode: def __init__(self): self.children = {} self.is_end_of_word = False self.features = [] class Trie: def __init__(self): self.root = TrieNode() self.lookup_table = {} def insert(self, word, features = None): current_node = self.root for char in word: if char not in current_node.children: current_node.children[char] = TrieNode() current_node = current_node.children[char] current_node.is_end_of_word = True if features: current_node.features.append(features) def print_trie(self, node=None, prefix="", level=0): if node is None: node = self.root for char, child_node in node.children.items(): print(" " * level + "'{}'{}".format(char, " (end)" if child_node.is_end_of_word else "")) self.print_trie(child_node, prefix + char, level + 1) def search(self, word): current_node = self.root for char in word: if char not in current_node.children: return False # Word not found current_node = current_node.children[char] if current_node.is_end_of_word: if len(current_node.features) > 0: return current_node.features else: return True return False # Word not found def load_trie_from_file(filename): with open(filename, 'rb') as file: return pickle.load(file) def load_tokenizer5_1(): config_class, model_class, tokenizer_class = MODEL_CLASSES['motifBert'] tokenizer = tokenizer_class.from_pretrained('motif', cache_dir=None) bases = ['A', 'T', 'C', 'G'] token_wc = [ f"{operator}_POS_{i}_*_{char}" for operator, i, char in itertools.product(['WC'], range(12), bases) ] motif_wildcarded = [] with open(os.path.join('/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/tokenizer_v5.1/hg38_NOOP', "motifs_wildcard.txt"), "r") as file: for line in file: seq, operations = line.strip().split(maxsplit=1) # Split only on the first space motif_wildcarded.append(operations.split()[0]) # Store in dictionary tokenizer.add_tokens(token_wc + motif_wildcarded) return tokenizer def tokenize(seg, i, maxlen, motif_hardcoded_trie, motif_wildcarded_trie, k3, k1, lookup_table): ''' Parameters: seg: a sequence chunk from the chromosome i: the start position at this segment maxlen: the longest distance considered to find motif, should be the longest word in vocabulary rule: hardcoded motif > wildcarded motif > motif + operation score design rule: reward length of underlying sequence(instead of the motif token, cuz it means how long these token combination can tokenize) penalize # of wildcards (identifying how many positions have high uncertainty) penalize mutation operation ''' score = 0 t = [] best_token = None best_score = -float('inf') for l in range(maxlen, 3, -1): segment = seg[i:i+l] if motif_hardcoded_trie.search(segment): t = [segment] score = 1 * l best_token, best_score = max([(best_token, best_score), (t, score)], key=lambda x: x[1]) if motif_wildcarded_trie.search(segment): t = [random.choice(motif_wildcarded_trie.search(segment))] # random.choice output doesn't have [], so need to add [] wd = len(t[0].split()) - 1 # the number of wildcards score = 1 * l - np.exp( wd / l) # the less wd count, the lower penalization best_token, best_score = max([(best_token, best_score), (t, score)], key=lambda x: x[1]) # if cannot find motifs, tokenize with 3mer then 1mer if best_token == None: for l in range(3, 0, -1): segment = seg[i:i+l] if segment in k3: best_token = [segment] best_score = 3 break if segment in k1: best_token = [segment] best_score = 1 name = lookup_table.get(best_token[0].split()[0], '-') # '-' represent the given name for non-motif tokens next_pos = i + len(best_token[0].split()[0]) return best_token[0], name, best_score, next_pos def tokenize_seq(seg, vocab_path, maxlen, motif_hardcoded_trie, motif_wildcarded_trie, k1, k3, lookup_table): i = 0 # start position tokens = [] names = [] coordinates = [] t = [] while i < len(seg): t = [] best_token, best_name, best_score, next_pos = tokenize(seg, i, maxlen, motif_hardcoded_trie, motif_wildcarded_trie, k3, k1, lookup_table) best_i = i _curr_token = best_token offsets = [] if len(_curr_token) > 1: # our token only has length 1, 3, >=5, no length at 2 # 只要当前 token 不是 1mer, 向右 offset 才有意义,否则相当于从下一个位置开始 tokenize offsets = [1, 2] if offsets: for shift in offsets: i_shifted = i + shift if i_shifted < len(seg): token_, name_, score_, next_pos_ = tokenize(seg, i_shifted, maxlen, motif_hardcoded_trie, motif_wildcarded_trie, k3, k1, lookup_table) best_token, best_name, best_i, next_pos, best_score = max([(best_token, best_name, best_i, next_pos, best_score), (token_, name_, i_shifted, next_pos_, score_ )], key=lambda x: x[4]) for skip in range(best_i - i): tokens.append(seg[i + skip]) # names.append('-') # coordinates.append(chrmname + ':' + str(start_position + i + skip) + '-' + str(start_position + i + skip + 1)) # coordinate = chrmname + ':' + str(start_position + best_i) + '-' + str(min(start_position + next_pos, start_position + len(seg))) tokens.append(best_token) # names.append(best_name) # coordinates.append(coordinate) i = next_pos return tokens, coordinates, names def main(): # load vocabs motif_hardcoded_trie = load_trie_from_file(join(args.tokenizer_dir, 'motifs_hardcode_trie.pkl')) motif_wildcarded_trie = load_trie_from_file(join(args.tokenizer_dir, 'motifs_wildcard_trie.pkl')) k1 = ['A', 'T', 'C', 'G', 'N'] # 3-mer combinations = list(itertools.product(['A', 'T', 'C', 'G'], repeat=3)) k3 = [''.join(term) for term in combinations] lookup_table = {} with open(join(args.tokenizer_dir, "motifs_dedup.txt"), "r") as file: for line in file: segment, name = line.strip().split(maxsplit=1) # Split only on the first space lookup_table[segment] = name # Store in dictionary for folder in os.listdir(args.data_dir): if not folder.startswith('.'): for f in ['test', 'dev', 'train']: data = join(args.data_dir, folder, 'split', f + '.csv') if not os.path.exists(data): print(f"File {data} does not exist, skipping...") continue df = pd.read_csv(data, sep = '\t') print('Processing ' + folder + ' ' + f) df_tokenized = [] if args.only_positive: for i in range(len(df['sequence'])): if df['label'][i] == 1: seg = df['sequence'][i] t, _, _ = tokenize_seq(seg, args.tokenizer_dir, 12, motif_hardcoded_trie, motif_wildcarded_trie, k1, k3, lookup_table) df_tokenized.append(t) df_ = [" ".join(line) for line in df_tokenized] f_ = join(args.data_dir, folder, 'split', f + '_token_v5_1_only_POS.json') with open(f_, 'w') as file: # logging.warning(f"Saving tokenized results to {f_}...") json.dump(df_, file) else: for i in range(len(df['sequence'])): seg = df['sequence'][i] t, _, _ = tokenize_seq(seg, args.tokenizer_dir, 12, motif_hardcoded_trie, motif_wildcarded_trie, k1, k3, lookup_table) df_tokenized.append(t) df_ = [" ".join(line) for line in df_tokenized] f_ = join(args.data_dir, folder, 'split', f + '_token_v5_1.json') with open(f_, 'w') as file: # logging.warning(f"Saving tokenized results to {f_}...") json.dump(df_, file) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--tokenizer_dir", type=str, required=True) parser.add_argument("--data_dir", type=str, required=True) parser.add_argument("--only_positive", action="store_true") args = parser.parse_args() main()