import numpy as np import pandas as pd from os.path import join import argparse import glob import logging import os import pickle import random import re import shutil from typing import Dict, List, Tuple from copy import deepcopy from multiprocessing import Pool import sys import importlib from pathlib import Path import numpy as np import torch from torch.nn.utils.rnn import pad_sequence from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange import collections import itertools import json from transformers import ( WEIGHTS_NAME, AdamW, BertConfig, BertForMaskedLM, BertTokenizer, DNATokenizer, myTokenizer, MotifTokenizer, CamembertConfig, CamembertForMaskedLM, CamembertTokenizer, DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, PreTrainedModel, PreTrainedTokenizer, RobertaConfig, RobertaForMaskedLM, RobertaTokenizer, get_linear_schedule_with_warmup, ) MODEL_CLASSES = { "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer), "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), "dna": (BertConfig, BertForMaskedLM, DNATokenizer), "bert": (BertConfig, BertForMaskedLM, BertTokenizer), "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer), "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer), "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer), "myBert": (BertConfig, BertForMaskedLM, myTokenizer), "motifBert": (BertConfig, BertForMaskedLM, MotifTokenizer) } MASK_LIST = { "3mer_stride1": [-1, 1], "3mer_stride3": [0], "6mer_stride1": [-2, -1, 1, 2, 3], "6mer_stride6": [0], "motif": [0] } # Setting environment variables os.environ['VOCAB_PATH'] = '/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/tokenizer_v4/hg38/vocab_dedup.txt' os.environ['VOCAB_NAME'] = 'vocab_dedup.txt' os.environ['POSITIONAL_EMBEDDINGS_SIZE'] = '512' def tokenize_seq(seg, vocabs, maxlen, motif_hardcoded_sorted, motif_wildcarded_sorted, motif_variations_sorted, k3, k1): i = 0 # start position tokens = [] coordinates = [] names = [] t = [] while i < len(seg): t = [] for l in range(maxlen, 0, -1): if seg[i:i+l] in motif_hardcoded_sorted: t = [seg[i:i+l]] elif seg[i:i+l] in motif_wildcarded_sorted: t = motif_wildcarded_sorted[seg[i:i+l]] elif seg[i:i+l] in motif_variations_sorted: t = motif_variations_sorted[seg[i:i+l]] elif seg[i:i+l] in k3: t = [seg[i:i+l]] elif seg[i:i+l] in k1: t = [seg[i:i+l]] if t: if len(t) > 1: # min_length = min(len(item.split()) for item in t) # filtered_list = [item for item in data if len(item.split()) == min_length] # random_choice = random.choice(filtered_list) random_choice = random.choice(t) tokens.append(random_choice) # names.append(lookup_table[random_choice.split()[0]]) else: tokens.append(t[0]) # names.append(lookup_table.get(t[0].split()[0], '-')) # coordinate = chrmname + ':' + str(start_position + i) + '-' + str(min(start_position + i + l, start_position + len(seg))) # coordinates.append(coordinate) i = i + l break return tokens, coordinates, names def main(): motif_hardcoded = pd.read_csv(join(args.tokenizer_dir, 'motifs_hardcode.txt'), header = None, names = ['column']) motif_hardcoded_sorted = motif_hardcoded.sort_values(by='column', key=lambda col: col.str.len(), ascending=False) # uniq wildcarded motifs motif_wildcarded = collections.defaultdict(list) with open(join(args.tokenizer_dir, "motifs_wildcard.txt"), "r") as file: for line in file: seq, operations = line.strip().split(maxsplit=1) # Split only on the first space motif_wildcarded[seq].append(operations) # Store in dictionary motif_wildcarded_sorted = {k: motif_wildcarded[k] for k in sorted(motif_wildcarded.keys(), key=len, reverse=True)} # uniq motif variations motif_variations = collections.defaultdict(list) with open(join(args.tokenizer_dir, "motifs_variations.txt"), "r") as file: for line in file: seq, operations = line.strip().split(maxsplit=1) # Split only on the first space motif_variations[seq].append(operations) # Store in dictionary motif_variations_sorted = {k: motif_variations[k] for k in sorted(motif_variations.keys(), key=len, reverse=True)} k1 = ['A', 'T', 'C', 'G', 'N'] # 3-mer combinations = list(itertools.product(['A', 'T', 'C', 'G'], repeat=3)) k3 = [''.join(term) for term in combinations] lookup_table = {} with open(join(args.tokenizer_dir, "motifs_dedup.txt"), "r") as file: for line in file: segment, name = line.strip().split(maxsplit=1) # Split only on the first space lookup_table[segment] = name # Store in dictionary for folder in os.listdir(args.data_dir): if not folder.startswith('.'): for f in ['test', 'dev', 'train']: data = join(args.data_dir, folder, 'split', f + '.csv') print('process file: ' + data) if not os.path.exists(data): print(f"File {data} does not exist, skipping...") continue df = pd.read_csv(data, sep = '\t') print('Processing ' + folder + ' ' + f) df_tokenized = [] if args.only_positive: for i in range(len(df['sequence'])): if df['label'][i] == 1: seg = df['sequence'][i] t, _, _ = tokenize_seq(seg, args.tokenizer_dir, 12, motif_hardcoded_sorted, motif_wildcarded_sorted, motif_variations_sorted, k3, k1) df_tokenized.append(t) df_ = [" ".join(line) for line in df_tokenized] f_ = join(args.data_dir, folder, 'split', f, name + '_token_v4_only_POS.json') with open(f_, 'w') as file: # logging.warning(f"Saving tokenized results to {f_}...") json.dump(df_, file) else: for i in range(len(df['sequence'])): seg = df['sequence'][i] t, _, _ = tokenize_seq(seg, args.tokenizer_dir, 12, motif_hardcoded_sorted, motif_wildcarded_sorted, motif_variations_sorted, k3, k1) df_tokenized.append(t) df_ = [" ".join(line) for line in df_tokenized] f_ = join(args.data_dir, folder, 'split', f + '_token_v4.json') with open(f_, 'w') as file: # logging.warning(f"Saving tokenized results to {f_}...") json.dump(df_, file) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--tokenizer_dir", type=str, required=True) parser.add_argument("--data_dir", type=str, required=True) parser.add_argument("--only_positive", action="store_true") args = parser.parse_args() main()