import os import sys import numpy as np import pandas as pd from os.path import join import argparse import glob import logging import os import pickle import random import re import shutil from typing import Dict, List, Tuple from copy import deepcopy from multiprocessing import Pool import numpy as np import torch from torch.nn.utils.rnn import pad_sequence from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange import collections import itertools from transformers import ( WEIGHTS_NAME, AdamW, BertConfig, BertForMaskedLM, BertTokenizer, CamembertConfig, CamembertForMaskedLM, CamembertTokenizer, DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, PreTrainedModel, PreTrainedTokenizer, RobertaConfig, RobertaForMaskedLM, RobertaTokenizer, get_linear_schedule_with_warmup, ) import os import csv import copy import json import logging from dataclasses import dataclass, field from typing import Any, Optional, Dict, Sequence, Tuple, List, Union import torch import transformers import sklearn import numpy as np from torch.utils.data import Dataset import collections try: from torch.utils.tensorboard import SummaryWriter except ImportError: from tensorboardX import SummaryWriter def main(): model_name_or_path = 'zhihan1996/DNABERT-2-117M' cache_dir='/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/apps/transformers_cache' tokenizer = transformers.AutoTokenizer.from_pretrained( model_name_or_path, "cache_dir" == cache_dir, ) for folder in os.listdir(args.data_dir): if not folder.startswith('.'): for f in os.listdir(os.path.join(args.data_dir, folder)): if not f.startswith('.'): for name in ['test', 'dev', 'train']: data = join(args.data_dir, folder, f, name + '.csv') if not os.path.exists(data): print(f"File {data} does not exist, skipping...") continue df = pd.read_csv(data) print('Processing ' + folder + ' ' + f) df_tokenized = [] if args.only_positive: for i in range(len(df['sequence'])): if df['label'][i] == 1: seg = df['sequence'][i] output = tokenizer.encode_plus(seg, return_tensors="pt") df_tokenized.append(output['input_ids'].cpu()) df_ = [" ".join(str(token.item()) for token in line.squeeze()) for line in df_tokenized] f_ = join(args.data_dir, folder, f, name + '_DNAbert2_only_POS.json') with open(f_, 'w') as file: logging.warning(f"Saving tokenized results to {f_}...") json.dump(df_, file) else: for i in range(len(df['sequence'])): seg = df['sequence'][i] output = tokenizer.encode_plus(seg, return_tensors="pt") df_tokenized.append(output['input_ids'].cpu()) df_ = [" ".join(str(token.item()) for token in line.squeeze()) for line in df_tokenized] f_ = join(args.data_dir, folder, f, name + '_DNAbert2.json') with open(f_, 'w') as file: logging.warning(f"Saving tokenized results to {f_}...") json.dump(df_, file) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, required=True) parser.add_argument("--only_positive", action="store_true") args = parser.parse_args() main()