| import os |
| import sys |
| import numpy as np |
| import pandas as pd |
| from os.path import join |
|
|
| import argparse |
| import glob |
| import logging |
| import os |
| import pickle |
| import random |
| import re |
| import shutil |
| from typing import Dict, List, Tuple |
| from copy import deepcopy |
| from multiprocessing import Pool |
|
|
| import numpy as np |
| import torch |
| from torch.nn.utils.rnn import pad_sequence |
| from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler |
| from torch.utils.data.distributed import DistributedSampler |
| from tqdm import tqdm, trange |
| import collections |
| import itertools |
|
|
| from transformers import ( |
| WEIGHTS_NAME, |
| AdamW, |
| BertConfig, |
| BertForMaskedLM, |
| BertTokenizer, |
| CamembertConfig, |
| CamembertForMaskedLM, |
| CamembertTokenizer, |
| DistilBertConfig, |
| DistilBertForMaskedLM, |
| DistilBertTokenizer, |
| GPT2Config, |
| GPT2LMHeadModel, |
| GPT2Tokenizer, |
| OpenAIGPTConfig, |
| OpenAIGPTLMHeadModel, |
| OpenAIGPTTokenizer, |
| PreTrainedModel, |
| PreTrainedTokenizer, |
| RobertaConfig, |
| RobertaForMaskedLM, |
| RobertaTokenizer, |
| get_linear_schedule_with_warmup, |
| ) |
|
|
| import os |
| import csv |
| import copy |
| import json |
| import logging |
| from dataclasses import dataclass, field |
| from typing import Any, Optional, Dict, Sequence, Tuple, List, Union |
|
|
| import torch |
| import transformers |
| import sklearn |
| import numpy as np |
| from torch.utils.data import Dataset |
| import collections |
|
|
|
|
| try: |
| from torch.utils.tensorboard import SummaryWriter |
| except ImportError: |
| from tensorboardX import SummaryWriter |
|
|
| def main(): |
|
|
| model_name_or_path = 'zhihan1996/DNABERT-2-117M' |
| cache_dir='/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/apps/transformers_cache' |
|
|
| tokenizer = transformers.AutoTokenizer.from_pretrained( |
| model_name_or_path, |
| "cache_dir" == cache_dir, |
| ) |
|
|
| for folder in os.listdir(args.data_dir): |
| if not folder.startswith('.'): |
| for f in os.listdir(os.path.join(args.data_dir, folder)): |
| if not f.startswith('.'): |
|
|
| for name in ['test', 'dev', 'train']: |
| data = join(args.data_dir, folder, f, name + '.csv') |
|
|
| if not os.path.exists(data): |
| print(f"File {data} does not exist, skipping...") |
| continue |
| |
| df = pd.read_csv(data) |
| print('Processing ' + folder + ' ' + f) |
| df_tokenized = [] |
|
|
| if args.only_positive: |
| for i in range(len(df['sequence'])): |
| if df['label'][i] == 1: |
| seg = df['sequence'][i] |
| output = tokenizer.encode_plus(seg, return_tensors="pt") |
| df_tokenized.append(output['input_ids'].cpu()) |
|
|
| df_ = [" ".join(str(token.item()) for token in line.squeeze()) for line in df_tokenized] |
| f_ = join(args.data_dir, folder, f, name + '_DNAbert2_only_POS.json') |
| with open(f_, 'w') as file: |
| logging.warning(f"Saving tokenized results to {f_}...") |
| json.dump(df_, file) |
|
|
| else: |
| for i in range(len(df['sequence'])): |
| seg = df['sequence'][i] |
| output = tokenizer.encode_plus(seg, return_tensors="pt") |
| df_tokenized.append(output['input_ids'].cpu()) |
|
|
| df_ = [" ".join(str(token.item()) for token in line.squeeze()) for line in df_tokenized] |
| f_ = join(args.data_dir, folder, f, name + '_DNAbert2.json') |
| with open(f_, 'w') as file: |
| logging.warning(f"Saving tokenized results to {f_}...") |
| json.dump(df_, file) |
| |
|
|
| if __name__ == "__main__": |
|
|
| parser = argparse.ArgumentParser() |
|
|
| parser.add_argument("--data_dir", type=str, required=True) |
| parser.add_argument("--only_positive", action="store_true") |
|
|
| args = parser.parse_args() |
|
|
| main() |
|
|