diff --git a/data_download.py b/data_download.py new file mode 100644 index 0000000000000000000000000000000000000000..d2fe14fb02e7bd1b3d50fcf444820ba380404749 --- /dev/null +++ b/data_download.py @@ -0,0 +1,23 @@ +from datasets import load_dataset +from pathlib import Path + +datasets = [ + "demo_human_or_worm", + "dummy_mouse_enhancers_ensembl", + "human_enhancers_ensembl", + "human_nontata_promoters", + "demo_coding_vs_intergenomic_seqs", + "drosophila_enhancers_stark", + "human_enhancers_cohn", + "human_ensembl_regulatory", + "human_ocr_ensembl", +] + +out_root = Path("hf_raw") +out_root.mkdir(parents=True, exist_ok=True) + +for name in datasets: + hf_id = f"katarinagresova/Genomic_Benchmarks_{name}" + ds = load_dataset(hf_id) # downloads to the HF cache + ds.save_to_disk(out_root / name) # optional: persist locally for reuse + print(f"downloaded {hf_id}") diff --git a/data_split.py b/data_split.py new file mode 100644 index 0000000000000000000000000000000000000000..9fd44708064bf90768d39a87b5536c1ad7901797 --- /dev/null +++ b/data_split.py @@ -0,0 +1,28 @@ +from datasets import load_from_disk, concatenate_datasets +from pathlib import Path +import pandas as pd + +src_root = Path("hf_raw") +dst_root = Path("ft_data") +seed = 42 + +for ds_dir in src_root.iterdir(): + if not ds_dir.is_dir(): + continue + ds = load_from_disk(ds_dir) + + # Combine all available splits, shuffle, then 80/10/10 + full = concatenate_datasets([ds[s] for s in ds.keys()]) + full = full.shuffle(seed=seed) + split1 = full.train_test_split(test_size=0.2, seed=seed) + train = split1["train"] + split2 = split1["test"].train_test_split(test_size=0.5, seed=seed) + dev, test = split2["train"], split2["test"] + + out = dst_root / ds_dir.name / "split" + out.mkdir(parents=True, exist_ok=True) + for name, subset in [("train", train), ("dev", dev), ("test", test)]: + subset.to_pandas()[["seq", "label"]].rename( + columns={"seq": "sequence", "label": "labels"} + ).to_csv(out / f"{name}.csv", sep="\t", index=False) + print(f"Wrote {out/f'{name}.csv'}") diff --git a/full_multi_base_2048_3e-5_log.txt b/full_multi_base_2048_3e-5_log.txt new file mode 100644 index 0000000000000000000000000000000000000000..a92d6e521180c0990af0805c22cd7c6ab63de39c --- /dev/null +++ b/full_multi_base_2048_3e-5_log.txt @@ -0,0 +1,1936 @@ +nohup: ignoring input +The provided data_path is ft_data +Output root: full_output_multi_epoch +Running base_2048 on drosophila_enhancers_stark, seed 42, lr 3e-5, output full_output_multi_epoch/drosophila_enhancers_stark/base_2048 +wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc +wandb: Currently logged in as: n5huang (n5huang-uc-san-diego) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +wandb: Tracking run with wandb version 0.23.1 +wandb: Run data is saved locally in /root/NaN/dna-tokenizer/Finetune-GenomicBenchmarks/wandb/run-20260209_074339-c25agmr6 +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run honest-sky-401 +wandb: โญ๏ธ View project at https://wandb.ai/n5huang-uc-san-diego/genomic_bench +wandb: ๐Ÿš€ View run at https://wandb.ai/n5huang-uc-san-diego/genomic_bench/runs/c25agmr6 +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +WARNING:root:Perform single sequence classification... +Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /root/NaN/dna-tokenizer/pretrain/models/base_2048/checkpoint-100000 and are newly initialized: ['bert.pooler.dense.bias', 'classifier.weight', 'classifier.bias', 'bert.pooler.dense.weight'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +/root/miniconda3/envs/bpe_v2/lib/python3.9/site-packages/accelerate/accelerator.py:439: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. + self.scaler = torch.cuda.amp.GradScaler(**kwargs) +Using auto half precision backend +***** Running training ***** + Num examples = 5,531 + Num Epochs = 5 + Instantaneous batch size per device = 128 + Total train batch size (w. parallel, distributed & accumulation) = 128 + Gradient Accumulation steps = 1 + Total optimization steps = 220 + Number of trainable parameters = 87,615,746 +Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true" + 0%| | 0/220 [00:00&2 + exit 1 +fi + + + +echo "The provided data_path is $data_path" +echo "Output root: $output_root" + + + +for seed in 42; do + for idx in "${!MODELS[@]}"; do + model=${MODELS[$idx]} + tokenizer=${TOKENIZERS[$idx]} + model_name=${MODEL_NAMES[$idx]} + + + + # for data in demo_coding_vs_intergenomic_seqs human_nontata_promoters human_enhancers_cohn human_ocr_ensembl; do # length ~200 + # run_output_dir=${output_root}/${data}/${model_name} + # mkdir -p "${run_output_dir}" + # echo "Running ${model_name} on ${data}, seed ${seed}, lr ${lr}, output ${run_output_dir}" + # torchrun --nproc_per_node=1 \ + # --master_port=${MASTER_PORT:-29500} \ + # train.py \ + # --model_name_or_path ${model} \ + # --tokenizer_path ${tokenizer} \ + # --trust_remote_code True \ + # --data_path $data_path/$data/split \ + # --kmer -1 \ + # --run_name ${model_name}_hg38_BPE_${lr}_${data}_seed${seed} \ + # --model_max_length 100 \ + # --per_device_train_batch_size 128 \ + # --per_device_eval_batch_size 128 \ + # --gradient_accumulation_steps 1 \ + # --learning_rate ${lr} \ + # --num_train_epochs 3 \ + # --fp16 \ + # --save_steps 200 \ + # --output_dir ${run_output_dir} \ + # --evaluation_strategy steps \ + # --eval_steps 200 \ + # --warmup_steps 30 \ + # --logging_steps 100000 \ + # --overwrite_output_dir True \ + # --log_level info \ + # --seed ${seed} \ + # --find_unused_parameters False \ + # --project_name ${project_name} + # done + for data in drosophila_enhancers_stark dummy_mouse_enhancers_ensembl human_enhancers_ensembl; do + run_output_dir=${output_root}/${data}/${model_name} + mkdir -p "${run_output_dir}" + echo "Running ${model_name} on ${data}, seed ${seed}, lr ${lr}, output ${run_output_dir}" + torchrun --nproc_per_node=1 \ + --master_port=${MASTER_PORT:-29500} \ + train.py \ + --model_name_or_path ${model} \ + --tokenizer_path ${tokenizer} \ + --trust_remote_code True \ + --data_path $data_path/$data/split \ + --kmer -1 \ + --run_name ${model_name}_hg38_BPE_${lr}_${data}_seed${seed} \ + --model_max_length 512 \ + --per_device_train_batch_size 128 \ + --per_device_eval_batch_size 128 \ + --gradient_accumulation_steps 1 \ + --learning_rate ${lr} \ + --num_train_epochs 5 \ + --fp16 \ + --save_steps 200 \ + --output_dir ${run_output_dir} \ + --evaluation_strategy steps \ + --eval_steps 200 \ + --warmup_steps 30 \ + --logging_steps 100000 \ + --overwrite_output_dir True \ + --log_level info \ + --seed ${seed} \ + --find_unused_parameters False \ + --project_name ${project_name} + done + + # for data in demo_human_or_worm drosophila_enhancers_stark dummy_mouse_enhancers_ensembl human_enhancers_ensembl; do # length mostly 2000+ + # run_output_dir=${output_root}/${data}/${model_name} + # mkdir -p "${run_output_dir}" + # echo "Running ${model_name} on ${data}, seed ${seed}, lr ${lr}, output ${run_output_dir}" + # torchrun --nproc_per_node=1 \ + # --master_port=${MASTER_PORT:-29500} \ + # train.py \ + # --model_name_or_path ${model} \ + # --tokenizer_path ${tokenizer} \ + # --trust_remote_code True \ + # --data_path $data_path/$data/split \ + # --kmer -1 \ + # --run_name ${model_name}_hg38_BPE_${lr}_${data}_seed${seed} \ + # --model_max_length 512 \ + # --per_device_train_batch_size 128 \ + # --per_device_eval_batch_size 128 \ + # --gradient_accumulation_steps 1 \ + # --learning_rate ${lr} \ + # --num_train_epochs 5 \ + # --fp16 \ + # --save_steps 200 \ + # --output_dir ${run_output_dir} \ + # --evaluation_strategy steps \ + # --eval_steps 200 \ + # --warmup_steps 30 \ + # --logging_steps 100000 \ + # --overwrite_output_dir True \ + # --log_level info \ + # --seed ${seed} \ + # --find_unused_parameters False \ + # --project_name ${project_name} + # done + + + + # for data in human_ensembl_regulatory; do # length ~200-700 + # run_output_dir=${output_root}/${data}/${model_name} + # mkdir -p "${run_output_dir}" + # echo "Running ${model_name} on ${data}, seed ${seed}, lr ${lr}, output ${run_output_dir}" + # torchrun --nproc_per_node=1 \ + # --master_port=${MASTER_PORT:-29500} \ + # train.py \ + # --model_name_or_path ${model} \ + # --tokenizer_path ${tokenizer} \ + # --trust_remote_code True \ + # --data_path $data_path/$data/split \ + # --kmer -1 \ + # --run_name ${model_name}_hg38_BPE_${lr}_${data}_seed${seed} \ + # --model_max_length 250 \ + # --per_device_train_batch_size 128 \ + # --per_device_eval_batch_size 128 \ + # --gradient_accumulation_steps 1 \ + # --learning_rate ${lr} \ + # --num_train_epochs 8 \ + # --fp16 \ + # --save_steps 200 \ + # --output_dir ${run_output_dir} \ + # --evaluation_strategy steps \ + # --eval_steps 200 \ + # --warmup_steps 30 \ + # --logging_steps 100000 \ + # --overwrite_output_dir True \ + # --log_level info \ + # --seed ${seed} \ + # --find_unused_parameters False \ + # --project_name ${project_name} + # done + + done +done diff --git a/run_dnabert2_2048.sh b/run_dnabert2_2048.sh new file mode 100644 index 0000000000000000000000000000000000000000..cbbd31c6d17b09be5e8bec56b1fa849b49364770 --- /dev/null +++ b/run_dnabert2_2048.sh @@ -0,0 +1,115 @@ +#!/bin/bash + +data_path=$1 +lr=$2 +output_path=$3 +project_name=$4 +vocab=117M + +model=/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/DNAbert2/pretrain/models_2048/model_1/checkpoint-200000 +tokenizer=/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/DNAbert2/hg38_2048/tokenizer.json + +echo "The provided data_path is $data_path" + +datasets=( + demo_human_or_worm + dummy_mouse_enhancers_ensembl + human_enhancers_ensembl + human_nontata_promoters + demo_coding_vs_intergenomic_seqs + drosophila_enhancers_stark + human_enhancers_cohn + human_ensembl_regulatory + human_ocr_ensembl +) + + +for seed in 42 +do + for data in demo_human_or_worm demo_coding_vs_intergenomic_seqs human_nontata_promoters # length all 200, 251 + do + python train.py \ + --model_name_or_path ${model} \ + --tokenizer_path ${tokenizer} \ + --trust_remote_code True \ + --data_path $data_path/$data/split \ + --kmer -1 \ + --run_name hg38_BPE_${lr}_${data}_seed${seed} \ + --model_max_length 100 \ + --per_device_train_batch_size 128 \ + --per_device_eval_batch_size 128 \ + --gradient_accumulation_steps 1 \ + --learning_rate ${lr} \ + --num_train_epochs 3 \ + --fp16 \ + --save_steps 200 \ + --output_dir ${output_path} \ + --evaluation_strategy steps \ + --eval_steps 200 \ + --warmup_steps 30 \ + --logging_steps 100000 \ + --overwrite_output_dir True \ + --log_level info \ + --seed ${seed} \ + --find_unused_parameters False \ + --project_name ${project_name} + done + + for data in drosophila_enhancers_stark dummy_mouse_enhancers_ensembl # length mostly 2000, 3000~4000 + do + python train.py \ + --model_name_or_path ${model} \ + --tokenizer_path ${tokenizer} \ + --trust_remote_code True \ + --data_path $data_path/$data/split \ + --kmer -1 \ + --run_name hg38_BPE_${lr}_${data}_seed${seed} \ + --model_max_length 512 \ + --per_device_train_batch_size 128 \ + --per_device_eval_batch_size 128 \ + --gradient_accumulation_steps 1 \ + --learning_rate ${lr} \ + --num_train_epochs 3 \ + --fp16 \ + --save_steps 200 \ + --output_dir ${output_path} \ + --evaluation_strategy steps \ + --eval_steps 200 \ + --warmup_steps 30 \ + --logging_steps 100000 \ + --overwrite_output_dir True \ + --log_level info \ + --seed ${seed} \ + --find_unused_parameters False \ + --project_name ${project_name} + done + + for data in human_enhancers_ensembl human_enhancers_cohn human_ensembl_regulatory human_ocr_ensembl # length usually 200~700 + do + python train.py \ + --model_name_or_path ${model} \ + --tokenizer_path ${tokenizer} \ + --trust_remote_code True \ + --data_path $data_path/$data/split \ + --kmer -1 \ + --run_name hg38_BPE_${lr}_${data}_seed${seed} \ + --model_max_length 250 \ + --per_device_train_batch_size 128 \ + --per_device_eval_batch_size 128 \ + --gradient_accumulation_steps 1 \ + --learning_rate ${lr} \ + --num_train_epochs 3 \ + --fp16 \ + --save_steps 200 \ + --output_dir ${output_path} \ + --evaluation_strategy steps \ + --eval_steps 200 \ + --warmup_steps 30 \ + --logging_steps 100000 \ + --overwrite_output_dir True \ + --log_level info \ + --seed ${seed} \ + --find_unused_parameters False \ + --project_name ${project_name} + done +done diff --git a/tokenization_dna.py b/tokenization_dna.py new file mode 100644 index 0000000000000000000000000000000000000000..e28717404d9ffded8e243dfc9e4dd5595c6aefce --- /dev/null +++ b/tokenization_dna.py @@ -0,0 +1,394 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + + +import collections +import logging +import os +import math +import unicodedata + + +from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast + + +logger = logging.getLogger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": os.getenv("VOCAB_NAME")} + +PRETRAINED_VOCAB_FILES_MAP = {"vocab_file": { + 'dna' : os.getenv("VOCAB_PATH") + } + } + + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {'dna': os.getenv("POSITIONAL_EMBEDDINGS_SIZE")} +PRETRAINED_INIT_CONFIGURATION = {'dna': {"do_lower_case": False}} + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class DNATokenizer(PreTrainedTokenizer): + r""" + Constructs a BertTokenizer. + :class:`~transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece + + Args: + vocab_file: Path to a one-wordpiece-per-line vocabulary file + do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True + do_basic_tokenize: Whether to do basic tokenization before wordpiece. + max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the + minimum of this value (if specified) and the underlying BERT model's sequence length. + never_split: List of tokens which will never be split during tokenization. Only has an effect when + do_basic_tokenize=True + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + + def __init__( + self, + vocab_file, + do_lower_case=False, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + **kwargs + ): + """Constructs a BertTokenizer. + + Args: + **vocab_file**: Path to a one-wordpiece-per-line vocabulary file + **do_lower_case**: (`optional`) boolean (default True) + Whether to lower case the input + Only has an effect when do_basic_tokenize=True + **do_basic_tokenize**: (`optional`) boolean (default True) + Whether to do basic tokenization before wordpiece. + **never_split**: (`optional`) list of string + List of tokens which will never be split during tokenization. + Only has an effect when do_basic_tokenize=True + **tokenize_chinese_chars**: (`optional`) boolean (default True) + Whether to tokenize Chinese characters. + This should likely be deactivated for Japanese: + see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 + """ + super().__init__( + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + **kwargs, + ) + self.max_len_single_sentence = self.max_len - 2 # take into account special tokens + self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens + + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) + ) + self.vocab = load_vocab(vocab_file) + self.kmer = VOCAB_KMER[str(len(self.vocab))] + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars + ) + + @property + def vocab_size(self): + return len(self.vocab) + + def _tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): + split_tokens.append(token) + # print(split_tokens) + return split_tokens + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """ Converts a sequence of tokens (string) in a single string. """ + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A BERT sequence has the following format: + single sequence: [CLS] X [SEP] + pair of sequences: [CLS] A [SEP] B [SEP] + """ + cls = [self.cls_token_id] + sep = [self.sep_token_id] + + if token_ids_1 is None: + if len(token_ids_0) < 510: + return cls + token_ids_0 + sep + else: + output = [] + num_pieces = int(len(token_ids_0)//510) + 1 + for i in range(num_pieces): + output.extend(cls + token_ids_0[510*i:min(len(token_ids_0), 510*(i+1))] + sep) + return output + + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model + + Returns: + A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + + if len(token_ids_0) < 510: + return [1] + ([0] * len(token_ids_0)) + [1] + else: + output = [] + num_pieces = int(len(token_ids_0)//510) + 1 + for i in range(num_pieces): + output.extend([1] + ([0] * (min(len(token_ids_0), 510*(i+1))-510*i)) + [1]) + return output + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. + A BERT sequence pair mask has the following format: + 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence + + if token_ids_1 is None, only returns the first portion of the mask (0's). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + if len(token_ids_0) < 510: + return len(cls + token_ids_0 + sep) * [0] + else: + num_pieces = int(len(token_ids_0)//510) + 1 + return (len(cls + token_ids_0 + sep) + 2*(num_pieces-1)) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, vocab_path): + """Save the tokenizer vocabulary to a directory or file.""" + index = 0 + if os.path.isdir(vocab_path): + vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"]) + else: + vocab_file = vocab_path + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + "Saving vocabulary to {}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!".format(vocab_file) + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=False, never_split=None, tokenize_chinese_chars=True): + """ Constructs a BasicTokenizer. + + Args: + **do_lower_case**: Whether to lower case the input. + **never_split**: (`optional`) list of str + Kept for backward compatibility purposes. + Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) + List of token not to split. + **tokenize_chinese_chars**: (`optional`) boolean (default True) + Whether to tokenize Chinese characters. + This should likely be deactivated for Japanese: + see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 + """ + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = never_split + self.tokenize_chinese_chars = tokenize_chinese_chars + + def tokenize(self, text, never_split=None): + """ Basic Tokenization of a piece of text. + Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer. + + Args: + **never_split**: (`optional`) list of str + Kept for backward compatibility purposes. + Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) + List of token not to split. + """ + never_split = self.never_split + (never_split if never_split is not None else []) + text = self._clean_text(text) + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if never_split is not None and text in never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/tokenization_motif.py b/tokenization_motif.py new file mode 100644 index 0000000000000000000000000000000000000000..61a20a9f299a6bed526d44f98809d125025cec21 --- /dev/null +++ b/tokenization_motif.py @@ -0,0 +1,406 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + + +import collections +import logging +import os +import math +import unicodedata +import json + +from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast + + +logger = logging.getLogger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": os.getenv("VOCAB_NAME")} + +PRETRAINED_VOCAB_FILES_MAP = {"vocab_file": { + 'motif' : os.getenv("VOCAB_PATH") + } + } + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {'motif': os.getenv("POSITIONAL_EMBEDDINGS_SIZE")} +PRETRAINED_INIT_CONFIGURATION = {'motif': {"do_lower_case": False}} + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class MotifTokenizer(PreTrainedTokenizer): + r""" + Constructs a BertTokenizer. + :class:`~transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece + + Args: + vocab_file: Path to a one-wordpiece-per-line vocabulary file + do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True + do_basic_tokenize: Whether to do basic tokenization before wordpiece. + max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the + minimum of this value (if specified) and the underlying BERT model's sequence length. + never_split: List of tokens which will never be split during tokenization. Only has an effect when + do_basic_tokenize=True + """ + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__( + self, + vocab_file, + do_lower_case=False, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=False, + additional_special_tokens = None, + **kwargs + ): + """Constructs a BertTokenizer. + Args: + **vocab_file**: Path to a one-wordpiece-per-line vocabulary file + **do_lower_case**: (`optional`) boolean (default True) + Whether to lower case the input + Only has an effect when do_basic_tokenize=True + **do_basic_tokenize**: (`optional`) boolean (default True) + Whether to do basic tokenization before wordpiece. + **never_split**: (`optional`) list of string + List of tokens which will never be split during tokenization. + Only has an effect when do_basic_tokenize=True + **tokenize_chinese_chars**: (`optional`) boolean (default True) + Whether to tokenize Chinese characters. + This should likely be deactivated for Japanese: + see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 + """ + super().__init__( + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + **kwargs, + ) + self.vocab = load_vocab(vocab_file) + self.max_len_single_sentence = self.model_max_length - 2 # take into account special tokens + self.max_len_sentences_pair = self.model_max_length - 3 # take into account special tokens + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) + ) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self._additional_special_tokens = additional_special_tokens or [] + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars + ) + + def add_tokens(self, new_tokens): + """Method to add custom tokens to the tokenizer.""" + self._additional_special_tokens.extend(new_tokens) + self._additional_special_tokens = list(set(self._additional_special_tokens)) # Remove duplicates + print(f"Custom tokens added: {new_tokens}") + print(f"Updated additional_special_tokens: {self._additional_special_tokens}") + + @property + def all_special_tokens(self): + """ List all the special tokens ('', ''...) mapped to class attributes + (cls_token, unk_token...) and custom special tokens (additional_special_tokens). + """ + set_attr = self.special_tokens_map + all_toks = [] + + # Add standard special tokens + for attr_value in set_attr.values(): + all_toks += (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value]) + + # Add custom special tokens + all_toks += self._additional_special_tokens + + # Remove duplicates by converting to a set and back to a list + all_toks = list(set(all_toks)) + + return all_toks + + @property + def vocab_size(self): + return len(self.vocab) + + def _tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): + split_tokens.append(token) + # print(split_tokens) + return split_tokens + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.vocab.get(token, self.vocab.get(self.unk_token)) + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + def convert_tokens_to_string(self, tokens): + """ Converts a sequence of tokens (string) in a single string. """ + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A BERT sequence has the following format: + single sequence: [CLS] X [SEP] + pair of sequences: [CLS] A [SEP] B [SEP] + """ + cls = [self.cls_token_id] + sep = [self.sep_token_id] + if token_ids_1 is None: + if len(token_ids_0) < 510: + return cls + token_ids_0 + sep + else: + output = [] + num_pieces = int(len(token_ids_0)//510) + 1 + for i in range(num_pieces): + output.extend(cls + token_ids_0[510*i:min(len(token_ids_0), 510*(i+1))] + sep) + return output + return cls + token_ids_0 + sep + token_ids_1 + sep + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model + + Returns: + A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + if len(token_ids_0) < 510: + return [1] + ([0] * len(token_ids_0)) + [1] + else: + output = [] + num_pieces = int(len(token_ids_0)//510) + 1 + for i in range(num_pieces): + output.extend([1] + ([0] * (min(len(token_ids_0), 510*(i+1))-510*i)) + [1]) + return output + return [1] + ([0] * len(token_ids_0)) + [1] + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. + A BERT sequence pair mask has the following format: + 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence + + if token_ids_1 is None, only returns the first portion of the mask (0's). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + if len(token_ids_0) < 510: + return len(cls + token_ids_0 + sep) * [0] + else: + num_pieces = int(len(token_ids_0)//510) + 1 + return (len(cls + token_ids_0 + sep) + 2*(num_pieces-1)) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + def save_vocabulary(self, vocab_path, filename_prefix=None): + """Save the tokenizer vocabulary to a directory or file.""" + index = 0 + if os.path.isdir(vocab_path): + vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"]) + else: + vocab_file = vocab_path + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + "Saving vocabulary to {}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!".format(vocab_file) + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=False, never_split=None, tokenize_chinese_chars=True): + """ Constructs a BasicTokenizer. + + Args: + **do_lower_case**: Whether to lower case the input. + **never_split**: (`optional`) list of str + Kept for backward compatibility purposes. + Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) + List of token not to split. + **tokenize_chinese_chars**: (`optional`) boolean (default True) + Whether to tokenize Chinese characters. + This should likely be deactivated for Japanese: + see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 + """ + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = never_split + self.tokenize_chinese_chars = tokenize_chinese_chars + + def tokenize(self, text, never_split=None): + """ Basic Tokenization of a piece of text. + Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer. + + Args: + **never_split**: (`optional`) list of str + Kept for backward compatibility purposes. + Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) + List of token not to split. + """ + never_split = self.never_split + (never_split if never_split is not None else []) + text = self._clean_text(text) + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if never_split is not None and text in never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/tokenize_dnabert2.py b/tokenize_dnabert2.py new file mode 100644 index 0000000000000000000000000000000000000000..081fc3b6cda8c66524d39d7796d3dc2253f6832d --- /dev/null +++ b/tokenize_dnabert2.py @@ -0,0 +1,136 @@ +import os +import sys +import numpy as np +import pandas as pd +from os.path import join + +import argparse +import glob +import logging +import os +import pickle +import random +import re +import shutil +from typing import Dict, List, Tuple +from copy import deepcopy +from multiprocessing import Pool + +import numpy as np +import torch +from torch.nn.utils.rnn import pad_sequence +from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler +from torch.utils.data.distributed import DistributedSampler +from tqdm import tqdm, trange +import collections +import itertools + +from transformers import ( + WEIGHTS_NAME, + AdamW, + BertConfig, + BertForMaskedLM, + BertTokenizer, + CamembertConfig, + CamembertForMaskedLM, + CamembertTokenizer, + DistilBertConfig, + DistilBertForMaskedLM, + DistilBertTokenizer, + GPT2Config, + GPT2LMHeadModel, + GPT2Tokenizer, + OpenAIGPTConfig, + OpenAIGPTLMHeadModel, + OpenAIGPTTokenizer, + PreTrainedModel, + PreTrainedTokenizer, + RobertaConfig, + RobertaForMaskedLM, + RobertaTokenizer, + get_linear_schedule_with_warmup, +) + +import os +import csv +import copy +import json +import logging +from dataclasses import dataclass, field +from typing import Any, Optional, Dict, Sequence, Tuple, List, Union + +import torch +import transformers +import sklearn +import numpy as np +from torch.utils.data import Dataset +import collections + + +try: + from torch.utils.tensorboard import SummaryWriter +except ImportError: + from tensorboardX import SummaryWriter + +def main(): + + model_name_or_path = 'zhihan1996/DNABERT-2-117M' + cache_dir='/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/apps/transformers_cache' + + tokenizer = transformers.AutoTokenizer.from_pretrained( + model_name_or_path, + "cache_dir" == cache_dir, + ) + + for folder in os.listdir(args.data_dir): + if not folder.startswith('.'): + for f in os.listdir(os.path.join(args.data_dir, folder)): + if not f.startswith('.'): + + for name in ['test', 'dev', 'train']: + data = join(args.data_dir, folder, f, name + '.csv') + + if not os.path.exists(data): + print(f"File {data} does not exist, skipping...") + continue + + df = pd.read_csv(data) + print('Processing ' + folder + ' ' + f) + df_tokenized = [] + + if args.only_positive: + for i in range(len(df['sequence'])): + if df['label'][i] == 1: + seg = df['sequence'][i] + output = tokenizer.encode_plus(seg, return_tensors="pt") + df_tokenized.append(output['input_ids'].cpu()) + + df_ = [" ".join(str(token.item()) for token in line.squeeze()) for line in df_tokenized] + f_ = join(args.data_dir, folder, f, name + '_DNAbert2_only_POS.json') + with open(f_, 'w') as file: + logging.warning(f"Saving tokenized results to {f_}...") + json.dump(df_, file) + + else: + for i in range(len(df['sequence'])): + seg = df['sequence'][i] + output = tokenizer.encode_plus(seg, return_tensors="pt") + df_tokenized.append(output['input_ids'].cpu()) + + df_ = [" ".join(str(token.item()) for token in line.squeeze()) for line in df_tokenized] + f_ = join(args.data_dir, folder, f, name + '_DNAbert2.json') + with open(f_, 'w') as file: + logging.warning(f"Saving tokenized results to {f_}...") + json.dump(df_, file) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + + parser.add_argument("--data_dir", type=str, required=True) + parser.add_argument("--only_positive", action="store_true") + + args = parser.parse_args() + + main() diff --git a/tokenize_nt.py b/tokenize_nt.py new file mode 100644 index 0000000000000000000000000000000000000000..82b8f4a89ee9d1779b0c38d2e498b9de74d7a56f --- /dev/null +++ b/tokenize_nt.py @@ -0,0 +1,99 @@ +import sys +import os +import numpy as np +import pandas as pd +from os.path import join +import json + +import argparse +import glob +import logging +import os +import pickle +import random +import re +import shutil +from typing import Dict, List, Tuple +from copy import deepcopy +from multiprocessing import Pool + +import numpy as np +import torch +from torch.nn.utils.rnn import pad_sequence +from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler +from torch.utils.data.distributed import DistributedSampler +from tqdm import tqdm, trange +import collections +import itertools + +module_path = "/storage1/fs1/yeli/Active/xiaoxiao.zhou/projects/foundation/nucleotide-transformer" +if module_path not in sys.path: + sys.path.append(module_path) + +# import haiku as hk +# import jax +# import jax.numpy as jnp +# from nucleotide_transformer.pretrained import get_pretrained_model + +from transformers import AutoTokenizer, AutoModelForMaskedLM +import torch + +def main(): + + cache_dir='/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/apps/transformers_cache' + + tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref") + model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref") + + for folder in os.listdir(args.data_dir): + if not folder.startswith('.'): + for f in os.listdir(os.path.join(args.data_dir, folder)): + if not f.startswith('.'): + + for name in ['test', 'dev', 'train']: + data = join(args.data_dir, folder, f, name + '.csv') + + if not os.path.exists(data): + print(f"File {data} does not exist, skipping...") + continue + + df = pd.read_csv(data, sep = '\t') + print('Processing ' + folder + ' ' + f) + df_tokenized = [] + + if args.only_positive: + for i in range(len(df['sequence'])): + if df['label'][i] == 1: + seg = df['sequence'][i] + output = tokenizer.encode_plus(seg, return_tensors="pt") + df_tokenized.append(output['input_ids'].cpu()) + + df_ = [" ".join(str(token.item()) for token in line.squeeze()) for line in df_tokenized] + f_ = join(args.data_dir, folder, f, name + '_NT_only_POS.json') + with open(f_, 'w') as file: + logging.warning(f"Saving tokenized results to {f_}...") + json.dump(df_, file) + + else: + for i in range(len(df['sequence'])): + seg = df['sequence'][i] + output = tokenizer.encode_plus(seg, return_tensors="pt") + df_tokenized.append(output['input_ids'].cpu()) + + df_ = [" ".join(str(token.item()) for token in line.squeeze()) for line in df_tokenized] + f_ = join(args.data_dir, folder, f, name + '_NT.json') + with open(f_, 'w') as file: + logging.warning(f"Saving tokenized results to {f_}...") + json.dump(df_, file) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + + parser.add_argument("--data_dir", type=str, required=True) + parser.add_argument("--only_positive", action="store_true") + + args = parser.parse_args() + + main() diff --git a/tokenize_v4.py b/tokenize_v4.py new file mode 100644 index 0000000000000000000000000000000000000000..c116cf01a43700d1bd346813e830cdc245dd3a3c --- /dev/null +++ b/tokenize_v4.py @@ -0,0 +1,210 @@ +import numpy as np +import pandas as pd +from os.path import join + +import argparse +import glob +import logging +import os +import pickle +import random +import re +import shutil +from typing import Dict, List, Tuple +from copy import deepcopy +from multiprocessing import Pool +import sys +import importlib +from pathlib import Path + +import numpy as np +import torch +from torch.nn.utils.rnn import pad_sequence +from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler +from torch.utils.data.distributed import DistributedSampler +from tqdm import tqdm, trange +import collections +import itertools +import json + +from transformers import ( + WEIGHTS_NAME, + AdamW, + BertConfig, + BertForMaskedLM, + BertTokenizer, + DNATokenizer, + myTokenizer, + MotifTokenizer, + CamembertConfig, + CamembertForMaskedLM, + CamembertTokenizer, + DistilBertConfig, + DistilBertForMaskedLM, + DistilBertTokenizer, + GPT2Config, + GPT2LMHeadModel, + GPT2Tokenizer, + OpenAIGPTConfig, + OpenAIGPTLMHeadModel, + OpenAIGPTTokenizer, + PreTrainedModel, + PreTrainedTokenizer, + RobertaConfig, + RobertaForMaskedLM, + RobertaTokenizer, + get_linear_schedule_with_warmup, +) + + +MODEL_CLASSES = { + "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer), + "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), + "dna": (BertConfig, BertForMaskedLM, DNATokenizer), + "bert": (BertConfig, BertForMaskedLM, BertTokenizer), + "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer), + "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer), + "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer), + "myBert": (BertConfig, BertForMaskedLM, myTokenizer), + "motifBert": (BertConfig, BertForMaskedLM, MotifTokenizer) +} + +MASK_LIST = { + "3mer_stride1": [-1, 1], + "3mer_stride3": [0], + "6mer_stride1": [-2, -1, 1, 2, 3], + "6mer_stride6": [0], + "motif": [0] +} + +# Setting environment variables +os.environ['VOCAB_PATH'] = '/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/tokenizer_v4/hg38/vocab_dedup.txt' +os.environ['VOCAB_NAME'] = 'vocab_dedup.txt' +os.environ['POSITIONAL_EMBEDDINGS_SIZE'] = '512' + + +def tokenize_seq(seg, vocabs, maxlen, motif_hardcoded_sorted, motif_wildcarded_sorted, motif_variations_sorted, k3, k1): + + i = 0 # start position + tokens = [] + coordinates = [] + names = [] + + t = [] + while i < len(seg): + t = [] + for l in range(maxlen, 0, -1): + if seg[i:i+l] in motif_hardcoded_sorted: + t = [seg[i:i+l]] + elif seg[i:i+l] in motif_wildcarded_sorted: + t = motif_wildcarded_sorted[seg[i:i+l]] + elif seg[i:i+l] in motif_variations_sorted: + t = motif_variations_sorted[seg[i:i+l]] + elif seg[i:i+l] in k3: + t = [seg[i:i+l]] + elif seg[i:i+l] in k1: + t = [seg[i:i+l]] + + if t: + if len(t) > 1: + # min_length = min(len(item.split()) for item in t) + # filtered_list = [item for item in data if len(item.split()) == min_length] + # random_choice = random.choice(filtered_list) + random_choice = random.choice(t) + tokens.append(random_choice) + # names.append(lookup_table[random_choice.split()[0]]) + else: + tokens.append(t[0]) + # names.append(lookup_table.get(t[0].split()[0], '-')) + + # coordinate = chrmname + ':' + str(start_position + i) + '-' + str(min(start_position + i + l, start_position + len(seg))) + # coordinates.append(coordinate) + i = i + l + break + + return tokens, coordinates, names + +def main(): + + motif_hardcoded = pd.read_csv(join(args.tokenizer_dir, 'motifs_hardcode.txt'), header = None, names = ['column']) + motif_hardcoded_sorted = motif_hardcoded.sort_values(by='column', key=lambda col: col.str.len(), ascending=False) + + # uniq wildcarded motifs + motif_wildcarded = collections.defaultdict(list) + with open(join(args.tokenizer_dir, "motifs_wildcard.txt"), "r") as file: + for line in file: + seq, operations = line.strip().split(maxsplit=1) # Split only on the first space + motif_wildcarded[seq].append(operations) # Store in dictionary + motif_wildcarded_sorted = {k: motif_wildcarded[k] for k in sorted(motif_wildcarded.keys(), key=len, reverse=True)} + + # uniq motif variations + motif_variations = collections.defaultdict(list) + with open(join(args.tokenizer_dir, "motifs_variations.txt"), "r") as file: + for line in file: + seq, operations = line.strip().split(maxsplit=1) # Split only on the first space + motif_variations[seq].append(operations) # Store in dictionary + motif_variations_sorted = {k: motif_variations[k] for k in sorted(motif_variations.keys(), key=len, reverse=True)} + + k1 = ['A', 'T', 'C', 'G', 'N'] + # 3-mer + combinations = list(itertools.product(['A', 'T', 'C', 'G'], repeat=3)) + k3 = [''.join(term) for term in combinations] + + lookup_table = {} + with open(join(args.tokenizer_dir, "motifs_dedup.txt"), "r") as file: + for line in file: + segment, name = line.strip().split(maxsplit=1) # Split only on the first space + lookup_table[segment] = name # Store in dictionary + + for folder in os.listdir(args.data_dir): + if not folder.startswith('.'): + for f in ['test', 'dev', 'train']: + data = join(args.data_dir, folder, 'split', f + '.csv') + print('process file: ' + data) + + if not os.path.exists(data): + print(f"File {data} does not exist, skipping...") + continue + + df = pd.read_csv(data, sep = '\t') + print('Processing ' + folder + ' ' + f) + df_tokenized = [] + + if args.only_positive: + for i in range(len(df['sequence'])): + if df['label'][i] == 1: + seg = df['sequence'][i] + t, _, _ = tokenize_seq(seg, args.tokenizer_dir, 12, motif_hardcoded_sorted, motif_wildcarded_sorted, motif_variations_sorted, k3, k1) + df_tokenized.append(t) + + df_ = [" ".join(line) for line in df_tokenized] + f_ = join(args.data_dir, folder, 'split', f, name + '_token_v4_only_POS.json') + with open(f_, 'w') as file: + # logging.warning(f"Saving tokenized results to {f_}...") + json.dump(df_, file) + + else: + for i in range(len(df['sequence'])): + seg = df['sequence'][i] + t, _, _ = tokenize_seq(seg, args.tokenizer_dir, 12, motif_hardcoded_sorted, motif_wildcarded_sorted, motif_variations_sorted, k3, k1) + df_tokenized.append(t) + + df_ = [" ".join(line) for line in df_tokenized] + f_ = join(args.data_dir, folder, 'split', f + '_token_v4.json') + with open(f_, 'w') as file: + # logging.warning(f"Saving tokenized results to {f_}...") + json.dump(df_, file) + + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("--tokenizer_dir", type=str, required=True) + parser.add_argument("--data_dir", type=str, required=True) + parser.add_argument("--only_positive", action="store_true") + + args = parser.parse_args() + + main() + diff --git a/tokenize_v5_1.py b/tokenize_v5_1.py new file mode 100644 index 0000000000000000000000000000000000000000..3adec738e3aee6c3a642f8deed127aa0ec18b55d --- /dev/null +++ b/tokenize_v5_1.py @@ -0,0 +1,318 @@ +import numpy as np +import pandas as pd +from os.path import join + +import argparse +import glob +import logging +import os +import pickle +import random +import re +import shutil +from typing import Dict, List, Tuple +from copy import deepcopy +from multiprocessing import Pool +import sys +import importlib +from pathlib import Path + +import numpy as np +import torch +from torch.nn.utils.rnn import pad_sequence +from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler +from torch.utils.data.distributed import DistributedSampler +from tqdm import tqdm, trange +import collections +import itertools +import json + +from transformers import ( + WEIGHTS_NAME, + AdamW, + BertConfig, + BertForMaskedLM, + BertTokenizer, + DNATokenizer, + myTokenizer, + MotifTokenizer, + CamembertConfig, + CamembertForMaskedLM, + CamembertTokenizer, + DistilBertConfig, + DistilBertForMaskedLM, + DistilBertTokenizer, + GPT2Config, + GPT2LMHeadModel, + GPT2Tokenizer, + OpenAIGPTConfig, + OpenAIGPTLMHeadModel, + OpenAIGPTTokenizer, + PreTrainedModel, + PreTrainedTokenizer, + RobertaConfig, + RobertaForMaskedLM, + RobertaTokenizer, + get_linear_schedule_with_warmup, +) + + +MODEL_CLASSES = { + "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer), + "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), + "dna": (BertConfig, BertForMaskedLM, DNATokenizer), + "bert": (BertConfig, BertForMaskedLM, BertTokenizer), + "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer), + "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer), + "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer), + "myBert": (BertConfig, BertForMaskedLM, myTokenizer), + "motifBert": (BertConfig, BertForMaskedLM, MotifTokenizer) +} + +MASK_LIST = { + "3mer_stride1": [-1, 1], + "3mer_stride3": [0], + "6mer_stride1": [-2, -1, 1, 2, 3], + "6mer_stride6": [0], + "motif": [0] +} + +# Setting environment variables +os.environ['VOCAB_PATH'] = '/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/tokenizer_v5.1/hg38_NOOP/vocab_dedup.txt' +os.environ['VOCAB_NAME'] = 'vocab_dedup.txt' +os.environ['POSITIONAL_EMBEDDINGS_SIZE'] = '512' + +class TrieNode: + def __init__(self): + self.children = {} + self.is_end_of_word = False + self.features = [] + + +class Trie: + def __init__(self): + self.root = TrieNode() + self.lookup_table = {} + def insert(self, word, features = None): + current_node = self.root + for char in word: + if char not in current_node.children: + current_node.children[char] = TrieNode() + current_node = current_node.children[char] + current_node.is_end_of_word = True + if features: + current_node.features.append(features) + def print_trie(self, node=None, prefix="", level=0): + if node is None: + node = self.root + for char, child_node in node.children.items(): + print(" " * level + "'{}'{}".format(char, " (end)" if child_node.is_end_of_word else "")) + self.print_trie(child_node, prefix + char, level + 1) + def search(self, word): + current_node = self.root + for char in word: + if char not in current_node.children: + return False # Word not found + current_node = current_node.children[char] + if current_node.is_end_of_word: + if len(current_node.features) > 0: + return current_node.features + else: + return True + return False # Word not found + +def load_trie_from_file(filename): + with open(filename, 'rb') as file: + return pickle.load(file) + +def load_tokenizer5_1(): + config_class, model_class, tokenizer_class = MODEL_CLASSES['motifBert'] + tokenizer = tokenizer_class.from_pretrained('motif', cache_dir=None) + + bases = ['A', 'T', 'C', 'G'] + + token_wc = [ + f"{operator}_POS_{i}_*_{char}" + for operator, i, char in itertools.product(['WC'], range(12), bases) + ] + + motif_wildcarded = [] + with open(os.path.join('/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/tokenizer_v5.1/hg38_NOOP', "motifs_wildcard.txt"), "r") as file: + for line in file: + seq, operations = line.strip().split(maxsplit=1) # Split only on the first space + motif_wildcarded.append(operations.split()[0]) # Store in dictionary + + tokenizer.add_tokens(token_wc + motif_wildcarded) + return tokenizer + +def tokenize(seg, i, maxlen, motif_hardcoded_trie, motif_wildcarded_trie, k3, k1, lookup_table): + ''' + Parameters: + seg: a sequence chunk from the chromosome + i: the start position at this segment + maxlen: the longest distance considered to find motif, should be the longest word in vocabulary + + rule: + hardcoded motif > wildcarded motif > motif + operation + + score design rule: + reward length of underlying sequence(instead of the motif token, cuz it means how long these token combination can tokenize) + penalize # of wildcards (identifying how many positions have high uncertainty) + penalize mutation operation + ''' + + score = 0 + t = [] + + best_token = None + best_score = -float('inf') + + for l in range(maxlen, 3, -1): + + segment = seg[i:i+l] + + if motif_hardcoded_trie.search(segment): + + t = [segment] + score = 1 * l + best_token, best_score = max([(best_token, best_score), (t, score)], key=lambda x: x[1]) + + if motif_wildcarded_trie.search(segment): + + t = [random.choice(motif_wildcarded_trie.search(segment))] # random.choice output doesn't have [], so need to add [] + wd = len(t[0].split()) - 1 # the number of wildcards + score = 1 * l - np.exp( wd / l) # the less wd count, the lower penalization + best_token, best_score = max([(best_token, best_score), (t, score)], key=lambda x: x[1]) + + # if cannot find motifs, tokenize with 3mer then 1mer + if best_token == None: + + for l in range(3, 0, -1): + + segment = seg[i:i+l] + + if segment in k3: + best_token = [segment] + best_score = 3 + break + + if segment in k1: + best_token = [segment] + best_score = 1 + + name = lookup_table.get(best_token[0].split()[0], '-') # '-' represent the given name for non-motif tokens + next_pos = i + len(best_token[0].split()[0]) + + return best_token[0], name, best_score, next_pos + +def tokenize_seq(seg, vocab_path, maxlen, motif_hardcoded_trie, motif_wildcarded_trie, k1, k3, lookup_table): + + i = 0 # start position + tokens = [] + names = [] + coordinates = [] + + t = [] + + while i < len(seg): + + t = [] + + best_token, best_name, best_score, next_pos = tokenize(seg, i, maxlen, motif_hardcoded_trie, motif_wildcarded_trie, k3, k1, lookup_table) + best_i = i + + _curr_token = best_token + offsets = [] + + if len(_curr_token) > 1: # our token only has length 1, 3, >=5, no length at 2 + # ๅช่ฆๅฝ“ๅ‰ token ไธๆ˜ฏ 1mer, ๅ‘ๅณ offset ๆ‰ๆœ‰ๆ„ไน‰๏ผŒๅฆๅˆ™็›ธๅฝ“ไบŽไปŽไธ‹ไธ€ไธชไฝ็ฝฎๅผ€ๅง‹ tokenize + offsets = [1, 2] + + if offsets: + for shift in offsets: + i_shifted = i + shift + if i_shifted < len(seg): + token_, name_, score_, next_pos_ = tokenize(seg, i_shifted, maxlen, motif_hardcoded_trie, motif_wildcarded_trie, k3, k1, lookup_table) + best_token, best_name, best_i, next_pos, best_score = max([(best_token, best_name, best_i, next_pos, best_score), (token_, name_, i_shifted, next_pos_, score_ )], key=lambda x: x[4]) + + for skip in range(best_i - i): + tokens.append(seg[i + skip]) + # names.append('-') + # coordinates.append(chrmname + ':' + str(start_position + i + skip) + '-' + str(start_position + i + skip + 1)) + + # coordinate = chrmname + ':' + str(start_position + best_i) + '-' + str(min(start_position + next_pos, start_position + len(seg))) + tokens.append(best_token) + # names.append(best_name) + # coordinates.append(coordinate) + + i = next_pos + + return tokens, coordinates, names + + +def main(): + + # load vocabs + motif_hardcoded_trie = load_trie_from_file(join(args.tokenizer_dir, 'motifs_hardcode_trie.pkl')) + motif_wildcarded_trie = load_trie_from_file(join(args.tokenizer_dir, 'motifs_wildcard_trie.pkl')) + + k1 = ['A', 'T', 'C', 'G', 'N'] + # 3-mer + combinations = list(itertools.product(['A', 'T', 'C', 'G'], repeat=3)) + k3 = [''.join(term) for term in combinations] + + lookup_table = {} + with open(join(args.tokenizer_dir, "motifs_dedup.txt"), "r") as file: + for line in file: + segment, name = line.strip().split(maxsplit=1) # Split only on the first space + lookup_table[segment] = name # Store in dictionary + + for folder in os.listdir(args.data_dir): + if not folder.startswith('.'): + for f in ['test', 'dev', 'train']: + data = join(args.data_dir, folder, 'split', f + '.csv') + + if not os.path.exists(data): + print(f"File {data} does not exist, skipping...") + continue + + df = pd.read_csv(data, sep = '\t') + print('Processing ' + folder + ' ' + f) + df_tokenized = [] + + if args.only_positive: + for i in range(len(df['sequence'])): + if df['label'][i] == 1: + seg = df['sequence'][i] + t, _, _ = tokenize_seq(seg, args.tokenizer_dir, 12, motif_hardcoded_trie, motif_wildcarded_trie, k1, k3, lookup_table) + df_tokenized.append(t) + + df_ = [" ".join(line) for line in df_tokenized] + f_ = join(args.data_dir, folder, 'split', f + '_token_v5_1_only_POS.json') + with open(f_, 'w') as file: + # logging.warning(f"Saving tokenized results to {f_}...") + json.dump(df_, file) + + else: + for i in range(len(df['sequence'])): + seg = df['sequence'][i] + t, _, _ = tokenize_seq(seg, args.tokenizer_dir, 12, motif_hardcoded_trie, motif_wildcarded_trie, k1, k3, lookup_table) + df_tokenized.append(t) + + df_ = [" ".join(line) for line in df_tokenized] + f_ = join(args.data_dir, folder, 'split', f + '_token_v5_1.json') + with open(f_, 'w') as file: + # logging.warning(f"Saving tokenized results to {f_}...") + json.dump(df_, file) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + + parser.add_argument("--tokenizer_dir", type=str, required=True) + parser.add_argument("--data_dir", type=str, required=True) + parser.add_argument("--only_positive", action="store_true") + args = parser.parse_args() + + main() diff --git a/train.py b/train.py new file mode 100644 index 0000000000000000000000000000000000000000..4af65611876654410775343018c43b9dcd3fadc5 --- /dev/null +++ b/train.py @@ -0,0 +1,473 @@ +import wandb +# wandb.login(key="293cdcc20c72cb7e8cc5a077eaacf86b254e46ed") +#Nancy +wandb.login(key="04fa40f46e9b09c72fc2dcb1457767c7ad809037") +import os +import sys +os.environ["DISABLE_TRITON"] = "1" +sys.modules['triton'] = None +sys.modules['flash_attn_triton'] = None + + +import csv +import copy +import json +import logging +from dataclasses import dataclass, field +from typing import Any, Optional, Dict, Sequence, Tuple, List, Union + +import torch +import transformers +import sklearn +import numpy as np +from torch.utils.data import Dataset +import importlib +from pathlib import Path +import itertools + +from transformers import BertConfig, BertForSequenceClassification + +from transformers import ( + WEIGHTS_NAME, + AdamW, + BertConfig, + BertForMaskedLM, + BertTokenizer, + CamembertConfig, + CamembertForMaskedLM, + CamembertTokenizer, + DistilBertConfig, + DistilBertForMaskedLM, + DistilBertTokenizer, + GPT2Config, + GPT2LMHeadModel, + GPT2Tokenizer, + OpenAIGPTConfig, + OpenAIGPTLMHeadModel, + OpenAIGPTTokenizer, + PreTrainedModel, + PreTrainedTokenizer, + RobertaConfig, + RobertaForMaskedLM, + RobertaTokenizer, + get_linear_schedule_with_warmup, +) + +from tokenization_motif import MotifTokenizer +from tokenization_dna import DNATokenizer + + +MODEL_CLASSES = { + "dna": (BertConfig, BertForMaskedLM, DNATokenizer), + "bert": (BertConfig, BertForMaskedLM, BertTokenizer), + "motifBert": (BertConfig, BertForMaskedLM, MotifTokenizer) +} + +@dataclass +class ModelArguments: + model_name_or_path: Optional[str] = field(default="facebook/opt-125m") + trust_remote_code: bool = field(default=False, metadata={"help": "for custom models(has custom code that needs to be executed (e.g., custom architectures, tokenizers, or modeling files)), whether local or from the Hub"}) + use_lora: bool = field(default=False, metadata={"help": "whether to use LoRA"}) + lora_r: int = field(default=8, metadata={"help": "hidden dimension for LoRA"}) + lora_alpha: int = field(default=32, metadata={"help": "alpha for LoRA"}) + lora_dropout: float = field(default=0.05, metadata={"help": "dropout rate for LoRA"}) + lora_target_modules: str = field(default="query,value", metadata={"help": "where to perform LoRA"}) + tokenizer_path: Optional[str] = field(default="facebook/opt-125m") + + +@dataclass +class DataArguments: + data_path: str = field(default=None, metadata={"help": "Path to the training data."}) + kmer: int = field(default=-1, metadata={"help": "k-mer for input sequence. -1 means not using k-mer."}) + customized_tokenizer: Optional[str] = field(default=None) + + +@dataclass +class TrainingArguments(transformers.TrainingArguments): + vocab_file: Optional[str] = field( + default=None, + metadata={"help": "Path to custom vocabulary file (overrides Hugging Face default)"} + ) + cache_dir: Optional[str] = field(default=None) + run_name: str = field(default="run") + optim: str = field(default="adamw_torch") + model_max_length: int = field(default=512, metadata={"help": "Maximum sequence length."}) + gradient_accumulation_steps: int = field(default=1) + per_device_train_batch_size: int = field(default=1) + per_device_eval_batch_size: int = field(default=1) + num_train_epochs: int = field(default=1) + fp16: bool = field(default=False) + logging_steps: int = field(default=100) + save_steps: int = field(default=100) + eval_steps: int = field(default=100) + evaluation_strategy: str = field(default="steps"), + warmup_steps: int = field(default=50) + weight_decay: float = field(default=0.01) + learning_rate: float = field(default=1e-4) + save_total_limit: int = field(default=3) + load_best_model_at_end: bool = field(default=False) + output_dir: str = field(default="output") + find_unused_parameters: bool = field(default=False) + checkpointing: bool = field(default=False) + dataloader_pin_memory: bool = field(default=False) + eval_and_save_results: bool = field(default=True) + save_model: bool = field(default=False) + seed: int = field(default=42) + project_name: str = field(default=None) + + +def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str): + """Collects the state dict and dump to disk.""" + state_dict = trainer.model.state_dict() + if trainer.args.should_save: + cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()} + del state_dict + trainer._save(output_dir, state_dict=cpu_state_dict) # noqa + + +""" +Get the reversed complement of the original DNA sequence. +""" +def get_alter_of_dna_sequence(sequence: str): + MAP = {"A": "T", "T": "A", "C": "G", "G": "C"} + # return "".join([MAP[c] for c in reversed(sequence)]) + return "".join([MAP[c] for c in sequence]) + +""" +Transform a dna sequence to k-mer string +""" +def generate_kmer_str(sequence: str, k: int) -> str: + """Generate k-mer string from DNA sequence.""" + return " ".join([sequence[i:i+k] for i in range(len(sequence) - k + 1)]) + + +""" +Load or generate k-mer string for each DNA sequence. The generated k-mer string will be saved to the same directory as the original data with the same name but with a suffix of "_{k}mer". +""" +def load_or_generate_kmer(data_path: str, texts: List[str], k: int) -> List[str]: + """Load or generate k-mer string for each DNA sequence.""" + kmer_path = data_path.replace(".csv", f"_{k}mer.json") + if os.path.exists(kmer_path): + logging.warning(f"Loading k-mer from {kmer_path}...") + with open(kmer_path, "r") as f: + kmer = json.load(f) + else: + logging.warning(f"Generating k-mer...") + kmer = [generate_kmer_str(text, k) for text in texts] + with open(kmer_path, "w") as f: + logging.warning(f"Saving k-mer to {kmer_path}...") + json.dump(kmer, f) + + return kmer + +def load_customized_data(data_path: str, texts: List[str], customized_tokenizer: str) -> List[str]: + """Load or generate k-mer string for each DNA sequence.""" + customize_path = data_path.replace(".csv", f"_{customized_tokenizer}.json") + print(customize_path) + if os.path.exists(customize_path): + logging.warning(f"Loading data by customized tokenizer from {customize_path}...") + with open(customize_path, "r") as f: + data = json.load(f) + + return data + + +class SupervisedDataset(Dataset): + """Dataset for supervised fine-tuning.""" + + def __init__(self, + data_path: str, + tokenizer: transformers.PreTrainedTokenizer, + kmer: int = -1, + customized_tokenizer = None): + + super(SupervisedDataset, self).__init__() + + # load data from the disk + with open(data_path, "r") as f: + data = list(csv.reader(f, delimiter='\t'))[1:] + if len(data[0]) == 2: + # data is in the format of [text, label] + logging.warning("Perform single sequence classification...") + texts = [d[0] for d in data] + # labels = [int(d[1]) for d in data] + label_set = sorted(set(d[1] for d in data)) # get unique labels + label2id = {label: idx for idx, label in enumerate(label_set)} # map labels to integers + labels = [label2id[d[1]] for d in data] + elif len(data[0]) == 3: + # data is in the format of [text1, text2, label] + logging.warning("Perform sequence-pair classification...") + texts = [[d[0], d[1]] for d in data] + # labels = [int(d[2]) for d in data] + label_set = sorted(set(d[1] for d in data)) # get unique labels + label2id = {label: idx for idx, label in enumerate(label_set)} # map labels to integers + labels = [label2id[d[1]] for d in data] + elif len(data[0]) == 5: + logging.warning("Perform single sequence classification on Genomic Benchmarks...") + texts = [d[4] for d in data] + # labels = [int(d[0]) for d in data] + label_set = sorted(set(d[1] for d in data)) # get unique labels + label2id = {label: idx for idx, label in enumerate(label_set)} # map labels to integers + labels = [label2id[d[1]] for d in data] + else: + raise ValueError("Data format not supported.") + + if kmer != -1: + + logging.warning(f"Using {kmer}-mer as input...") + texts = load_or_generate_kmer(data_path, texts, kmer) + + elif kmer == -1 and customized_tokenizer: + logging.warning(f"Using {customized_tokenizer} as input...") + texts = load_customized_data(data_path, texts, customized_tokenizer) + + output = tokenizer( + texts, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ) + # print(texts, output["input_ids"]) + + self.input_ids = output["input_ids"] + self.attention_mask = output["attention_mask"] + self.labels = labels + self.num_labels = len(set(labels)) + + def __len__(self): + return len(self.input_ids) + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + return dict(input_ids=self.input_ids[i], labels=self.labels[i]) + + +@dataclass +class DataCollatorForSupervisedDataset(object): + """Collate examples for supervised fine-tuning.""" + + tokenizer: transformers.PreTrainedTokenizer + + def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: + input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels")) + input_ids = torch.nn.utils.rnn.pad_sequence( + input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id + ) + labels = torch.Tensor(labels).long() + return dict( + input_ids=input_ids, + labels=labels, + attention_mask=input_ids.ne(self.tokenizer.pad_token_id), + ) + +""" +Manually calculate the accuracy, f1, matthews_correlation, precision, recall with sklearn. +""" +def calculate_metric_with_sklearn(predictions: np.ndarray, labels: np.ndarray): + valid_mask = labels != -100 # Exclude padding tokens (assuming -100 is the padding token ID) + valid_predictions = predictions[valid_mask] + valid_labels = labels[valid_mask] + return { + "accuracy": sklearn.metrics.accuracy_score(valid_labels, valid_predictions), + "f1": sklearn.metrics.f1_score( + valid_labels, valid_predictions, average="macro", zero_division=0 + ), + "matthews_correlation": sklearn.metrics.matthews_corrcoef( + valid_labels, valid_predictions + ), + "precision": sklearn.metrics.precision_score( + valid_labels, valid_predictions, average="macro", zero_division=0 + ), + "recall": sklearn.metrics.recall_score( + valid_labels, valid_predictions, average="macro", zero_division=0 + ), + } + +# from: https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941/13 +def preprocess_logits_for_metrics(logits:Union[torch.Tensor, Tuple[torch.Tensor, Any]], _): + if isinstance(logits, tuple): # Unpack logits if it's a tuple + logits = logits[0] + + if logits.ndim == 3: + # Reshape logits to 2D if needed + logits = logits.reshape(-1, logits.shape[-1]) + + return torch.argmax(logits, dim=-1) + + +""" +Compute metrics used for huggingface trainer. +""" +def compute_metrics(eval_pred): + predictions, labels = eval_pred + return calculate_metric_with_sklearn(predictions, labels) + +def load_token_v5_1(tokenizer_kwargs): + config_class, model_class, tokenizer_class = MODEL_CLASSES['motifBert'] + tokenizer = MotifTokenizer(**tokenizer_kwargs) + + bases = ['A', 'T', 'C', 'G'] + + token_wc = [ + f"{operator}_POS_{i}_*_{char}" + for operator, i, char in itertools.product(['WC'], range(12), bases) + ] + + motif_wildcarded = [] + with open(os.path.join('/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/tokenizer_v5.1/hg38_NOOP', "motifs_wildcard.txt"), "r") as file: + for line in file: + seq, operations = line.strip().split(maxsplit=1) # Split only on the first space + motif_wildcarded.append(operations.split()[0]) # Store in dictionary + + tokenizer.add_tokens(token_wc + motif_wildcarded) + return tokenizer + +def load_token_v4(tokenizer_kwargs): + config_class, model_class, tokenizer_class = MODEL_CLASSES['motifBert'] + tokenizer = MotifTokenizer(**tokenizer_kwargs) + + bases = ['A', 'T', 'C', 'G'] + token_del = [ + f"{operator}_POS_{i}_{char}" + for operator, i, char in itertools.product(['DEL'], range(12), bases) + ] + token_rep = [ + f"{operator}_POS_{i}_{char1}_{char2}" + for operator, i, char1, char2 in itertools.product(['SUB'], range(12), bases, bases) + if char1 != char2 + ] + + token_wc = [ + f"{operator}_POS_{i}_*_{char}" + for operator, i, char in itertools.product(['WC'], range(12), bases) + ] + + token_ins = [ + f"{operator}_POS_{i}_{char}" + for operator, i, char in itertools.product(['INS'], range(13), bases) + ] + + motif_wildcarded = [] + with open(os.path.join('/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/tokenizer_v4/hg38', "motifs_wildcard.txt"), "r") as file: + for line in file: + seq, operations = line.strip().split(maxsplit=1) # Split only on the first space + motif_wildcarded.append(operations.split()[0]) # Store in dictionary + + tokenizer.add_tokens(token_del + token_rep + token_wc + token_ins + motif_wildcarded) + return tokenizer + +def train(): + + parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + wandb.init( + project=training_args.project_name, + ) + + tokenizer_kwargs = { + "cache_dir": training_args.cache_dir, + "model_max_length": training_args.model_max_length, + "padding_side": "right", + "use_fast": True, + "trust_remote_code": model_args.trust_remote_code # ้™ค้žๅฟ…่ฆๅฆๅˆ™ๅปบ่ฎฎไฟๆŒFalse + } + + if training_args.vocab_file is not None: + if not os.path.exists(training_args.vocab_file): + raise ValueError(f"Vocab file not found at: {training_args.vocab_file}") + tokenizer_kwargs["vocab_file"] = training_args.vocab_file + + if data_args.customized_tokenizer == 'token_v4': + tokenizer = load_token_v4(tokenizer_kwargs) + + elif data_args.customized_tokenizer == 'token_v5_1': + tokenizer = load_token_v5_1(tokenizer_kwargs) + + else: + tokenizer = transformers.PreTrainedTokenizerFast( + tokenizer_file=model_args.tokenizer_path, + **tokenizer_kwargs + ) + + tokenizer.pad_token = "[PAD]" + tokenizer.unk_token = "[UNK]" + tokenizer.cls_token = "[CLS]" + tokenizer.sep_token = "[SEP]" + tokenizer.mask_token = "[MASK]" + + if "InstaDeepAI" in model_args.model_name_or_path: + tokenizer.eos_token = tokenizer.pad_token + + # define datasets and data collator + train_dataset = SupervisedDataset(tokenizer=tokenizer, + data_path=os.path.join(data_args.data_path, "train.csv"), + kmer=data_args.kmer, + customized_tokenizer=data_args.customized_tokenizer) + val_dataset = SupervisedDataset(tokenizer=tokenizer, + data_path=os.path.join(data_args.data_path, "dev.csv"), + kmer=data_args.kmer, + customized_tokenizer=data_args.customized_tokenizer) + test_dataset = SupervisedDataset(tokenizer=tokenizer, + data_path=os.path.join(data_args.data_path, "test.csv"), + kmer=data_args.kmer, + customized_tokenizer=data_args.customized_tokenizer) + data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) + + + config = transformers.AutoConfig.from_pretrained( + model_args.model_name_or_path, + num_labels = train_dataset.num_labels, + trust_remote_code=model_args.trust_remote_code + ) + + model = transformers.AutoModelForSequenceClassification.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + config=config, # pass the adjusted config + trust_remote_code=model_args.trust_remote_code + ).to("cuda") + + # configure LoRA + if model_args.use_lora: + lora_config = LoraConfig( + r=model_args.lora_r, + lora_alpha=model_args.lora_alpha, + target_modules=list(model_args.lora_target_modules.split(",")), + lora_dropout=model_args.lora_dropout, + bias="none", + task_type="SEQ_CLS", + inference_mode=False, + ) + model = get_peft_model(model, lora_config) + model.print_trainable_parameters() + + # define trainer + trainer = transformers.Trainer(model=model, + tokenizer=tokenizer, + args=training_args, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + compute_metrics=compute_metrics, + train_dataset=train_dataset, + eval_dataset=val_dataset, + data_collator=data_collator) + trainer.train() + + if training_args.save_model: + trainer.save_state() + safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) + + # get the evaluation results from trainer + if training_args.eval_and_save_results: + results_path = os.path.join(training_args.output_dir, "results", training_args.run_name) + results = trainer.evaluate(eval_dataset=test_dataset) + os.makedirs(results_path, exist_ok=True) + with open(os.path.join(results_path, "eval_results.json"), "w") as f: + json.dump(results, f) + + + + +if __name__ == "__main__": + + train() \ No newline at end of file