aduncan94 commited on
Commit
4fd272a
·
verified ·
1 Parent(s): 6eb4aa7

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +35 -5
  2. tokenizer_config.json +51 -9
  3. tokenizers.py +52 -29
special_tokens_map.json CHANGED
@@ -1,7 +1,37 @@
1
  {
2
- "bos_token": "@",
3
- "eos_token": "*",
4
- "mask_token": "#",
5
- "pad_token": "!",
6
- "sep_token": "<SEP>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "@",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "*",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "mask_token": {
17
+ "content": "#",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "pad_token": {
24
+ "content": "!",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "sep_token": {
31
+ "content": "/",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
  }
tokenizer_config.json CHANGED
@@ -1,16 +1,58 @@
1
  {
2
- "tokenizer_class": "DNATokenizer",
3
- "bos_token": "@",
4
- "eos_token": "*",
5
- "mask_token": "#",
6
- "pad_token": "!",
7
- "sep_token": "<SEP>",
8
- "model_max_length": 2048,
9
- "clean_up_tokenization_spaces": true,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  "auto_map": {
11
  "AutoTokenizer": [
12
  "tokenizers.DNATokenizer",
13
  null
14
  ]
15
- }
 
 
 
 
 
 
 
 
16
  }
 
1
  {
2
+ "added_tokens_decoder": {
3
+ "6": {
4
+ "content": "*",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "7": {
12
+ "content": "#",
13
+ "lstrip": false,
14
+ "normalized": true,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "8": {
20
+ "content": "@",
21
+ "lstrip": false,
22
+ "normalized": true,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "9": {
28
+ "content": "!",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "10": {
36
+ "content": "/",
37
+ "lstrip": false,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
  "auto_map": {
45
  "AutoTokenizer": [
46
  "tokenizers.DNATokenizer",
47
  null
48
  ]
49
+ },
50
+ "bos_token": "@",
51
+ "clean_up_tokenization_spaces": true,
52
+ "eos_token": "*",
53
+ "mask_token": "#",
54
+ "model_max_length": 2048,
55
+ "pad_token": "!",
56
+ "sep_token": "/",
57
+ "tokenizer_class": "DNATokenizer"
58
  }
tokenizers.py CHANGED
@@ -2,46 +2,51 @@ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
2
  from typing import List, Optional, Union
3
  import os
4
 
5
- # Special tokens
6
  MASK = "#"
7
- PAD = "!"
8
- BOS = "@"
9
- EOS = "*"
 
 
 
10
  SEP = "/"
11
-
12
- # Generatable alphabet (only what the model should predict)
13
- BASE_ALPHABET = "ACGT/{}" # 7 tokens
 
14
 
15
  class DNATokenizer(PreTrainedTokenizer):
16
 
17
  def __init__(
18
  self,
19
- dna_alphabet: str = BASE_ALPHABET,
20
  model_max_length: int = 2048,
21
- pad_token=PAD,
22
  mask_token=MASK,
23
- bos_token=BOS,
24
- eos_token=EOS,
 
 
25
  sep_token=SEP,
26
  **kwargs
27
  ):
28
- # Build vocabulary
29
- self.alphabet = list(dna_alphabet)
 
 
 
 
30
  self.a_to_i = {u: i for i, u in enumerate(self.alphabet)}
31
  self.i_to_a = {i: u for i, u in enumerate(self.alphabet)}
 
32
 
33
- # Wrap special tokens
34
- # def ensure_added(token):
35
- # if isinstance(token, AddedToken):
36
- # return token
37
- # return AddedToken(token, lstrip=False, rstrip=False)
38
-
39
- # bos_token = ensure_added(bos_token)
40
- # eos_token = ensure_added(eos_token)
41
- # sep_token = ensure_added(sep_token)
42
- # mask_token = ensure_added(mask_token)
43
- # pad_token = ensure_added(pad_token)
44
-
45
 
46
  super().__init__(
47
  pad_token=pad_token,
@@ -56,6 +61,10 @@ class DNATokenizer(PreTrainedTokenizer):
56
  @property
57
  def vocab_size(self):
58
  return len(self.alphabet)
 
 
 
 
59
 
60
  def get_vocab(self):
61
  return self.a_to_i
@@ -73,9 +82,10 @@ class DNATokenizer(PreTrainedTokenizer):
73
  return "".join(tokens)
74
 
75
  def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
 
76
  if token_ids_1 is not None:
77
  raise NotImplementedError("This tokenizer does not support two sequences")
78
- return token_ids_0
79
 
80
  def get_special_tokens_mask(
81
  self,
@@ -89,15 +99,28 @@ class DNATokenizer(PreTrainedTokenizer):
89
  token_ids_1=token_ids_1,
90
  already_has_special_tokens=True,
91
  )
92
- return [0] * len(token_ids_0)
 
 
 
 
 
93
 
94
  def create_token_type_ids_from_sequences(
95
  self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
96
  ) -> List[int]:
 
 
 
 
 
 
97
  if token_ids_1 is not None:
98
  raise NotImplementedError("This tokenizer does not support two sequences")
99
- return [0] * len(token_ids_0)
100
 
 
 
 
101
  def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
102
- # No separate vocab file needed
103
  return ()
 
2
  from typing import List, Optional, Union
3
  import os
4
 
 
5
  MASK = "#"
6
+ MSA_PAD = "!"
7
+ UL_ALPHABET_PLUS = "GATCN-*#@!/[]{}"
8
+ MSA_AAS = "GATCN-"
9
+ GAP = "-"
10
+ START = "@"
11
+ STOP = "*"
12
  SEP = "/"
13
+ END_AL = "]"
14
+ END_UL = "}"
15
+ START_AL = "["
16
+ START_UL = "{"
17
 
18
  class DNATokenizer(PreTrainedTokenizer):
19
 
20
  def __init__(
21
  self,
22
+ dna_alphabet: str = UL_ALPHABET_PLUS,
23
  model_max_length: int = 2048,
24
+ pad_token=MSA_PAD,
25
  mask_token=MASK,
26
+ all_aas=MSA_AAS,
27
+ gap_token=GAP,
28
+ bos_token=START,
29
+ eos_token=STOP,
30
  sep_token=SEP,
31
  **kwargs
32
  ):
33
+ """Character tokenizer for Hugging Face transformers.
34
+
35
+ model_max_length (int): Model maximum sequence length.
36
+ """
37
+ self.alphabet = list("".join(dna_alphabet))
38
+ self.all_aas = list("".join(all_aas))
39
  self.a_to_i = {u: i for i, u in enumerate(self.alphabet)}
40
  self.i_to_a = {i: u for i, u in enumerate(self.alphabet)}
41
+ self.gap_token = gap_token
42
 
43
+
44
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
45
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
46
+ sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
47
+ mask_token = AddedToken(mask_token, lstrip=False, rstrip=False) if isinstance(mask_token, str) else mask_token
48
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
49
+ gap_token = AddedToken(gap_token, lstrip=False, rstrip=False) if isinstance(gap_token, str) else gap_token
 
 
 
 
 
50
 
51
  super().__init__(
52
  pad_token=pad_token,
 
61
  @property
62
  def vocab_size(self):
63
  return len(self.alphabet)
64
+
65
+ @property
66
+ def gap_token_id(self):
67
+ return self.convert_tokens_to_ids(self.gap_token)
68
 
69
  def get_vocab(self):
70
  return self.a_to_i
 
82
  return "".join(tokens)
83
 
84
  def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
85
+ result = token_ids_0
86
  if token_ids_1 is not None:
87
  raise NotImplementedError("This tokenizer does not support two sequences")
88
+ return result
89
 
90
  def get_special_tokens_mask(
91
  self,
 
99
  token_ids_1=token_ids_1,
100
  already_has_special_tokens=True,
101
  )
102
+
103
+ result = [0] * len(token_ids_0)
104
+ if token_ids_1 is not None:
105
+ raise NotImplementedError("This tokenizer does not support two sequences")
106
+
107
+ return result
108
 
109
  def create_token_type_ids_from_sequences(
110
  self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
111
  ) -> List[int]:
112
+ """
113
+ Identifies the type of token. 0 for the first sentence, 1 for the second sentence if it exists
114
+ """
115
+
116
+ result = len(token_ids_0) * [0]
117
+
118
  if token_ids_1 is not None:
119
  raise NotImplementedError("This tokenizer does not support two sequences")
120
+ return result
121
 
122
+ def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
123
+ super().save_pretrained(save_directory, **kwargs)
124
+
125
  def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
 
126
  return ()