jwengr commited on
Commit
06b1bd7
·
verified ·
1 Parent(s): 5467ee5

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -9,5 +9,5 @@
9
  "base_model_name": "unsloth/gemma-2-2b",
10
  "model_type": "hangul_gemma_deobfuscator",
11
  "torch_dtype": "float32",
12
- "transformers_version": "4.48.0"
13
  }
 
9
  "base_model_name": "unsloth/gemma-2-2b",
10
  "model_type": "hangul_gemma_deobfuscator",
11
  "torch_dtype": "float32",
12
+ "transformers_version": "4.50.2"
13
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:569a3cf79ae397c384d4f08733761954f5f98deccebf8b2b3addc5bea26c1f93
3
  size 4992576696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b212d008b1aaaafbe0dd51710c7466d1836577073d01510ceb5ab7bb3d1b19f
3
  size 4992576696
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0981c0ef8e86962ff1d407bf8f318e70875e9f5926dfdfb83c4e3fd3dba801fc
3
  size 4983444480
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a11268a65cf0ae9cdd5dab5310c09f3342ec1540f598a52b4bf86a7d199a4039
3
  size 4983444480
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed2baf69f3921f01bf52751aa0a74018d299d08a5c3b7be41dcd5250ecf80dc2
3
  size 1104312040
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b04ff95754a801b9c575840cd3808a298fc0321ad1afa1cd09871d1fb310951
3
  size 1104312040
modeling_hangul_gemma_deobfuscator.py CHANGED
@@ -2,6 +2,7 @@ import torch
2
  import torch.nn as nn
3
 
4
  from types import MethodType
 
5
  from copy import deepcopy
6
  from transformers import PretrainedConfig, PreTrainedModel, AutoModelForCausalLM, AutoConfig
7
  from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
@@ -126,7 +127,7 @@ class HangulGemmaDeobfuscator(PreTrainedModel):
126
  pred_ids[token_type_ids==4] = torch.LongTensor(pred_char_ids).type_as(pred_ids)
127
  return pred_ids
128
 
129
- def deobfuscate_by_syllable(self, sentence):
130
  sentences = [sentence]
131
  char_input_ids, char_attention_mask, char_token_type_ids = self.tokenizer.batch_encode_char(sentences)
132
  char_input_ids, char_attention_mask, char_token_type_ids = char_input_ids.to(self.device), char_attention_mask.to(self.device), char_token_type_ids.to(self.device)
@@ -137,7 +138,7 @@ class HangulGemmaDeobfuscator(PreTrainedModel):
137
  decoded = self.tokenizer.decode_char(pred_char_ids[0],char_token_type_ids[0])
138
  return decoded
139
 
140
- def deobfuscate_hierarchical(self, sentence):
141
  sentences = [sentence]
142
  char_input_ids, char_attention_mask, char_token_type_ids = self.tokenizer.batch_encode_char(sentences)
143
  char_input_ids, char_attention_mask, char_token_type_ids = char_input_ids.to(self.device), char_attention_mask.to(self.device), char_token_type_ids.to(self.device)
@@ -154,18 +155,36 @@ class HangulGemmaDeobfuscator(PreTrainedModel):
154
  y_pred = [self.tokenizer.decode_jamo(pred_jamo_id, jamo_token_type_id) for pred_jamo_id, jamo_token_type_id in zip(pred_jamo_ids, jamo_token_type_ids.tolist())]
155
  return y_pred[0]
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  def decoder_forward(
159
  self,
160
- hidden_states,
161
- position_embeddings,
162
- attention_mask = None,
163
- position_ids = None,
164
- past_key_value = None,
165
- output_attentions = False,
166
- use_cache = False,
167
- cache_position = None,
168
- ):
 
 
169
  if self.is_sliding and attention_mask is not None: # efficient SDPA and no padding
170
  attention_mask = torch.tril(torch.triu(attention_mask, diagonal=-self.sliding_window), diagonal=self.sliding_window)
171
 
@@ -183,6 +202,7 @@ def decoder_forward(
183
  output_attentions=output_attentions,
184
  use_cache=use_cache,
185
  cache_position=cache_position,
 
186
  )
187
  hidden_states = self.post_attention_layernorm(hidden_states)
188
  hidden_states = residual + hidden_states
 
2
  import torch.nn as nn
3
 
4
  from types import MethodType
5
+ from typing import List, Optional, Tuple, Union
6
  from copy import deepcopy
7
  from transformers import PretrainedConfig, PreTrainedModel, AutoModelForCausalLM, AutoConfig
8
  from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
 
127
  pred_ids[token_type_ids==4] = torch.LongTensor(pred_char_ids).type_as(pred_ids)
128
  return pred_ids
129
 
130
+ def _deobfuscate_by_syllable(self, sentence):
131
  sentences = [sentence]
132
  char_input_ids, char_attention_mask, char_token_type_ids = self.tokenizer.batch_encode_char(sentences)
133
  char_input_ids, char_attention_mask, char_token_type_ids = char_input_ids.to(self.device), char_attention_mask.to(self.device), char_token_type_ids.to(self.device)
 
138
  decoded = self.tokenizer.decode_char(pred_char_ids[0],char_token_type_ids[0])
139
  return decoded
140
 
141
+ def _deobfuscate(self, sentence):
142
  sentences = [sentence]
143
  char_input_ids, char_attention_mask, char_token_type_ids = self.tokenizer.batch_encode_char(sentences)
144
  char_input_ids, char_attention_mask, char_token_type_ids = char_input_ids.to(self.device), char_attention_mask.to(self.device), char_token_type_ids.to(self.device)
 
155
  y_pred = [self.tokenizer.decode_jamo(pred_jamo_id, jamo_token_type_id) for pred_jamo_id, jamo_token_type_id in zip(pred_jamo_ids, jamo_token_type_ids.tolist())]
156
  return y_pred[0]
157
 
158
+ def deobfuscate(self, sentence, sentence_tokenizer=None):
159
+ if sentence_tokenizer is not None:
160
+ chunks_row = sentence_tokenizer.tokenize(sentence)
161
+ chunks_overlap_row = sentence_tokenizer.overlap(chunks_row)
162
+ chunks_indices = []
163
+ chunks_overlap = []
164
+ for start_idx, end_idx, chunk_overlap_row in chunks_overlap_row:
165
+ chunks_indices.append((start_idx, end_idx))
166
+ chunks_overlap.append(self._deobfuscate_hierarchical(chunk_overlap_row))
167
+
168
+ sentence_tokenizer.decode_overlap(row)
169
+ else:
170
+ return self._deobfuscate(sentence)
171
+
172
+
173
+
174
 
175
  def decoder_forward(
176
  self,
177
+ hidden_states: torch.Tensor,
178
+ position_embeddings: Tuple[torch.Tensor, torch.Tensor],
179
+ attention_mask: Optional[torch.Tensor] = None,
180
+ position_ids: Optional[torch.LongTensor] = None,
181
+ past_key_value=None,
182
+ output_attentions: Optional[bool] = False,
183
+ use_cache: Optional[bool] = False,
184
+ cache_position: Optional[torch.LongTensor] = None,
185
+ last_cache_position: int = 0,
186
+ **kwargs,
187
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
188
  if self.is_sliding and attention_mask is not None: # efficient SDPA and no padding
189
  attention_mask = torch.tril(torch.triu(attention_mask, diagonal=-self.sliding_window), diagonal=self.sliding_window)
190
 
 
202
  output_attentions=output_attentions,
203
  use_cache=use_cache,
204
  cache_position=cache_position,
205
+ **kwargs,
206
  )
207
  hidden_states = self.post_attention_layernorm(hidden_states)
208
  hidden_states = residual + hidden_states
sentence_tokenizer/config.json CHANGED
@@ -3,14 +3,8 @@
3
  "SentenceTokenizer"
4
  ],
5
  "auto_map": {
6
- "AutoConfig": [
7
- "modeling_sentence_tokenizer.SentenceTokenizerConfig",
8
- null
9
- ],
10
- "AutoModel": [
11
- "modeling_sentence_tokenizer.SentenceTokenizer",
12
- null
13
- ]
14
  },
15
  "max_length": 64,
16
  "min_length": 32,
 
3
  "SentenceTokenizer"
4
  ],
5
  "auto_map": {
6
+ "AutoConfig": "modeling_sentence_tokenizer.SentenceTokenizerConfig",
7
+ "AutoModel": "modeling_sentence_tokenizer.SentenceTokenizer"
 
 
 
 
 
 
8
  },
9
  "max_length": 64,
10
  "min_length": 32,