Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

config.json +1 -1
model-00001-of-00003.safetensors +1 -1
model-00002-of-00003.safetensors +1 -1
model-00003-of-00003.safetensors +1 -1
modeling_hangul_gemma_deobfuscator.py +31 -11
sentence_tokenizer/config.json +2 -8

config.json CHANGED Viewed

@@ -9,5 +9,5 @@
   "base_model_name": "unsloth/gemma-2-2b",
   "model_type": "hangul_gemma_deobfuscator",
   "torch_dtype": "float32",
-  "transformers_version": "4.48.0"
 }

   "base_model_name": "unsloth/gemma-2-2b",
   "model_type": "hangul_gemma_deobfuscator",
   "torch_dtype": "float32",
+  "transformers_version": "4.50.2"
 }

model-00001-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:569a3cf79ae397c384d4f08733761954f5f98deccebf8b2b3addc5bea26c1f93
 size 4992576696

 version https://git-lfs.github.com/spec/v1
+oid sha256:5b212d008b1aaaafbe0dd51710c7466d1836577073d01510ceb5ab7bb3d1b19f
 size 4992576696

model-00002-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0981c0ef8e86962ff1d407bf8f318e70875e9f5926dfdfb83c4e3fd3dba801fc
 size 4983444480

 version https://git-lfs.github.com/spec/v1
+oid sha256:a11268a65cf0ae9cdd5dab5310c09f3342ec1540f598a52b4bf86a7d199a4039
 size 4983444480

model-00003-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ed2baf69f3921f01bf52751aa0a74018d299d08a5c3b7be41dcd5250ecf80dc2
 size 1104312040

 version https://git-lfs.github.com/spec/v1
+oid sha256:8b04ff95754a801b9c575840cd3808a298fc0321ad1afa1cd09871d1fb310951
 size 1104312040

modeling_hangul_gemma_deobfuscator.py CHANGED Viewed

@@ -2,6 +2,7 @@ import torch
 import torch.nn as nn
 from types import MethodType
 from copy import deepcopy
 from transformers import PretrainedConfig, PreTrainedModel, AutoModelForCausalLM, AutoConfig
 from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
@@ -126,7 +127,7 @@ class HangulGemmaDeobfuscator(PreTrainedModel):
         pred_ids[token_type_ids==4] = torch.LongTensor(pred_char_ids).type_as(pred_ids)
         return pred_ids
-    def deobfuscate_by_syllable(self, sentence):
         sentences = [sentence]
         char_input_ids, char_attention_mask, char_token_type_ids = self.tokenizer.batch_encode_char(sentences)
         char_input_ids, char_attention_mask, char_token_type_ids = char_input_ids.to(self.device), char_attention_mask.to(self.device), char_token_type_ids.to(self.device)
@@ -137,7 +138,7 @@ class HangulGemmaDeobfuscator(PreTrainedModel):
         decoded = self.tokenizer.decode_char(pred_char_ids[0],char_token_type_ids[0])
         return decoded
-    def deobfuscate_hierarchical(self, sentence):
         sentences = [sentence]
         char_input_ids, char_attention_mask, char_token_type_ids = self.tokenizer.batch_encode_char(sentences)
         char_input_ids, char_attention_mask, char_token_type_ids = char_input_ids.to(self.device), char_attention_mask.to(self.device), char_token_type_ids.to(self.device)
@@ -154,18 +155,36 @@ class HangulGemmaDeobfuscator(PreTrainedModel):
         y_pred = [self.tokenizer.decode_jamo(pred_jamo_id, jamo_token_type_id) for pred_jamo_id, jamo_token_type_id in zip(pred_jamo_ids, jamo_token_type_ids.tolist())]
         return y_pred[0]
 def decoder_forward(
     self,
-    hidden_states,
-    position_embeddings,
-    attention_mask = None,
-    position_ids = None,
-    past_key_value = None,
-    output_attentions = False,
-    use_cache = False,
-    cache_position = None,
-):
     if self.is_sliding and attention_mask is not None:  # efficient SDPA and no padding
         attention_mask = torch.tril(torch.triu(attention_mask, diagonal=-self.sliding_window), diagonal=self.sliding_window)
@@ -183,6 +202,7 @@ def decoder_forward(
         output_attentions=output_attentions,
         use_cache=use_cache,
         cache_position=cache_position,
     )
     hidden_states = self.post_attention_layernorm(hidden_states)
     hidden_states = residual + hidden_states

 import torch.nn as nn
 from types import MethodType
+from typing import List, Optional, Tuple, Union
 from copy import deepcopy
 from transformers import PretrainedConfig, PreTrainedModel, AutoModelForCausalLM, AutoConfig
 from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
         pred_ids[token_type_ids==4] = torch.LongTensor(pred_char_ids).type_as(pred_ids)
         return pred_ids
+    def _deobfuscate_by_syllable(self, sentence):
         sentences = [sentence]
         char_input_ids, char_attention_mask, char_token_type_ids = self.tokenizer.batch_encode_char(sentences)
         char_input_ids, char_attention_mask, char_token_type_ids = char_input_ids.to(self.device), char_attention_mask.to(self.device), char_token_type_ids.to(self.device)
         decoded = self.tokenizer.decode_char(pred_char_ids[0],char_token_type_ids[0])
         return decoded
+    def _deobfuscate(self, sentence):
         sentences = [sentence]
         char_input_ids, char_attention_mask, char_token_type_ids = self.tokenizer.batch_encode_char(sentences)
         char_input_ids, char_attention_mask, char_token_type_ids = char_input_ids.to(self.device), char_attention_mask.to(self.device), char_token_type_ids.to(self.device)
         y_pred = [self.tokenizer.decode_jamo(pred_jamo_id, jamo_token_type_id) for pred_jamo_id, jamo_token_type_id in zip(pred_jamo_ids, jamo_token_type_ids.tolist())]
         return y_pred[0]
+    def deobfuscate(self, sentence, sentence_tokenizer=None):
+        if sentence_tokenizer is not None:
+            chunks_row = sentence_tokenizer.tokenize(sentence)
+            chunks_overlap_row = sentence_tokenizer.overlap(chunks_row)
+            chunks_indices = []
+            chunks_overlap = []
+            for start_idx, end_idx, chunk_overlap_row in chunks_overlap_row:
+                chunks_indices.append((start_idx, end_idx))
+                chunks_overlap.append(self._deobfuscate_hierarchical(chunk_overlap_row))
+            sentence_tokenizer.decode_overlap(row)
+        else:
+            return self._deobfuscate(sentence)
 def decoder_forward(
     self,
+    hidden_states: torch.Tensor,
+    position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value=None,
+    output_attentions: Optional[bool] = False,
+    use_cache: Optional[bool] = False,
+    cache_position: Optional[torch.LongTensor] = None,
+    last_cache_position: int = 0,
+    **kwargs,
+) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
     if self.is_sliding and attention_mask is not None:  # efficient SDPA and no padding
         attention_mask = torch.tril(torch.triu(attention_mask, diagonal=-self.sliding_window), diagonal=self.sliding_window)
         output_attentions=output_attentions,
         use_cache=use_cache,
         cache_position=cache_position,
+        **kwargs,
     )
     hidden_states = self.post_attention_layernorm(hidden_states)
     hidden_states = residual + hidden_states

sentence_tokenizer/config.json CHANGED Viewed

@@ -3,14 +3,8 @@
     "SentenceTokenizer"
   ],
   "auto_map": {
-    "AutoConfig": [
-      "modeling_sentence_tokenizer.SentenceTokenizerConfig",
-      null
-    ],
-    "AutoModel": [
-      "modeling_sentence_tokenizer.SentenceTokenizer",
-      null
-    ]
   },
   "max_length": 64,
   "min_length": 32,

     "SentenceTokenizer"
   ],
   "auto_map": {
+    "AutoConfig": "modeling_sentence_tokenizer.SentenceTokenizerConfig",
+    "AutoModel": "modeling_sentence_tokenizer.SentenceTokenizer"
   },
   "max_length": 64,
   "min_length": 32,