Upload EMG model with MorPiece tokenizer

Browse files

Files changed (10) hide show

README.md +60 -0
config.json +22 -0
generation_config.json +4 -0
model_eMG_simplified.py +230 -0
modeling_emg.py +319 -0
pytorch_model.bin +3 -0
requirements.txt +3 -0
tokenizer.json +0 -0
tokenizer_MorPiece.py +350 -0
tokenizer_config.json +7 -0

README.md ADDED Viewed

	@@ -0,0 +1,60 @@

+---
+language: en
+library_name: transformers
+tags:
+- emg
+- morphology
+- language-model
+- causal-lm
+- morpiece-tokenizer
+license: apache-2.0
+pipeline_tag: text-generation
+---
+# EMG Language Model
+This is an EMG (Enhanced Morphological Generation) language model with MorPiece tokenizer.
+## Model Details
+- **Model Type**: Causal Language Model
+- **Architecture**: EMG with morphological awareness
+- **Tokenizer**: MorPiece (morphology-aware tokenization)
+- **Parameters**: 79.75M
+- **Vocabulary Size**: 60001
+## Usage
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# Load model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained("your-username/your-model-name", trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained("your-username/your-model-name", trust_remote_code=True)
+# Generate text
+input_text = "The future of AI is"
+inputs = tokenizer(input_text, return_tensors="pt")
+outputs = model.generate(**inputs, max_length=50)
+generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(generated_text)
+```
+## Model Architecture
+The EMG model uses morphological awareness for better language understanding and generation.
+The MorPiece tokenizer provides morphology-aware tokenization that better handles word formations.
+## Training
+This model was trained on conversational data with morphological enhancement.
+## Limitations
+- This model is designed for research purposes
+- May not perform optimally on all downstream tasks without fine-tuning
+- Requires trust_remote_code=True due to custom architecture
+## Citation
+If you use this model, please cite the original EMG paper and implementation.

config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "architectures": [
+    "EMGForCausalLM"
+  ],
+  "dropout": 0.01,
+  "embedding_dim": 650,
+  "hidden_dim": 650,
+  "model_type": "emg",
+  "num_layers": 1,
+  "pad_token_id": 60004,
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.3",
+  "use_gradient_checkpointing": false,
+  "use_layer_norm": true,
+  "vocab_size": 60001,
+  "auto_map": {
+    "AutoConfig": "modeling_emg.EMGConfig",
+    "AutoModel": "modeling_emg.EMGLanguageModel",
+    "AutoModelForCausalLM": "modeling_emg.EMGForCausalLM",
+    "AutoTokenizer": "modeling_emg.MorPieceTokenizer"
+  }
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.52.3"
+}

model_eMG_simplified.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import os
+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel, PretrainedConfig
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+# ===================== OPTIMIZED EMG MODEL =====================
+class OptimizedEMGCell(nn.Module):
+    def __init__(self, input_size, hidden_size, dropout_rate=0.1, use_layer_norm=False):
+        super(OptimizedEMGCell, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.use_layer_norm = use_layer_norm
+        self.clamp_min = -1
+        self.clamp_max = 1
+        # Fused linear transformations for better efficiency
+        self.input_transform_linear = nn.Linear(input_size, hidden_size * 2)
+        self.hidden_transform_linear = nn.Linear(hidden_size, hidden_size * 2)
+        # SIMPLIFIED: Use standard dropout instead of variational
+        self.dropout = nn.Dropout(dropout_rate) if dropout_rate > 0 else None
+        # Layer normalization for training stability
+        if use_layer_norm:
+            self.input_norm = nn.LayerNorm(hidden_size)
+            self.hidden_norm = nn.LayerNorm(hidden_size)
+            self.cell_norm = nn.LayerNorm(hidden_size)
+        self.init_weights()
+    def init_weights(self):
+        for linear in [self.input_transform_linear, self.hidden_transform_linear]:
+            # Use smaller initialization for RNN stability
+            nn.init.uniform_(linear.weight, -0.1, 0.1)
+            nn.init.zeros_(linear.bias)
+    def forward(self, input, hidden):
+        h_prev, c_prev = hidden
+        # Project input and hidden states
+        input_connections = self.input_transform_linear(input)
+        hidden_connections = self.hidden_transform_linear(h_prev)
+        # Split projections
+        i_move, i_merge = torch.chunk(input_connections, 2, dim=-1)
+        h_move, h_merge = torch.chunk(hidden_connections, 2, dim=-1)
+        # EMG computation
+        # merge_gate = torch.clamp(i_merge, self.clamp_min, self.clamp_max) * torch.sigmoid(torch.clamp(h_merge, self.clamp_min, self.clamp_max))
+        merge_gate = torch.clamp(i_merge * torch.sigmoid(h_merge), self.clamp_min, self.clamp_max)
+        move_gate = torch.clamp(torch.sigmoid(i_move) * h_move, self.clamp_min, self.clamp_max)
+        if self.use_layer_norm:
+            c_prev = self.cell_norm(c_prev)
+        context_gate = torch.tanh(torch.clamp(c_prev + merge_gate, self.clamp_min, self.clamp_max))
+        if self.use_layer_norm:
+            context_gate = self.input_norm(context_gate)
+        c_next = context_gate
+        if self.use_layer_norm:
+            c_next = self.hidden_norm(c_next)
+        # Apply dropout to output instead of complex variational dropout
+        m_next = (1 - move_gate) * merge_gate + move_gate * c_next
+        if self.dropout is not None:
+            m_next = self.dropout(m_next)
+        return m_next, c_next
+class OptimizedEMG(nn.Module):
+    """Enhanced EMG with gradient checkpointing and other optimizations"""
+    def __init__(self, input_size, hidden_size, num_layers, dropout_rate=0.1,
+                 use_gradient_checkpointing=False):
+        super(OptimizedEMG, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.use_gradient_checkpointing = use_gradient_checkpointing
+        self.cells = nn.ModuleList([
+            OptimizedEMGCell(
+                input_size if i == 0 else hidden_size,
+                hidden_size,
+                dropout_rate
+            ) for i in range(num_layers)
+        ])
+    def forward(self, x, hidden=None):
+        batch_size, seq_len, _ = x.size()
+        if hidden is None:
+            hidden = [(torch.zeros(batch_size, self.hidden_size, device=x.device),
+                      torch.zeros(batch_size, self.hidden_size, device=x.device))
+                     for _ in range(self.num_layers)]
+        outputs = []
+        for t in range(seq_len):
+            layer_input = x[:, t, :]
+            for layer_idx, cell in enumerate(self.cells):
+                m_prev, c_prev = hidden[layer_idx]
+                if self.use_gradient_checkpointing and self.training:
+                    m_next, c_next = torch.utils.checkpoint.checkpoint(
+                        cell, layer_input, (m_prev, c_prev), use_reentrant=False
+                    )
+                else:
+                    m_next, c_next = cell(layer_input, (m_prev, c_prev))
+                hidden[layer_idx] = (m_next, c_next)
+                layer_input = m_next
+            outputs.append(layer_input)
+        output = torch.stack(outputs, dim=1)
+        return output, hidden
+# ===================== HUGGING FACE COMPATIBLE MODEL =====================
+class EMGConfig(PretrainedConfig):
+    """Configuration class for EMG model"""
+    model_type = "emg"
+    def __init__(
+        self,
+        vocab_size=50000,
+        embedding_dim=512,
+        hidden_dim=512,
+        num_layers=2,
+        dropout=0.1,
+        use_layer_norm=True,
+        use_gradient_checkpointing=False,
+        tie_word_embeddings=True,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.dropout = dropout
+        self.use_layer_norm = use_layer_norm
+        self.use_gradient_checkpointing = use_gradient_checkpointing
+        self.tie_word_embeddings = tie_word_embeddings
+class EMGLanguageModel(PreTrainedModel):
+    """Hugging Face compatible EMG Language Model"""
+    config_class = EMGConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.embedding = nn.Embedding(config.vocab_size, config.embedding_dim)
+        self.emg = OptimizedEMG(
+            config.embedding_dim,
+            config.hidden_dim,
+            config.num_layers,
+            config.dropout,
+            config.use_gradient_checkpointing
+        )
+        self.output_projection = nn.Linear(config.hidden_dim, config.vocab_size)
+        # Tie embedding and output weights if dimensions match
+        if config.tie_word_embeddings and config.embedding_dim == config.hidden_dim:
+            self.output_projection.weight = self.embedding.weight
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            nn.init.xavier_uniform_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, input_ids, hidden=None, labels=None, **kwargs):
+        embedded = self.embedding(input_ids)
+        output, hidden = self.emg(embedded, hidden)
+        logits = self.output_projection(output)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                          shift_labels.view(-1))
+        return {'loss': loss, 'logits': logits, 'hidden_states': hidden}
+    def generate(self, input_ids, max_length=50, temperature=1.0, top_k=50):
+        self.eval()
+        generated = input_ids
+        hidden = None
+        for _ in range(max_length):
+            outputs = self.forward(generated[:, -1:], hidden)
+            logits = outputs['logits'][:, -1, :] / temperature
+            # Top-k sampling
+            top_k_logits, top_k_indices = torch.topk(logits, top_k)
+            probs = F.softmax(top_k_logits, dim=-1)
+            next_token = top_k_indices.gather(1, torch.multinomial(probs, num_samples=1))
+            generated = torch.cat([generated, next_token], dim=1)
+        return generated

modeling_emg.py ADDED Viewed

	@@ -0,0 +1,319 @@

+"""
+HuggingFace Integration for EMG Model and MorPiece Tokenizer
+This file makes your custom model and tokenizer compatible with HuggingFace and lm_eval
+"""
+import json
+import os
+from typing import List, Optional, Union, Dict, Any
+import torch
+import torch.nn as nn
+from transformers import (
+    PreTrainedModel,
+    PretrainedConfig,
+    PreTrainedTokenizer,
+    AutoConfig,
+    AutoModel,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    GenerationMixin,  # Add this import
+)
+from transformers.modeling_outputs import CausalLMOutputWithPast
+# Import your existing classes
+from model_eMG_simplified import EMGLanguageModel, EMGConfig, OptimizedEMG, OptimizedEMGCell
+from tokenizer_MorPiece import MorPiece
+class MorPieceTokenizer(PreTrainedTokenizer):
+    """
+    HuggingFace compatible wrapper for MorPiece tokenizer
+    """
+    def __init__(self,
+                 vocab_file=None,
+                 model_file=None,
+                 unk_token="<unk>",
+                 pad_token="<pad>",
+                 bos_token="<s>",
+                 eos_token="</s>",
+                 **kwargs):
+        # Initialize the MorPiece tokenizer
+        self.morpiece = MorPiece()
+        # Load from file if provided
+        if vocab_file or model_file:
+            model_path = vocab_file or model_file
+            if os.path.isdir(model_path):
+                self.morpiece.from_pretrained(model_path)
+            else:
+                # Load from JSON file
+                with open(model_path, 'r') as f:
+                    data = json.load(f)
+                self.morpiece.roots = data.get('roots', data)
+                if 'vocab' in data:
+                    self.morpiece.vocab_to_id = data['vocab']
+                else:
+                    self.morpiece.build_vocab_lookup()
+        # Get vocabulary
+        self.vocab = self.morpiece.get_vocab()
+        # Set special tokens
+        super().__init__(
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            **kwargs
+        )
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    def get_vocab(self):
+        return self.vocab.copy()
+    def _tokenize(self, text: str) -> List[str]:
+        """Tokenize text into tokens"""
+        # For HuggingFace compatibility, we need to return string tokens
+        token_ids = self.morpiece.encode(text)
+        tokens = self.morpiece.decode(token_ids)
+        return tokens
+    def _convert_token_to_id(self, token: str) -> int:
+        """Convert token to ID"""
+        return self.vocab.get(token, self.vocab.get(self.unk_token, 0))
+    def _convert_id_to_token(self, index: int) -> str:
+        """Convert ID to token"""
+        for token, idx in self.vocab.items():
+            if idx == index:
+                return token
+        return self.unk_token
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Convert tokens back to string"""
+        # Handle special tokens
+        text = "".join(tokens)
+        # Clean up special tokens for display
+        for special_token in [self.pad_token, self.bos_token, self.eos_token]:
+            if special_token:
+                text = text.replace(special_token, "")
+        return text.strip()
+    def encode(self, text: str, add_special_tokens: bool = True, **kwargs) -> List[int]:
+        """Encode text to token IDs"""
+        if add_special_tokens and self.bos_token:
+            text = f"{self.bos_token} {text}"
+        if add_special_tokens and self.eos_token:
+            text = f"{text} {self.eos_token}"
+        return self.morpiece.encode(text)
+    def decode(self, token_ids: List[int], skip_special_tokens: bool = True, **kwargs) -> str:
+        """Decode token IDs to text"""
+        tokens = []
+        for token_id in token_ids:
+            token = self._convert_id_to_token(token_id)
+            if skip_special_tokens and token in [self.pad_token, self.bos_token, self.eos_token, self.unk_token]:
+                continue
+            tokens.append(token)
+        return self.convert_tokens_to_string(tokens)
+    def save_pretrained(self, save_directory: str, **kwargs):
+        """Save tokenizer"""
+        os.makedirs(save_directory, exist_ok=True)
+        # Save MorPiece data
+        tokenizer_file = os.path.join(save_directory, "tokenizer.json")
+        self.morpiece.save(tokenizer_file)
+        # Save tokenizer config
+        config = {
+            "tokenizer_class": "MorPieceTokenizer",
+            "unk_token": self.unk_token,
+            "pad_token": self.pad_token,
+            "bos_token": self.bos_token,
+            "eos_token": self.eos_token,
+        }
+        config_file = os.path.join(save_directory, "tokenizer_config.json")
+        with open(config_file, 'w') as f:
+            json.dump(config, f, indent=2)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
+        """Load tokenizer from pretrained"""
+        return cls(vocab_file=pretrained_model_name_or_path, **kwargs)
+class EMGForCausalLM(EMGLanguageModel, GenerationMixin):
+    """
+    Enhanced EMG model with better HuggingFace compatibility for lm_eval
+    Inherits from GenerationMixin to fix the warning
+    """
+    def __init__(self, config):
+        # Initialize EMGLanguageModel first
+        EMGLanguageModel.__init__(self, config)
+        # Then initialize GenerationMixin
+        GenerationMixin.__init__(self)
+        self.config = config
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        past_key_values: Optional[tuple] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs
+    ) -> CausalLMOutputWithPast:
+        """
+        Forward pass with HuggingFace compatible output format
+        """
+        # Get embeddings
+        embedded = self.embedding(input_ids)
+        # Pass through EMG layers
+        output, hidden = self.emg(embedded, past_key_values)
+        # Get logits
+        logits = self.output_projection(output)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                          shift_labels.view(-1))
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=hidden if use_cache else None,
+            hidden_states=output,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values=None,
+        attention_mask=None,
+        **kwargs
+    ):
+        """Prepare inputs for generation"""
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "attention_mask": attention_mask,
+        }
+    def _reorder_cache(self, past_key_values, beam_idx):
+        """Reorder cache for beam search"""
+        if past_key_values is None:
+            return None
+        reordered_cache = []
+        for layer_cache in past_key_values:
+            if isinstance(layer_cache, tuple):
+                reordered_cache.append(tuple(
+                    cache.index_select(0, beam_idx) for cache in layer_cache
+                ))
+            else:
+                reordered_cache.append(layer_cache.index_select(0, beam_idx))
+        return tuple(reordered_cache)
+# Register the custom classes with transformers
+def register_emg_model():
+    """Register EMG model and tokenizer with transformers"""
+    # Register config
+    AutoConfig.register("emg", EMGConfig)
+    # Register model
+    AutoModel.register(EMGConfig, EMGLanguageModel)
+    AutoModelForCausalLM.register(EMGConfig, EMGForCausalLM)
+    # Register tokenizer
+    AutoTokenizer.register(EMGConfig, MorPieceTokenizer)
+    print("EMG model and MorPiece tokenizer registered with transformers!")
+def load_emg_model_and_tokenizer(model_path: str):
+    """
+    Load EMG model and MorPiece tokenizer from saved directory
+    Args:
+        model_path: Path to the saved model directory
+    Returns:
+        tuple: (model, tokenizer)
+    """
+    # Register classes first
+    register_emg_model()
+    # Load model
+    config = EMGConfig.from_pretrained(model_path)
+    model = EMGForCausalLM.from_pretrained(model_path, config=config)
+    # Load tokenizer
+    tokenizer = MorPieceTokenizer.from_pretrained(model_path)
+    # Set pad token id in model config if not set
+    if not hasattr(config, 'pad_token_id') or config.pad_token_id is None:
+        config.pad_token_id = tokenizer.pad_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+    return model, tokenizer
+def test_model_and_tokenizer(model_path: str):
+    """Test the loaded model and tokenizer"""
+    model, tokenizer = load_emg_model_and_tokenizer(model_path)
+    # Test encoding/decoding
+    test_text = "Hello world, this is a test."
+    print(f"Original text: {test_text}")
+    # Encode
+    encoded = tokenizer.encode(test_text)
+    print(f"Encoded: {encoded}")
+    # Decode
+    decoded = tokenizer.decode(encoded, skip_special_tokens=True)
+    print(f"Decoded: {decoded}")
+    # Test model forward pass
+    input_ids = torch.tensor([encoded])
+    with torch.no_grad():
+        outputs = model(input_ids)
+        print(f"Model output shape: {outputs.logits.shape}")
+        print(f"Model output type: {type(outputs)}")
+    print("Model and tokenizer are working correctly!")
+    return model, tokenizer
+if __name__ == "__main__":
+    # Example usage
+    model_path = "path/to/your/saved/model"  # Replace with your model path
+    # Register the classes
+    register_emg_model()
+    # Test loading
+    try:
+        model, tokenizer = test_model_and_tokenizer(model_path)
+        print("✅ Model and tokenizer loaded successfully!")
+    except Exception as e:
+        print(f"❌ Error loading model: {e}")

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5f54b4392c29c455472735d1de207e490f1ef9789ac39df15a50a5117feba81
+size 163016093

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+torch>=1.9.0
+transformers>=4.20.0
+numpy

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_MorPiece.py ADDED Viewed

	@@ -0,0 +1,350 @@

+import os
+import json
+from math import log
+class MorPiece:
+    def __init__(self, vocab_size=30000, min_frequency=2, cutoff=8, bf=10, special_tokens=None):
+        self.tokenization_to_print = "TP left-right \t BF right-left \t TP right-left \t BP right-left\n"  # for debugging only
+        if special_tokens is None:
+            special_tokens = ['<unk>', '<pad>', '<s>', '</s>']
+        self.special_tokens = special_tokens
+        self.reserved_keys = {'[RSX]', '##', 'IDX', '++'}
+        self.vocab_size = vocab_size
+        self.min_frequency = min_frequency
+        self.bf = bf
+        self.roots = {'[RSX]': {}, '++': {}}
+        self.roots_unoptimized = {}
+        self.infls = {}
+        self.types = {}
+        self.last_item_in_trie = {}
+        self.idx = 0
+        self.tokens = []
+        self.suffixes = []
+        self.tokens_bf = []
+        self.suffixes_bf = []
+        self.prefix = ""
+        self.n_prefix = 0
+        self.n_suffix = 0
+        self.tokenized_words = []
+        self.tokenized_word_longest = ""
+        self.tokenized_word_idx_longest = ""
+        self.cutoff = cutoff  # ln(8) is > 2, so, non-branching paths will be ignored
+        self.num_tokens_in_corpus = 0
+        self.num_chars_in_corpus = 0
+        self.num_chars_in_trie = 0
+        self.num_chars_in_optimized_trie = 0
+        self.set_special_tokens(self.special_tokens)
+    def train(self, corpus: str):  # create the vocabulary
+        words = corpus.split()
+        print("MorPiece tokenizer training: processing words...")
+        for word in words:
+            word_alpha = ''.join([char for char in word if char.isalpha() or char == "'"])
+            if not word_alpha:
+                word = ''.join([char for char in word])
+            else:
+                word = word_alpha
+            if word:
+                self.build_trie(word, self.roots_unoptimized)  # create roots trie
+                self.build_trie(word[::-1], self.infls)  # create inflections trie
+                if word not in self.types:  # count tokens and chars in corpus
+                    self.types[word] = 1
+                else:
+                    self.types[word] += 1
+                self.num_tokens_in_corpus += 1
+                self.num_chars_in_corpus += len(word)
+        self.types = dict(sorted(self.types.items(), key=lambda item: item[1], reverse=True))
+        sort_trie_by_freq(self.roots_unoptimized)
+        sort_trie_by_freq(self.infls)
+        print("MorPiece tokenizer training: trie optimization...")
+        self.optimize(self.types)
+        print(f"Built final vocabulary with {self.get_vocab_size()} tokens")
+        print(f"Most common tokens: {list(self.types.items())[:20]}")
+    def build_trie(self, wordpiece, root):  # build the trie and register # of traversals in '##'
+        if wordpiece[0] in root:
+            root[wordpiece[0]]['##'] += 1
+            self.num_chars_in_trie += 1
+            if len(wordpiece) > 1:
+                self.build_trie(wordpiece[1:], root[wordpiece[0]])
+            else:
+                if 'END' not in root[wordpiece[0]]:
+                    root[wordpiece[0]]['END'] = None
+        else:
+            root[wordpiece[0]] = {}
+            root[wordpiece[0]]['##'] = 1
+            if len(wordpiece) > 1:
+                self.build_trie(wordpiece[1:], root[wordpiece[0]])
+    def set_special_tokens(self, list):
+        for item in list:
+            if item not in self.roots['[RSX]'].keys():
+                self.roots['[RSX]'][item] = {'IDX': None}
+                self.roots['[RSX]'][item]['IDX'] = self.idx
+                self.idx += 1
+    # assign idx based on word freq and add potential inflection links in the root trie, remove frequency at the end
+    def optimize(self, words):
+        for word, freq in words.items():
+            if freq >= self.min_frequency and self.idx <= self.vocab_size:
+                self.tokens = []
+                self.suffixes = []
+                self.tokens_bf = []
+                self.suffixes_bf = []
+                self.tokens.append(word[0])
+                self.suffixes.append(word[len(word) - 1])
+                self.split_prefix(word, self.roots_unoptimized)
+                if len(self.tokens) > 1:
+                    self.split_suffix(word[::-1], self.infls)
+                    self.suffixes = [word[::-1] for word in self.suffixes][::-1]
+                    self.tokenization_to_print += str(self.tokens) + '\t' + str(self.tokens_bf) + '\t' + str(
+                        self.suffixes) + '\t' + str(self.suffixes_bf) + '\n'  # for debugging only
+                    for i in range(0,
+                                   len(self.tokens)):  # esperimenti: usare solo self.suffixes o self.tokens (prefissi)
+                        if i == 0:
+                            self.last_item_in_trie = self.roots
+                            self.add_items_to_trie(
+                                self.tokens[0])  # esperimenti: usare solo self.suffixes o self.tokens (prefissi)
+                        else:
+                            self.last_item_in_trie = self.roots['++']
+                            self.add_items_to_trie(
+                                self.tokens[i])  # esperimenti: usare solo self.suffixes o self.tokens (prefissi)
+                        if 'IDX' not in self.last_item_in_trie:
+                            self.last_item_in_trie['IDX'] = self.idx
+                            self.idx += 1
+                else:
+                    self.last_item_in_trie = self.roots
+                    self.add_items_to_trie(word)
+                    if 'IDX' not in self.last_item_in_trie:
+                        self.last_item_in_trie['IDX'] = self.idx
+                        self.idx += 1
+        self.build_vocab_lookup()
+    def build_vocab_lookup(self):
+        self.vocab_to_id = {}
+        def traverse(trie, path):
+            for k, v in trie.items():
+                if k == 'IDX':
+                    token = ''.join(path)
+                    self.vocab_to_id[token] = v
+                elif isinstance(v, dict):
+                    traverse(v, path + [k])
+        traverse(self.roots, [])
+    def encode(self, sentence: str):
+        self.tokenized_words = []
+        words = sentence.strip().split()
+        token_ids = []
+        for word in words:
+            if word in self.roots['[RSX]']:
+                token_ids.append(self.roots['[RSX]'][word]['IDX'])
+            else:
+                self.tokenized_word_longest = ""
+                self.tokenized_word_idx_longest = None
+                self.retrieve(word, self.roots)
+                if self.tokenized_word_idx_longest is not None:
+                    token_ids.append(self.tokenized_word_idx_longest)
+                else:
+                    token_ids.append(self.roots['[RSX]']['<unk>']['IDX'])
+        return token_ids
+    def decode(self, sentence_idxs):
+        tokens = []
+        for idx in sentence_idxs:
+            keys_path = find_idx_path(self.roots, idx)
+            if keys_path:
+                token = "".join(keys_path)
+                if token.startswith('[RSX]'):
+                    token = token[5:]
+                tokens.append(token)
+        return tokens
+    def retrieve(self, word, trie):
+        self.longest_match_in_trie(word, trie)
+        if self.tokenized_word_longest:
+            self.tokenized_words.append([self.tokenized_word_longest, self.tokenized_word_idx_longest])
+        else:
+            self.tokenized_words.append(['<unk>', self.roots['[RSX]']['<unk>']['IDX']])
+    def longest_match_in_trie(self, string, trie):
+        if string[0] in trie:
+            self.tokenized_word_longest += string[0]
+            if 'IDX' in trie[string[0]]:
+                self.tokenized_word_idx_longest = trie[string[0]]['IDX']
+            if len(string) > 1:
+                self.longest_match_in_trie(string[1:], trie[string[0]])
+        else:
+            # print(string[0], self.tokenized_word_longest)
+            if string[0] in self.roots['++'] and self.tokenized_word_idx_longest:
+                self.tokenized_words.append([self.tokenized_word_longest + '++', self.tokenized_word_idx_longest])
+                self.tokenized_word_longest = '++'
+                self.tokenized_word_idx_longest = 0
+                self.longest_match_in_trie(string, self.roots['++'])
+            else:
+                self.tokenized_words.append(['<unk>', self.roots['[RSX]']['<unk>']['IDX']])
+                self.tokenized_word_longest = None
+    def split_prefix(self, word, trie):
+        l = len(word)
+        if l > 1:
+            self.get_pair_in_trie(word[0], word[1], trie)
+            if self.check_tp(self.n_prefix, self.n_suffix) and self.get_bf(trie[word[0]]) <= self.bf:
+                self.tokens.append(word[1])
+                self.tokens_bf.append(word[0] + str(self.get_bf(trie[word[0]])))
+            else:
+                self.tokens[len(self.tokens) - 1] = self.tokens[len(self.tokens) - 1] + word[1]
+        if l > 2:
+            self.split_prefix(word[1:], trie[word[0]])
+    def split_suffix(self, word, trie):
+        l = len(word)
+        if l > 1:
+            self.get_pair_in_trie(word[0], word[1], trie)
+            if self.check_tp(self.n_prefix, self.n_suffix) and self.get_bf(trie[word[0]]) <= self.bf:  # verify if the
+                self.suffixes.append(word[1])
+                self.suffixes_bf.append(word[0] + str(self.get_bf(trie[word[0]])))
+            else:
+                self.suffixes[len(self.suffixes) - 1] = self.suffixes[len(self.suffixes) - 1] + word[1]
+        if l > 2:
+            if word[0] in trie.keys():
+                self.split_suffix(word[1:], trie[word[0]])
+    def get_pair_in_trie(self, prefix, suffix, trie):
+        self.n_prefix = 0
+        self.n_suffix = 0
+        if prefix in trie:
+            if suffix in trie[prefix]:
+                self.n_prefix = trie[prefix]["##"]
+                self.n_suffix = trie[prefix][suffix]["##"]
+    def check_tp(self, m, d):  # verify if Tolerance Principle applies between m(other) and d(aughter) nodes
+        if not m > 1:
+            return False
+        else:
+            tp = m / log(m)
+        if self.cutoff <= m != d > tp:
+            return True
+        else:
+            return False
+    def get_bf(self, m):  # return the branching factor of the mother node
+        keys = m.keys()
+        n_keys = len(keys)
+        for k in keys:
+            if k in self.special_tokens:
+                n_keys -= 1
+        return n_keys
+    def add_items_to_trie(self, items):
+        for item in items:
+            self.add_item_to_trie(item)
+    def add_item_to_trie(self, item):
+        if item not in self.last_item_in_trie:
+            self.last_item_in_trie[item] = {}
+        self.last_item_in_trie = self.last_item_in_trie[item]
+    def pad_sentence(sentence, l):
+        """
+        Pads the given sentence with "[pad]" tokens at the beginning to reach the desired length.
+        Parameters:
+        - sentence (str): The original sentence to be padded.
+        - l (int): The desired total number of tokens in the sentence after padding.
+        Returns:
+        - str: The padded sentence.
+        """
+        words = sentence.split()
+        n_pad = max(l - len(words), 0)  # Ensure n_pad is not negative
+        pad_tokens = ["[pad]"] * n_pad
+        padded_sentence = ' '.join(pad_tokens + words)
+        return padded_sentence
+    def get_num_chars_in_trie(self):
+        return self.num_chars_in_trie
+    def get_num_chars_in_corpus(self):
+        return self.num_chars_in_corpus
+    def get_vocab_size(self) -> int:
+        return self.idx
+    def get_vocab(self):
+        return self.vocab_to_id.copy()
+    def get_num_tokens_in_corpus(self):
+        return self.num_tokens_in_corpus
+    def get_num_types_in_corpus(self):
+        return len(self.types)
+    def get_compression_ratio(self):
+        return round(self.num_chars_in_trie / self.num_chars_in_corpus, 3)
+    def get_ttr(self):
+        return round(len(self.types) / self.num_tokens_in_corpus, 3)
+    def save(self, save_file):
+        self.build_vocab_lookup()
+        with open(save_file, 'w') as f:
+            json.dump({
+                'roots': self.roots,
+                'vocab': self.vocab_to_id
+            }, f, indent=2)
+    def from_pretrained(self, load_file):
+        with open(load_file + '/tokenizer.json', 'r') as f:
+            data = json.load(f)
+        # Backward compatibility: if old format, data is just roots
+        if isinstance(data, dict) and 'roots' in data:
+            self.roots = data['roots']
+            self.vocab_to_id = data.get('vocab', {})  # fallback to empty dict if missing
+        else:
+            # Old format support (e.g., tokenizer.json only had roots)
+            self.roots = data
+            self.vocab_to_id = {}
+        # Ensure [RSX] exists
+        if '[RSX]' not in self.roots:
+            raise ValueError("Invalid tokenizer format: Missing [RSX] root node.")
+    def save_types(self, file):
+        with open(file, 'w') as f:
+            json.dump(self.types, f, indent=2)
+def sort_trie_by_freq(d):
+    if not isinstance(d, dict):
+        return d
+    # Sort the dictionary items by the value of the nested key '##'
+    sorted_items = sorted(
+        d.items(),
+        key=lambda item: item[1].get('##', float('-inf')) if isinstance(item[1], dict) else float('-inf'),
+        reverse=True
+    )
+    # Clear the dictionary and update with sorted items
+    d.clear()
+    for k, v in sorted_items:
+        d[k] = sort_trie_by_freq(v)
+    return d
+def find_idx_path(d, target_value, path=None):
+    if path is None:
+        path = []
+    for key, value in d.items():
+        if key == 'IDX' and value == target_value:
+            return path
+        elif isinstance(value, dict):
+            result = find_idx_path(value, target_value, path + [key])
+            if result is not None:
+                return result
+    return None

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "tokenizer_class": "MorPieceTokenizer",
+  "unk_token": "<unk>",
+  "pad_token": "<pad>",
+  "bos_token": "<s>",
+  "eos_token": "</s>"
+}