Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

README.md +73 -0
__init__.py +33 -0
__pycache__/configuration_setu_translation.cpython-310.pyc +0 -0
__pycache__/modeling_setu_translation.cpython-310.pyc +0 -0
assets/decoder.onnx +3 -0
assets/encoder.onnx +3 -0
assets/model.pt +3 -0
assets/spm.model +3 -0
assets/spm.vocab +0 -0
config.json +28 -0
configuration_setu_translation.py +56 -0
model_config.json +19 -0
modeling_setu_translation.py +277 -0
requirements.txt +6 -0

README.md ADDED Viewed

	@@ -0,0 +1,73 @@

+# SETU - Script-agnostic English Translation Unifier
+SETU is a neural translation model that unifies multiscript, multilingual, and informal text into clean, formal English.
+## Model Description
+The SETU model can handle:
+- Romanized Nepali to English translation
+- Devanagari Nepali to English translation
+- Code-mixed text to English translation
+- Informal/slang to formal English translation
+## Usage
+```python
+from transformers import AutoModel
+# Load the model
+model = AutoModel.from_pretrained("santoshdahal/setu", trust_remote_code=True)
+# Translate text
+result = model("mero name santosh ho")
+print("Translation:", result)
+# Output: "My name is Santosh."
+# Works with Devanagari script too
+result = model("मेरो नाम सन्तोष हो")
+print("Translation:", result)
+# Output: "My name is Santosh."
+# Handles informal text
+result = model("bro i gonna go ktm")
+print("Translation:", result)
+# Output: "I am going to Kathmandu."
+```
+## Model Details
+- **Model Type**: Neural Machine Translation
+- **Architecture**: Transformer (based on fairseq transformer_iwslt_de_en)
+- **Vocabulary Size**: 40,253 tokens
+- **Languages Supported**: Nepali (Romanized & Devanagari), English, Code-mixed text
+- **Model Format**: ONNX for efficient inference
+## Technical Implementation
+The model uses:
+- ONNX Runtime for efficient inference
+- SentencePiece for tokenization
+- Beam search decoding with configurable beam size
+- Separate encoder and decoder ONNX models
+## Files Included
+- `encoder.onnx`: ONNX encoder model
+- `decoder.onnx`: ONNX decoder model
+- `spm.model`: SentencePiece tokenizer model
+- `spm.vocab`: SentencePiece vocabulary
+- `config.json`: Model configuration
+- `modeling_setu_translation.py`: Model implementation
+- `configuration_setu_translation.py`: Configuration class
+## Citation
+If you use this model, please cite:
+```
+@misc{setu2024,
+  title={SETU: Script-agnostic English Translation Unifier},
+  author={Santosh Dahal},
+  year={2024}
+}
+```

__init__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+SETU Translation Model for Hugging Face Transformers
+This package provides the SETU (Script-agnostic English Translation Unifier) model
+for translating multiscript, multilingual, and informal text into clean, formal English.
+Usage:
+    from transformers import AutoModel
+    # Load the model
+    model = AutoModel.from_pretrained("santoshdahal/setu", trust_remote_code=True)
+    # Translate text
+    result = model("mero name santosh ho")
+    print("Translation:", result)
+"""
+from transformers import AutoConfig, AutoModel
+try:
+    from .configuration_setu_translation import SetuTranslationConfig
+    from .modeling_setu_translation import SetuTranslationModel
+except ImportError:
+    from configuration_setu_translation import SetuTranslationConfig
+    from modeling_setu_translation import SetuTranslationModel
+# Register the model configuration and model class
+AutoConfig.register("setu_translation", SetuTranslationConfig)
+AutoModel.register(SetuTranslationConfig, SetuTranslationModel)
+__all__ = [
+    "SetuTranslationConfig",
+    "SetuTranslationModel",
+]

__pycache__/configuration_setu_translation.cpython-310.pyc ADDED Viewed

Binary file (2.05 kB). View file

__pycache__/modeling_setu_translation.cpython-310.pyc ADDED Viewed

Binary file (7.33 kB). View file

assets/decoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b07150550ea258faac1ea62095ce63da348fd37a4ed560a274b6cb134ce649a
+size 242959762

assets/encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2bf8f37df0f8f066023cc41b7c65d9e8a4dd82badeb3d0f3a7d6abe2e4587dfd
+size 135159477

assets/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f398c42275e7925df4ac1d7a0b59c7cb2629e899ee2a24e86f323261504b321
+size 790826829

assets/spm.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d26da1faa7fa9c8b8b30f1ea44da83939be6656e7c077f63ab271d34abe877b
+size 948113

assets/spm.vocab ADDED Viewed

The diff for this file is too large to render. See raw diff

config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "model_type": "setu_translation",
+  "architectures": ["SetuTranslationModel"],
+  "auto_map": {
+    "AutoConfig": "configuration_setu_translation.SetuTranslationConfig",
+    "AutoModel": "modeling_setu_translation.SetuTranslationModel"
+  },
+  "model_name": "SETU",
+  "full_name": "Script-agnostic English Translation Unifier",
+  "description": "A neural translation model that unifies multiscript, multilingual, and informal text into clean, formal English",
+  "version": "1.0.0",
+  "architecture": "transformer_iwslt_de_en",
+  "src_vocab_size": 40253,
+  "tgt_vocab_size": 40253,
+  "bos_idx": 0,
+  "eos_idx": 2,
+  "pad_idx": 1,
+  "unk_idx": 3,
+  "beam_size": 5,
+  "max_len": 200,
+  "len_penalty": 1.0,
+  "capabilities": [
+    "Romanized Nepali to English",
+    "Devanagari Nepali to English",
+    "Code-mixed text to English",
+    "Informal/slang to formal English"
+  ]
+}

configuration_setu_translation.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from transformers import PreTrainedModel, PretrainedConfig
+import json
+class SetuTranslationConfig(PretrainedConfig):
+    """Configuration class for SETU Translation model.
+    This class handles the configuration for the SETU (Script-agnostic English Translation Unifier) model
+    which translates multiscript, multilingual, and informal text into clean, formal English.
+    """
+    model_type = "setu_translation"
+    def __init__(
+        self,
+        model_name: str = "SETU",
+        full_name: str = "Script-agnostic English Translation Unifier",
+        description: str = "A neural translation model that unifies multiscript, multilingual, and informal text into clean, formal English",
+        version: str = "1.0.0",
+        architecture: str = "transformer_iwslt_de_en",
+        src_vocab_size: int = 40253,
+        tgt_vocab_size: int = 40253,
+        bos_idx: int = 0,
+        eos_idx: int = 2,
+        pad_idx: int = 1,
+        unk_idx: int = 3,
+        beam_size: int = 5,
+        max_len: int = 200,
+        len_penalty: float = 1.0,
+        capabilities: list = None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.model_name = model_name
+        self.full_name = full_name
+        self.description = description
+        self.version = version
+        self.architecture = architecture
+        self.src_vocab_size = src_vocab_size
+        self.tgt_vocab_size = tgt_vocab_size
+        self.bos_idx = bos_idx
+        self.eos_idx = eos_idx
+        self.pad_idx = pad_idx
+        self.unk_idx = unk_idx
+        self.beam_size = beam_size
+        self.max_len = max_len
+        self.len_penalty = len_penalty
+        if capabilities is None:
+            capabilities = [
+                "Romanized Nepali to English",
+                "Devanagari Nepali to English",
+                "Code-mixed text to English",
+                "Informal/slang to formal English"
+            ]
+        self.capabilities = capabilities

model_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "model_name": "SETU",
+  "full_name": "Script-agnostic English Translation Unifier",
+  "description": "A neural translation model that unifies multiscript, multilingual, and informal text into clean, formal English",
+  "version": "1.0.0",
+  "architecture": "transformer_iwslt_de_en",
+  "src_vocab_size": 40253,
+  "tgt_vocab_size": 40253,
+  "bos_idx": 0,
+  "eos_idx": 2,
+  "pad_idx": 1,
+  "unk_idx": 3,
+  "capabilities": [
+    "Romanized Nepali to English",
+    "Devanagari Nepali to English",
+    "Code-mixed text to English",
+    "Informal/slang to formal English"
+  ]
+}

modeling_setu_translation.py ADDED Viewed

	@@ -0,0 +1,277 @@

+from transformers import PreTrainedModel, AutoConfig, AutoModel
+try:
+    from .configuration_setu_translation import SetuTranslationConfig
+except ImportError:
+    from configuration_setu_translation import SetuTranslationConfig
+import torch
+import os
+import numpy as np
+import json
+import onnxruntime as ort
+import sentencepiece as spm
+from typing import List, Tuple
+from huggingface_hub import snapshot_download
+class SetuTranslationModel(PreTrainedModel):
+    """SETU Translation Model for Hugging Face Hub
+    This model performs script-agnostic translation to unified English output.
+    It handles multiscript, multilingual, and informal text translation.
+    """
+    config_class = SetuTranslationConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        # Initialize model components
+        self.encoder_session = None
+        self.decoder_session = None
+        self.sp = None
+        # Load model files if they exist
+        self._load_model_components()
+    def _load_model_components(self):
+        """Load ONNX models and SentencePiece processor"""
+        model_dir = getattr(self.config, '_name_or_path', '.')
+        # Paths to model files in assets folder
+        assets_dir = os.path.join(model_dir, 'assets')
+        encoder_path = os.path.join(assets_dir, 'encoder.onnx')
+        decoder_path = os.path.join(assets_dir, 'decoder.onnx')
+        smp_path = os.path.join(assets_dir, 'spm.model')
+        # Load ONNX models
+        if os.path.exists(encoder_path):
+            self.encoder_session = ort.InferenceSession(
+                encoder_path,
+                providers=['CPUExecutionProvider']
+            )
+        if os.path.exists(decoder_path):
+            self.decoder_session = ort.InferenceSession(
+                decoder_path,
+                providers=['CPUExecutionProvider']
+            )
+        # Load SentencePiece model
+        if os.path.exists(smp_path):
+            self.sp = spm.SentencePieceProcessor()
+            self.sp.Load(smp_path)
+    def encode_text(self, text: str) -> np.ndarray:
+        """Encode text to token IDs using SentencePiece"""
+        if self.sp is None:
+            raise ValueError("SentencePiece model not loaded")
+        # Encode using SentencePiece
+        tokens = self.sp.EncodeAsIds(text)
+        # Add EOS token
+        tokens = tokens + [self.config.eos_idx]
+        return np.array(tokens, dtype=np.int64)
+    def decode_tokens(self, tokens: List[int]) -> str:
+        """Decode token IDs to text using SentencePiece"""
+        if self.sp is None:
+            raise ValueError("SentencePiece model not loaded")
+        # Remove special tokens
+        tokens = [t for t in tokens if t not in [self.config.bos_idx, self.config.eos_idx, self.config.pad_idx]]
+        # Decode using SentencePiece
+        text = self.sp.DecodeIds(tokens)
+        return text.strip()
+    def encode_source(self, src_tokens: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        """Run encoder on source tokens"""
+        if self.encoder_session is None:
+            raise ValueError("Encoder model not loaded")
+        # Prepare inputs
+        src_tokens_batch = src_tokens.reshape(1, -1)  # [1, src_len]
+        src_lengths = np.array([len(src_tokens)], dtype=np.int64)
+        # Check encoder input names
+        encoder_inputs = [inp.name for inp in self.encoder_session.get_inputs()]
+        # Build input dict based on what encoder expects
+        input_dict = {'src_tokens': src_tokens_batch}
+        if 'src_lengths' in encoder_inputs:
+            input_dict['src_lengths'] = src_lengths
+        # Run encoder
+        outputs = self.encoder_session.run(None, input_dict)
+        # Handle encoder outputs
+        encoder_out = outputs[0]
+        encoder_padding_mask = outputs[1] if len(outputs) > 1 else None
+        return encoder_out, encoder_padding_mask
+    def decode_step(self, prev_tokens, encoder_out, encoder_padding_mask):
+        """Run decoder for one step"""
+        if self.decoder_session is None:
+            raise ValueError("Decoder model not loaded")
+        # Prepare inputs - check if already numpy array
+        if isinstance(prev_tokens, np.ndarray):
+            prev_tokens_np = prev_tokens  # Already formatted correctly
+        else:
+            prev_tokens_np = np.array([prev_tokens], dtype=np.int64)  # [1, seq_len]
+        try:
+            # Run decoder
+            outputs = self.decoder_session.run(
+                None,  # Get all outputs
+                {
+                    'prev_output_tokens': prev_tokens_np,
+                    'encoder_out': encoder_out,
+                    'encoder_padding_mask': encoder_padding_mask
+                }
+            )
+            # Return logits (first output)
+            return outputs[0]
+        except Exception as e:
+            raise RuntimeError(f"Decoder step failed: {e}")
+    def beam_search_translate(self, src_tokens: np.ndarray) -> List[int]:
+        """Perform beam search translation"""
+        # Encode source
+        encoder_out, encoder_padding_mask = self.encode_source(src_tokens)
+        # Initialize beam
+        beam_size = self.config.beam_size
+        max_len = self.config.max_len
+        len_penalty = self.config.len_penalty
+        # Initialize beams with BOS token
+        beams = [([self.config.bos_idx], 0.0)]  # (tokens, score)
+        for step in range(max_len):
+            candidates = []
+            for tokens, score in beams:
+                # Skip if already ended
+                if tokens[-1] == self.config.eos_idx:
+                    candidates.append((tokens, score))
+                    continue
+                # Get next token logits
+                logits = self.decode_step(tokens, encoder_out, encoder_padding_mask)
+                # Convert to probabilities
+                probs = torch.softmax(torch.from_numpy(logits[0, -1, :]), dim=-1)
+                # Get top-k tokens
+                top_probs, top_indices = torch.topk(probs, beam_size)
+                # Add to candidates
+                for prob, idx in zip(top_probs, top_indices):
+                    new_tokens = tokens + [idx.item()]
+                    new_score = score + torch.log(prob).item()
+                    # Apply length penalty
+                    if new_tokens[-1] == self.config.eos_idx:
+                        new_score = new_score / (len(new_tokens) ** len_penalty)
+                    candidates.append((new_tokens, new_score))
+            # Keep top beam_size candidates
+            candidates.sort(key=lambda x: x[1], reverse=True)
+            beams = candidates[:beam_size]
+            # Check if all beams ended
+            if all(tokens[-1] == self.config.eos_idx for tokens, _ in beams):
+                break
+        # Return best translation
+        best_tokens, _ = max(beams, key=lambda x: x[1])
+        return best_tokens
+    def translate(self, text: str) -> str:
+        """Translate input text to English
+        Args:
+            text: Input text in any supported script/language
+        Returns:
+            Translated English text
+        """
+        # Encode input text
+        src_tokens = self.encode_text(text)
+        # Perform beam search translation
+        output_tokens = self.beam_search_translate(src_tokens)
+        # Decode output tokens
+        translated_text = self.decode_tokens(output_tokens)
+        return translated_text
+    def forward(self, text: str) -> str:
+        """Forward pass - alias for translate method for simple usage"""
+        return self.translate(text)
+    def __call__(self, text: str) -> str:
+        """Make model callable - enables model("text") usage"""
+        return self.translate(text)
+    @classmethod
+    def from_pretrained(cls,
+        pretrained_model_name_or_path,
+        *,
+        force_download=False,
+        resume_download=None,
+        proxies=None,
+        token=None,
+        cache_dir=None,
+        local_files_only=False,
+        revision=None,
+        **kwargs):
+        """Load model from Hugging Face Hub or local directory"""
+        # Download model if it's a hub model
+        if not os.path.isdir(pretrained_model_name_or_path):
+            model_dir = snapshot_download(
+                repo_id=pretrained_model_name_or_path,
+                token=token,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                revision=revision
+            )
+        else:
+            model_dir = pretrained_model_name_or_path
+        # Load config
+        config_path = os.path.join(model_dir, 'config.json')
+        if os.path.exists(config_path):
+            config = SetuTranslationConfig.from_json_file(config_path)
+        else:
+            # Load from model_config.json if config.json doesn't exist
+            model_config_path = os.path.join(model_dir, 'model_config.json')
+            if os.path.exists(model_config_path):
+                with open(model_config_path, 'r') as f:
+                    model_config = json.load(f)
+                config = SetuTranslationConfig(**model_config, **kwargs)
+            else:
+                config = SetuTranslationConfig(**kwargs)
+        # Set the model directory path
+        config._name_or_path = model_dir
+        # Create model instance
+        model = cls(config)
+        return model

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+transformers>=4.20.0
+torch>=1.10.0
+onnxruntime>=1.12.0
+sentencepiece>=0.1.90
+huggingface-hub>=0.10.0
+numpy>=1.21.0