santoshdahal commited on
Commit
785f55b
·
verified ·
1 Parent(s): 2828220

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SETU - Script-agnostic English Translation Unifier
2
+
3
+ SETU is a neural translation model that unifies multiscript, multilingual, and informal text into clean, formal English.
4
+
5
+ ## Model Description
6
+
7
+ The SETU model can handle:
8
+ - Romanized Nepali to English translation
9
+ - Devanagari Nepali to English translation
10
+ - Code-mixed text to English translation
11
+ - Informal/slang to formal English translation
12
+
13
+ ## Usage
14
+
15
+ ```python
16
+ from transformers import AutoModel
17
+
18
+ # Load the model
19
+ model = AutoModel.from_pretrained("santoshdahal/setu", trust_remote_code=True)
20
+
21
+ # Translate text
22
+ result = model("mero name santosh ho")
23
+ print("Translation:", result)
24
+ # Output: "My name is Santosh."
25
+
26
+ # Works with Devanagari script too
27
+ result = model("मेरो नाम सन्तोष हो")
28
+ print("Translation:", result)
29
+ # Output: "My name is Santosh."
30
+
31
+ # Handles informal text
32
+ result = model("bro i gonna go ktm")
33
+ print("Translation:", result)
34
+ # Output: "I am going to Kathmandu."
35
+ ```
36
+
37
+ ## Model Details
38
+
39
+ - **Model Type**: Neural Machine Translation
40
+ - **Architecture**: Transformer (based on fairseq transformer_iwslt_de_en)
41
+ - **Vocabulary Size**: 40,253 tokens
42
+ - **Languages Supported**: Nepali (Romanized & Devanagari), English, Code-mixed text
43
+ - **Model Format**: ONNX for efficient inference
44
+
45
+ ## Technical Implementation
46
+
47
+ The model uses:
48
+ - ONNX Runtime for efficient inference
49
+ - SentencePiece for tokenization
50
+ - Beam search decoding with configurable beam size
51
+ - Separate encoder and decoder ONNX models
52
+
53
+ ## Files Included
54
+
55
+ - `encoder.onnx`: ONNX encoder model
56
+ - `decoder.onnx`: ONNX decoder model
57
+ - `spm.model`: SentencePiece tokenizer model
58
+ - `spm.vocab`: SentencePiece vocabulary
59
+ - `config.json`: Model configuration
60
+ - `modeling_setu_translation.py`: Model implementation
61
+ - `configuration_setu_translation.py`: Configuration class
62
+
63
+ ## Citation
64
+
65
+ If you use this model, please cite:
66
+
67
+ ```
68
+ @misc{setu2024,
69
+ title={SETU: Script-agnostic English Translation Unifier},
70
+ author={Santosh Dahal},
71
+ year={2024}
72
+ }
73
+ ```
__init__.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SETU Translation Model for Hugging Face Transformers
3
+
4
+ This package provides the SETU (Script-agnostic English Translation Unifier) model
5
+ for translating multiscript, multilingual, and informal text into clean, formal English.
6
+
7
+ Usage:
8
+ from transformers import AutoModel
9
+
10
+ # Load the model
11
+ model = AutoModel.from_pretrained("santoshdahal/setu", trust_remote_code=True)
12
+
13
+ # Translate text
14
+ result = model("mero name santosh ho")
15
+ print("Translation:", result)
16
+ """
17
+
18
+ from transformers import AutoConfig, AutoModel
19
+ try:
20
+ from .configuration_setu_translation import SetuTranslationConfig
21
+ from .modeling_setu_translation import SetuTranslationModel
22
+ except ImportError:
23
+ from configuration_setu_translation import SetuTranslationConfig
24
+ from modeling_setu_translation import SetuTranslationModel
25
+
26
+ # Register the model configuration and model class
27
+ AutoConfig.register("setu_translation", SetuTranslationConfig)
28
+ AutoModel.register(SetuTranslationConfig, SetuTranslationModel)
29
+
30
+ __all__ = [
31
+ "SetuTranslationConfig",
32
+ "SetuTranslationModel",
33
+ ]
__pycache__/configuration_setu_translation.cpython-310.pyc ADDED
Binary file (2.05 kB). View file
 
__pycache__/modeling_setu_translation.cpython-310.pyc ADDED
Binary file (7.33 kB). View file
 
assets/decoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b07150550ea258faac1ea62095ce63da348fd37a4ed560a274b6cb134ce649a
3
+ size 242959762
assets/encoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bf8f37df0f8f066023cc41b7c65d9e8a4dd82badeb3d0f3a7d6abe2e4587dfd
3
+ size 135159477
assets/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f398c42275e7925df4ac1d7a0b59c7cb2629e899ee2a24e86f323261504b321
3
+ size 790826829
assets/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d26da1faa7fa9c8b8b30f1ea44da83939be6656e7c077f63ab271d34abe877b
3
+ size 948113
assets/spm.vocab ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "setu_translation",
3
+ "architectures": ["SetuTranslationModel"],
4
+ "auto_map": {
5
+ "AutoConfig": "configuration_setu_translation.SetuTranslationConfig",
6
+ "AutoModel": "modeling_setu_translation.SetuTranslationModel"
7
+ },
8
+ "model_name": "SETU",
9
+ "full_name": "Script-agnostic English Translation Unifier",
10
+ "description": "A neural translation model that unifies multiscript, multilingual, and informal text into clean, formal English",
11
+ "version": "1.0.0",
12
+ "architecture": "transformer_iwslt_de_en",
13
+ "src_vocab_size": 40253,
14
+ "tgt_vocab_size": 40253,
15
+ "bos_idx": 0,
16
+ "eos_idx": 2,
17
+ "pad_idx": 1,
18
+ "unk_idx": 3,
19
+ "beam_size": 5,
20
+ "max_len": 200,
21
+ "len_penalty": 1.0,
22
+ "capabilities": [
23
+ "Romanized Nepali to English",
24
+ "Devanagari Nepali to English",
25
+ "Code-mixed text to English",
26
+ "Informal/slang to formal English"
27
+ ]
28
+ }
configuration_setu_translation.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedModel, PretrainedConfig
2
+ import json
3
+
4
+ class SetuTranslationConfig(PretrainedConfig):
5
+ """Configuration class for SETU Translation model.
6
+
7
+ This class handles the configuration for the SETU (Script-agnostic English Translation Unifier) model
8
+ which translates multiscript, multilingual, and informal text into clean, formal English.
9
+ """
10
+
11
+ model_type = "setu_translation"
12
+
13
+ def __init__(
14
+ self,
15
+ model_name: str = "SETU",
16
+ full_name: str = "Script-agnostic English Translation Unifier",
17
+ description: str = "A neural translation model that unifies multiscript, multilingual, and informal text into clean, formal English",
18
+ version: str = "1.0.0",
19
+ architecture: str = "transformer_iwslt_de_en",
20
+ src_vocab_size: int = 40253,
21
+ tgt_vocab_size: int = 40253,
22
+ bos_idx: int = 0,
23
+ eos_idx: int = 2,
24
+ pad_idx: int = 1,
25
+ unk_idx: int = 3,
26
+ beam_size: int = 5,
27
+ max_len: int = 200,
28
+ len_penalty: float = 1.0,
29
+ capabilities: list = None,
30
+ **kwargs
31
+ ):
32
+ super().__init__(**kwargs)
33
+
34
+ self.model_name = model_name
35
+ self.full_name = full_name
36
+ self.description = description
37
+ self.version = version
38
+ self.architecture = architecture
39
+ self.src_vocab_size = src_vocab_size
40
+ self.tgt_vocab_size = tgt_vocab_size
41
+ self.bos_idx = bos_idx
42
+ self.eos_idx = eos_idx
43
+ self.pad_idx = pad_idx
44
+ self.unk_idx = unk_idx
45
+ self.beam_size = beam_size
46
+ self.max_len = max_len
47
+ self.len_penalty = len_penalty
48
+
49
+ if capabilities is None:
50
+ capabilities = [
51
+ "Romanized Nepali to English",
52
+ "Devanagari Nepali to English",
53
+ "Code-mixed text to English",
54
+ "Informal/slang to formal English"
55
+ ]
56
+ self.capabilities = capabilities
model_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "SETU",
3
+ "full_name": "Script-agnostic English Translation Unifier",
4
+ "description": "A neural translation model that unifies multiscript, multilingual, and informal text into clean, formal English",
5
+ "version": "1.0.0",
6
+ "architecture": "transformer_iwslt_de_en",
7
+ "src_vocab_size": 40253,
8
+ "tgt_vocab_size": 40253,
9
+ "bos_idx": 0,
10
+ "eos_idx": 2,
11
+ "pad_idx": 1,
12
+ "unk_idx": 3,
13
+ "capabilities": [
14
+ "Romanized Nepali to English",
15
+ "Devanagari Nepali to English",
16
+ "Code-mixed text to English",
17
+ "Informal/slang to formal English"
18
+ ]
19
+ }
modeling_setu_translation.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedModel, AutoConfig, AutoModel
2
+ try:
3
+ from .configuration_setu_translation import SetuTranslationConfig
4
+ except ImportError:
5
+ from configuration_setu_translation import SetuTranslationConfig
6
+ import torch
7
+ import os
8
+ import numpy as np
9
+ import json
10
+ import onnxruntime as ort
11
+ import sentencepiece as spm
12
+ from typing import List, Tuple
13
+ from huggingface_hub import snapshot_download
14
+
15
+
16
+ class SetuTranslationModel(PreTrainedModel):
17
+ """SETU Translation Model for Hugging Face Hub
18
+
19
+ This model performs script-agnostic translation to unified English output.
20
+ It handles multiscript, multilingual, and informal text translation.
21
+ """
22
+
23
+ config_class = SetuTranslationConfig
24
+
25
+ def __init__(self, config):
26
+ super().__init__(config)
27
+
28
+ self.config = config
29
+
30
+ # Initialize model components
31
+ self.encoder_session = None
32
+ self.decoder_session = None
33
+ self.sp = None
34
+
35
+ # Load model files if they exist
36
+ self._load_model_components()
37
+
38
+ def _load_model_components(self):
39
+ """Load ONNX models and SentencePiece processor"""
40
+ model_dir = getattr(self.config, '_name_or_path', '.')
41
+
42
+ # Paths to model files in assets folder
43
+ assets_dir = os.path.join(model_dir, 'assets')
44
+ encoder_path = os.path.join(assets_dir, 'encoder.onnx')
45
+ decoder_path = os.path.join(assets_dir, 'decoder.onnx')
46
+ smp_path = os.path.join(assets_dir, 'spm.model')
47
+
48
+ # Load ONNX models
49
+ if os.path.exists(encoder_path):
50
+ self.encoder_session = ort.InferenceSession(
51
+ encoder_path,
52
+ providers=['CPUExecutionProvider']
53
+ )
54
+
55
+ if os.path.exists(decoder_path):
56
+ self.decoder_session = ort.InferenceSession(
57
+ decoder_path,
58
+ providers=['CPUExecutionProvider']
59
+ )
60
+
61
+ # Load SentencePiece model
62
+ if os.path.exists(smp_path):
63
+ self.sp = spm.SentencePieceProcessor()
64
+ self.sp.Load(smp_path)
65
+
66
+ def encode_text(self, text: str) -> np.ndarray:
67
+ """Encode text to token IDs using SentencePiece"""
68
+ if self.sp is None:
69
+ raise ValueError("SentencePiece model not loaded")
70
+
71
+ # Encode using SentencePiece
72
+ tokens = self.sp.EncodeAsIds(text)
73
+
74
+ # Add EOS token
75
+ tokens = tokens + [self.config.eos_idx]
76
+
77
+ return np.array(tokens, dtype=np.int64)
78
+
79
+ def decode_tokens(self, tokens: List[int]) -> str:
80
+ """Decode token IDs to text using SentencePiece"""
81
+ if self.sp is None:
82
+ raise ValueError("SentencePiece model not loaded")
83
+
84
+ # Remove special tokens
85
+ tokens = [t for t in tokens if t not in [self.config.bos_idx, self.config.eos_idx, self.config.pad_idx]]
86
+
87
+ # Decode using SentencePiece
88
+ text = self.sp.DecodeIds(tokens)
89
+
90
+ return text.strip()
91
+
92
+ def encode_source(self, src_tokens: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
93
+ """Run encoder on source tokens"""
94
+ if self.encoder_session is None:
95
+ raise ValueError("Encoder model not loaded")
96
+
97
+ # Prepare inputs
98
+ src_tokens_batch = src_tokens.reshape(1, -1) # [1, src_len]
99
+ src_lengths = np.array([len(src_tokens)], dtype=np.int64)
100
+
101
+ # Check encoder input names
102
+ encoder_inputs = [inp.name for inp in self.encoder_session.get_inputs()]
103
+
104
+ # Build input dict based on what encoder expects
105
+ input_dict = {'src_tokens': src_tokens_batch}
106
+ if 'src_lengths' in encoder_inputs:
107
+ input_dict['src_lengths'] = src_lengths
108
+
109
+ # Run encoder
110
+ outputs = self.encoder_session.run(None, input_dict)
111
+
112
+ # Handle encoder outputs
113
+ encoder_out = outputs[0]
114
+ encoder_padding_mask = outputs[1] if len(outputs) > 1 else None
115
+
116
+ return encoder_out, encoder_padding_mask
117
+
118
+ def decode_step(self, prev_tokens, encoder_out, encoder_padding_mask):
119
+ """Run decoder for one step"""
120
+ if self.decoder_session is None:
121
+ raise ValueError("Decoder model not loaded")
122
+
123
+ # Prepare inputs - check if already numpy array
124
+ if isinstance(prev_tokens, np.ndarray):
125
+ prev_tokens_np = prev_tokens # Already formatted correctly
126
+ else:
127
+ prev_tokens_np = np.array([prev_tokens], dtype=np.int64) # [1, seq_len]
128
+
129
+ try:
130
+ # Run decoder
131
+ outputs = self.decoder_session.run(
132
+ None, # Get all outputs
133
+ {
134
+ 'prev_output_tokens': prev_tokens_np,
135
+ 'encoder_out': encoder_out,
136
+ 'encoder_padding_mask': encoder_padding_mask
137
+ }
138
+ )
139
+
140
+ # Return logits (first output)
141
+ return outputs[0]
142
+
143
+ except Exception as e:
144
+ raise RuntimeError(f"Decoder step failed: {e}")
145
+
146
+ def beam_search_translate(self, src_tokens: np.ndarray) -> List[int]:
147
+ """Perform beam search translation"""
148
+ # Encode source
149
+ encoder_out, encoder_padding_mask = self.encode_source(src_tokens)
150
+
151
+ # Initialize beam
152
+ beam_size = self.config.beam_size
153
+ max_len = self.config.max_len
154
+ len_penalty = self.config.len_penalty
155
+
156
+ # Initialize beams with BOS token
157
+ beams = [([self.config.bos_idx], 0.0)] # (tokens, score)
158
+
159
+ for step in range(max_len):
160
+ candidates = []
161
+
162
+ for tokens, score in beams:
163
+ # Skip if already ended
164
+ if tokens[-1] == self.config.eos_idx:
165
+ candidates.append((tokens, score))
166
+ continue
167
+
168
+ # Get next token logits
169
+ logits = self.decode_step(tokens, encoder_out, encoder_padding_mask)
170
+
171
+ # Convert to probabilities
172
+ probs = torch.softmax(torch.from_numpy(logits[0, -1, :]), dim=-1)
173
+
174
+ # Get top-k tokens
175
+ top_probs, top_indices = torch.topk(probs, beam_size)
176
+
177
+ # Add to candidates
178
+ for prob, idx in zip(top_probs, top_indices):
179
+ new_tokens = tokens + [idx.item()]
180
+ new_score = score + torch.log(prob).item()
181
+
182
+ # Apply length penalty
183
+ if new_tokens[-1] == self.config.eos_idx:
184
+ new_score = new_score / (len(new_tokens) ** len_penalty)
185
+
186
+ candidates.append((new_tokens, new_score))
187
+
188
+ # Keep top beam_size candidates
189
+ candidates.sort(key=lambda x: x[1], reverse=True)
190
+ beams = candidates[:beam_size]
191
+
192
+ # Check if all beams ended
193
+ if all(tokens[-1] == self.config.eos_idx for tokens, _ in beams):
194
+ break
195
+
196
+ # Return best translation
197
+ best_tokens, _ = max(beams, key=lambda x: x[1])
198
+ return best_tokens
199
+
200
+ def translate(self, text: str) -> str:
201
+ """Translate input text to English
202
+
203
+ Args:
204
+ text: Input text in any supported script/language
205
+
206
+ Returns:
207
+ Translated English text
208
+ """
209
+ # Encode input text
210
+ src_tokens = self.encode_text(text)
211
+
212
+ # Perform beam search translation
213
+ output_tokens = self.beam_search_translate(src_tokens)
214
+
215
+ # Decode output tokens
216
+ translated_text = self.decode_tokens(output_tokens)
217
+
218
+ return translated_text
219
+
220
+ def forward(self, text: str) -> str:
221
+ """Forward pass - alias for translate method for simple usage"""
222
+ return self.translate(text)
223
+
224
+ def __call__(self, text: str) -> str:
225
+ """Make model callable - enables model("text") usage"""
226
+ return self.translate(text)
227
+
228
+ @classmethod
229
+ def from_pretrained(cls,
230
+ pretrained_model_name_or_path,
231
+ *,
232
+ force_download=False,
233
+ resume_download=None,
234
+ proxies=None,
235
+ token=None,
236
+ cache_dir=None,
237
+ local_files_only=False,
238
+ revision=None,
239
+ **kwargs):
240
+ """Load model from Hugging Face Hub or local directory"""
241
+
242
+ # Download model if it's a hub model
243
+ if not os.path.isdir(pretrained_model_name_or_path):
244
+ model_dir = snapshot_download(
245
+ repo_id=pretrained_model_name_or_path,
246
+ token=token,
247
+ cache_dir=cache_dir,
248
+ force_download=force_download,
249
+ resume_download=resume_download,
250
+ proxies=proxies,
251
+ local_files_only=local_files_only,
252
+ revision=revision
253
+ )
254
+ else:
255
+ model_dir = pretrained_model_name_or_path
256
+
257
+ # Load config
258
+ config_path = os.path.join(model_dir, 'config.json')
259
+ if os.path.exists(config_path):
260
+ config = SetuTranslationConfig.from_json_file(config_path)
261
+ else:
262
+ # Load from model_config.json if config.json doesn't exist
263
+ model_config_path = os.path.join(model_dir, 'model_config.json')
264
+ if os.path.exists(model_config_path):
265
+ with open(model_config_path, 'r') as f:
266
+ model_config = json.load(f)
267
+ config = SetuTranslationConfig(**model_config, **kwargs)
268
+ else:
269
+ config = SetuTranslationConfig(**kwargs)
270
+
271
+ # Set the model directory path
272
+ config._name_or_path = model_dir
273
+
274
+ # Create model instance
275
+ model = cls(config)
276
+
277
+ return model
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers>=4.20.0
2
+ torch>=1.10.0
3
+ onnxruntime>=1.12.0
4
+ sentencepiece>=0.1.90
5
+ huggingface-hub>=0.10.0
6
+ numpy>=1.21.0