Abhishek11k commited on
Commit
724838e
·
verified ·
1 Parent(s): 3aa47c9

Upload 31 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ models/checkpoint-30/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ models/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,71 @@
1
- ---
2
- title: Project
3
- emoji:
4
- colorFrom: purple
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 6.3.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Multilingual Transliteration
3
+ emoji: 🌐
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.8.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # Multilingual Transliteration Model
13
+
14
+ This project implements a multilingual transliteration model (English -> Hindi, Bengali, Tamil) using a fine-tuned mT5 model. It focuses on optimization using CTranslate2 for fast inference and provides a Gradio-based web interface.
15
+
16
+ ## Project Structure
17
+ - `src/`: Source code for training, optimization, and deployment.
18
+ - `data/`: Directory for storing datasets (train/test/val).
19
+ - `models/`: Directory for saving trained and optimized models.
20
+ - `requirements.txt`: Python dependencies.
21
+
22
+ ## Setup
23
+
24
+ 1. **Clone the repository:**
25
+ ```bash
26
+ git clone <repo_url>
27
+ cd <repo_name>
28
+ ```
29
+
30
+ 2. **Create a virtual environment (optional but recommended):**
31
+ ```bash
32
+ python -m venv venv
33
+ .\venv\Scripts\activate # Windows
34
+ # source venv/bin/activate # Linux/Mac
35
+ ```
36
+
37
+ 3. **Install dependencies:**
38
+ ```bash
39
+ pip install -r requirements.txt
40
+ ```
41
+
42
+ ## Usage
43
+
44
+ ### 1. Data Preparation
45
+ Generate dummy data for training:
46
+ ```bash
47
+ python src/prepare_data.py
48
+ ```
49
+
50
+ ### 2. Training
51
+ Train the mT5 model:
52
+ ```bash
53
+ python src/train.py
54
+ ```
55
+
56
+ ### 3. Optimization
57
+ Optimize the trained model using CTranslate2 and benchmark:
58
+ ```bash
59
+ python src/optimize.py
60
+ ```
61
+
62
+ ### 4. Run Demo
63
+ Launch the Gradio app:
64
+ ```bash
65
+ python src/app.py
66
+ ```
67
+
68
+ ## Approach
69
+ - **Model:** `google/mt5-small` is used as the base model due to its multilingual capabilities and efficiency.
70
+ - **Optimization:** CTranslate2 is used to quantize and optimize the model for faster CPU/GPU inference.
71
+ - **Deployment:** Gradio provides a simple and interactive UI for the model.
__pycache__/app.cpython-313.pyc ADDED
Binary file (4.51 kB). View file
 
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import ctranslate2
3
+ import transformers
4
+ import os
5
+
6
+ MODEL_DIR = "models"
7
+ TOKENIZER_DIR = "models" # Relative path for HF Space compatibility
8
+
9
+ # Check if optimized model exists, else fallback or warn
10
+ if not os.path.exists(MODEL_DIR):
11
+ print("Warning: CT2 Model not found. Please run src/optimize.py")
12
+
13
+ # Load Global resources
14
+ def load_model():
15
+ global translator, tokenizer
16
+ try:
17
+ # 1. Try to load CTranslate2 model (Optimized Local)
18
+ if os.path.exists(os.path.join(MODEL_DIR, "model.bin")):
19
+ print("Loading CTranslate2 model from local storage...")
20
+ translator = ctranslate2.Translator(MODEL_DIR)
21
+ tokenizer = transformers.MBart50TokenizerFast.from_pretrained(TOKENIZER_DIR)
22
+
23
+ # 2. Fallback: Load from Hugging Face Hub
24
+ else:
25
+ print("Local weights not found. Downloading fallback model from HF Hub (facebook/mbart-large-50)...")
26
+ from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
27
+ base_model_id = "facebook/mbart-large-50-many-to-many-mmt"
28
+ tokenizer = MBart50TokenizerFast.from_pretrained(base_model_id)
29
+ hf_model = MBartForConditionalGeneration.from_pretrained(base_model_id)
30
+
31
+ # Create a simple wrapper to make hf_model act like a CT2 translator for the existing code
32
+ class TransformersWrapper:
33
+ def __init__(self, model, tokenizer):
34
+ self.model = model
35
+ self.tokenizer = tokenizer
36
+ def translate_batch(self, source_tokens, target_prefix):
37
+ # Convert tokens back to text for transformers
38
+ text = [self.tokenizer.decode(self.tokenizer.convert_tokens_to_ids(s)) for s in source_tokens]
39
+ encoded = self.tokenizer(text, return_tensors="pt", padding=True)
40
+ # Get target lang code
41
+ forced_bos_token_id = self.tokenizer.lang_code_to_id[target_prefix[0][0]]
42
+ generated_tokens = self.model.generate(
43
+ **encoded,
44
+ forced_bos_token_id=forced_bos_token_id
45
+ )
46
+ # Wrap in a result object that mimics CT2 output
47
+ class Result:
48
+ def __init__(self, tokens): self.hypotheses = [tokens]
49
+
50
+ return [Result(self.tokenizer.convert_ids_to_tokens(g)) for g in generated_tokens]
51
+
52
+ translator = TransformersWrapper(hf_model, tokenizer)
53
+ print("Fallback model loaded successfully.")
54
+
55
+ except Exception as e:
56
+ print(f"Error loading model: {e}")
57
+ translator = None
58
+ tokenizer = None
59
+
60
+ load_model()
61
+ if tokenizer:
62
+ tokenizer.src_lang = "en_XX"
63
+
64
+ LANG_CODES = {
65
+ "Hindi": "hi_IN",
66
+ "Bengali": "bn_IN",
67
+ "Tamil": "ta_IN"
68
+ }
69
+
70
+ def transliterate(text, target_language):
71
+ if not translator or not text:
72
+ return "Model not loaded or empty input."
73
+
74
+ target_code = LANG_CODES.get(target_language)
75
+ if not target_code:
76
+ return "Invalid Language"
77
+
78
+ # Tokenize
79
+ source = tokenizer.convert_ids_to_tokens(tokenizer.encode(text))
80
+
81
+ # Translate
82
+ results = translator.translate_batch(
83
+ [source],
84
+ target_prefix=[[target_code]]
85
+ )
86
+
87
+ # Decode
88
+ target = results[0].hypotheses[0]
89
+ return tokenizer.decode(tokenizer.convert_tokens_to_ids(target), skip_special_tokens=True)
90
+
91
+ def create_demo():
92
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
93
+ gr.Markdown("# 🌐 Multilingual Transliteration Model")
94
+
95
+ gr.Markdown("Transliterate English text to Hindi, Bengali, or Tamil.")
96
+
97
+ with gr.Row():
98
+ with gr.Column():
99
+ input_text = gr.Textbox(label="Input Text (English/Roman)", placeholder="e.g. Namaste", lines=3)
100
+ target_lang = gr.Dropdown(choices=["Hindi", "Bengali", "Tamil"], value="Hindi", label="Target Language")
101
+ btn = gr.Button("🚀 Transliterate", variant="primary")
102
+
103
+ with gr.Column():
104
+ output_text = gr.Textbox(label="Transliterated Output", lines=5)
105
+
106
+ gr.Examples(
107
+ examples=[
108
+ ["Namaste", "Hindi"],
109
+ ["Kemon achen", "Bengali"],
110
+ ["Vanakkam", "Tamil"]
111
+ ],
112
+ inputs=[input_text, target_lang]
113
+ )
114
+
115
+ btn.click(fn=transliterate, inputs=[input_text, target_lang], outputs=output_text)
116
+
117
+ return demo
118
+
data/test.csv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ source,target,lang
2
+ aap,आप,hi
3
+ hai,है,hi
4
+ namoshkar,নমস্কার,bn
5
+ amar,আমার,bn
6
+ vanakkam,வணக்கம்,ta
7
+ jal,জল,bn
8
+ nadu,நாடு,ta
9
+ amar,আমার,bn
10
+ namaste,नमस्ते,hi
11
+ kar,कर,hi
12
+ jal,জল,bn
13
+ namoshkar,নমস্কার,bn
14
+ nam,নাম,bn
15
+ nam,নাম,bn
16
+ kya,क्या,hi
data/train.csv ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ source,target,lang
2
+ irukkeenga,இருக்கிறீர்கள்,ta
3
+ naam,नाम,hi
4
+ thanni,தண்ணீர்,ta
5
+ aap,आप,hi
6
+ nam,নাম,bn
7
+ naam,नाम,hi
8
+ achen,আছেন,bn
9
+ bharat,भारत,hi
10
+ peyar,பெயர்,ta
11
+ naam,नाम,hi
12
+ bharat,भारत,hi
13
+ kya,क्या,hi
14
+ en,என்,ta
15
+ nadu,நாடு,ta
16
+ eppadi,எப்படி,ta
17
+ amar,আমার,bn
18
+ en,என்,ta
19
+ kemon,কেমন,bn
20
+ achen,আছেন,bn
21
+ achen,আছেন,bn
22
+ ho,हो,hi
23
+ naam,नाम,hi
24
+ ho,हो,hi
25
+ namaste,नमस्ते,hi
26
+ neengal,நீங்கள்,ta
27
+ bangla,বাংলা,bn
28
+ sapadu,சாப்பாடு,ta
29
+ bharat,भारत,hi
30
+ kya,क्या,hi
31
+ achen,আছেন,bn
32
+ thanni,தண்ணீர்,ta
33
+ khabar,খাবার,bn
34
+ kya,क्या,hi
35
+ mera,मेरा,hi
36
+ vanakkam,வணக்கம்,ta
37
+ bangla,বাংলা,bn
38
+ peyar,பெயர்,ta
39
+ thanni,தண்ணீர்,ta
40
+ hai,है,hi
41
+ irukkeenga,இருக்கிறீர்கள்,ta
42
+ neengal,நீங்கள்,ta
43
+ bangla,বাংলা,bn
44
+ vanakkam,வணக்கம்,ta
45
+ namaste,नमस्ते,hi
46
+ mera,मेरा,hi
47
+ kar,कर,hi
48
+ bangla,বাংলা,bn
49
+ aap,आप,hi
50
+ en,என்,ta
51
+ eppadi,எப்படி,ta
52
+ ho,हो,hi
53
+ en,என்,ta
54
+ desh,দেশ,bn
55
+ amar,আমার,bn
56
+ sapadu,சாப்பாடு,ta
57
+ neengal,நீங்கள்,ta
58
+ kya,क्या,hi
59
+ tamil,தமிழ்,ta
60
+ apni,আপনি,bn
61
+ nam,নাম,bn
62
+ bharat,भारत,hi
63
+ tamil,தமிழ்,ta
64
+ neengal,நீங்கள்,ta
65
+ khabar,খাবার,bn
66
+ rahe,रहे,hi
67
+ eppadi,எப்படி,ta
68
+ apni,আপনি,bn
69
+ aap,आप,hi
70
+ jal,জল,bn
71
+ eppadi,எப்படி,ta
72
+ eppadi,எப்படி,ta
73
+ kar,कर,hi
74
+ khabar,খাবার,bn
75
+ nadu,நாடு,ta
76
+ irukkeenga,இருக்கிறீர்கள்,ta
77
+ thanni,தண்ணீர்,ta
78
+ mera,मेरा,hi
79
+ tamil,தமிழ்,ta
80
+ bangla,বাংলা,bn
81
+ peyar,பெயர்,ta
82
+ kemon,কেমন,bn
83
+ tamil,தமிழ்,ta
84
+ sapadu,சாப்பாடு,ta
85
+ kemon,কেমন,bn
86
+ irukkeenga,இருக்கிறீர்கள்,ta
87
+ peyar,பெயர்,ta
88
+ ho,हो,hi
89
+ kar,कर,hi
90
+ bharat,भारत,hi
91
+ desh,দেশ,bn
92
+ khabar,খাবার,bn
93
+ khabar,খাবার,bn
94
+ apni,আপনি,bn
95
+ desh,দেশ,bn
96
+ desh,দেশ,bn
97
+ namoshkar,নমস্কার,bn
98
+ namaste,नमस्ते,hi
99
+ kemon,কেমন,bn
100
+ rahe,रहे,hi
101
+ jal,জল,bn
102
+ rahe,रहे,hi
103
+ rahe,रहे,hi
104
+ thanni,தண்ணீர்,ta
105
+ mera,मेरा,hi
106
+ mera,मेरा,hi
107
+ en,என்,ta
108
+ sapadu,சாப்பாடு,ta
109
+ kemon,কেমন,bn
110
+ kar,कर,hi
111
+ tamil,தமிழ்,ta
112
+ vanakkam,வணக்கம்,ta
113
+ naam,नाम,hi
114
+ desh,দেশ,bn
115
+ namaste,नमस्ते,hi
116
+ nadu,நாடு,ta
117
+ jal,জল,bn
118
+ nadu,நாடு,ta
119
+ aap,आप,hi
120
+ hai,है,hi
121
+ namoshkar,নমস্কার,bn
data/val.csv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ source,target,lang
2
+ amar,আমার,bn
3
+ apni,আপনি,bn
4
+ sapadu,சாப்பாடு,ta
5
+ neengal,நீங்கள்,ta
6
+ irukkeenga,இருக்கிறீர்கள்,ta
7
+ peyar,பெயர்,ta
8
+ rahe,रहे,hi
9
+ hai,है,hi
10
+ namoshkar,নমস্কার,bn
11
+ nam,নাম,bn
12
+ achen,আছেন,bn
13
+ ho,हो,hi
14
+ hai,है,hi
15
+ apni,আপনি,bn
16
+ vanakkam,வணக்கம்,ta
models/checkpoint-30/config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_num_labels": 3,
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "relu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": true,
7
+ "architectures": [
8
+ "MBartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.0,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 12,
19
+ "decoder_start_token_id": 2,
20
+ "dropout": 0.1,
21
+ "dtype": "float32",
22
+ "early_stopping": null,
23
+ "encoder_attention_heads": 16,
24
+ "encoder_ffn_dim": 4096,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 12,
27
+ "eos_token_id": 2,
28
+ "forced_eos_token_id": 2,
29
+ "gradient_checkpointing": false,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1",
33
+ "2": "LABEL_2"
34
+ },
35
+ "init_std": 0.02,
36
+ "is_encoder_decoder": true,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1,
40
+ "LABEL_2": 2
41
+ },
42
+ "max_length": null,
43
+ "max_position_embeddings": 1024,
44
+ "model_type": "mbart",
45
+ "normalize_before": true,
46
+ "normalize_embedding": true,
47
+ "num_beams": null,
48
+ "num_hidden_layers": 12,
49
+ "output_past": true,
50
+ "pad_token_id": 1,
51
+ "scale_embedding": true,
52
+ "static_position_embeddings": false,
53
+ "tokenizer_class": "MBart50Tokenizer",
54
+ "transformers_version": "4.57.3",
55
+ "use_cache": true,
56
+ "vocab_size": 250054
57
+ }
models/checkpoint-30/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "early_stopping": true,
6
+ "eos_token_id": [
7
+ 2
8
+ ],
9
+ "forced_eos_token_id": 2,
10
+ "max_length": 200,
11
+ "num_beams": 5,
12
+ "pad_token_id": 1,
13
+ "transformers_version": "4.57.3"
14
+ }
models/checkpoint-30/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf06980fc3200df90cdd62120cbad96ec7378e2bb8faae0509e98d67fea85727
3
+ size 14645
models/checkpoint-30/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:973e7699cf118c0ef2f285910efd67abb42d0d1ae7bae40cb22396d19a64328c
3
+ size 1383
models/checkpoint-30/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c01134b5ae1edcac974086698aba68af7d61c087c24b035fd0502482c1fac02
3
+ size 1465
models/checkpoint-30/sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
models/checkpoint-30/special_tokens_map.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "ar_AR",
4
+ "cs_CZ",
5
+ "de_DE",
6
+ "en_XX",
7
+ "es_XX",
8
+ "et_EE",
9
+ "fi_FI",
10
+ "fr_XX",
11
+ "gu_IN",
12
+ "hi_IN",
13
+ "it_IT",
14
+ "ja_XX",
15
+ "kk_KZ",
16
+ "ko_KR",
17
+ "lt_LT",
18
+ "lv_LV",
19
+ "my_MM",
20
+ "ne_NP",
21
+ "nl_XX",
22
+ "ro_RO",
23
+ "ru_RU",
24
+ "si_LK",
25
+ "tr_TR",
26
+ "vi_VN",
27
+ "zh_CN",
28
+ "af_ZA",
29
+ "az_AZ",
30
+ "bn_IN",
31
+ "fa_IR",
32
+ "he_IL",
33
+ "hr_HR",
34
+ "id_ID",
35
+ "ka_GE",
36
+ "km_KH",
37
+ "mk_MK",
38
+ "ml_IN",
39
+ "mn_MN",
40
+ "mr_IN",
41
+ "pl_PL",
42
+ "ps_AF",
43
+ "pt_XX",
44
+ "sv_SE",
45
+ "sw_KE",
46
+ "ta_IN",
47
+ "te_IN",
48
+ "th_TH",
49
+ "tl_XX",
50
+ "uk_UA",
51
+ "ur_PK",
52
+ "xh_ZA",
53
+ "gl_ES",
54
+ "sl_SI"
55
+ ],
56
+ "bos_token": "<s>",
57
+ "cls_token": "<s>",
58
+ "eos_token": "</s>",
59
+ "mask_token": {
60
+ "content": "<mask>",
61
+ "lstrip": true,
62
+ "normalized": true,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ },
66
+ "pad_token": "<pad>",
67
+ "sep_token": "</s>",
68
+ "unk_token": "<unk>"
69
+ }
models/checkpoint-30/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0da4e7af9b86e84c844ce9b0d58a845dd3b0d9724abef93bc226aeb17d5110a0
3
+ size 17110186
models/checkpoint-30/tokenizer_config.json ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "ar_AR",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "250002": {
44
+ "content": "cs_CZ",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "250003": {
52
+ "content": "de_DE",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "250004": {
60
+ "content": "en_XX",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "250005": {
68
+ "content": "es_XX",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "250006": {
76
+ "content": "et_EE",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "250007": {
84
+ "content": "fi_FI",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "250008": {
92
+ "content": "fr_XX",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "250009": {
100
+ "content": "gu_IN",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "250010": {
108
+ "content": "hi_IN",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "250011": {
116
+ "content": "it_IT",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "250012": {
124
+ "content": "ja_XX",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "250013": {
132
+ "content": "kk_KZ",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "250014": {
140
+ "content": "ko_KR",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "250015": {
148
+ "content": "lt_LT",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "250016": {
156
+ "content": "lv_LV",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "250017": {
164
+ "content": "my_MM",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "250018": {
172
+ "content": "ne_NP",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "250019": {
180
+ "content": "nl_XX",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ },
187
+ "250020": {
188
+ "content": "ro_RO",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": true
194
+ },
195
+ "250021": {
196
+ "content": "ru_RU",
197
+ "lstrip": false,
198
+ "normalized": false,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": true
202
+ },
203
+ "250022": {
204
+ "content": "si_LK",
205
+ "lstrip": false,
206
+ "normalized": false,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": true
210
+ },
211
+ "250023": {
212
+ "content": "tr_TR",
213
+ "lstrip": false,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": true
218
+ },
219
+ "250024": {
220
+ "content": "vi_VN",
221
+ "lstrip": false,
222
+ "normalized": false,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": true
226
+ },
227
+ "250025": {
228
+ "content": "zh_CN",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": true
234
+ },
235
+ "250026": {
236
+ "content": "af_ZA",
237
+ "lstrip": false,
238
+ "normalized": false,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": true
242
+ },
243
+ "250027": {
244
+ "content": "az_AZ",
245
+ "lstrip": false,
246
+ "normalized": false,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": true
250
+ },
251
+ "250028": {
252
+ "content": "bn_IN",
253
+ "lstrip": false,
254
+ "normalized": false,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": true
258
+ },
259
+ "250029": {
260
+ "content": "fa_IR",
261
+ "lstrip": false,
262
+ "normalized": false,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": true
266
+ },
267
+ "250030": {
268
+ "content": "he_IL",
269
+ "lstrip": false,
270
+ "normalized": false,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": true
274
+ },
275
+ "250031": {
276
+ "content": "hr_HR",
277
+ "lstrip": false,
278
+ "normalized": false,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": true
282
+ },
283
+ "250032": {
284
+ "content": "id_ID",
285
+ "lstrip": false,
286
+ "normalized": false,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": true
290
+ },
291
+ "250033": {
292
+ "content": "ka_GE",
293
+ "lstrip": false,
294
+ "normalized": false,
295
+ "rstrip": false,
296
+ "single_word": false,
297
+ "special": true
298
+ },
299
+ "250034": {
300
+ "content": "km_KH",
301
+ "lstrip": false,
302
+ "normalized": false,
303
+ "rstrip": false,
304
+ "single_word": false,
305
+ "special": true
306
+ },
307
+ "250035": {
308
+ "content": "mk_MK",
309
+ "lstrip": false,
310
+ "normalized": false,
311
+ "rstrip": false,
312
+ "single_word": false,
313
+ "special": true
314
+ },
315
+ "250036": {
316
+ "content": "ml_IN",
317
+ "lstrip": false,
318
+ "normalized": false,
319
+ "rstrip": false,
320
+ "single_word": false,
321
+ "special": true
322
+ },
323
+ "250037": {
324
+ "content": "mn_MN",
325
+ "lstrip": false,
326
+ "normalized": false,
327
+ "rstrip": false,
328
+ "single_word": false,
329
+ "special": true
330
+ },
331
+ "250038": {
332
+ "content": "mr_IN",
333
+ "lstrip": false,
334
+ "normalized": false,
335
+ "rstrip": false,
336
+ "single_word": false,
337
+ "special": true
338
+ },
339
+ "250039": {
340
+ "content": "pl_PL",
341
+ "lstrip": false,
342
+ "normalized": false,
343
+ "rstrip": false,
344
+ "single_word": false,
345
+ "special": true
346
+ },
347
+ "250040": {
348
+ "content": "ps_AF",
349
+ "lstrip": false,
350
+ "normalized": false,
351
+ "rstrip": false,
352
+ "single_word": false,
353
+ "special": true
354
+ },
355
+ "250041": {
356
+ "content": "pt_XX",
357
+ "lstrip": false,
358
+ "normalized": false,
359
+ "rstrip": false,
360
+ "single_word": false,
361
+ "special": true
362
+ },
363
+ "250042": {
364
+ "content": "sv_SE",
365
+ "lstrip": false,
366
+ "normalized": false,
367
+ "rstrip": false,
368
+ "single_word": false,
369
+ "special": true
370
+ },
371
+ "250043": {
372
+ "content": "sw_KE",
373
+ "lstrip": false,
374
+ "normalized": false,
375
+ "rstrip": false,
376
+ "single_word": false,
377
+ "special": true
378
+ },
379
+ "250044": {
380
+ "content": "ta_IN",
381
+ "lstrip": false,
382
+ "normalized": false,
383
+ "rstrip": false,
384
+ "single_word": false,
385
+ "special": true
386
+ },
387
+ "250045": {
388
+ "content": "te_IN",
389
+ "lstrip": false,
390
+ "normalized": false,
391
+ "rstrip": false,
392
+ "single_word": false,
393
+ "special": true
394
+ },
395
+ "250046": {
396
+ "content": "th_TH",
397
+ "lstrip": false,
398
+ "normalized": false,
399
+ "rstrip": false,
400
+ "single_word": false,
401
+ "special": true
402
+ },
403
+ "250047": {
404
+ "content": "tl_XX",
405
+ "lstrip": false,
406
+ "normalized": false,
407
+ "rstrip": false,
408
+ "single_word": false,
409
+ "special": true
410
+ },
411
+ "250048": {
412
+ "content": "uk_UA",
413
+ "lstrip": false,
414
+ "normalized": false,
415
+ "rstrip": false,
416
+ "single_word": false,
417
+ "special": true
418
+ },
419
+ "250049": {
420
+ "content": "ur_PK",
421
+ "lstrip": false,
422
+ "normalized": false,
423
+ "rstrip": false,
424
+ "single_word": false,
425
+ "special": true
426
+ },
427
+ "250050": {
428
+ "content": "xh_ZA",
429
+ "lstrip": false,
430
+ "normalized": false,
431
+ "rstrip": false,
432
+ "single_word": false,
433
+ "special": true
434
+ },
435
+ "250051": {
436
+ "content": "gl_ES",
437
+ "lstrip": false,
438
+ "normalized": false,
439
+ "rstrip": false,
440
+ "single_word": false,
441
+ "special": true
442
+ },
443
+ "250052": {
444
+ "content": "sl_SI",
445
+ "lstrip": false,
446
+ "normalized": false,
447
+ "rstrip": false,
448
+ "single_word": false,
449
+ "special": true
450
+ },
451
+ "250053": {
452
+ "content": "<mask>",
453
+ "lstrip": true,
454
+ "normalized": true,
455
+ "rstrip": false,
456
+ "single_word": false,
457
+ "special": true
458
+ }
459
+ },
460
+ "additional_special_tokens": [
461
+ "ar_AR",
462
+ "cs_CZ",
463
+ "de_DE",
464
+ "en_XX",
465
+ "es_XX",
466
+ "et_EE",
467
+ "fi_FI",
468
+ "fr_XX",
469
+ "gu_IN",
470
+ "hi_IN",
471
+ "it_IT",
472
+ "ja_XX",
473
+ "kk_KZ",
474
+ "ko_KR",
475
+ "lt_LT",
476
+ "lv_LV",
477
+ "my_MM",
478
+ "ne_NP",
479
+ "nl_XX",
480
+ "ro_RO",
481
+ "ru_RU",
482
+ "si_LK",
483
+ "tr_TR",
484
+ "vi_VN",
485
+ "zh_CN",
486
+ "af_ZA",
487
+ "az_AZ",
488
+ "bn_IN",
489
+ "fa_IR",
490
+ "he_IL",
491
+ "hr_HR",
492
+ "id_ID",
493
+ "ka_GE",
494
+ "km_KH",
495
+ "mk_MK",
496
+ "ml_IN",
497
+ "mn_MN",
498
+ "mr_IN",
499
+ "pl_PL",
500
+ "ps_AF",
501
+ "pt_XX",
502
+ "sv_SE",
503
+ "sw_KE",
504
+ "ta_IN",
505
+ "te_IN",
506
+ "th_TH",
507
+ "tl_XX",
508
+ "uk_UA",
509
+ "ur_PK",
510
+ "xh_ZA",
511
+ "gl_ES",
512
+ "sl_SI"
513
+ ],
514
+ "bos_token": "<s>",
515
+ "clean_up_tokenization_spaces": false,
516
+ "cls_token": "<s>",
517
+ "eos_token": "</s>",
518
+ "extra_special_tokens": {},
519
+ "language_codes": "ML50",
520
+ "mask_token": "<mask>",
521
+ "model_max_length": 1000000000000000019884624838656,
522
+ "pad_token": "<pad>",
523
+ "sep_token": "</s>",
524
+ "sp_model_kwargs": {},
525
+ "src_lang": "en_XX",
526
+ "tgt_lang": "hi_IN",
527
+ "tokenizer_class": "MBart50Tokenizer",
528
+ "unk_token": "<unk>"
529
+ }
models/checkpoint-30/trainer_state.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 30,
3
+ "best_metric": 7.648374557495117,
4
+ "best_model_checkpoint": "/content/drive/MyDrive/Nagina-2/models/mbart-transliteration/checkpoint-30",
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 30,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.3333333333333333,
14
+ "grad_norm": 94.37104797363281,
15
+ "learning_rate": 3.5e-05,
16
+ "loss": 11.4112,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.6666666666666666,
21
+ "grad_norm": 101.393798828125,
22
+ "learning_rate": 1.8333333333333333e-05,
23
+ "loss": 9.2075,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 1.0,
28
+ "grad_norm": 103.45658111572266,
29
+ "learning_rate": 1.6666666666666667e-06,
30
+ "loss": 8.0438,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 1.0,
35
+ "eval_loss": 7.648374557495117,
36
+ "eval_runtime": 0.2819,
37
+ "eval_samples_per_second": 53.202,
38
+ "eval_steps_per_second": 14.187,
39
+ "step": 30
40
+ }
41
+ ],
42
+ "logging_steps": 10,
43
+ "max_steps": 30,
44
+ "num_input_tokens_seen": 0,
45
+ "num_train_epochs": 1,
46
+ "save_steps": 500,
47
+ "stateful_callbacks": {
48
+ "TrainerControl": {
49
+ "args": {
50
+ "should_epoch_stop": false,
51
+ "should_evaluate": false,
52
+ "should_log": false,
53
+ "should_save": true,
54
+ "should_training_stop": true
55
+ },
56
+ "attributes": {}
57
+ }
58
+ },
59
+ "total_flos": 32506946519040.0,
60
+ "train_batch_size": 4,
61
+ "trial_name": null,
62
+ "trial_params": null
63
+ }
models/checkpoint-30/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9013c117330a5e2e1042c93ca678d33d3f6c2afa498e8a5c8079ab49db2ccd69
3
+ size 6033
models/config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_num_labels": 3,
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "relu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": true,
7
+ "architectures": [
8
+ "MBartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.0,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 12,
19
+ "decoder_start_token_id": 2,
20
+ "dropout": 0.1,
21
+ "dtype": "float32",
22
+ "early_stopping": null,
23
+ "encoder_attention_heads": 16,
24
+ "encoder_ffn_dim": 4096,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 12,
27
+ "eos_token_id": 2,
28
+ "forced_eos_token_id": 2,
29
+ "gradient_checkpointing": false,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1",
33
+ "2": "LABEL_2"
34
+ },
35
+ "init_std": 0.02,
36
+ "is_encoder_decoder": true,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1,
40
+ "LABEL_2": 2
41
+ },
42
+ "max_length": null,
43
+ "max_position_embeddings": 1024,
44
+ "model_type": "mbart",
45
+ "normalize_before": true,
46
+ "normalize_embedding": true,
47
+ "num_beams": null,
48
+ "num_hidden_layers": 12,
49
+ "output_past": true,
50
+ "pad_token_id": 1,
51
+ "scale_embedding": true,
52
+ "static_position_embeddings": false,
53
+ "tokenizer_class": "MBart50Tokenizer",
54
+ "transformers_version": "4.57.3",
55
+ "use_cache": true,
56
+ "vocab_size": 250054
57
+ }
models/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "early_stopping": true,
6
+ "eos_token_id": [
7
+ 2
8
+ ],
9
+ "forced_eos_token_id": 2,
10
+ "max_length": 200,
11
+ "num_beams": 5,
12
+ "pad_token_id": 1,
13
+ "transformers_version": "4.57.3"
14
+ }
models/sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
models/special_tokens_map.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "ar_AR",
4
+ "cs_CZ",
5
+ "de_DE",
6
+ "en_XX",
7
+ "es_XX",
8
+ "et_EE",
9
+ "fi_FI",
10
+ "fr_XX",
11
+ "gu_IN",
12
+ "hi_IN",
13
+ "it_IT",
14
+ "ja_XX",
15
+ "kk_KZ",
16
+ "ko_KR",
17
+ "lt_LT",
18
+ "lv_LV",
19
+ "my_MM",
20
+ "ne_NP",
21
+ "nl_XX",
22
+ "ro_RO",
23
+ "ru_RU",
24
+ "si_LK",
25
+ "tr_TR",
26
+ "vi_VN",
27
+ "zh_CN",
28
+ "af_ZA",
29
+ "az_AZ",
30
+ "bn_IN",
31
+ "fa_IR",
32
+ "he_IL",
33
+ "hr_HR",
34
+ "id_ID",
35
+ "ka_GE",
36
+ "km_KH",
37
+ "mk_MK",
38
+ "ml_IN",
39
+ "mn_MN",
40
+ "mr_IN",
41
+ "pl_PL",
42
+ "ps_AF",
43
+ "pt_XX",
44
+ "sv_SE",
45
+ "sw_KE",
46
+ "ta_IN",
47
+ "te_IN",
48
+ "th_TH",
49
+ "tl_XX",
50
+ "uk_UA",
51
+ "ur_PK",
52
+ "xh_ZA",
53
+ "gl_ES",
54
+ "sl_SI"
55
+ ],
56
+ "bos_token": "<s>",
57
+ "cls_token": "<s>",
58
+ "eos_token": "</s>",
59
+ "mask_token": {
60
+ "content": "<mask>",
61
+ "lstrip": true,
62
+ "normalized": true,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ },
66
+ "pad_token": "<pad>",
67
+ "sep_token": "</s>",
68
+ "unk_token": "<unk>"
69
+ }
models/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0da4e7af9b86e84c844ce9b0d58a845dd3b0d9724abef93bc226aeb17d5110a0
3
+ size 17110186
models/tokenizer_config.json ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "ar_AR",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "250002": {
44
+ "content": "cs_CZ",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "250003": {
52
+ "content": "de_DE",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "250004": {
60
+ "content": "en_XX",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "250005": {
68
+ "content": "es_XX",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "250006": {
76
+ "content": "et_EE",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "250007": {
84
+ "content": "fi_FI",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "250008": {
92
+ "content": "fr_XX",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "250009": {
100
+ "content": "gu_IN",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "250010": {
108
+ "content": "hi_IN",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "250011": {
116
+ "content": "it_IT",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "250012": {
124
+ "content": "ja_XX",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "250013": {
132
+ "content": "kk_KZ",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "250014": {
140
+ "content": "ko_KR",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "250015": {
148
+ "content": "lt_LT",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "250016": {
156
+ "content": "lv_LV",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "250017": {
164
+ "content": "my_MM",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "250018": {
172
+ "content": "ne_NP",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "250019": {
180
+ "content": "nl_XX",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ },
187
+ "250020": {
188
+ "content": "ro_RO",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": true
194
+ },
195
+ "250021": {
196
+ "content": "ru_RU",
197
+ "lstrip": false,
198
+ "normalized": false,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": true
202
+ },
203
+ "250022": {
204
+ "content": "si_LK",
205
+ "lstrip": false,
206
+ "normalized": false,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": true
210
+ },
211
+ "250023": {
212
+ "content": "tr_TR",
213
+ "lstrip": false,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": true
218
+ },
219
+ "250024": {
220
+ "content": "vi_VN",
221
+ "lstrip": false,
222
+ "normalized": false,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": true
226
+ },
227
+ "250025": {
228
+ "content": "zh_CN",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": true
234
+ },
235
+ "250026": {
236
+ "content": "af_ZA",
237
+ "lstrip": false,
238
+ "normalized": false,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": true
242
+ },
243
+ "250027": {
244
+ "content": "az_AZ",
245
+ "lstrip": false,
246
+ "normalized": false,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": true
250
+ },
251
+ "250028": {
252
+ "content": "bn_IN",
253
+ "lstrip": false,
254
+ "normalized": false,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": true
258
+ },
259
+ "250029": {
260
+ "content": "fa_IR",
261
+ "lstrip": false,
262
+ "normalized": false,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": true
266
+ },
267
+ "250030": {
268
+ "content": "he_IL",
269
+ "lstrip": false,
270
+ "normalized": false,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": true
274
+ },
275
+ "250031": {
276
+ "content": "hr_HR",
277
+ "lstrip": false,
278
+ "normalized": false,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": true
282
+ },
283
+ "250032": {
284
+ "content": "id_ID",
285
+ "lstrip": false,
286
+ "normalized": false,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": true
290
+ },
291
+ "250033": {
292
+ "content": "ka_GE",
293
+ "lstrip": false,
294
+ "normalized": false,
295
+ "rstrip": false,
296
+ "single_word": false,
297
+ "special": true
298
+ },
299
+ "250034": {
300
+ "content": "km_KH",
301
+ "lstrip": false,
302
+ "normalized": false,
303
+ "rstrip": false,
304
+ "single_word": false,
305
+ "special": true
306
+ },
307
+ "250035": {
308
+ "content": "mk_MK",
309
+ "lstrip": false,
310
+ "normalized": false,
311
+ "rstrip": false,
312
+ "single_word": false,
313
+ "special": true
314
+ },
315
+ "250036": {
316
+ "content": "ml_IN",
317
+ "lstrip": false,
318
+ "normalized": false,
319
+ "rstrip": false,
320
+ "single_word": false,
321
+ "special": true
322
+ },
323
+ "250037": {
324
+ "content": "mn_MN",
325
+ "lstrip": false,
326
+ "normalized": false,
327
+ "rstrip": false,
328
+ "single_word": false,
329
+ "special": true
330
+ },
331
+ "250038": {
332
+ "content": "mr_IN",
333
+ "lstrip": false,
334
+ "normalized": false,
335
+ "rstrip": false,
336
+ "single_word": false,
337
+ "special": true
338
+ },
339
+ "250039": {
340
+ "content": "pl_PL",
341
+ "lstrip": false,
342
+ "normalized": false,
343
+ "rstrip": false,
344
+ "single_word": false,
345
+ "special": true
346
+ },
347
+ "250040": {
348
+ "content": "ps_AF",
349
+ "lstrip": false,
350
+ "normalized": false,
351
+ "rstrip": false,
352
+ "single_word": false,
353
+ "special": true
354
+ },
355
+ "250041": {
356
+ "content": "pt_XX",
357
+ "lstrip": false,
358
+ "normalized": false,
359
+ "rstrip": false,
360
+ "single_word": false,
361
+ "special": true
362
+ },
363
+ "250042": {
364
+ "content": "sv_SE",
365
+ "lstrip": false,
366
+ "normalized": false,
367
+ "rstrip": false,
368
+ "single_word": false,
369
+ "special": true
370
+ },
371
+ "250043": {
372
+ "content": "sw_KE",
373
+ "lstrip": false,
374
+ "normalized": false,
375
+ "rstrip": false,
376
+ "single_word": false,
377
+ "special": true
378
+ },
379
+ "250044": {
380
+ "content": "ta_IN",
381
+ "lstrip": false,
382
+ "normalized": false,
383
+ "rstrip": false,
384
+ "single_word": false,
385
+ "special": true
386
+ },
387
+ "250045": {
388
+ "content": "te_IN",
389
+ "lstrip": false,
390
+ "normalized": false,
391
+ "rstrip": false,
392
+ "single_word": false,
393
+ "special": true
394
+ },
395
+ "250046": {
396
+ "content": "th_TH",
397
+ "lstrip": false,
398
+ "normalized": false,
399
+ "rstrip": false,
400
+ "single_word": false,
401
+ "special": true
402
+ },
403
+ "250047": {
404
+ "content": "tl_XX",
405
+ "lstrip": false,
406
+ "normalized": false,
407
+ "rstrip": false,
408
+ "single_word": false,
409
+ "special": true
410
+ },
411
+ "250048": {
412
+ "content": "uk_UA",
413
+ "lstrip": false,
414
+ "normalized": false,
415
+ "rstrip": false,
416
+ "single_word": false,
417
+ "special": true
418
+ },
419
+ "250049": {
420
+ "content": "ur_PK",
421
+ "lstrip": false,
422
+ "normalized": false,
423
+ "rstrip": false,
424
+ "single_word": false,
425
+ "special": true
426
+ },
427
+ "250050": {
428
+ "content": "xh_ZA",
429
+ "lstrip": false,
430
+ "normalized": false,
431
+ "rstrip": false,
432
+ "single_word": false,
433
+ "special": true
434
+ },
435
+ "250051": {
436
+ "content": "gl_ES",
437
+ "lstrip": false,
438
+ "normalized": false,
439
+ "rstrip": false,
440
+ "single_word": false,
441
+ "special": true
442
+ },
443
+ "250052": {
444
+ "content": "sl_SI",
445
+ "lstrip": false,
446
+ "normalized": false,
447
+ "rstrip": false,
448
+ "single_word": false,
449
+ "special": true
450
+ },
451
+ "250053": {
452
+ "content": "<mask>",
453
+ "lstrip": true,
454
+ "normalized": true,
455
+ "rstrip": false,
456
+ "single_word": false,
457
+ "special": true
458
+ }
459
+ },
460
+ "additional_special_tokens": [
461
+ "ar_AR",
462
+ "cs_CZ",
463
+ "de_DE",
464
+ "en_XX",
465
+ "es_XX",
466
+ "et_EE",
467
+ "fi_FI",
468
+ "fr_XX",
469
+ "gu_IN",
470
+ "hi_IN",
471
+ "it_IT",
472
+ "ja_XX",
473
+ "kk_KZ",
474
+ "ko_KR",
475
+ "lt_LT",
476
+ "lv_LV",
477
+ "my_MM",
478
+ "ne_NP",
479
+ "nl_XX",
480
+ "ro_RO",
481
+ "ru_RU",
482
+ "si_LK",
483
+ "tr_TR",
484
+ "vi_VN",
485
+ "zh_CN",
486
+ "af_ZA",
487
+ "az_AZ",
488
+ "bn_IN",
489
+ "fa_IR",
490
+ "he_IL",
491
+ "hr_HR",
492
+ "id_ID",
493
+ "ka_GE",
494
+ "km_KH",
495
+ "mk_MK",
496
+ "ml_IN",
497
+ "mn_MN",
498
+ "mr_IN",
499
+ "pl_PL",
500
+ "ps_AF",
501
+ "pt_XX",
502
+ "sv_SE",
503
+ "sw_KE",
504
+ "ta_IN",
505
+ "te_IN",
506
+ "th_TH",
507
+ "tl_XX",
508
+ "uk_UA",
509
+ "ur_PK",
510
+ "xh_ZA",
511
+ "gl_ES",
512
+ "sl_SI"
513
+ ],
514
+ "bos_token": "<s>",
515
+ "clean_up_tokenization_spaces": false,
516
+ "cls_token": "<s>",
517
+ "eos_token": "</s>",
518
+ "extra_special_tokens": {},
519
+ "language_codes": "ML50",
520
+ "mask_token": "<mask>",
521
+ "model_max_length": 1000000000000000019884624838656,
522
+ "pad_token": "<pad>",
523
+ "sep_token": "</s>",
524
+ "sp_model_kwargs": {},
525
+ "src_lang": "en_XX",
526
+ "tgt_lang": "hi_IN",
527
+ "tokenizer_class": "MBart50Tokenizer",
528
+ "unk_token": "<unk>"
529
+ }
models/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9013c117330a5e2e1042c93ca678d33d3f6c2afa498e8a5c8079ab49db2ccd69
3
+ size 6033
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ datasets
4
+ sentencepiece
5
+ sacremoses
6
+ ctranslate2
7
+ gradio
8
+ pandas
9
+ scikit-learn
10
+ accelerate
src/data/test.csv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ source,target,lang
2
+ nadu,நாடு,ta
3
+ tamil,தமிழ்,ta
4
+ irukkeenga,இருக்கிறீர்கள்,ta
5
+ khabar,খাবার,bn
6
+ rahe,रहे,hi
7
+ neengal,நீங்கள்,ta
8
+ ho,हो,hi
9
+ nadu,நாடு,ta
10
+ bharat,भारत,hi
11
+ desh,দেশ,bn
12
+ vanakkam,வணக்கம்,ta
13
+ achen,আছেন,bn
14
+ kya,क्या,hi
15
+ kar,कर,hi
16
+ desh,দেশ,bn
src/data/train.csv ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ source,target,lang
2
+ tamil,தமிழ்,ta
3
+ kya,क्या,hi
4
+ aap,आप,hi
5
+ apni,আপনি,bn
6
+ amar,আমার,bn
7
+ khabar,খাবার,bn
8
+ bharat,भारत,hi
9
+ apni,আপনি,bn
10
+ bharat,भारत,hi
11
+ vanakkam,வணக்கம்,ta
12
+ en,என்,ta
13
+ achen,আছেন,bn
14
+ mera,मेरा,hi
15
+ achen,আছেন,bn
16
+ neengal,நீங்கள்,ta
17
+ bharat,भारत,hi
18
+ en,என்,ta
19
+ sapadu,சாப்பாடு,ta
20
+ rahe,रहे,hi
21
+ hai,है,hi
22
+ naam,नाम,hi
23
+ namoshkar,নমস্কার,bn
24
+ mera,मेरा,hi
25
+ namoshkar,নমস্কার,bn
26
+ aap,आप,hi
27
+ kar,कर,hi
28
+ jal,জল,bn
29
+ rahe,रहे,hi
30
+ eppadi,எப்படி,ta
31
+ vanakkam,வணக்கம்,ta
32
+ kar,कर,hi
33
+ khabar,খাবার,bn
34
+ tamil,தமிழ்,ta
35
+ kemon,কেমন,bn
36
+ jal,জল,bn
37
+ thanni,தண்ணீர்,ta
38
+ en,என்,ta
39
+ kya,क्या,hi
40
+ eppadi,எப்படி,ta
41
+ khabar,খাবার,bn
42
+ vanakkam,வணக்கம்,ta
43
+ namaste,नमस्ते,hi
44
+ desh,দেশ,bn
45
+ thanni,தண்ணீர்,ta
46
+ bangla,বাংলা,bn
47
+ mera,मेरा,hi
48
+ apni,আপনি,bn
49
+ mera,मेरा,hi
50
+ achen,আছেন,bn
51
+ nam,নাম,bn
52
+ irukkeenga,இருக்கிறீர்கள்,ta
53
+ namoshkar,নমস্কার,bn
54
+ desh,দেশ,bn
55
+ mera,मेरा,hi
56
+ nadu,நாடு,ta
57
+ kar,कर,hi
58
+ desh,দেশ,bn
59
+ ho,हो,hi
60
+ nam,নাম,bn
61
+ rahe,रहे,hi
62
+ rahe,रहे,hi
63
+ bangla,বাংলা,bn
64
+ apni,আপনি,bn
65
+ naam,नाम,hi
66
+ eppadi,எப்படி,ta
67
+ namoshkar,নমস্কার,bn
68
+ thanni,தண்ணீர்,ta
69
+ eppadi,எப்படி,ta
70
+ peyar,பெயர்,ta
71
+ peyar,பெயர்,ta
72
+ kar,कर,hi
73
+ amar,আমার,bn
74
+ thanni,தண்ணீர்,ta
75
+ naam,नाम,hi
76
+ kemon,কেমন,bn
77
+ neengal,நீங்கள்,ta
78
+ irukkeenga,இருக்கிறீர்கள்,ta
79
+ bangla,বাংলা,bn
80
+ en,என்,ta
81
+ bangla,বাংলা,bn
82
+ ho,हो,hi
83
+ hai,है,hi
84
+ nadu,நாடு,ta
85
+ irukkeenga,இருக்கிறீர்கள்,ta
86
+ tamil,தமிழ்,ta
87
+ namaste,नमस्ते,hi
88
+ vanakkam,வணக்கம்,ta
89
+ naam,नाम,hi
90
+ eppadi,எப்படி,ta
91
+ bharat,भारत,hi
92
+ amar,আমার,bn
93
+ ho,हो,hi
94
+ jal,জল,bn
95
+ aap,आप,hi
96
+ sapadu,சாப்பாடு,ta
97
+ peyar,பெயர்,ta
98
+ aap,आप,hi
99
+ kya,क्या,hi
100
+ kemon,কেমন,bn
101
+ kemon,কেমন,bn
102
+ amar,আমার,bn
103
+ peyar,பெயர்,ta
104
+ namaste,नमस्ते,hi
105
+ nam,নাম,bn
106
+ kya,क्या,hi
107
+ irukkeenga,இருக்கிறீர்கள்,ta
108
+ jal,জল,bn
109
+ amar,আমার,bn
110
+ nadu,நாடு,ta
111
+ tamil,தமிழ்,ta
112
+ bangla,বাংলা,bn
113
+ hai,है,hi
114
+ namaste,नमस्ते,hi
115
+ thanni,தண்ணீர்,ta
116
+ neengal,நீங்கள்,ta
117
+ aap,आप,hi
118
+ nam,নাম,bn
119
+ hai,है,hi
120
+ jal,জল,bn
121
+ nam,নাম,bn
src/data/val.csv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ source,target,lang
2
+ khabar,খাবার,bn
3
+ kemon,কেমন,bn
4
+ namoshkar,নমস্কার,bn
5
+ sapadu,சாப்பாடு,ta
6
+ sapadu,சாப்பாடு,ta
7
+ namaste,नमस्ते,hi
8
+ hai,है,hi
9
+ neengal,நீங்கள்,ta
10
+ apni,আপনি,bn
11
+ peyar,பெயர்,ta
12
+ en,என்,ta
13
+ ho,हो,hi
14
+ sapadu,சாப்பாடு,ta
15
+ naam,नाम,hi
16
+ achen,আছেন,bn
src/optimize.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import ctranslate2
4
+ import transformers
5
+ from datasets import load_dataset
6
+ import pandas as pd
7
+
8
+ MODEL_DIR = "models"
9
+ CT2_MODEL_DIR = "models" # Set to models for HF Spaces compatibility (outputs model.bin here)
10
+
11
+ def optimize_model():
12
+ print("Converting model to CTranslate2 format...")
13
+ # Ensure source files exist
14
+ if not any(f for f in os.listdir(MODEL_DIR) if f.startswith("pytorch_model") or f.endswith(".safetensors")):
15
+ print(f"Error: No source weights found in {MODEL_DIR}. Cannot convert.")
16
+ return
17
+
18
+ # Converter for mBART
19
+ converter = ctranslate2.converters.TransformersConverter(
20
+ MODEL_DIR,
21
+ activation_scales=None,
22
+ copy_files=["tokenizer.json", "sentencepiece.bpe.model"] # Ensure tokenizer files are copied
23
+ )
24
+
25
+ # Quantization often helps speed. Int8 is common.
26
+ converter.convert(
27
+ CT2_MODEL_DIR,
28
+ quantization="int8",
29
+ force=True
30
+ )
31
+ print(f"Model converted and saved to {CT2_MODEL_DIR}")
32
+
33
+ def benchmark():
34
+ print("\nStarting Benchmarking...")
35
+
36
+ # Load original model (for size check only, inference might be slow to load)
37
+ # original_size = get_dir_size(MODEL_DIR)
38
+ # ct2_size = get_dir_size(CT2_MODEL_DIR)
39
+ # print(f"Original Model Size: {original_size / 1e6:.2f} MB")
40
+ # print(f"Optimized Model Size: {ct2_size / 1e6:.2f} MB")
41
+
42
+ # Load CT2 model
43
+ translator = ctranslate2.Translator(CT2_MODEL_DIR)
44
+ tokenizer = transformers.MBart50TokenizerFast.from_pretrained(MODEL_DIR)
45
+
46
+ # Test data
47
+ texts = ["Namaste", "Hello", "How are you", "Good morning", "India"]
48
+ target_lang = "hi_IN" # Test with Hindi
49
+
50
+ tokenizer.src_lang = "en_XX"
51
+
52
+ start_time = time.time()
53
+
54
+ # Tokenize
55
+ source = tokenizer(texts, return_tensors="pt", padding=True)
56
+ input_tokens = [tokenizer.convert_ids_to_tokens(ids) for ids in source["input_ids"]]
57
+
58
+ # Remove padding/eos if needed specifically for CT2, but usually it handles list of strings
59
+ # Actually CT2 expects list of list of str tokens
60
+ # Let's re-do properly for CT2 text input
61
+
62
+ input_tokens_batch = []
63
+ for text in texts:
64
+ tokens = tokenizer.tokenize(text)
65
+ input_tokens_batch.append(tokens)
66
+
67
+ # Translate
68
+ results = translator.translate_batch(
69
+ input_tokens_batch,
70
+ target_prefix=[[target_lang]] * len(texts) # Force target lang
71
+ )
72
+
73
+ end_time = time.time()
74
+
75
+ decoded = []
76
+ for result in results:
77
+ decoded.append(tokenizer.decode(tokenizer.convert_tokens_to_ids(result.hypotheses[0])))
78
+
79
+ duration = end_time - start_time
80
+ print(f"Inference Time for {len(texts)} sentences: {duration:.4f}s")
81
+ print(f"Speed: {len(texts)/duration:.2f} sentences/s")
82
+
83
+ for src, tgt in zip(texts, decoded):
84
+ print(f"{src} -> {tgt}")
85
+
86
+ def get_dir_size(path):
87
+ total = 0
88
+ with os.scandir(path) as it:
89
+ for entry in it:
90
+ if entry.is_file():
91
+ total += entry.stat().st_size
92
+ elif entry.is_dir():
93
+ total += get_dir_size(entry.path)
94
+ return total
95
+
96
+ if __name__ == "__main__":
97
+ if not os.path.exists(MODEL_DIR):
98
+ print(f"Model directory {MODEL_DIR} not found. Please train first.")
99
+ else:
100
+ optimize_model()
101
+ benchmark()
src/prepare_data.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import random
4
+
5
+ def create_dummy_data():
6
+ """Generates dummy transliteration data for Hindi, Bengali, and Tamil."""
7
+
8
+ # Minimal dummy dataset
9
+ data = [
10
+ # Hindi
11
+ ("namaste", "नमस्ते", "hi"),
12
+ ("aap", "आप", "hi"),
13
+ ("kya", "क्या", "hi"),
14
+ ("kar", "कर", "hi"),
15
+ ("rahe", "रहे", "hi"),
16
+ ("ho", "हो", "hi"),
17
+ ("mera", "मेरा", "hi"),
18
+ ("naam", "नाम", "hi"),
19
+ ("hai", "है", "hi"),
20
+ ("bharat", "भारत", "hi"),
21
+
22
+ # Bengali
23
+ ("namoshkar", "নমস্কার", "bn"),
24
+ ("apni", "আপনি", "bn"),
25
+ ("kemon", "কেমন", "bn"),
26
+ ("achen", "আছেন", "bn"),
27
+ ("amar", "আমার", "bn"),
28
+ ("nam", "নাম", "bn"),
29
+ ("bangla", "বাংলা", "bn"),
30
+ ("desh", "দেশ", "bn"),
31
+ ("khabar", "খাবার", "bn"),
32
+ ("jal", "জল", "bn"),
33
+
34
+ # Tamil
35
+ ("vanakkam", "வணக்கம்", "ta"),
36
+ ("neengal", "நீங்கள்", "ta"),
37
+ ("eppadi", "எப்படி", "ta"),
38
+ ("irukkeenga", "இருக்கிறீர்கள்", "ta"),
39
+ ("en", "என்", "ta"),
40
+ ("peyar", "பெயர்", "ta"),
41
+ ("tamil", "தமிழ்", "ta"),
42
+ ("nadu", "நாடு", "ta"),
43
+ ("sapadu", "சாப்பாடு", "ta"),
44
+ ("thanni", "தண்ணீர்", "ta")
45
+ ]
46
+
47
+ # Expand data slightly by duplicating to simulate a larger set for split
48
+ data = data * 5
49
+ random.shuffle(data)
50
+
51
+ df = pd.DataFrame(data, columns=["source", "target", "lang"])
52
+
53
+ # Split into train, val, test (80-10-10)
54
+ train_size = int(0.8 * len(df))
55
+ val_size = int(0.1 * len(df))
56
+
57
+ train_df = df[:train_size]
58
+ val_df = df[train_size:train_size+val_size]
59
+ test_df = df[train_size+val_size:]
60
+
61
+ output_dir = "data"
62
+ os.makedirs(output_dir, exist_ok=True)
63
+
64
+ train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False)
65
+ val_df.to_csv(os.path.join(output_dir, "val.csv"), index=False)
66
+ test_df.to_csv(os.path.join(output_dir, "test.csv"), index=False)
67
+
68
+ print(f"Data generation complete.")
69
+ print(f"Train size: {len(train_df)}")
70
+ print(f"Val size: {len(val_df)}")
71
+ print(f"Test size: {len(test_df)}")
72
+
73
+ if __name__ == "__main__":
74
+ create_dummy_data()
src/train.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import torch
4
+ from datasets import Dataset, DatasetDict
5
+ from transformers import (
6
+ MBartForConditionalGeneration,
7
+ MBart50TokenizerFast,
8
+ Seq2SeqTrainingArguments,
9
+ Seq2SeqTrainer,
10
+ DataCollatorForSeq2Seq,
11
+ )
12
+
13
+ # ======================
14
+ # CONFIG
15
+ # ======================
16
+ MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"
17
+ OUTPUT_DIR = "models/mbart-transliteration"
18
+
19
+ MAX_INPUT_LENGTH = 128
20
+ MAX_TARGET_LENGTH = 128
21
+
22
+ BATCH_SIZE = 4 # CPU-safe
23
+ EPOCHS = 1 # Increase later
24
+ LEARNING_RATE = 5e-5
25
+
26
+ SRC_LANG = "en_XX"
27
+ TGT_LANG = "hi_IN" # Hindi
28
+
29
+ # ======================
30
+ # LOAD DATA
31
+ # ======================
32
+ def load_data():
33
+ data_files = {
34
+ "train": "data/train.csv",
35
+ "validation": "data/val.csv",
36
+ "test": "data/test.csv",
37
+ }
38
+
39
+ dataset_dict = {}
40
+ for split, path in data_files.items():
41
+ df = pd.read_csv(path)
42
+
43
+ # REQUIRED columns
44
+ assert "source" in df.columns
45
+ assert "target" in df.columns
46
+
47
+ dataset_dict[split] = Dataset.from_pandas(df)
48
+
49
+ return DatasetDict(dataset_dict)
50
+
51
+ # ======================
52
+ # PREPROCESS (✅ FIXED)
53
+ # ======================
54
+ def preprocess_function(examples):
55
+ # ✅ MUST set every call (critical for mBART)
56
+ tokenizer.src_lang = SRC_LANG
57
+ tokenizer.tgt_lang = TGT_LANG
58
+
59
+ inputs = examples["source"]
60
+ targets = examples["target"]
61
+
62
+ model_inputs = tokenizer(
63
+ inputs,
64
+ max_length=MAX_INPUT_LENGTH,
65
+ truncation=True,
66
+ padding="max_length",
67
+ )
68
+
69
+ labels = tokenizer(
70
+ text_target=targets,
71
+ max_length=MAX_TARGET_LENGTH,
72
+ truncation=True,
73
+ padding="max_length",
74
+ )
75
+
76
+ model_inputs["labels"] = labels["input_ids"]
77
+ return model_inputs
78
+
79
+ # ======================
80
+ # TRAIN
81
+ # ======================
82
+ def main():
83
+ print("Loading tokenizer and model...")
84
+ global tokenizer
85
+
86
+ tokenizer = MBart50TokenizerFast.from_pretrained(MODEL_NAME)
87
+ model = MBartForConditionalGeneration.from_pretrained(MODEL_NAME, low_cpu_mem_usage=True)
88
+
89
+ print("Loading datasets...")
90
+ raw_datasets = load_data()
91
+
92
+ print("Tokenizing datasets...")
93
+ tokenized_datasets = raw_datasets.map(
94
+ preprocess_function,
95
+ batched=True,
96
+ remove_columns=raw_datasets["train"].column_names,
97
+ )
98
+
99
+ data_collator = DataCollatorForSeq2Seq(
100
+ tokenizer=tokenizer,
101
+ model=model,
102
+ )
103
+
104
+ training_args = Seq2SeqTrainingArguments(
105
+ output_dir=OUTPUT_DIR,
106
+ eval_strategy="epoch",
107
+ learning_rate=LEARNING_RATE,
108
+ per_device_train_batch_size=BATCH_SIZE,
109
+ per_device_eval_batch_size=BATCH_SIZE,
110
+ num_train_epochs=EPOCHS,
111
+ weight_decay=0.01,
112
+ save_total_limit=1,
113
+ save_strategy="epoch",
114
+ predict_with_generate=True,
115
+ logging_steps=10,
116
+ load_best_model_at_end=True,
117
+ report_to="none",
118
+ fp16=False, # CPU safe
119
+ )
120
+
121
+ trainer = Seq2SeqTrainer(
122
+ model=model,
123
+ args=training_args,
124
+ train_dataset=tokenized_datasets["train"],
125
+ eval_dataset=tokenized_datasets["validation"],
126
+ tokenizer=tokenizer,
127
+ data_collator=data_collator,
128
+ )
129
+
130
+ print("Training started...")
131
+ trainer.train()
132
+
133
+ print("Saving model...")
134
+ trainer.save_model(OUTPUT_DIR)
135
+ tokenizer.save_pretrained(OUTPUT_DIR)
136
+
137
+ print(f"Training complete. Model saved to `{OUTPUT_DIR}`")
138
+
139
+ # ======================
140
+ if __name__ == "__main__":
141
+ main()