endurasolution commited on
Commit
8faa42b
·
verified ·
1 Parent(s): cb6b24d

Upload folder using huggingface_hub

Browse files
.ipynb_checkpoints/README-checkpoint.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🅰️ Akshara-ML — Malayalam Transliteration Model
2
+
3
+ **Akshara-ML** is a neural transliteration model that converts **Manglish (Romanized Malayalam)** into **Malayalam script**.
4
+
5
+ Developed by **EnduraSolution**, in association with **Aksharakuppy**.
6
+
7
+ 🌐 https://aksharakuppy.com
8
+
9
+ ---
10
+ [![Hugging Face](https://img.shields.io/badge/HuggingFace-Model-yellow)](https://huggingface.co/endurasolution/akshara-ml)
11
+ ## ✨ Features
12
+
13
+ - 🔤 Manglish → Malayalam transliteration
14
+ - ⚡ Fast inference (greedy decoding)
15
+ - 🎯 High accuracy (beam search decoding)
16
+ - 🧠 Transformer-based architecture
17
+ - 🇮🇳 Built specifically for Malayalam language
18
+
19
+ ---
20
+
21
+ ## 🧪 Example
22
+
23
+ | Manglish | Malayalam |
24
+ |--------|----------|
25
+ | namaskaram | നമസ്കാരം |
26
+ | sugam aano | സുഖം ആണോ |
27
+ | ente peru | എന്റെ പേര് |
28
+
29
+ ---
30
+
31
+ ## 🚀 Usage (Python)
32
+
33
+ ```python
34
+ from model import build_model
35
+ from train import load_checkpoint
36
+ from dataset import load_vocab, get_inverse_vocab
37
+ from config import Config
38
+ import torch
39
+
40
+ # Load vocab
41
+ src_vocab = load_vocab("src_vocab.json")
42
+ tgt_vocab = load_vocab("tgt_vocab.json")
43
+ inv_vocab = get_inverse_vocab(tgt_vocab)
44
+
45
+ # Build model
46
+ model = build_model(len(src_vocab), len(tgt_vocab))
47
+ load_checkpoint("pytorch_model.bin", model)
48
+ model.eval()
49
+
50
+ def transliterate(text):
51
+ ids = [Config.SOS_IDX] + [src_vocab.get(c, Config.UNK_IDX) for c in text] + [Config.EOS_IDX]
52
+ src = torch.tensor([ids])
53
+ pred_ids = model.greedy_decode(src)
54
+
55
+ output = ""
56
+ for i in pred_ids:
57
+ if i == Config.EOS_IDX:
58
+ break
59
+ output += inv_vocab.get(i, "")
60
+ return output
61
+
62
+ print(transliterate("namaskaram"))
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🅰️ Akshara-ML — Malayalam Transliteration Model
2
+
3
+ **Akshara-ML** is a neural transliteration model that converts **Manglish (Romanized Malayalam)** into **Malayalam script**.
4
+
5
+ Developed by **EnduraSolution**, in association with **Aksharakuppy**.
6
+
7
+ 🌐 https://aksharakuppy.com
8
+
9
+ ---
10
+ [![Hugging Face](https://img.shields.io/badge/HuggingFace-Model-yellow)](https://huggingface.co/endurasolution/akshara-ml)
11
+ ## ✨ Features
12
+
13
+ - 🔤 Manglish → Malayalam transliteration
14
+ - ⚡ Fast inference (greedy decoding)
15
+ - 🎯 High accuracy (beam search decoding)
16
+ - 🧠 Transformer-based architecture
17
+ - 🇮🇳 Built specifically for Malayalam language
18
+
19
+ ---
20
+
21
+ ## 🧪 Example
22
+
23
+ | Manglish | Malayalam |
24
+ |--------|----------|
25
+ | namaskaram | നമസ്കാരം |
26
+ | sugam aano | സുഖം ആണോ |
27
+ | ente peru | എന്റെ പേര് |
28
+
29
+ ---
30
+
31
+ ## 🚀 Usage (Python)
32
+
33
+ ```python
34
+ from model import build_model
35
+ from train import load_checkpoint
36
+ from dataset import load_vocab, get_inverse_vocab
37
+ from config import Config
38
+ import torch
39
+
40
+ # Load vocab
41
+ src_vocab = load_vocab("src_vocab.json")
42
+ tgt_vocab = load_vocab("tgt_vocab.json")
43
+ inv_vocab = get_inverse_vocab(tgt_vocab)
44
+
45
+ # Build model
46
+ model = build_model(len(src_vocab), len(tgt_vocab))
47
+ load_checkpoint("pytorch_model.bin", model)
48
+ model.eval()
49
+
50
+ def transliterate(text):
51
+ ids = [Config.SOS_IDX] + [src_vocab.get(c, Config.UNK_IDX) for c in text] + [Config.EOS_IDX]
52
+ src = torch.tensor([ids])
53
+ pred_ids = model.greedy_decode(src)
54
+
55
+ output = ""
56
+ for i in pred_ids:
57
+ if i == Config.EOS_IDX:
58
+ break
59
+ output += inv_vocab.get(i, "")
60
+ return output
61
+
62
+ print(transliterate("namaskaram"))
config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "malayalam-transliteration-transformer",
3
+ "d_model": 512,
4
+ "nhead": 8,
5
+ "num_encoder_layers": 6,
6
+ "num_decoder_layers": 6,
7
+ "dim_feedforward": 2048,
8
+ "dropout": 0.1,
9
+ "src_vocab_size": 30,
10
+ "tgt_vocab_size": 74,
11
+ "max_position_embeddings": 1024
12
+ }
modeling_transliterator.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from model import TransliterationTransformer
4
+
5
+ class HFTransliterator(nn.Module):
6
+ def __init__(self, config):
7
+ super().__init__()
8
+ self.model = TransliterationTransformer(
9
+ config["src_vocab_size"],
10
+ config["tgt_vocab_size"]
11
+ )
12
+
13
+ def forward(self, src, tgt):
14
+ return self.model(src, tgt)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:866a807590fcfdb9c6dc7290033f9cd9c087286adf843561bc2475b7075f1d09
3
+ size 181196059
src_vocab.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<pad>": 0,
3
+ "<sos>": 1,
4
+ "<eos>": 2,
5
+ "<unk>": 3,
6
+ "a": 4,
7
+ "n": 5,
8
+ "i": 6,
9
+ "u": 7,
10
+ "h": 8,
11
+ "k": 9,
12
+ "t": 10,
13
+ "l": 11,
14
+ "e": 12,
15
+ "m": 13,
16
+ "r": 14,
17
+ "y": 15,
18
+ "d": 16,
19
+ "o": 17,
20
+ "p": 18,
21
+ "s": 19,
22
+ "v": 20,
23
+ "c": 21,
24
+ "g": 22,
25
+ "b": 23,
26
+ "j": 24,
27
+ "z": 25,
28
+ "w": 26,
29
+ "f": 27,
30
+ "x": 28,
31
+ "q": 29
32
+ }
tgt_vocab.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<pad>": 0,
3
+ "<sos>": 1,
4
+ "<eos>": 2,
5
+ "<unk>": 3,
6
+ "്": 4,
7
+ "ി": 5,
8
+ "ക": 6,
9
+ "ന": 7,
10
+ "ു": 8,
11
+ "ാ": 9,
12
+ "ത": 10,
13
+ "യ": 11,
14
+ "ട": 12,
15
+ "ര": 13,
16
+ "മ": 14,
17
+ "ല": 15,
18
+ "െ": 16,
19
+ "പ": 17,
20
+ "വ": 18,
21
+ "ണ": 19,
22
+ "ള": 20,
23
+ "ം": 21,
24
+ "റ": 22,
25
+ "സ": 23,
26
+ "ച": 24,
27
+ "ോ": 25,
28
+ "ങ": 26,
29
+ "േ": 27,
30
+ "ദ": 28,
31
+ "ീ": 29,
32
+ "ൂ": 30,
33
+ "ശ": 31,
34
+ "ഷ": 32,
35
+ "അ": 33,
36
+ "ർ": 34,
37
+ "ഗ": 35,
38
+ "ൊ": 36,
39
+ "ൽ": 37,
40
+ "ജ": 38,
41
+ "ബ": 39,
42
+ "ധ": 40,
43
+ "ഞ": 41,
44
+ "ഹ": 42,
45
+ "ഴ": 43,
46
+ "ഭ": 44,
47
+ "ൾ": 45,
48
+ "ൻ": 46,
49
+ "ആ": 47,
50
+ "ഡ": 48,
51
+ "ഇ": 49,
52
+ "ൈ": 50,
53
+ "ഥ": 51,
54
+ "ഉ": 52,
55
+ "ഫ": 53,
56
+ "എ": 54,
57
+ "ൃ": 55,
58
+ "ഖ": 56,
59
+ "ഒ": 57,
60
+ "ഘ": 58,
61
+ "ൌ": 59,
62
+ "ഓ": 60,
63
+ "ഠ": 61,
64
+ "ഏ": 62,
65
+ "ൺ": 63,
66
+ "ഈ": 64,
67
+ "ഊ": 65,
68
+ "ഐ": 66,
69
+ "ഛ": 67,
70
+ "ഔ": 68,
71
+ "ഢ": 69,
72
+ "ഃ": 70,
73
+ "ഋ": 71,
74
+ "ൗ": 72,
75
+ "ഝ": 73
76
+ }