Rahul2020 commited on
Commit
1103cec
·
verified ·
1 Parent(s): c1989d5

Upload 6 files

Browse files
app.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from fastapi import FastAPI
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from transformers import PreTrainedTokenizerFast
5
+ import os
6
+
7
+ # --------------------------------------
8
+ # LOAD TOKENIZER
9
+ # --------------------------------------
10
+
11
+ TOKENIZER_JSON = "tokenizer_hindi_bpe_8k_stream/tokenizer.json"
12
+ HF_DIR = "tokenizer_hindi_bpe_8k_stream/hf"
13
+
14
+ if os.path.exists(HF_DIR):
15
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(HF_DIR)
16
+ elif os.path.exists(TOKENIZER_JSON):
17
+ tokenizer = PreTrainedTokenizerFast(tokenizer_file=TOKENIZER_JSON)
18
+ else:
19
+ raise ValueError("Tokenizer not found!")
20
+
21
+ print("Tokenizer loaded: vocab =", tokenizer.vocab_size)
22
+
23
+ # --------------------------------------
24
+ # ENCODE / DECODE FUNCTIONS
25
+ # --------------------------------------
26
+
27
+ def encode_text(text: str):
28
+ """Basic encode: returns tokens + ids."""
29
+ enc = tokenizer(text, add_special_tokens=False)
30
+ tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
31
+ # return tokens, enc["input_ids"]
32
+ csv_ids = ",".join(str(x) for x in enc["input_ids"])
33
+ return tokens, csv_ids
34
+
35
+ def encode_plus(text: str):
36
+ enc = tokenizer(
37
+ text,
38
+ truncation=False,
39
+ return_attention_mask=True,
40
+ return_offsets_mapping=True,
41
+ add_special_tokens=True
42
+ )
43
+ enc["input_ids_csv"] = ",".join(str(x) for x in enc["input_ids"])
44
+ return enc
45
+
46
+
47
+ def decode_ids(ids: str):
48
+ """Decode from comma-separated IDs to text."""
49
+ try:
50
+ arr = [int(x) for x in ids.split(",") if x.strip()]
51
+ return tokenizer.decode(arr)
52
+ except:
53
+ return "❌ Invalid ID list"
54
+
55
+ def batch_encode(text_list):
56
+ """Batch encode multiple lines separated by newline."""
57
+ lines = [ln.strip() for ln in text_list.split("\n") if ln.strip()]
58
+ enc = tokenizer(lines, add_special_tokens=False)
59
+ out = []
60
+ for i, ids in enumerate(enc["input_ids"]):
61
+ toks = tokenizer.convert_ids_to_tokens(ids)
62
+ out.append({
63
+ "input": lines[i],
64
+ "tokens": toks,
65
+ "ids_csv": ",".join(str(x) for x in ids)
66
+ })
67
+ return out
68
+
69
+ # --------------------------------------
70
+ # FASTAPI REST BACKEND
71
+ # --------------------------------------
72
+
73
+ api = FastAPI(title="Hindi Tokenizer API")
74
+
75
+ api.add_middleware(
76
+ CORSMiddleware,
77
+ allow_origins=["*"],
78
+ allow_methods=["*"],
79
+ allow_headers=["*"]
80
+ )
81
+
82
+ @api.get("/")
83
+ def home():
84
+ return {
85
+ "message": "Hindi Tokenizer API",
86
+ "vocab_size": tokenizer.vocab_size
87
+ }
88
+
89
+ @api.get("/tokenize")
90
+ def tokenize_endpoint(text: str):
91
+ enc = tokenizer(text, add_special_tokens=False)
92
+ tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
93
+ return {"tokens": tokens, "ids": enc["input_ids"]}
94
+
95
+ @api.get("/decode")
96
+ def decode_endpoint(ids: str):
97
+ try:
98
+ arr = [int(x) for x in ids.split(",") if x.strip()]
99
+ return {"text": tokenizer.decode(arr)}
100
+ except:
101
+ return {"error": "Invalid id list"}
102
+
103
+ # --------------------------------------
104
+ # GRADIO FRONTEND
105
+ # --------------------------------------
106
+
107
+ with gr.Blocks(title="Hindi Tokenizer") as demo:
108
+ gr.Markdown("## 🔡 Hindi BPE Tokenizer — Encode / Decode / Batch")
109
+
110
+ with gr.Tab("Encode"):
111
+ text_in = gr.Textbox(label="Enter text")
112
+ tokens_out = gr.JSON(label="Tokens")
113
+ ids_out = gr.Textbox(label="Token IDs (CSV)")
114
+ btn = gr.Button("Encode")
115
+ btn.click(encode_text, text_in, [tokens_out, ids_out])
116
+
117
+ with gr.Tab("Encode+ (HF full)"):
118
+ text2_in = gr.Textbox(label="Enter text (HF encode_plus)")
119
+ enc_plus_out = gr.JSON(label="Output")
120
+ btn2 = gr.Button("Run encode_plus")
121
+ btn2.click(encode_plus, text2_in, enc_plus_out)
122
+
123
+ with gr.Tab("Decode"):
124
+ ids_in = gr.Textbox(label="Comma-separated token IDs")
125
+ text_out = gr.Textbox(label="Decoded text")
126
+ btn3 = gr.Button("Decode")
127
+ btn3.click(decode_ids, ids_in, text_out)
128
+
129
+ with gr.Tab("Batch Encode"):
130
+ batch_in = gr.Textbox(
131
+ label="Enter multiple lines (newline separated)",
132
+ placeholder="Line 1\nLine 2\nLine 3"
133
+ )
134
+ batch_out = gr.JSON(label="Batch output (CSV per line)")
135
+
136
+ btn4 = gr.Button("Batch Encode")
137
+ btn4.click(batch_encode, batch_in, batch_out)
138
+
139
+ # Mount FastAPI + Gradio
140
+
141
+ if "app" not in globals():
142
+ app = gr.mount_gradio_app(api, demo, path="/gradio")
143
+
144
+ if __name__ == "__main__":
145
+ demo.launch(server_port=7860, share=False)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ fastapi
3
+ uvicorn
4
+ transformers
5
+ tokenizers
6
+ torch --index-url https://download.pytorch.org/whl/cpu
tokenizer_hindi_bpe_8k_stream/hf/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer_hindi_bpe_8k_stream/hf/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_hindi_bpe_8k_stream/hf/tokenizer_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "mask_token": "[MASK]",
47
+ "model_max_length": 1000000000000000019884624838656,
48
+ "pad_token": "[PAD]",
49
+ "sep_token": "[SEP]",
50
+ "tokenizer_class": "PreTrainedTokenizerFast",
51
+ "unk_token": "[UNK]"
52
+ }
tokenizer_hindi_bpe_8k_stream/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff