Rahul2020 commited on
Commit
e5ce8eb
·
verified ·
1 Parent(s): b2032af

Upload 6 files

Browse files
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from fastapi import FastAPI
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from transformers import PreTrainedTokenizerFast
5
+ import os
6
+
7
+ # --------------------------------------
8
+ # LOAD TOKENIZER
9
+ # --------------------------------------
10
+
11
+ TOKENIZER_JSON = "tokenizer_hindi_bpe_8k_stream/tokenizer.json"
12
+ HF_DIR = "tokenizer_hindi_bpe_8k_stream/hf"
13
+
14
+ if os.path.exists(HF_DIR):
15
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(HF_DIR)
16
+ elif os.path.exists(TOKENIZER_JSON):
17
+ tokenizer = PreTrainedTokenizerFast(tokenizer_file=TOKENIZER_JSON)
18
+ else:
19
+ raise ValueError("Tokenizer not found!")
20
+
21
+ print("Tokenizer loaded: vocab =", tokenizer.vocab_size)
22
+
23
+ # --------------------------------------
24
+ # ENCODE / DECODE FUNCTIONS
25
+ # --------------------------------------
26
+
27
+ def encode_text(text: str):
28
+ """Basic encode: returns tokens + ids."""
29
+ enc = tokenizer(text, add_special_tokens=False)
30
+ tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
31
+ return tokens, enc["input_ids"]
32
+
33
+ def encode_plus(text: str):
34
+ """HF encode_plus with attention mask etc."""
35
+ enc = tokenizer(
36
+ text,
37
+ truncation=False,
38
+ return_attention_mask=True,
39
+ return_offsets_mapping=True,
40
+ add_special_tokens=True
41
+ )
42
+ return enc
43
+
44
+ def decode_ids(ids: str):
45
+ """Decode from comma-separated IDs to text."""
46
+ try:
47
+ arr = [int(x) for x in ids.split(",") if x.strip()]
48
+ return tokenizer.decode(arr)
49
+ except:
50
+ return "❌ Invalid ID list"
51
+
52
+ def batch_encode(text_list):
53
+ """Batch encode multiple lines separated by newline."""
54
+ lines = [ln.strip() for ln in text_list.split("\n") if ln.strip()]
55
+ enc = tokenizer(lines, add_special_tokens=False)
56
+ out = []
57
+ for i, ids in enumerate(enc["input_ids"]):
58
+ toks = tokenizer.convert_ids_to_tokens(ids)
59
+ out.append({
60
+ "input": lines[i],
61
+ "tokens": toks,
62
+ "ids": ids
63
+ })
64
+ return out
65
+
66
+ # --------------------------------------
67
+ # FASTAPI REST BACKEND
68
+ # --------------------------------------
69
+
70
+ api = FastAPI(title="Hindi Tokenizer API")
71
+
72
+ api.add_middleware(
73
+ CORSMiddleware,
74
+ allow_origins=["*"],
75
+ allow_methods=["*"],
76
+ allow_headers=["*"]
77
+ )
78
+
79
+ @api.get("/")
80
+ def home():
81
+ return {
82
+ "message": "Hindi Tokenizer API",
83
+ "vocab_size": tokenizer.vocab_size
84
+ }
85
+
86
+ @api.get("/tokenize")
87
+ def tokenize_endpoint(text: str):
88
+ enc = tokenizer(text, add_special_tokens=False)
89
+ tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
90
+ return {"tokens": tokens, "ids": enc["input_ids"]}
91
+
92
+ @api.get("/decode")
93
+ def decode_endpoint(ids: str):
94
+ try:
95
+ arr = [int(x) for x in ids.split(",") if x.strip()]
96
+ return {"text": tokenizer.decode(arr)}
97
+ except:
98
+ return {"error": "Invalid id list"}
99
+
100
+ # --------------------------------------
101
+ # GRADIO FRONTEND
102
+ # --------------------------------------
103
+
104
+ with gr.Blocks(title="Hindi Tokenizer") as demo:
105
+ gr.Markdown("## 🔡 Hindi BPE Tokenizer — Encode / Decode / Batch")
106
+
107
+ with gr.Tab("Encode"):
108
+ text_in = gr.Textbox(label="Enter text")
109
+ tokens_out = gr.JSON(label="Tokens")
110
+ ids_out = gr.JSON(label="Token IDs")
111
+ btn = gr.Button("Encode")
112
+ btn.click(encode_text, text_in, [tokens_out, ids_out])
113
+
114
+ with gr.Tab("Encode+ (HF full)"):
115
+ text2_in = gr.Textbox(label="Enter text (HF encode_plus)")
116
+ enc_plus_out = gr.JSON(label="Output")
117
+ btn2 = gr.Button("Run encode_plus")
118
+ btn2.click(encode_plus, text2_in, enc_plus_out)
119
+
120
+ with gr.Tab("Decode"):
121
+ ids_in = gr.Textbox(label="Comma-separated token IDs")
122
+ text_out = gr.Textbox(label="Decoded text")
123
+ btn3 = gr.Button("Decode")
124
+ btn3.click(decode_ids, ids_in, text_out)
125
+
126
+ with gr.Tab("Batch Encode"):
127
+ batch_in = gr.Textbox(
128
+ label="Enter multiple lines (newline separated)",
129
+ placeholder="Line 1\nLine 2\nLine 3"
130
+ )
131
+ batch_out = gr.JSON(label="Batch output list")
132
+ btn4 = gr.Button("Batch Encode")
133
+ btn4.click(batch_encode, batch_in, batch_out)
134
+
135
+ # Mount FastAPI + Gradio
136
+ app = gr.mount_gradio_app(api, demo, path="/gradio")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ fastapi
3
+ uvicorn
4
+ transformers
5
+ tokenizers
tokenizer_hindi_bpe_8k_stream/hf/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer_hindi_bpe_8k_stream/hf/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_hindi_bpe_8k_stream/hf/tokenizer_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "mask_token": "[MASK]",
47
+ "model_max_length": 1000000000000000019884624838656,
48
+ "pad_token": "[PAD]",
49
+ "sep_token": "[SEP]",
50
+ "tokenizer_class": "PreTrainedTokenizerFast",
51
+ "unk_token": "[UNK]"
52
+ }
tokenizer_hindi_bpe_8k_stream/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff