Rahul2020 commited on
Commit
b5259f8
·
verified ·
1 Parent(s): 1103cec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -145
app.py CHANGED
@@ -1,145 +1,100 @@
1
- import gradio as gr
2
- from fastapi import FastAPI
3
- from fastapi.middleware.cors import CORSMiddleware
4
- from transformers import PreTrainedTokenizerFast
5
- import os
6
-
7
- # --------------------------------------
8
- # LOAD TOKENIZER
9
- # --------------------------------------
10
-
11
- TOKENIZER_JSON = "tokenizer_hindi_bpe_8k_stream/tokenizer.json"
12
- HF_DIR = "tokenizer_hindi_bpe_8k_stream/hf"
13
-
14
- if os.path.exists(HF_DIR):
15
- tokenizer = PreTrainedTokenizerFast.from_pretrained(HF_DIR)
16
- elif os.path.exists(TOKENIZER_JSON):
17
- tokenizer = PreTrainedTokenizerFast(tokenizer_file=TOKENIZER_JSON)
18
- else:
19
- raise ValueError("Tokenizer not found!")
20
-
21
- print("Tokenizer loaded: vocab =", tokenizer.vocab_size)
22
-
23
- # --------------------------------------
24
- # ENCODE / DECODE FUNCTIONS
25
- # --------------------------------------
26
-
27
- def encode_text(text: str):
28
- """Basic encode: returns tokens + ids."""
29
- enc = tokenizer(text, add_special_tokens=False)
30
- tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
31
- # return tokens, enc["input_ids"]
32
- csv_ids = ",".join(str(x) for x in enc["input_ids"])
33
- return tokens, csv_ids
34
-
35
- def encode_plus(text: str):
36
- enc = tokenizer(
37
- text,
38
- truncation=False,
39
- return_attention_mask=True,
40
- return_offsets_mapping=True,
41
- add_special_tokens=True
42
- )
43
- enc["input_ids_csv"] = ",".join(str(x) for x in enc["input_ids"])
44
- return enc
45
-
46
-
47
- def decode_ids(ids: str):
48
- """Decode from comma-separated IDs to text."""
49
- try:
50
- arr = [int(x) for x in ids.split(",") if x.strip()]
51
- return tokenizer.decode(arr)
52
- except:
53
- return "❌ Invalid ID list"
54
-
55
- def batch_encode(text_list):
56
- """Batch encode multiple lines separated by newline."""
57
- lines = [ln.strip() for ln in text_list.split("\n") if ln.strip()]
58
- enc = tokenizer(lines, add_special_tokens=False)
59
- out = []
60
- for i, ids in enumerate(enc["input_ids"]):
61
- toks = tokenizer.convert_ids_to_tokens(ids)
62
- out.append({
63
- "input": lines[i],
64
- "tokens": toks,
65
- "ids_csv": ",".join(str(x) for x in ids)
66
- })
67
- return out
68
-
69
- # --------------------------------------
70
- # FASTAPI REST BACKEND
71
- # --------------------------------------
72
-
73
- api = FastAPI(title="Hindi Tokenizer API")
74
-
75
- api.add_middleware(
76
- CORSMiddleware,
77
- allow_origins=["*"],
78
- allow_methods=["*"],
79
- allow_headers=["*"]
80
- )
81
-
82
- @api.get("/")
83
- def home():
84
- return {
85
- "message": "Hindi Tokenizer API",
86
- "vocab_size": tokenizer.vocab_size
87
- }
88
-
89
- @api.get("/tokenize")
90
- def tokenize_endpoint(text: str):
91
- enc = tokenizer(text, add_special_tokens=False)
92
- tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
93
- return {"tokens": tokens, "ids": enc["input_ids"]}
94
-
95
- @api.get("/decode")
96
- def decode_endpoint(ids: str):
97
- try:
98
- arr = [int(x) for x in ids.split(",") if x.strip()]
99
- return {"text": tokenizer.decode(arr)}
100
- except:
101
- return {"error": "Invalid id list"}
102
-
103
- # --------------------------------------
104
- # GRADIO FRONTEND
105
- # --------------------------------------
106
-
107
- with gr.Blocks(title="Hindi Tokenizer") as demo:
108
- gr.Markdown("## 🔡 Hindi BPE Tokenizer — Encode / Decode / Batch")
109
-
110
- with gr.Tab("Encode"):
111
- text_in = gr.Textbox(label="Enter text")
112
- tokens_out = gr.JSON(label="Tokens")
113
- ids_out = gr.Textbox(label="Token IDs (CSV)")
114
- btn = gr.Button("Encode")
115
- btn.click(encode_text, text_in, [tokens_out, ids_out])
116
-
117
- with gr.Tab("Encode+ (HF full)"):
118
- text2_in = gr.Textbox(label="Enter text (HF encode_plus)")
119
- enc_plus_out = gr.JSON(label="Output")
120
- btn2 = gr.Button("Run encode_plus")
121
- btn2.click(encode_plus, text2_in, enc_plus_out)
122
-
123
- with gr.Tab("Decode"):
124
- ids_in = gr.Textbox(label="Comma-separated token IDs")
125
- text_out = gr.Textbox(label="Decoded text")
126
- btn3 = gr.Button("Decode")
127
- btn3.click(decode_ids, ids_in, text_out)
128
-
129
- with gr.Tab("Batch Encode"):
130
- batch_in = gr.Textbox(
131
- label="Enter multiple lines (newline separated)",
132
- placeholder="Line 1\nLine 2\nLine 3"
133
- )
134
- batch_out = gr.JSON(label="Batch output (CSV per line)")
135
-
136
- btn4 = gr.Button("Batch Encode")
137
- btn4.click(batch_encode, batch_in, batch_out)
138
-
139
- # Mount FastAPI + Gradio
140
-
141
- if "app" not in globals():
142
- app = gr.mount_gradio_app(api, demo, path="/gradio")
143
-
144
- if __name__ == "__main__":
145
- demo.launch(server_port=7860, share=False)
 
1
+ import gradio as gr
2
+ from fastapi import FastAPI
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from transformers import PreTrainedTokenizerFast
5
+ import os
6
+
7
+ # --------------------------------------
8
+ # LOAD TOKENIZER
9
+ # --------------------------------------
10
+
11
+ TOKENIZER_JSON = "tokenizer_hindi_bpe_8k_stream/tokenizer.json"
12
+ HF_DIR = "tokenizer_hindi_bpe_8k_stream/hf"
13
+
14
+ if os.path.exists(HF_DIR):
15
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(HF_DIR)
16
+ elif os.path.exists(TOKENIZER_JSON):
17
+ tokenizer = PreTrainedTokenizerFast(tokenizer_file=TOKENIZER_JSON)
18
+ else:
19
+ raise ValueError("Tokenizer not found!")
20
+
21
+ print("Tokenizer loaded: vocab =", tokenizer.vocab_size)
22
+
23
+ # --------------------------------------
24
+ # ENCODE / DECODE FUNCTIONS
25
+ # --------------------------------------
26
+
27
+ def encode_text(text: str):
28
+ """Basic encode: returns token IDs as CSV."""
29
+ enc = tokenizer(text, add_special_tokens=False)
30
+ csv_ids = ",".join(str(x) for x in enc["input_ids"])
31
+ return csv_ids
32
+
33
+ def decode_ids(ids: str):
34
+ """Decode from comma-separated IDs to text."""
35
+ try:
36
+ arr = [int(x) for x in ids.split(",") if x.strip()]
37
+ return tokenizer.decode(arr)
38
+ except:
39
+ return "❌ Invalid ID list"
40
+
41
+ # --------------------------------------
42
+ # FASTAPI REST BACKEND
43
+ # --------------------------------------
44
+
45
+ api = FastAPI(title="Hindi Tokenizer API")
46
+
47
+ api.add_middleware(
48
+ CORSMiddleware,
49
+ allow_origins=["*"],
50
+ allow_methods=["*"],
51
+ allow_headers=["*"]
52
+ )
53
+
54
+ @api.get("/")
55
+ def home():
56
+ return {
57
+ "message": "Hindi Tokenizer API",
58
+ "vocab_size": tokenizer.vocab_size
59
+ }
60
+
61
+ @api.get("/tokenize")
62
+ def tokenize_endpoint(text: str):
63
+ enc = tokenizer(text, add_special_tokens=False)
64
+ tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
65
+ return {"tokens": tokens, "ids": enc["input_ids"]}
66
+
67
+ @api.get("/decode")
68
+ def decode_endpoint(ids: str):
69
+ try:
70
+ arr = [int(x) for x in ids.split(",") if x.strip()]
71
+ return {"text": tokenizer.decode(arr)}
72
+ except:
73
+ return {"error": "Invalid id list"}
74
+
75
+ # --------------------------------------
76
+ # GRADIO FRONTEND
77
+ # --------------------------------------
78
+
79
+ with gr.Blocks(title="Hindi Tokenizer") as demo:
80
+ gr.Markdown("## 🔡 Hindi BPE Tokenizer — Encode / Decode")
81
+
82
+ with gr.Tab("Encode"):
83
+ text_in = gr.Textbox(label="Enter text", lines=3)
84
+ ids_out = gr.Textbox(label="Token IDs", lines=8, max_lines=20)
85
+ btn = gr.Button("Encode")
86
+ btn.click(encode_text, text_in, ids_out)
87
+
88
+ with gr.Tab("Decode"):
89
+ ids_in = gr.Textbox(label="Comma-separated token IDs", lines=4)
90
+ text_out = gr.Textbox(label="Decoded text", lines=8, max_lines=20)
91
+ btn3 = gr.Button("Decode")
92
+ btn3.click(decode_ids, ids_in, text_out)
93
+
94
+ # Mount FastAPI + Gradio
95
+
96
+ if "app" not in globals():
97
+ app = gr.mount_gradio_app(api, demo, path="/gradio")
98
+
99
+ if __name__ == "__main__":
100
+ demo.launch(server_port=7860, share=False)