Rahul2020 commited on
Commit
3cb5649
·
verified ·
1 Parent(s): e5ce8eb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -136
app.py CHANGED
@@ -1,136 +1,137 @@
1
- import gradio as gr
2
- from fastapi import FastAPI
3
- from fastapi.middleware.cors import CORSMiddleware
4
- from transformers import PreTrainedTokenizerFast
5
- import os
6
-
7
- # --------------------------------------
8
- # LOAD TOKENIZER
9
- # --------------------------------------
10
-
11
- TOKENIZER_JSON = "tokenizer_hindi_bpe_8k_stream/tokenizer.json"
12
- HF_DIR = "tokenizer_hindi_bpe_8k_stream/hf"
13
-
14
- if os.path.exists(HF_DIR):
15
- tokenizer = PreTrainedTokenizerFast.from_pretrained(HF_DIR)
16
- elif os.path.exists(TOKENIZER_JSON):
17
- tokenizer = PreTrainedTokenizerFast(tokenizer_file=TOKENIZER_JSON)
18
- else:
19
- raise ValueError("Tokenizer not found!")
20
-
21
- print("Tokenizer loaded: vocab =", tokenizer.vocab_size)
22
-
23
- # --------------------------------------
24
- # ENCODE / DECODE FUNCTIONS
25
- # --------------------------------------
26
-
27
- def encode_text(text: str):
28
- """Basic encode: returns tokens + ids."""
29
- enc = tokenizer(text, add_special_tokens=False)
30
- tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
31
- return tokens, enc["input_ids"]
32
-
33
- def encode_plus(text: str):
34
- """HF encode_plus with attention mask etc."""
35
- enc = tokenizer(
36
- text,
37
- truncation=False,
38
- return_attention_mask=True,
39
- return_offsets_mapping=True,
40
- add_special_tokens=True
41
- )
42
- return enc
43
-
44
- def decode_ids(ids: str):
45
- """Decode from comma-separated IDs to text."""
46
- try:
47
- arr = [int(x) for x in ids.split(",") if x.strip()]
48
- return tokenizer.decode(arr)
49
- except:
50
- return "❌ Invalid ID list"
51
-
52
- def batch_encode(text_list):
53
- """Batch encode multiple lines separated by newline."""
54
- lines = [ln.strip() for ln in text_list.split("\n") if ln.strip()]
55
- enc = tokenizer(lines, add_special_tokens=False)
56
- out = []
57
- for i, ids in enumerate(enc["input_ids"]):
58
- toks = tokenizer.convert_ids_to_tokens(ids)
59
- out.append({
60
- "input": lines[i],
61
- "tokens": toks,
62
- "ids": ids
63
- })
64
- return out
65
-
66
- # --------------------------------------
67
- # FASTAPI REST BACKEND
68
- # --------------------------------------
69
-
70
- api = FastAPI(title="Hindi Tokenizer API")
71
-
72
- api.add_middleware(
73
- CORSMiddleware,
74
- allow_origins=["*"],
75
- allow_methods=["*"],
76
- allow_headers=["*"]
77
- )
78
-
79
- @api.get("/")
80
- def home():
81
- return {
82
- "message": "Hindi Tokenizer API",
83
- "vocab_size": tokenizer.vocab_size
84
- }
85
-
86
- @api.get("/tokenize")
87
- def tokenize_endpoint(text: str):
88
- enc = tokenizer(text, add_special_tokens=False)
89
- tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
90
- return {"tokens": tokens, "ids": enc["input_ids"]}
91
-
92
- @api.get("/decode")
93
- def decode_endpoint(ids: str):
94
- try:
95
- arr = [int(x) for x in ids.split(",") if x.strip()]
96
- return {"text": tokenizer.decode(arr)}
97
- except:
98
- return {"error": "Invalid id list"}
99
-
100
- # --------------------------------------
101
- # GRADIO FRONTEND
102
- # --------------------------------------
103
-
104
- with gr.Blocks(title="Hindi Tokenizer") as demo:
105
- gr.Markdown("## 🔡 Hindi BPE Tokenizer — Encode / Decode / Batch")
106
-
107
- with gr.Tab("Encode"):
108
- text_in = gr.Textbox(label="Enter text")
109
- tokens_out = gr.JSON(label="Tokens")
110
- ids_out = gr.JSON(label="Token IDs")
111
- btn = gr.Button("Encode")
112
- btn.click(encode_text, text_in, [tokens_out, ids_out])
113
-
114
- with gr.Tab("Encode+ (HF full)"):
115
- text2_in = gr.Textbox(label="Enter text (HF encode_plus)")
116
- enc_plus_out = gr.JSON(label="Output")
117
- btn2 = gr.Button("Run encode_plus")
118
- btn2.click(encode_plus, text2_in, enc_plus_out)
119
-
120
- with gr.Tab("Decode"):
121
- ids_in = gr.Textbox(label="Comma-separated token IDs")
122
- text_out = gr.Textbox(label="Decoded text")
123
- btn3 = gr.Button("Decode")
124
- btn3.click(decode_ids, ids_in, text_out)
125
-
126
- with gr.Tab("Batch Encode"):
127
- batch_in = gr.Textbox(
128
- label="Enter multiple lines (newline separated)",
129
- placeholder="Line 1\nLine 2\nLine 3"
130
- )
131
- batch_out = gr.JSON(label="Batch output list")
132
- btn4 = gr.Button("Batch Encode")
133
- btn4.click(batch_encode, batch_in, batch_out)
134
-
135
- # Mount FastAPI + Gradio
136
- app = gr.mount_gradio_app(api, demo, path="/gradio")
 
 
1
+ import gradio as gr
2
+ from fastapi import FastAPI
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from transformers import PreTrainedTokenizerFast
5
+ import os
6
+
7
+ # --------------------------------------
8
+ # LOAD TOKENIZER
9
+ # --------------------------------------
10
+
11
+ TOKENIZER_JSON = "tokenizer_hindi_bpe_8k_stream/tokenizer.json"
12
+ HF_DIR = "tokenizer_hindi_bpe_8k_stream/hf"
13
+
14
+ if os.path.exists(HF_DIR):
15
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(HF_DIR)
16
+ elif os.path.exists(TOKENIZER_JSON):
17
+ tokenizer = PreTrainedTokenizerFast(tokenizer_file=TOKENIZER_JSON)
18
+ else:
19
+ raise ValueError("Tokenizer not found!")
20
+
21
+ print("Tokenizer loaded: vocab =", tokenizer.vocab_size)
22
+
23
+ # --------------------------------------
24
+ # ENCODE / DECODE FUNCTIONS
25
+ # --------------------------------------
26
+
27
+ def encode_text(text: str):
28
+ """Basic encode: returns tokens + ids."""
29
+ enc = tokenizer(text, add_special_tokens=False)
30
+ tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
31
+ return tokens, enc["input_ids"]
32
+
33
+ def encode_plus(text: str):
34
+ """HF encode_plus with attention mask etc."""
35
+ enc = tokenizer(
36
+ text,
37
+ truncation=False,
38
+ return_attention_mask=True,
39
+ return_offsets_mapping=True,
40
+ add_special_tokens=True
41
+ )
42
+ return enc
43
+
44
+ def decode_ids(ids: str):
45
+ """Decode from comma-separated IDs to text."""
46
+ try:
47
+ arr = [int(x) for x in ids.split(",") if x.strip()]
48
+ return tokenizer.decode(arr)
49
+ except:
50
+ return "❌ Invalid ID list"
51
+
52
+ def batch_encode(text_list):
53
+ """Batch encode multiple lines separated by newline."""
54
+ lines = [ln.strip() for ln in text_list.split("\n") if ln.strip()]
55
+ enc = tokenizer(lines, add_special_tokens=False)
56
+ out = []
57
+ for i, ids in enumerate(enc["input_ids"]):
58
+ toks = tokenizer.convert_ids_to_tokens(ids)
59
+ out.append({
60
+ "input": lines[i],
61
+ "tokens": toks,
62
+ "ids": ids
63
+ })
64
+ return out
65
+
66
+ # --------------------------------------
67
+ # FASTAPI REST BACKEND
68
+ # --------------------------------------
69
+
70
+ api = FastAPI(title="Hindi Tokenizer API")
71
+
72
+ api.add_middleware(
73
+ CORSMiddleware,
74
+ allow_origins=["*"],
75
+ allow_methods=["*"],
76
+ allow_headers=["*"]
77
+ )
78
+
79
+ @api.get("/")
80
+ def home():
81
+ return {
82
+ "message": "Hindi Tokenizer API",
83
+ "vocab_size": tokenizer.vocab_size
84
+ }
85
+
86
+ @api.get("/tokenize")
87
+ def tokenize_endpoint(text: str):
88
+ enc = tokenizer(text, add_special_tokens=False)
89
+ tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
90
+ return {"tokens": tokens, "ids": enc["input_ids"]}
91
+
92
+ @api.get("/decode")
93
+ def decode_endpoint(ids: str):
94
+ try:
95
+ arr = [int(x) for x in ids.split(",") if x.strip()]
96
+ return {"text": tokenizer.decode(arr)}
97
+ except:
98
+ return {"error": "Invalid id list"}
99
+
100
+ # --------------------------------------
101
+ # GRADIO FRONTEND
102
+ # --------------------------------------
103
+
104
+ with gr.Blocks(title="Hindi Tokenizer") as demo:
105
+ gr.Markdown("## 🔡 Hindi BPE Tokenizer — Encode / Decode / Batch")
106
+
107
+ with gr.Tab("Encode"):
108
+ text_in = gr.Textbox(label="Enter text")
109
+ tokens_out = gr.JSON(label="Tokens")
110
+ ids_out = gr.JSON(label="Token IDs")
111
+ btn = gr.Button("Encode")
112
+ btn.click(encode_text, text_in, [tokens_out, ids_out])
113
+
114
+ with gr.Tab("Encode+ (HF full)"):
115
+ text2_in = gr.Textbox(label="Enter text (HF encode_plus)")
116
+ enc_plus_out = gr.JSON(label="Output")
117
+ btn2 = gr.Button("Run encode_plus")
118
+ btn2.click(encode_plus, text2_in, enc_plus_out)
119
+
120
+ with gr.Tab("Decode"):
121
+ ids_in = gr.Textbox(label="Comma-separated token IDs")
122
+ text_out = gr.Textbox(label="Decoded text")
123
+ btn3 = gr.Button("Decode")
124
+ btn3.click(decode_ids, ids_in, text_out)
125
+
126
+ with gr.Tab("Batch Encode"):
127
+ batch_in = gr.Textbox(
128
+ label="Enter multiple lines (newline separated)",
129
+ placeholder="Line 1\nLine 2\nLine 3"
130
+ )
131
+ batch_out = gr.JSON(label="Batch output list")
132
+ btn4 = gr.Button("Batch Encode")
133
+ btn4.click(batch_encode, batch_in, batch_out)
134
+
135
+ # Mount FastAPI + Gradio
136
+ if "app" not in globals():
137
+ app = gr.mount_gradio_app(api, demo, path="/gradio")