sidharthg commited on
Commit
8e1870d
·
verified ·
1 Parent(s): 6cf0afb

Upload 4 files

Browse files
Files changed (4) hide show
  1. .gitattributes +35 -35
  2. README.md +13 -13
  3. app.py +258 -0
  4. model/vocab.json +0 -0
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,13 +1,13 @@
1
- ---
2
- title: Marathi Bpe Tokenizer
3
- emoji: 💻
4
- colorFrom: purple
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.49.1
8
- app_file: app.py
9
- pinned: false
10
- short_description: Visualize a Byte-Pair Encoding (BPE) tokenizer for Marathi
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: Marathi Bpe Tokenizer
3
+ emoji: 💻
4
+ colorFrom: purple
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 5.49.1
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: Visualize a Byte-Pair Encoding (BPE) tokenizer for Marathi
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ...existing code...
2
+ """
3
+ Gradio app for Marathi BPE Tokenizer — redesigned UI: elegant, business-oriented styling.
4
+ Usage: python app.py
5
+ """
6
+ from typing import Tuple, List, Dict
7
+ import re
8
+
9
+ import gradio as gr
10
+
11
+ from tokenizer import MarathiBPETokenizer # type: ignore
12
+
13
+ # Accent palette: bright but refined accents for token chips
14
+ ACCENTS = [
15
+ "#1FB6FF", # azure
16
+ "#00D4B8", # teal
17
+ "#FFB86B", # warm amber
18
+ "#FF6B6B", # coral
19
+ "#A78BFA", # muted violet
20
+ "#FFD166", # soft yellow
21
+ "#8ED1FC", # light sky
22
+ "#6CE0B6", # mint
23
+ ]
24
+
25
+
26
+ def _token_text(tokenizer: MarathiBPETokenizer, tid: int) -> str:
27
+ """Resolve token id to readable text using common fallbacks."""
28
+ try:
29
+ if hasattr(tokenizer, "decode"):
30
+ out = tokenizer.decode([tid])
31
+ if out:
32
+ return out
33
+ except Exception:
34
+ pass
35
+
36
+ if isinstance(getattr(tokenizer, "id_to_token", None), dict):
37
+ return tokenizer.id_to_token.get(tid, f"<{tid}>")
38
+
39
+ vocab = getattr(tokenizer, "vocab", None)
40
+ if isinstance(vocab, dict):
41
+ if tid in vocab:
42
+ return vocab[tid]
43
+ for k, v in vocab.items():
44
+ if v == tid:
45
+ return k
46
+
47
+ return f"<{tid}>"
48
+
49
+
50
+ def tokenize_and_visualize(text: str, tokenizer: MarathiBPETokenizer) -> Tuple[str, str, str]:
51
+ """Return (visual_html, count_card_html, token_ids_table_html)."""
52
+ if not text or not text.strip():
53
+ placeholder = (
54
+ "<div style='color:#9CA3AF; font-size:15px; padding:12px;'>"
55
+ "Enter Marathi text and click Analyze.</div>"
56
+ )
57
+ return placeholder, "<div style='color:#9CA3AF;'>Token count will appear here</div>", placeholder
58
+
59
+ # Try bulk encode; fallback to per-word
60
+ try:
61
+ token_ids: List[int] = tokenizer.encode(text)
62
+ except Exception:
63
+ token_ids = []
64
+ for part in text.split():
65
+ try:
66
+ token_ids.extend(tokenizer.encode(part))
67
+ except Exception:
68
+ continue
69
+
70
+ # map unique token ids -> accent color
71
+ tid_to_color: Dict[int, str] = {}
72
+ unique_tids: List[int] = []
73
+ for tid in token_ids:
74
+ if tid not in tid_to_color:
75
+ tid_to_color[tid] = ACCENTS[len(unique_tids) % len(ACCENTS)]
76
+ unique_tids.append(tid)
77
+
78
+ # Visualization tile (azure-toned business tile)
79
+ vis_outer = [
80
+ '<div style="padding:18px; border-radius:12px; background:linear-gradient(180deg,#063b66 0%,#0a2b48 100%);'
81
+ 'color:#F8FAFC; font-family:Inter, \'Noto Sans Devanagari\', Arial, sans-serif; font-size:18px; line-height:2;">'
82
+ ]
83
+
84
+ # split text into tokens/chunks for visualization (use tokenizer.pattern if available)
85
+ pattern = getattr(tokenizer, "pattern", r"\S+")
86
+ chunks = re.findall(pattern, text)
87
+ token_idx = 0
88
+ token_rows = [] # list of (idx, tid, token_text, color) for table
89
+
90
+ for chunk in chunks:
91
+ # prefer tokenizer's _apply_bpe if available, else encode chunk
92
+ if hasattr(tokenizer, "_apply_bpe"):
93
+ try:
94
+ chunk_tids = tokenizer._apply_bpe(chunk)
95
+ except Exception:
96
+ chunk_tids = tokenizer.encode(chunk) if hasattr(tokenizer, "encode") else []
97
+ else:
98
+ try:
99
+ chunk_tids = tokenizer.encode(chunk)
100
+ except Exception:
101
+ chunk_tids = []
102
+
103
+ for tid in chunk_tids:
104
+ token_text = _token_text(tokenizer, tid)
105
+ color = tid_to_color.get(tid, ACCENTS[0])
106
+ token_rows.append((token_idx, tid, token_text, color))
107
+
108
+ # bright chip with subtle border and drop
109
+ vis_outer.append(
110
+ f'<span class="token-chip" data-idx="{token_idx}" '
111
+ f'style="background:{color}; color:#fff; padding:8px 12px; margin:6px 6px 6px 0; '
112
+ f'border-radius:10px; display:inline-block; font-weight:600; box-shadow:0 6px 18px rgba(3,12,26,0.35); '
113
+ f'text-shadow:0 1px 2px rgba(0,0,0,0.25);">'
114
+ f'{token_text}</span>'
115
+ )
116
+ token_idx += 1
117
+
118
+ vis_outer.append("</div>")
119
+ visual_html = "".join(vis_outer)
120
+
121
+ # Count card: clean slate card
122
+ count_html = (
123
+ '<div style="padding:14px; border-radius:10px; background:linear-gradient(180deg,#f8fbff 0%,#eaf3ff 100%);'
124
+ 'color:#0b2540; text-align:center; font-family:Inter, Arial, sans-serif;">'
125
+ f'<div style="font-size:28px; font-weight:700;">{len(token_ids)}</div>'
126
+ f'<div style="color:#567096; margin-top:6px;">Total tokens • {len(unique_tids)} unique</div>'
127
+ "</div>"
128
+ )
129
+
130
+ # Token IDs table: elegant white text on soft azure panel
131
+ table_parts = [
132
+ '<div style="padding:12px; border-radius:10px; background:#083E8C; color:#FFFFFF; max-height:420px; overflow:auto;">',
133
+ '<table style="width:100%; border-collapse:collapse; font-family:Menlo, Monaco, monospace; font-size:13px;">',
134
+ '<thead><tr style="text-align:left; color:red;"><th style="padding:8px 10px;">Idx</th><th style="padding:8px 10px;">Token ID</th><th style="padding:8px 10px;">Token</th><th style="padding:8px 10px;">Color</th></tr></thead>',
135
+ "<tbody>"
136
+ ]
137
+
138
+ for idx, tid, ttext, color in token_rows:
139
+ table_parts.append(
140
+ '<tr style="border-bottom:1px solid rgba(255,255,255,0.05);">'
141
+ f'<td style="padding:8px 10px; color:#C9D6E6;">{idx}</td>'
142
+ f'<td style="padding:8px 10px; font-weight:700; color:#FFFFFF;">{tid}</td>'
143
+ f'<td style="padding:8px 10px; color:#FFFFFF;">{ttext!r}</td>'
144
+ f'<td style="padding:8px 10px;"><span style="display:inline-block; background:{color}; padding:6px 14px; border-radius:8px; box-shadow:0 6px 14px rgba(3,12,26,0.28);"></span></td>'
145
+ "</tr>"
146
+ )
147
+
148
+ table_parts.extend(["</tbody></table></div>"])
149
+ token_ids_html = "".join(table_parts)
150
+
151
+ return visual_html, count_html, token_ids_html
152
+
153
+
154
+ def create_app(tokenizer: MarathiBPETokenizer) -> gr.Blocks:
155
+ """Build Gradio Blocks UI with refined, business styling."""
156
+ css = """
157
+ <style>
158
+ :root{
159
+ --panel-bg:#0b2540;
160
+ --tile-azure:#083E8C;
161
+ --muted-text:#9CA3AF;
162
+ --header-grey:#374151;
163
+ }
164
+ /* Page baseline */
165
+ body { background: linear-gradient(180deg,#061328 0%, #071627 100%); font-family:Inter, "Noto Sans Devanagari", Arial, sans-serif; }
166
+
167
+ /* Header */
168
+ #header { margin-bottom:14px; }
169
+ .app-title { color: var(--header-grey); font-weight:700; font-size:20px; margin:0; }
170
+ .app-sub { color: var(--muted-text); margin:4px 0 0 0; }
171
+
172
+ /* Token chip hover */
173
+ .token-chip { transition: transform 0.16s ease, box-shadow 0.16s ease; cursor:default; }
174
+ .token-chip:hover { transform: translateY(-6px); box-shadow:0 28px 48px rgba(3,12,26,0.55); }
175
+
176
+ /* Examples styling (compatible across Gradio versions) */
177
+ .gr-examples, .gr-examples td, .gr-examples th { background: transparent !important; color: #E6EEF7 !important; }
178
+
179
+ /* Make Gradio tooltips readable */
180
+ .gradio-tooltip { color:#081026 !important; background:#F3F7FB !important; }
181
+
182
+ /* Responsive columns spacing */
183
+ .gr-row { gap:18px; }
184
+
185
+ /* Small utility */
186
+ .muted { color: var(--muted-text); font-size:13px; }
187
+ </style>
188
+ """
189
+
190
+ with gr.Blocks(css="") as demo:
191
+ # Inject CSS
192
+ gr.HTML(css, visible=False)
193
+
194
+ # Header area (remove unsupported 'classes' kwarg for compatibility)
195
+ with gr.Row(elem_id="header"):
196
+ with gr.Column(scale=1):
197
+ gr.Markdown(
198
+ "<div><h1 class='app-title' style='display:inline-block;'>Marathi BPE Tokenizer</h1>"
199
+ "<div class='app-sub'>Enterprise token inspection & visualization</div></div>"
200
+ )
201
+
202
+ # Main content: two-column layout
203
+ with gr.Row():
204
+ # Left: input + examples
205
+ with gr.Column(scale=1):
206
+ input_text = gr.Textbox(
207
+ label="Input Text",
208
+ placeholder="नमस्ते, मी एक मराठी टोकनायझर आहे",
209
+ lines=6
210
+ )
211
+ analyze_btn = gr.Button("Analyze", variant="primary")
212
+ gr.Markdown("<div class='muted' style='margin-top:8px;'>Sample inputs</div>")
213
+ gr.Examples(
214
+ examples=[
215
+ ["नमस्ते, मी एक मराठी टोकनायझर आहे."],
216
+ ["क्रिकेट - लहान मुले बागेत क्रिकेट खेळत आहेत."],
217
+ ["गाडी हळूहू चालवा किंवा आपल्याला अपघात होऊ शकतो."],
218
+ ["सचिन तेंडुलकर हा आमचा अव्वल क्रिकेटपटू आहे."],
219
+ ],
220
+ inputs=[input_text],
221
+ )
222
+
223
+ # Right: visual tile, count card, token table
224
+ with gr.Column(scale=1):
225
+ # visual tile
226
+ visual_out = gr.HTML("<div class='muted'>Token visualization will appear here</div>")
227
+ # compact row for count card
228
+ count_out = gr.HTML("<div class='muted'>Token count will appear here</div>")
229
+ # token ids table
230
+ table_out = gr.HTML("<div class='muted'>Token details will appear here</div>")
231
+
232
+ # Processing closure binds tokenizer instance
233
+ def _process(text: str):
234
+ return tokenize_and_visualize(text or "", tokenizer)
235
+
236
+ analyze_btn.click(fn=_process, inputs=[input_text], outputs=[visual_out, count_out, table_out])
237
+ input_text.submit(fn=_process, inputs=[input_text], outputs=[visual_out, count_out, table_out])
238
+
239
+ return demo
240
+
241
+
242
+ def main():
243
+ tokenizer = MarathiBPETokenizer()
244
+ try:
245
+ tokenizer.load_vocab("model/vocab.json")
246
+ print("✓ Loaded vocabulary successfully")
247
+ except FileNotFoundError:
248
+ print("ERROR: Vocabulary file not found at 'model/vocab.json'")
249
+ print("Run: python train.py to train and save the tokenizer.")
250
+ return
251
+
252
+ demo = create_app(tokenizer)
253
+ demo.launch()
254
+
255
+
256
+ if __name__ == "__main__":
257
+ main()
258
+ # ...existing code...
model/vocab.json ADDED
The diff for this file is too large to render. See raw diff