Spaces:

Rahul2020
/

hindi_tokenizer

Sleeping

App Files Files Community

Rahul2020 commited on Nov 23, 2025

Commit

1103cec

verified ·

1 Parent(s): c1989d5

Upload 6 files

Browse files

Files changed (6) hide show

app.py +145 -0
requirements.txt +6 -0
tokenizer_hindi_bpe_8k_stream/hf/special_tokens_map.json +7 -0
tokenizer_hindi_bpe_8k_stream/hf/tokenizer.json +0 -0
tokenizer_hindi_bpe_8k_stream/hf/tokenizer_config.json +52 -0
tokenizer_hindi_bpe_8k_stream/tokenizer.json +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import gradio as gr
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from transformers import PreTrainedTokenizerFast
+import os
+# --------------------------------------
+# LOAD TOKENIZER
+# --------------------------------------
+TOKENIZER_JSON = "tokenizer_hindi_bpe_8k_stream/tokenizer.json"
+HF_DIR = "tokenizer_hindi_bpe_8k_stream/hf"
+if os.path.exists(HF_DIR):
+    tokenizer = PreTrainedTokenizerFast.from_pretrained(HF_DIR)
+elif os.path.exists(TOKENIZER_JSON):
+    tokenizer = PreTrainedTokenizerFast(tokenizer_file=TOKENIZER_JSON)
+else:
+    raise ValueError("Tokenizer not found!")
+print("Tokenizer loaded: vocab =", tokenizer.vocab_size)
+# --------------------------------------
+# ENCODE / DECODE FUNCTIONS
+# --------------------------------------
+def encode_text(text: str):
+    """Basic encode: returns tokens + ids."""
+    enc = tokenizer(text, add_special_tokens=False)
+    tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
+    # return tokens, enc["input_ids"]
+    csv_ids = ",".join(str(x) for x in enc["input_ids"])
+    return tokens, csv_ids
+def encode_plus(text: str):
+    enc = tokenizer(
+        text,
+        truncation=False,
+        return_attention_mask=True,
+        return_offsets_mapping=True,
+        add_special_tokens=True
+    )
+    enc["input_ids_csv"] = ",".join(str(x) for x in enc["input_ids"])
+    return enc
+def decode_ids(ids: str):
+    """Decode from comma-separated IDs to text."""
+    try:
+        arr = [int(x) for x in ids.split(",") if x.strip()]
+        return tokenizer.decode(arr)
+    except:
+        return "❌ Invalid ID list"
+def batch_encode(text_list):
+    """Batch encode multiple lines separated by newline."""
+    lines = [ln.strip() for ln in text_list.split("\n") if ln.strip()]
+    enc = tokenizer(lines, add_special_tokens=False)
+    out = []
+    for i, ids in enumerate(enc["input_ids"]):
+        toks = tokenizer.convert_ids_to_tokens(ids)
+    out.append({
+        "input": lines[i],
+        "tokens": toks,
+        "ids_csv": ",".join(str(x) for x in ids)
+    })
+    return out
+# --------------------------------------
+# FASTAPI REST BACKEND
+# --------------------------------------
+api = FastAPI(title="Hindi Tokenizer API")
+api.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"]
+)
+@api.get("/")
+def home():
+    return {
+        "message": "Hindi Tokenizer API",
+        "vocab_size": tokenizer.vocab_size
+    }
+@api.get("/tokenize")
+def tokenize_endpoint(text: str):
+    enc = tokenizer(text, add_special_tokens=False)
+    tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
+    return {"tokens": tokens, "ids": enc["input_ids"]}
+@api.get("/decode")
+def decode_endpoint(ids: str):
+    try:
+        arr = [int(x) for x in ids.split(",") if x.strip()]
+        return {"text": tokenizer.decode(arr)}
+    except:
+        return {"error": "Invalid id list"}
+# --------------------------------------
+# GRADIO FRONTEND
+# --------------------------------------
+with gr.Blocks(title="Hindi Tokenizer") as demo:
+    gr.Markdown("## 🔡 Hindi BPE Tokenizer — Encode / Decode / Batch")
+    with gr.Tab("Encode"):
+        text_in = gr.Textbox(label="Enter text")
+        tokens_out = gr.JSON(label="Tokens")
+        ids_out = gr.Textbox(label="Token IDs (CSV)")
+        btn = gr.Button("Encode")
+        btn.click(encode_text, text_in, [tokens_out, ids_out])
+    with gr.Tab("Encode+ (HF full)"):
+        text2_in = gr.Textbox(label="Enter text (HF encode_plus)")
+        enc_plus_out = gr.JSON(label="Output")
+        btn2 = gr.Button("Run encode_plus")
+        btn2.click(encode_plus, text2_in, enc_plus_out)
+    with gr.Tab("Decode"):
+        ids_in = gr.Textbox(label="Comma-separated token IDs")
+        text_out = gr.Textbox(label="Decoded text")
+        btn3 = gr.Button("Decode")
+        btn3.click(decode_ids, ids_in, text_out)
+    with gr.Tab("Batch Encode"):
+        batch_in = gr.Textbox(
+            label="Enter multiple lines (newline separated)",
+            placeholder="Line 1\nLine 2\nLine 3"
+        )
+        batch_out = gr.JSON(label="Batch output (CSV per line)")
+        btn4 = gr.Button("Batch Encode")
+        btn4.click(batch_encode, batch_in, batch_out)
+# Mount FastAPI + Gradio
+if "app" not in globals():
+    app = gr.mount_gradio_app(api, demo, path="/gradio")
+if __name__ == "__main__":
+    demo.launch(server_port=7860, share=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+fastapi
+uvicorn
+transformers
+tokenizers
+torch --index-url https://download.pytorch.org/whl/cpu

tokenizer_hindi_bpe_8k_stream/hf/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer_hindi_bpe_8k_stream/hf/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_hindi_bpe_8k_stream/hf/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "[UNK]"
+}

tokenizer_hindi_bpe_8k_stream/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff