Spaces:
Runtime error
Runtime error
File size: 4,598 Bytes
3cb5649 44758ce 3cb5649 44758ce 3cb5649 44758ce 3cb5649 44758ce 3cb5649 44758ce 3cb5649 44758ce 3cb5649 44758ce 3cb5649 44758ce ced19ab b30f19e 151fe6c b30f19e 151fe6c b30f19e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | import gradio as gr
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from transformers import PreTrainedTokenizerFast
import os
# --------------------------------------
# LOAD TOKENIZER
# --------------------------------------
TOKENIZER_JSON = "tokenizer_hindi_bpe_8k_stream/tokenizer.json"
HF_DIR = "tokenizer_hindi_bpe_8k_stream/hf"
if os.path.exists(HF_DIR):
tokenizer = PreTrainedTokenizerFast.from_pretrained(HF_DIR)
elif os.path.exists(TOKENIZER_JSON):
tokenizer = PreTrainedTokenizerFast(tokenizer_file=TOKENIZER_JSON)
else:
raise ValueError("Tokenizer not found!")
print("Tokenizer loaded: vocab =", tokenizer.vocab_size)
# --------------------------------------
# ENCODE / DECODE FUNCTIONS
# --------------------------------------
def encode_text(text: str):
"""Basic encode: returns tokens + ids."""
enc = tokenizer(text, add_special_tokens=False)
tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
# return tokens, enc["input_ids"]
csv_ids = ",".join(str(x) for x in enc["input_ids"])
return tokens, csv_ids
def encode_plus(text: str):
enc = tokenizer(
text,
truncation=False,
return_attention_mask=True,
return_offsets_mapping=True,
add_special_tokens=True
)
enc["input_ids_csv"] = ",".join(str(x) for x in enc["input_ids"])
return enc
def decode_ids(ids: str):
"""Decode from comma-separated IDs to text."""
try:
arr = [int(x) for x in ids.split(",") if x.strip()]
return tokenizer.decode(arr)
except:
return "❌ Invalid ID list"
def batch_encode(text_list):
"""Batch encode multiple lines separated by newline."""
lines = [ln.strip() for ln in text_list.split("\n") if ln.strip()]
enc = tokenizer(lines, add_special_tokens=False)
out = []
for i, ids in enumerate(enc["input_ids"]):
toks = tokenizer.convert_ids_to_tokens(ids)
out.append({
"input": lines[i],
"tokens": toks,
"ids_csv": ",".join(str(x) for x in ids)
})
return out
# --------------------------------------
# FASTAPI REST BACKEND
# --------------------------------------
api = FastAPI(title="Hindi Tokenizer API")
api.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"]
)
@api.get("/")
def home():
return {
"message": "Hindi Tokenizer API",
"vocab_size": tokenizer.vocab_size
}
@api.get("/tokenize")
def tokenize_endpoint(text: str):
enc = tokenizer(text, add_special_tokens=False)
tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
return {"tokens": tokens, "ids": enc["input_ids"]}
@api.get("/decode")
def decode_endpoint(ids: str):
try:
arr = [int(x) for x in ids.split(",") if x.strip()]
return {"text": tokenizer.decode(arr)}
except:
return {"error": "Invalid id list"}
# --------------------------------------
# GRADIO FRONTEND
# --------------------------------------
with gr.Blocks(title="Hindi Tokenizer") as demo:
gr.Markdown("## 🔡 Hindi BPE Tokenizer — Encode / Decode / Batch")
with gr.Tab("Encode"):
text_in = gr.Textbox(label="Enter text")
tokens_out = gr.JSON(label="Tokens")
ids_out = gr.Textbox(label="Token IDs (CSV)")
btn = gr.Button("Encode")
btn.click(encode_text, text_in, [tokens_out, ids_out])
with gr.Tab("Encode+ (HF full)"):
text2_in = gr.Textbox(label="Enter text (HF encode_plus)")
enc_plus_out = gr.JSON(label="Output")
btn2 = gr.Button("Run encode_plus")
btn2.click(encode_plus, text2_in, enc_plus_out)
with gr.Tab("Decode"):
ids_in = gr.Textbox(label="Comma-separated token IDs")
text_out = gr.Textbox(label="Decoded text")
btn3 = gr.Button("Decode")
btn3.click(decode_ids, ids_in, text_out)
with gr.Tab("Batch Encode"):
batch_in = gr.Textbox(
label="Enter multiple lines (newline separated)",
placeholder="Line 1\nLine 2\nLine 3"
)
batch_out = gr.JSON(label="Batch output (CSV per line)")
btn4 = gr.Button("Batch Encode")
btn4.click(batch_encode, batch_in, batch_out)
# Mount FastAPI + Gradio
if "app" not in globals():
app = gr.mount_gradio_app(api, demo, path="/gradio")
import os
print('@@@@@@@@@@@@@',os.environ.get("PORT"))
if __name__ == "__main__":
print(os.environ.get("PORT"))
demo.launch(
server_name="0.0.0.0",
server_port=int(os.environ.get("PORT", 7860))
)
|