Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -28,10 +28,11 @@ def encode_text(text: str):
|
|
| 28 |
"""Basic encode: returns tokens + ids."""
|
| 29 |
enc = tokenizer(text, add_special_tokens=False)
|
| 30 |
tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
|
| 31 |
-
return tokens, enc["input_ids"]
|
|
|
|
|
|
|
| 32 |
|
| 33 |
def encode_plus(text: str):
|
| 34 |
-
"""HF encode_plus with attention mask etc."""
|
| 35 |
enc = tokenizer(
|
| 36 |
text,
|
| 37 |
truncation=False,
|
|
@@ -39,8 +40,10 @@ def encode_plus(text: str):
|
|
| 39 |
return_offsets_mapping=True,
|
| 40 |
add_special_tokens=True
|
| 41 |
)
|
|
|
|
| 42 |
return enc
|
| 43 |
|
|
|
|
| 44 |
def decode_ids(ids: str):
|
| 45 |
"""Decode from comma-separated IDs to text."""
|
| 46 |
try:
|
|
@@ -56,11 +59,11 @@ def batch_encode(text_list):
|
|
| 56 |
out = []
|
| 57 |
for i, ids in enumerate(enc["input_ids"]):
|
| 58 |
toks = tokenizer.convert_ids_to_tokens(ids)
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
return out
|
| 65 |
|
| 66 |
# --------------------------------------
|
|
@@ -107,7 +110,7 @@ with gr.Blocks(title="Hindi Tokenizer") as demo:
|
|
| 107 |
with gr.Tab("Encode"):
|
| 108 |
text_in = gr.Textbox(label="Enter text")
|
| 109 |
tokens_out = gr.JSON(label="Tokens")
|
| 110 |
-
ids_out = gr.
|
| 111 |
btn = gr.Button("Encode")
|
| 112 |
btn.click(encode_text, text_in, [tokens_out, ids_out])
|
| 113 |
|
|
@@ -128,10 +131,15 @@ with gr.Blocks(title="Hindi Tokenizer") as demo:
|
|
| 128 |
label="Enter multiple lines (newline separated)",
|
| 129 |
placeholder="Line 1\nLine 2\nLine 3"
|
| 130 |
)
|
| 131 |
-
batch_out = gr.JSON(label="Batch output
|
|
|
|
| 132 |
btn4 = gr.Button("Batch Encode")
|
| 133 |
btn4.click(batch_encode, batch_in, batch_out)
|
| 134 |
|
| 135 |
# Mount FastAPI + Gradio
|
|
|
|
| 136 |
if "app" not in globals():
|
| 137 |
-
app = gr.mount_gradio_app(api, demo, path="/gradio")
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
"""Basic encode: returns tokens + ids."""
|
| 29 |
enc = tokenizer(text, add_special_tokens=False)
|
| 30 |
tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
|
| 31 |
+
# return tokens, enc["input_ids"]
|
| 32 |
+
csv_ids = ",".join(str(x) for x in enc["input_ids"])
|
| 33 |
+
return tokens, csv_ids
|
| 34 |
|
| 35 |
def encode_plus(text: str):
|
|
|
|
| 36 |
enc = tokenizer(
|
| 37 |
text,
|
| 38 |
truncation=False,
|
|
|
|
| 40 |
return_offsets_mapping=True,
|
| 41 |
add_special_tokens=True
|
| 42 |
)
|
| 43 |
+
enc["input_ids_csv"] = ",".join(str(x) for x in enc["input_ids"])
|
| 44 |
return enc
|
| 45 |
|
| 46 |
+
|
| 47 |
def decode_ids(ids: str):
|
| 48 |
"""Decode from comma-separated IDs to text."""
|
| 49 |
try:
|
|
|
|
| 59 |
out = []
|
| 60 |
for i, ids in enumerate(enc["input_ids"]):
|
| 61 |
toks = tokenizer.convert_ids_to_tokens(ids)
|
| 62 |
+
out.append({
|
| 63 |
+
"input": lines[i],
|
| 64 |
+
"tokens": toks,
|
| 65 |
+
"ids_csv": ",".join(str(x) for x in ids)
|
| 66 |
+
})
|
| 67 |
return out
|
| 68 |
|
| 69 |
# --------------------------------------
|
|
|
|
| 110 |
with gr.Tab("Encode"):
|
| 111 |
text_in = gr.Textbox(label="Enter text")
|
| 112 |
tokens_out = gr.JSON(label="Tokens")
|
| 113 |
+
ids_out = gr.Textbox(label="Token IDs (CSV)")
|
| 114 |
btn = gr.Button("Encode")
|
| 115 |
btn.click(encode_text, text_in, [tokens_out, ids_out])
|
| 116 |
|
|
|
|
| 131 |
label="Enter multiple lines (newline separated)",
|
| 132 |
placeholder="Line 1\nLine 2\nLine 3"
|
| 133 |
)
|
| 134 |
+
batch_out = gr.JSON(label="Batch output (CSV per line)")
|
| 135 |
+
|
| 136 |
btn4 = gr.Button("Batch Encode")
|
| 137 |
btn4.click(batch_encode, batch_in, batch_out)
|
| 138 |
|
| 139 |
# Mount FastAPI + Gradio
|
| 140 |
+
|
| 141 |
if "app" not in globals():
|
| 142 |
+
app = gr.mount_gradio_app(api, demo, path="/gradio")
|
| 143 |
+
|
| 144 |
+
if __name__ == "__main__":
|
| 145 |
+
demo.launch(server_port=7860, share=False)
|