Rahul2020 commited on
Commit
44758ce
·
verified ·
1 Parent(s): 3cb5649

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -10
app.py CHANGED
@@ -28,10 +28,11 @@ def encode_text(text: str):
28
  """Basic encode: returns tokens + ids."""
29
  enc = tokenizer(text, add_special_tokens=False)
30
  tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
31
- return tokens, enc["input_ids"]
 
 
32
 
33
  def encode_plus(text: str):
34
- """HF encode_plus with attention mask etc."""
35
  enc = tokenizer(
36
  text,
37
  truncation=False,
@@ -39,8 +40,10 @@ def encode_plus(text: str):
39
  return_offsets_mapping=True,
40
  add_special_tokens=True
41
  )
 
42
  return enc
43
 
 
44
  def decode_ids(ids: str):
45
  """Decode from comma-separated IDs to text."""
46
  try:
@@ -56,11 +59,11 @@ def batch_encode(text_list):
56
  out = []
57
  for i, ids in enumerate(enc["input_ids"]):
58
  toks = tokenizer.convert_ids_to_tokens(ids)
59
- out.append({
60
- "input": lines[i],
61
- "tokens": toks,
62
- "ids": ids
63
- })
64
  return out
65
 
66
  # --------------------------------------
@@ -107,7 +110,7 @@ with gr.Blocks(title="Hindi Tokenizer") as demo:
107
  with gr.Tab("Encode"):
108
  text_in = gr.Textbox(label="Enter text")
109
  tokens_out = gr.JSON(label="Tokens")
110
- ids_out = gr.JSON(label="Token IDs")
111
  btn = gr.Button("Encode")
112
  btn.click(encode_text, text_in, [tokens_out, ids_out])
113
 
@@ -128,10 +131,15 @@ with gr.Blocks(title="Hindi Tokenizer") as demo:
128
  label="Enter multiple lines (newline separated)",
129
  placeholder="Line 1\nLine 2\nLine 3"
130
  )
131
- batch_out = gr.JSON(label="Batch output list")
 
132
  btn4 = gr.Button("Batch Encode")
133
  btn4.click(batch_encode, batch_in, batch_out)
134
 
135
  # Mount FastAPI + Gradio
 
136
  if "app" not in globals():
137
- app = gr.mount_gradio_app(api, demo, path="/gradio")
 
 
 
 
28
  """Basic encode: returns tokens + ids."""
29
  enc = tokenizer(text, add_special_tokens=False)
30
  tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
31
+ # return tokens, enc["input_ids"]
32
+ csv_ids = ",".join(str(x) for x in enc["input_ids"])
33
+ return tokens, csv_ids
34
 
35
  def encode_plus(text: str):
 
36
  enc = tokenizer(
37
  text,
38
  truncation=False,
 
40
  return_offsets_mapping=True,
41
  add_special_tokens=True
42
  )
43
+ enc["input_ids_csv"] = ",".join(str(x) for x in enc["input_ids"])
44
  return enc
45
 
46
+
47
  def decode_ids(ids: str):
48
  """Decode from comma-separated IDs to text."""
49
  try:
 
59
  out = []
60
  for i, ids in enumerate(enc["input_ids"]):
61
  toks = tokenizer.convert_ids_to_tokens(ids)
62
+ out.append({
63
+ "input": lines[i],
64
+ "tokens": toks,
65
+ "ids_csv": ",".join(str(x) for x in ids)
66
+ })
67
  return out
68
 
69
  # --------------------------------------
 
110
  with gr.Tab("Encode"):
111
  text_in = gr.Textbox(label="Enter text")
112
  tokens_out = gr.JSON(label="Tokens")
113
+ ids_out = gr.Textbox(label="Token IDs (CSV)")
114
  btn = gr.Button("Encode")
115
  btn.click(encode_text, text_in, [tokens_out, ids_out])
116
 
 
131
  label="Enter multiple lines (newline separated)",
132
  placeholder="Line 1\nLine 2\nLine 3"
133
  )
134
+ batch_out = gr.JSON(label="Batch output (CSV per line)")
135
+
136
  btn4 = gr.Button("Batch Encode")
137
  btn4.click(batch_encode, batch_in, batch_out)
138
 
139
  # Mount FastAPI + Gradio
140
+
141
  if "app" not in globals():
142
+ app = gr.mount_gradio_app(api, demo, path="/gradio")
143
+
144
+ if __name__ == "__main__":
145
+ demo.launch(server_port=7860, share=False)