Spaces:

Prgckwb
/

tokenvisor

Sleeping

App Files Files Community

Prgckwb commited on Feb 21, 2024

Commit

6c94b18

1 Parent(s): 93013c6

Add app.py

Browse files

Files changed (2) hide show

app.py +66 -4
requirements.txt +5 -0

app.py CHANGED Viewed

@@ -1,7 +1,69 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

 import gradio as gr
+import pandas as pd
+from gradio.themes import colors
+from transformers import AutoTokenizer
+# Function to map tokenized text to IDs
+def inference(
+        text="",
+        model_id="openai/clip-vit-large-patch14",
+) -> (list[str, str], pd.DataFrame):
+    if text == "":
+        return [], pd.DataFrame()
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    # Use tokenizer to tokenize the text
+    text_inputs = tokenizer(text, return_tensors='pt')
+    input_ids = text_inputs['input_ids'].tolist()[0]  # Convert tensor to list
+    # Create pairs of tokens and IDs
+    tokens = [tokenizer.decode([id_]) for id_ in input_ids]
+    token_pairs = []
+    for token, id_ in zip(tokens, input_ids):
+        token_pairs.append((token, str(id_)))
+    # Count the number of characters and tokens
+    pos_count = pd.DataFrame({
+        "Char Count": [len(text)],
+        "Token Count": [len(token_pairs)]
+    })
+    return token_pairs, pos_count
+if __name__ == '__main__':
+    iface = gr.Interface(
+        fn=inference,
+        inputs=[
+            gr.Textbox(label="Text"),
+            gr.Dropdown(
+                label="Model",
+                choices=[
+                    "openai/clip-vit-large-patch14",
+                    "google-bert/bert-base-uncased",
+                    "google/flan-t5-base",
+                    "openai-community/gpt2",
+                ],
+                value="openai/clip-vit-large-patch14"
+            ),
+        ],
+        outputs=[
+            gr.Highlightedtext(label="Highlighted Text"),
+            gr.Dataframe(label="Position Count"),
+        ],
+        examples=[
+            ["When I told my computer I needed a break, it froze.", "openai/clip-vit-large-patch14"],
+            ["Yesterday, I thought my cat was studying for her degree in philosophy because she sat on my book, "
+             "but turns out she was just trying to hatch a plot to steal my dinner.", "openai/clip-vit-large-patch14"],
+            ["The square root of x is the cube root of y. What is y to the power of 2, if x = 4?",
+             "google/flan-t5-base"]
+        ],
+        cache_examples=True,
+        title="TokenVisor",
+        description="Visualize how the Tokenizer used in Hugging Face's Transformers library tokenizes text.",
+        theme=gr.Theme(primary_hue=colors.green, secondary_hue=colors.yellow),
+        allow_flagging="never",
+    )
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+transformers
+safetensors
+accelerate
+diffusers