dnzblgn commited on
Commit
7d7f8be
·
verified ·
1 Parent(s): e946a7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -3
app.py CHANGED
@@ -20,15 +20,21 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
20
  token_ids = encoded["input_ids"]
21
  tokens = tokenizer.convert_ids_to_tokens(token_ids)
22
 
23
- # Generate a random pastel color
24
  def random_pastel():
25
  r = lambda: random.randint(100, 255)
26
  return f"rgb({r()},{r()},{r()})"
27
 
28
- # Create HTML tokens with random colors and bigger size
 
 
 
29
  html_tokens = []
30
  for idx, token in enumerate(tokens):
31
- color = random_pastel()
 
 
 
 
32
  html_token = f"""
33
  <span style='
34
  display:inline-block;
@@ -50,6 +56,7 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
50
 
51
  return html_output, f"🔢 Token Count: {len(tokens)}"
52
 
 
53
  # Gradio app
54
  with gr.Blocks() as app:
55
  gr.Markdown("# 🚀 Tokenizer Playground (Tiktokenizer-Style)")
 
20
  token_ids = encoded["input_ids"]
21
  tokens = tokenizer.convert_ids_to_tokens(token_ids)
22
 
 
23
  def random_pastel():
24
  r = lambda: random.randint(100, 255)
25
  return f"rgb({r()},{r()},{r()})"
26
 
27
+ # Helper to detect special tokens
28
+ def is_special_token(token):
29
+ return token.startswith('[') and token.endswith(']') or token.startswith('<') and token.endswith('>') or token in tokenizer.all_special_tokens
30
+
31
  html_tokens = []
32
  for idx, token in enumerate(tokens):
33
+ if is_special_token(token):
34
+ color = "lightgray" # special token color
35
+ else:
36
+ color = random_pastel()
37
+
38
  html_token = f"""
39
  <span style='
40
  display:inline-block;
 
56
 
57
  return html_output, f"🔢 Token Count: {len(tokens)}"
58
 
59
+
60
  # Gradio app
61
  with gr.Blocks() as app:
62
  gr.Markdown("# 🚀 Tokenizer Playground (Tiktokenizer-Style)")