Spaces:
Sleeping
Sleeping
Gül Sena Altıntaş
commited on
Commit
·
199862a
1
Parent(s):
d9779a0
Small improvement for visualization
Browse files
app.py
CHANGED
|
@@ -81,14 +81,23 @@ def generate_interactive_tokenization(results):
|
|
| 81 |
|
| 82 |
# Add styles first
|
| 83 |
html_parts.append("""
|
| 84 |
-
<div id="tokenizer-container">
|
| 85 |
<style>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
.tokenizer-section {
|
| 87 |
margin-bottom: 20px;
|
| 88 |
border: 1px solid #e0e0e0;
|
| 89 |
border-radius: 8px;
|
| 90 |
padding: 15px;
|
| 91 |
background: white;
|
|
|
|
|
|
|
|
|
|
| 92 |
}
|
| 93 |
.tokenizer-header {
|
| 94 |
font-weight: bold;
|
|
@@ -157,6 +166,9 @@ def generate_interactive_tokenization(results):
|
|
| 157 |
font-size: 12px;
|
| 158 |
display: none;
|
| 159 |
z-index: 1000;
|
|
|
|
|
|
|
|
|
|
| 160 |
}
|
| 161 |
</style>
|
| 162 |
|
|
|
|
| 81 |
|
| 82 |
# Add styles first
|
| 83 |
html_parts.append("""
|
| 84 |
+
<div id="tokenizer-container" class="tokenizer-container">
|
| 85 |
<style>
|
| 86 |
+
.tokenizer-container {
|
| 87 |
+
display: flex;
|
| 88 |
+
flex-wrap: wrap;
|
| 89 |
+
justify-content: space-between;
|
| 90 |
+
gap: 20px;
|
| 91 |
+
}
|
| 92 |
.tokenizer-section {
|
| 93 |
margin-bottom: 20px;
|
| 94 |
border: 1px solid #e0e0e0;
|
| 95 |
border-radius: 8px;
|
| 96 |
padding: 15px;
|
| 97 |
background: white;
|
| 98 |
+
flex-wrap: wrap;
|
| 99 |
+
display: inline-block;
|
| 100 |
+
justify-content: space-between;
|
| 101 |
}
|
| 102 |
.tokenizer-header {
|
| 103 |
font-weight: bold;
|
|
|
|
| 166 |
font-size: 12px;
|
| 167 |
display: none;
|
| 168 |
z-index: 1000;
|
| 169 |
+
flex-wrap: wrap;
|
| 170 |
+
display: inline-block;
|
| 171 |
+
justify-content: space-between;
|
| 172 |
}
|
| 173 |
</style>
|
| 174 |
|
utils.py
CHANGED
|
@@ -8,6 +8,8 @@ from transformers import AutoTokenizer
|
|
| 8 |
|
| 9 |
from mappings import MODEL_MAP, TOKENIZER_INFO
|
| 10 |
|
|
|
|
|
|
|
| 11 |
|
| 12 |
class TokenMonsterTokenizer:
|
| 13 |
def __init__(self, name):
|
|
@@ -116,25 +118,33 @@ def tokenize_with_tiktoken(text, model):
|
|
| 116 |
}
|
| 117 |
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
def tokenize_with_hf(text, model):
|
| 120 |
try:
|
| 121 |
-
|
| 122 |
-
# Get token from environment
|
| 123 |
-
hf_token = os.getenv("HF_TOKEN")
|
| 124 |
-
if not hf_token:
|
| 125 |
-
return {
|
| 126 |
-
"model": TOKENIZER_INFO[model]["name"],
|
| 127 |
-
"token_count": 0,
|
| 128 |
-
"tokens": [],
|
| 129 |
-
"error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
|
| 130 |
-
}
|
| 131 |
-
|
| 132 |
-
if "tokenmonster" in model_name:
|
| 133 |
-
tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
|
| 134 |
-
else:
|
| 135 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 136 |
-
model_name, token=hf_token, trust_remote_code=True
|
| 137 |
-
)
|
| 138 |
token_data = []
|
| 139 |
for text_ in text.split("\n"):
|
| 140 |
text_ = text_ + "\n"
|
|
|
|
| 8 |
|
| 9 |
from mappings import MODEL_MAP, TOKENIZER_INFO
|
| 10 |
|
| 11 |
+
TOKENIZER_CACHE = {}
|
| 12 |
+
|
| 13 |
|
| 14 |
class TokenMonsterTokenizer:
|
| 15 |
def __init__(self, name):
|
|
|
|
| 118 |
}
|
| 119 |
|
| 120 |
|
| 121 |
+
def get_hf_tokenizer(model):
|
| 122 |
+
model_name = MODEL_MAP.get(model, "gpt2")
|
| 123 |
+
if model_name in TOKENIZER_CACHE:
|
| 124 |
+
return TOKENIZER_CACHE[model_name]
|
| 125 |
+
# Get token from environment
|
| 126 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 127 |
+
if not hf_token:
|
| 128 |
+
return {
|
| 129 |
+
"model": TOKENIZER_INFO[model]["name"],
|
| 130 |
+
"token_count": 0,
|
| 131 |
+
"tokens": [],
|
| 132 |
+
"error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
if "tokenmonster" in model_name:
|
| 136 |
+
tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
|
| 137 |
+
else:
|
| 138 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 139 |
+
model_name, token=hf_token, trust_remote_code=True
|
| 140 |
+
)
|
| 141 |
+
TOKENIZER_CACHE[model_name] = tokenizer
|
| 142 |
+
return tokenizer
|
| 143 |
+
|
| 144 |
+
|
| 145 |
def tokenize_with_hf(text, model):
|
| 146 |
try:
|
| 147 |
+
tokenizer = get_hf_tokenizer(model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
token_data = []
|
| 149 |
for text_ in text.split("\n"):
|
| 150 |
text_ = text_ + "\n"
|