Spaces:
Sleeping
Sleeping
Gül Sena Altıntaş
commited on
Commit
·
44cdae3
1
Parent(s):
199862a
Improvements
Browse files
app.py
CHANGED
|
@@ -576,7 +576,13 @@ with gr.Blocks(
|
|
| 576 |
"tokenmonster",
|
| 577 |
"byt5",
|
| 578 |
],
|
| 579 |
-
value=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 580 |
label="Select tokenizers to compare",
|
| 581 |
)
|
| 582 |
show_details = gr.Checkbox(
|
|
@@ -679,7 +685,7 @@ with gr.Blocks(
|
|
| 679 |
)
|
| 680 |
|
| 681 |
# Combine or show separately
|
| 682 |
-
combined_html = f"<h3>Normalized Text: {normalized_text}</h3>{norm_html}\n<h2>Original</h2>{orig_html}"
|
| 683 |
|
| 684 |
return (
|
| 685 |
orig_eff,
|
|
|
|
| 576 |
"tokenmonster",
|
| 577 |
"byt5",
|
| 578 |
],
|
| 579 |
+
value=[
|
| 580 |
+
"gpt-4",
|
| 581 |
+
"llama-3",
|
| 582 |
+
"gemma-2",
|
| 583 |
+
"qwen2.5",
|
| 584 |
+
"tokenmonster",
|
| 585 |
+
],
|
| 586 |
label="Select tokenizers to compare",
|
| 587 |
)
|
| 588 |
show_details = gr.Checkbox(
|
|
|
|
| 685 |
)
|
| 686 |
|
| 687 |
# Combine or show separately
|
| 688 |
+
combined_html = f"<h3>Normalized ({norm_method}) Text: {normalized_text} </h3>{norm_html}\n<h2>Original</h2>{orig_html}"
|
| 689 |
|
| 690 |
return (
|
| 691 |
orig_eff,
|
utils.py
CHANGED
|
@@ -110,9 +110,9 @@ def tokenize_with_tiktoken(text, model):
|
|
| 110 |
|
| 111 |
return {
|
| 112 |
"model": TOKENIZER_INFO[model]["name"],
|
| 113 |
-
"token_count": len(
|
| 114 |
"tokens": token_data,
|
| 115 |
-
"compression_ratio": len(text) / len(
|
| 116 |
"encoding": TOKENIZER_INFO[model]["encoding"],
|
| 117 |
"vocab_size": TOKENIZER_INFO[model]["vocab_size"],
|
| 118 |
}
|
|
@@ -187,9 +187,9 @@ def tokenize_with_hf(text, model):
|
|
| 187 |
|
| 188 |
return {
|
| 189 |
"model": TOKENIZER_INFO[model]["name"],
|
| 190 |
-
"token_count": len(
|
| 191 |
"tokens": token_data,
|
| 192 |
-
"compression_ratio": len(text) / len(
|
| 193 |
"encoding": TOKENIZER_INFO[model]["encoding"],
|
| 194 |
"vocab_size": TOKENIZER_INFO[model]["vocab_size"],
|
| 195 |
}
|
|
|
|
| 110 |
|
| 111 |
return {
|
| 112 |
"model": TOKENIZER_INFO[model]["name"],
|
| 113 |
+
"token_count": len(token_data),
|
| 114 |
"tokens": token_data,
|
| 115 |
+
"compression_ratio": len(text) / len(token_data) if token_data else 0,
|
| 116 |
"encoding": TOKENIZER_INFO[model]["encoding"],
|
| 117 |
"vocab_size": TOKENIZER_INFO[model]["vocab_size"],
|
| 118 |
}
|
|
|
|
| 187 |
|
| 188 |
return {
|
| 189 |
"model": TOKENIZER_INFO[model]["name"],
|
| 190 |
+
"token_count": len(token_data),
|
| 191 |
"tokens": token_data,
|
| 192 |
+
"compression_ratio": len(text) / len(token_data) if token_data else 0,
|
| 193 |
"encoding": TOKENIZER_INFO[model]["encoding"],
|
| 194 |
"vocab_size": TOKENIZER_INFO[model]["vocab_size"],
|
| 195 |
}
|