tamang0000
commited on
Commit
·
80caa24
1
Parent(s):
eec601a
added assamese
Browse files
app.py
CHANGED
|
@@ -15,7 +15,7 @@ def load_test_phrases(filename):
|
|
| 15 |
|
| 16 |
models = ["Xenova/claude-tokenizer", # Anthropic
|
| 17 |
"meta-llama/Llama-2-7b-chat-hf", # LLAMA-2
|
| 18 |
-
"beomi/llama-2-ko-7b", # LLAMA-2-ko
|
| 19 |
"ai4bharat/Airavata", # ARIVATA
|
| 20 |
"openaccess-ai-collective/tiny-mistral", # Mistral
|
| 21 |
"gpt-3.5-turbo", # GPT3.5
|
|
@@ -23,7 +23,9 @@ models = ["Xenova/claude-tokenizer", # Anthropic
|
|
| 23 |
"CohereForAI/aya-23-8B", # AYA
|
| 24 |
"google/gemma-1.1-2b-it", # GEMMA
|
| 25 |
"gpt-4o", # GPT4o
|
| 26 |
-
"TWO/sutra-mlt256-v2"
|
|
|
|
|
|
|
| 27 |
|
| 28 |
test_phrase_set = [
|
| 29 |
"I am going for a walk later today",
|
|
@@ -111,8 +113,9 @@ def generate_split_token_table(text):
|
|
| 111 |
with gr.Blocks() as sutra_token_count:
|
| 112 |
gr.Markdown(
|
| 113 |
"""
|
| 114 |
-
#
|
| 115 |
## Tokenize paragraphs in multiple languages and compare token counts.
|
|
|
|
| 116 |
""")
|
| 117 |
textbox = gr.Textbox(label="Input Text")
|
| 118 |
submit_button = gr.Button("Submit")
|
|
@@ -140,9 +143,10 @@ def generate_tokens_table(text):
|
|
| 140 |
with gr.Blocks() as sutra_tokenize:
|
| 141 |
gr.Markdown(
|
| 142 |
"""
|
| 143 |
-
#
|
| 144 |
## Tokenize a sentence with various tokenizers and inspect how it's broken down.
|
| 145 |
-
|
|
|
|
| 146 |
textbox = gr.Textbox(label="Input Text")
|
| 147 |
submit_button = gr.Button("Submit")
|
| 148 |
output = gr.Dataframe()
|
|
@@ -156,7 +160,7 @@ if __name__ == '__main__':
|
|
| 156 |
with gr.Row():
|
| 157 |
gr.Markdown(
|
| 158 |
"""
|
| 159 |
-
## <img src="https://
|
| 160 |
"""
|
| 161 |
)
|
| 162 |
with gr.Row():
|
|
|
|
| 15 |
|
| 16 |
models = ["Xenova/claude-tokenizer", # Anthropic
|
| 17 |
"meta-llama/Llama-2-7b-chat-hf", # LLAMA-2
|
| 18 |
+
# "beomi/llama-2-ko-7b", # LLAMA-2-ko
|
| 19 |
"ai4bharat/Airavata", # ARIVATA
|
| 20 |
"openaccess-ai-collective/tiny-mistral", # Mistral
|
| 21 |
"gpt-3.5-turbo", # GPT3.5
|
|
|
|
| 23 |
"CohereForAI/aya-23-8B", # AYA
|
| 24 |
"google/gemma-1.1-2b-it", # GEMMA
|
| 25 |
"gpt-4o", # GPT4o
|
| 26 |
+
"TWO/sutra-mlt256-v2", # SUTRA
|
| 27 |
+
"tamang0000/assamese-tokenizer-50k" # Assamese
|
| 28 |
+
]
|
| 29 |
|
| 30 |
test_phrase_set = [
|
| 31 |
"I am going for a walk later today",
|
|
|
|
| 113 |
with gr.Blocks() as sutra_token_count:
|
| 114 |
gr.Markdown(
|
| 115 |
"""
|
| 116 |
+
# Multilingual Tokenizer Specs & Stats.
|
| 117 |
## Tokenize paragraphs in multiple languages and compare token counts.
|
| 118 |
+
Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison)
|
| 119 |
""")
|
| 120 |
textbox = gr.Textbox(label="Input Text")
|
| 121 |
submit_button = gr.Button("Submit")
|
|
|
|
| 143 |
with gr.Blocks() as sutra_tokenize:
|
| 144 |
gr.Markdown(
|
| 145 |
"""
|
| 146 |
+
# Multilingual Tokenizer Sentence Inspector.
|
| 147 |
## Tokenize a sentence with various tokenizers and inspect how it's broken down.
|
| 148 |
+
Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison)
|
| 149 |
+
""")
|
| 150 |
textbox = gr.Textbox(label="Input Text")
|
| 151 |
submit_button = gr.Button("Submit")
|
| 152 |
output = gr.Dataframe()
|
|
|
|
| 160 |
with gr.Row():
|
| 161 |
gr.Markdown(
|
| 162 |
"""
|
| 163 |
+
## <img src="https://raw.githubusercontent.com/SAGAR-TAMANG/sagar-tamang-official-website-new/master/img/pi.jpg" height="30"/>
|
| 164 |
"""
|
| 165 |
)
|
| 166 |
with gr.Row():
|