Spaces:

tamang0000
/

assamese-tokenizer-comparison

Sleeping

App Files Files Community

tamang0000 commited on Jul 10, 2024

Commit

80caa24

1 Parent(s): eec601a

added assamese

Browse files

Files changed (1) hide show

app.py +10 -6

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ def load_test_phrases(filename):
 models = ["Xenova/claude-tokenizer",                 # Anthropic
           "meta-llama/Llama-2-7b-chat-hf",           # LLAMA-2
-          "beomi/llama-2-ko-7b",                     # LLAMA-2-ko
           "ai4bharat/Airavata",                      # ARIVATA
           "openaccess-ai-collective/tiny-mistral",   # Mistral
           "gpt-3.5-turbo",                           # GPT3.5
@@ -23,7 +23,9 @@ models = ["Xenova/claude-tokenizer",                 # Anthropic
           "CohereForAI/aya-23-8B",                   # AYA
           "google/gemma-1.1-2b-it",                  # GEMMA
           "gpt-4o",                                  # GPT4o
-          "TWO/sutra-mlt256-v2"]                     # SUTRA
 test_phrase_set = [
     "I am going for a walk later today",
@@ -111,8 +113,9 @@ def generate_split_token_table(text):
 with gr.Blocks() as sutra_token_count:
     gr.Markdown(
         """
-        # SUTRA Multilingual Tokenizer Specs & Stats.
         ## Tokenize paragraphs in multiple languages and compare token counts.
         """)
     textbox = gr.Textbox(label="Input Text")
     submit_button = gr.Button("Submit")
@@ -140,9 +143,10 @@ def generate_tokens_table(text):
 with gr.Blocks() as sutra_tokenize:
     gr.Markdown(
         """
-        # SUTRA Multilingual Tokenizer Sentence Inspector.
         ## Tokenize a sentence with various tokenizers and inspect how it's broken down.
-        """)
     textbox = gr.Textbox(label="Input Text")
     submit_button = gr.Button("Submit")
     output = gr.Dataframe()
@@ -156,7 +160,7 @@ if __name__ == '__main__':
         with gr.Row():
             gr.Markdown(
                 """
-                ## <img src="https://playground.two.ai/sutra.svg" height="20"/>
                 """
             )
         with gr.Row():

 models = ["Xenova/claude-tokenizer",                 # Anthropic
           "meta-llama/Llama-2-7b-chat-hf",           # LLAMA-2
+#          "beomi/llama-2-ko-7b",                     # LLAMA-2-ko
           "ai4bharat/Airavata",                      # ARIVATA
           "openaccess-ai-collective/tiny-mistral",   # Mistral
           "gpt-3.5-turbo",                           # GPT3.5
           "CohereForAI/aya-23-8B",                   # AYA
           "google/gemma-1.1-2b-it",                  # GEMMA
           "gpt-4o",                                  # GPT4o
+          "TWO/sutra-mlt256-v2",                     # SUTRA
+          "tamang0000/assamese-tokenizer-50k"        # Assamese
+]
 test_phrase_set = [
     "I am going for a walk later today",
 with gr.Blocks() as sutra_token_count:
     gr.Markdown(
         """
+        # Multilingual Tokenizer Specs & Stats.
         ## Tokenize paragraphs in multiple languages and compare token counts.
+        Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison)
         """)
     textbox = gr.Textbox(label="Input Text")
     submit_button = gr.Button("Submit")
 with gr.Blocks() as sutra_tokenize:
     gr.Markdown(
         """
+        # Multilingual Tokenizer Sentence Inspector.
         ## Tokenize a sentence with various tokenizers and inspect how it's broken down.
+        Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison)
+""")
     textbox = gr.Textbox(label="Input Text")
     submit_button = gr.Button("Submit")
     output = gr.Dataframe()
         with gr.Row():
             gr.Markdown(
                 """
+                ## <img src="https://raw.githubusercontent.com/SAGAR-TAMANG/sagar-tamang-official-website-new/master/img/pi.jpg" height="30"/>
                 """
             )
         with gr.Row():