Spaces:

toksuite
/

quick-tokenizer-accuracy

Runtime error

App Files Files Community

Gül Sena Altıntaş commited on Jan 23

Commit

5d0049d

1 Parent(s): 9cada5f

Updated hf paths for toksuite models

Browse files

Files changed (1) hide show

app.py +12 -7

app.py CHANGED Viewed

@@ -111,6 +111,7 @@ TOKSUITE_MODELS = [
     "google-byt5-small",
     "google-bert-bert-base-multilingual-cased",
     "Qwen-Qwen3-8B",
 ]
 # Global cache for loaded models
 model_cache = dict()
@@ -331,7 +332,7 @@ def calculate_choice_likelihood(
         if normalization_method == "token-length":
             normalization_term = answer_len
         elif normalization_method == "byte-length":
-            decoded_text = tokenizer.decode(target_ids[0]).strip()
             byte_len = len(decoded_text.encode("utf-8"))
             normalization_term = byte_len
         total_log_prob /= normalization_term
@@ -439,10 +440,10 @@ def run_evaluation(
     toksuite_selector,
     custom_models_text="",
     delimiter: str = "\t",
-    progress=gr.Progress(),
     save_summary=False,
     normalization_method: str = "token-length",
     prefix: str = "",
 ):
     import gc
@@ -476,7 +477,7 @@ def run_evaluation(
     # Add predefined models
     all_models.extend(selected_predefined)
     all_models.extend(
-        [f"r-three/supertoken_models-llama_{model}" for model in toksuite_selector]
     )
     all_models.extend(custom_models)
@@ -1192,8 +1193,8 @@ What is the capital of France?,Paris,London,Berlin,Paris""",
                 lines=1,
             )
         with gr.Column(scale=1):
-            save_summary_checkbox = False
-            prefix = ""
             slurm_id = os.environ.get("SLURM_JOB_ID", "")
             if slurm_id:
                 save_summary_checkbox = gr.Checkbox(
@@ -1322,7 +1323,6 @@ bigscience/bloom-560m""",
         fn=run_evaluation,
         inputs=[
             dataset_input,
-            # predefined_selector,
             industry_selector,
             toksuite_selector,
             custom_models_input,
@@ -1365,4 +1365,9 @@ bigscience/bloom-560m""",
     """)
 if __name__ == "__main__":
-    demo.launch(share=True)

     "google-byt5-small",
     "google-bert-bert-base-multilingual-cased",
     "Qwen-Qwen3-8B",
+    "tiktoken-gpt-4o",
 ]
 # Global cache for loaded models
 model_cache = dict()
         if normalization_method == "token-length":
             normalization_term = answer_len
         elif normalization_method == "byte-length":
+            decoded_text = tokenizer.decode(target_ids).strip()
             byte_len = len(decoded_text.encode("utf-8"))
             normalization_term = byte_len
         total_log_prob /= normalization_term
     toksuite_selector,
     custom_models_text="",
     delimiter: str = "\t",
     save_summary=False,
     normalization_method: str = "token-length",
     prefix: str = "",
+    progress=gr.Progress(),
 ):
     import gc
     # Add predefined models
     all_models.extend(selected_predefined)
     all_models.extend(
+        [f"toksuite/{model}" for model in toksuite_selector]
     )
     all_models.extend(custom_models)
                 lines=1,
             )
         with gr.Column(scale=1):
+            save_summary_checkbox = gr.Checkbox(value=True)
+            prefix = gr.Textbox(visible=False, value="")
             slurm_id = os.environ.get("SLURM_JOB_ID", "")
             if slurm_id:
                 save_summary_checkbox = gr.Checkbox(
         fn=run_evaluation,
         inputs=[
             dataset_input,
             industry_selector,
             toksuite_selector,
             custom_models_input,
     """)
 if __name__ == "__main__":
+    import argparse
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("--share", action="store_true", default=True)
+    args = argparser.parse_args()
+    demo.launch(share=args.share)