Spaces:

liujch1998
/

infini-gram

Running

liujch1998 commited on Feb 14, 2024

Commit

106f995

1 Parent(s): cb08e07

Sync changes

Files changed (2) hide show

app.py CHANGED Viewed

@@ -46,6 +46,8 @@ def prob(corpus_desc, engine_desc, query, request: gr.Request):
     tokenized = '' if 'tokenized' not in result else result['tokenized']
     if 'error' in result:
         prob = result['error']
     else:
         prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
     return latency, tokenized, prob
@@ -58,6 +60,8 @@ def ntd(corpus_desc, engine_desc, query, request: gr.Request):
         ntd = result['error']
     else:
         ntd = result['ntd']
     return latency, tokenized, ntd
 def infgram_prob(corpus_desc, engine_desc, query, request: gr.Request):

     tokenized = '' if 'tokenized' not in result else result['tokenized']
     if 'error' in result:
         prob = result['error']
+    elif result['prompt_cnt'] == 0:
+        prob = '(n-1)-gram is not found in the corpus'
     else:
         prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
     return latency, tokenized, prob
         ntd = result['error']
     else:
         ntd = result['ntd']
+        if ntd == {}:
+            ntd = '(n-1)-gram is not found in the corpus'
     return latency, tokenized, ntd
 def infgram_prob(corpus_desc, engine_desc, query, request: gr.Request):

constants.py CHANGED Viewed

@@ -2,10 +2,10 @@ import os
 # options
 CORPUS_BY_DESC = {
-    'RedPajama (LLaMA tokenizer), 1.4T tokens': 'v3_rpj_llama_c4',
     'Pile-train (LLaMA tokenizer), 380B tokens': 'v4_piletrain_llama',
-    'Pile-val (LLaMA tokenizer), 390M tokens': 'v3_pileval_llama',
-    'Pile-val (GPT-2 tokenizer), 380M tokens': 'v3_pileval_gpt2',
     'Dolma-sample (OLMo tokenizer), 8.0B tokens': 'v4_dolmasample_olmo',
 }
 CORPUS_DESCS = list(CORPUS_BY_DESC.keys())

 # options
 CORPUS_BY_DESC = {
+    'RedPajama (LLaMA tokenizer), 1.4T tokens': 'v4_rpj_llama_s4',
     'Pile-train (LLaMA tokenizer), 380B tokens': 'v4_piletrain_llama',
+    'Pile-val (LLaMA tokenizer), 390M tokens': 'v4_pileval_llama',
+    'Pile-val (GPT-2 tokenizer), 380M tokens': 'v4_pileval_gpt2',
     'Dolma-sample (OLMo tokenizer), 8.0B tokens': 'v4_dolmasample_olmo',
 }
 CORPUS_DESCS = list(CORPUS_BY_DESC.keys())