Spaces:

liujch1998
/

infini-gram

Running

App Files Files Community

liujch1998 commited on Dec 7, 2025

Commit

7f5f844

1 Parent(s): 9679701

sync: add olmo3 indexes; improve UI

Browse files

Files changed (3) hide show

app.py +38 -21
constants.py +15 -8
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -5,6 +5,19 @@ import random
 import requests
 from constants import *
 def process(query_type, index_desc, **kwargs):
     timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
     index = INDEX_BY_DESC[index_desc]
@@ -19,7 +32,7 @@ def process(query_type, index_desc, **kwargs):
     if API_URL is None:
         raise ValueError(f'API_URL envvar is not set!')
     try:
-        response = requests.post(API_URL, json=data, timeout=10)
     except requests.exceptions.Timeout:
         raise ValueError('Web request timed out. Please try again later.')
     except requests.exceptions.RequestException as e:
@@ -80,7 +93,7 @@ def prob(index_desc, query):
     if 'error' in result:
         prob = result['error']
     elif result['prompt_cnt'] == 0:
-        prob = '(n-1)-gram is not found in the corpus'
     else:
         prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
     return latency, tokenization_info, prob
@@ -97,7 +110,7 @@ def ntd(index_desc, query, max_support):
         for token_id, r in result_by_token_id.items():
             ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
         if ntd == {}:
-            ntd = '(n-1)-gram is not found in the corpus'
     return latency, tokenization_info, ntd
 def infgram_prob(index_desc, query):
@@ -173,11 +186,11 @@ def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_t
         ptrs_by_shard = find_result['ptrs_by_shard']
         cnt_retrievable = sum([len(ptrs) for ptrs in ptrs_by_shard])
         if find_result["approx"]:
-            message = f'Approximately {find_result["cnt"]} occurrences found, of which {cnt_retrievable} are retrievable'
         else:
-            message = f'{find_result["cnt"]} occurrences found'
     else: # simple query
-        message = f'{find_result["cnt"]} occurrences found'
         cnt_retrievable = find_result['cnt']
     if cnt_retrievable == 0:
         idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
@@ -229,24 +242,28 @@ def get_another_doc(index_desc, idx, max_disp_len, state):
 with gr.Blocks() as demo:
     with gr.Column():
         gr.HTML(
-            '''<h1 text-align="center">Infini-gram: An Efficient Search Engine over the Massive Pretraining Datasets of Language Models</h1>
-            <p style='font-size: 16px;'>This engine does exact-match search over several open pretraining datasets of language models. Please first select the corpus and the type of query, then enter your query and submit.</p>
             <p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. Feel free to check out our <a href="https://infini-gram.io">Project Homepage</a>.</p>
             <p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
-            <p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
             '''
         )
         with gr.Row():
             with gr.Column(scale=1, min_width=240):
-                index_desc = gr.Radio(choices=INDEX_DESCS, label='Corpus', value=INDEX_DESCS[0])
             with gr.Column(scale=7):
                 with gr.Tab('1. Count an n-gram'):
                     with gr.Column():
                         gr.HTML('<h2>1. Count an n-gram</h2>')
                         with gr.Accordion(label='Click to view instructions', open=False):
-                            gr.HTML(f'''<p style="font-size: 16px;">This counts the number of times an n-gram appears in the corpus. If you submit an empty input, it will return the total number of tokens in the corpus. You can also make more complex queries by connecting multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>.</p>
                                         <br>
                                         <p style="font-size: 16px;">Example queries:</p>
                                         <ul style="font-size: 16px;">
@@ -291,7 +308,7 @@ with gr.Blocks() as demo:
                                         <br>
                                         <p style="font-size: 16px;">Notes:</p>
                                         <ul style="font-size: 16px;">
-                                            <li>The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.</li>
                                         </ul>
                                     ''')
                         with gr.Row():
@@ -317,8 +334,8 @@ with gr.Blocks() as demo:
                                         <br>
                                         <p style="font-size: 16px;">Notes:</p>
                                         <ul style="font-size: 16px;">
-                                            <li>The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.</li>
-                                            <li>If the (n-1)-gram appears more than {max_support} times in the corpus, the result will be approximate: we will estimate the distribution by examining a subset of {max_support} occurrences of the (n-1)-gram. This value can be adjusted within range [1, {MAX_SUPPORT}] in "Advanced options".</li>
                                         </ul>
                                     ''')
@@ -341,9 +358,9 @@ with gr.Blocks() as demo:
                     with gr.Column():
                         gr.HTML('<h2>4. Compute the ∞-gram probability of the last token</h2>')
                         with gr.Accordion(label='Click to view instructions', open=False):
-                            gr.HTML(f'''<p style="font-size: 16px;">This computes the ∞-gram probability of the last token conditioned on the previous tokens. Compared to Query Type 2 (which uses your entire input for n-gram modeling), here we take the longest suffix that we can find in the corpus.</p>
                                         <br>
-                                        <p style="font-size: 16px;">Example query: <b>I love natural language processing</b> (if "natural language" appears in the corpus but "love natural language" doesn't, the output is P(processing | natural language); in this case the effective n = 3)</p>
                                         <br>
                                         <p style="font-size: 16px;">Notes:</p>
                                         <ul style="font-size: 16px;">
@@ -370,7 +387,7 @@ with gr.Blocks() as demo:
                         with gr.Accordion(label='Click to view instructions', open=False):
                             gr.HTML(f'''<p style="font-size: 16px;">This is similar to Query Type 3, but with ∞-gram instead of n-gram.</p>
                                         <br>
-                                        <p style="font-size: 16px;">Example query: <b>I love natural language</b> (if "natural language" appears in the corpus but "love natural language" doesn't, the output is P(* | natural language), for the top-10 tokens *)</p>
                                     ''')
                         with gr.Row():
                             with gr.Column(scale=1):
@@ -392,7 +409,7 @@ with gr.Blocks() as demo:
                     with gr.Column():
                         gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
                         with gr.Accordion(label='Click to view instructions', open=False):
-                            gr.HTML(f'''<p style="font-size: 16px;">This displays a few random documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
                                         <br>
                                         <p style="font-size: 16px;">Example queries:</p>
                                         <ul style="font-size: 16px;">
@@ -412,7 +429,7 @@ with gr.Blocks() as demo:
                                             <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
                                         </ul>
                                         <br>
-                                        <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
                                     ''')
                         with gr.Row():
                             with gr.Column(scale=1):
@@ -443,7 +460,7 @@ with gr.Blocks() as demo:
                     with gr.Column():
                         gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
                         with gr.Accordion(label='Click to view instructions', open=False):
-                            gr.HTML(f'''<p style="font-size: 16px;">This displays the documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
                                         <br>
                                         <p style="font-size: 16px;">Example queries:</p>
                                         <ul style="font-size: 16px;">
@@ -461,7 +478,7 @@ with gr.Blocks() as demo:
                                             <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
                                         </ul>
                                         <br>
-                                        <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
                                     ''')
                         with gr.Row():
                             with gr.Column(scale=1):

 import requests
 from constants import *
+# def get_demo_indexes():
+#     try:
+#         response = requests.get(API_URL)
+#         print(response)
+#         return response.json()
+#     except:
+#         return []
+# INDEXES = get_demo_indexes()
+# print(INDEXES)
+# INDEX_BY_DESC = {index['desc']: index['index'] for index in INDEXES}
+# INDEX_DESCS = [index['desc'] for index in INDEXES]
 def process(query_type, index_desc, **kwargs):
     timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
     index = INDEX_BY_DESC[index_desc]
     if API_URL is None:
         raise ValueError(f'API_URL envvar is not set!')
     try:
+        response = requests.post(API_URL, json=data, timeout=30)
     except requests.exceptions.Timeout:
         raise ValueError('Web request timed out. Please try again later.')
     except requests.exceptions.RequestException as e:
     if 'error' in result:
         prob = result['error']
     elif result['prompt_cnt'] == 0:
+        prob = '(n-1)-gram is not found in the dataset'
     else:
         prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
     return latency, tokenization_info, prob
         for token_id, r in result_by_token_id.items():
             ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
         if ntd == {}:
+            ntd = '(n-1)-gram is not found in the dataset'
     return latency, tokenization_info, ntd
 def infgram_prob(index_desc, query):
         ptrs_by_shard = find_result['ptrs_by_shard']
         cnt_retrievable = sum([len(ptrs) for ptrs in ptrs_by_shard])
         if find_result["approx"]:
+            message = f'Approximately {find_result["cnt"]:,} occurrences found, of which {cnt_retrievable:,} are retrievable'
         else:
+            message = f'{find_result["cnt"]:,} occurrences found'
     else: # simple query
+        message = f'{find_result["cnt"]:,} occurrences found'
         cnt_retrievable = find_result['cnt']
     if cnt_retrievable == 0:
         idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
 with gr.Blocks() as demo:
     with gr.Column():
         gr.HTML(
+            '''<h1 text-align="center">Infini-gram: An Efficient Search Engine over the Massive Pretraining Datasets of LLMs</h1>
+            <p style='font-size: 16px;'>Infini-gram does exact-match search over several open pretraining datasets of language models. Please first select the dataset and the type of query, then enter your query and submit.</p>
             <p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. Feel free to check out our <a href="https://infini-gram.io">Project Homepage</a>.</p>
             <p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
+            <p style='font-size: 16px;'><b>Notes:</b></p>
+            <ul style="font-size: 16px;">
+                <li>The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified). The total number of tokens in each dataset is shown in parenthesis in the dataset selection panel.</li>
+                <li>Dolma 3 and the Olmo 3 training datasets uses the Olmo 3 tokenizer. Also, these use a more cost-efficient technique to serve, meaning: (1) each query typically takes 12-15 seconds; (2) they only support n-gram counting and document search, and CNF queries are not supported.</li>
+            </ul>
             '''
         )
         with gr.Row():
             with gr.Column(scale=1, min_width=240):
+                index_desc = gr.Radio(choices=INDEX_DESCS, label='Dataset', value=INDEX_DESCS[0])
             with gr.Column(scale=7):
                 with gr.Tab('1. Count an n-gram'):
                     with gr.Column():
                         gr.HTML('<h2>1. Count an n-gram</h2>')
                         with gr.Accordion(label='Click to view instructions', open=False):
+                            gr.HTML(f'''<p style="font-size: 16px;">This counts the number of times an n-gram appears in the dataset. If you submit an empty input, it will return the total number of tokens in the dataset. You can also make more complex queries by connecting multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>.</p>
                                         <br>
                                         <p style="font-size: 16px;">Example queries:</p>
                                         <ul style="font-size: 16px;">
                                         <br>
                                         <p style="font-size: 16px;">Notes:</p>
                                         <ul style="font-size: 16px;">
+                                            <li>The (n-1)-gram needs to exist in the dataset. If the (n-1)-gram is not found in the dataset, an error message will appear.</li>
                                         </ul>
                                     ''')
                         with gr.Row():
                                         <br>
                                         <p style="font-size: 16px;">Notes:</p>
                                         <ul style="font-size: 16px;">
+                                            <li>The (n-1)-gram needs to exist in the dataset. If the (n-1)-gram is not found in the dataset, an error message will appear.</li>
+                                            <li>If the (n-1)-gram appears more than {max_support} times in the dataset, the result will be approximate: we will estimate the distribution by examining a subset of {max_support} occurrences of the (n-1)-gram. This value can be adjusted within range [1, {MAX_SUPPORT}] in "Advanced options".</li>
                                         </ul>
                                     ''')
                     with gr.Column():
                         gr.HTML('<h2>4. Compute the ∞-gram probability of the last token</h2>')
                         with gr.Accordion(label='Click to view instructions', open=False):
+                            gr.HTML(f'''<p style="font-size: 16px;">This computes the ∞-gram probability of the last token conditioned on the previous tokens. Compared to Query Type 2 (which uses your entire input for n-gram modeling), here we take the longest suffix that we can find in the dataset.</p>
                                         <br>
+                                        <p style="font-size: 16px;">Example query: <b>I love natural language processing</b> (if "natural language" appears in the dataset but "love natural language" doesn't, the output is P(processing | natural language); in this case the effective n = 3)</p>
                                         <br>
                                         <p style="font-size: 16px;">Notes:</p>
                                         <ul style="font-size: 16px;">
                         with gr.Accordion(label='Click to view instructions', open=False):
                             gr.HTML(f'''<p style="font-size: 16px;">This is similar to Query Type 3, but with ∞-gram instead of n-gram.</p>
                                         <br>
+                                        <p style="font-size: 16px;">Example query: <b>I love natural language</b> (if "natural language" appears in the dataset but "love natural language" doesn't, the output is P(* | natural language), for the top-10 tokens *)</p>
                                     ''')
                         with gr.Row():
                             with gr.Column(scale=1):
                     with gr.Column():
                         gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
                         with gr.Accordion(label='Click to view instructions', open=False):
+                            gr.HTML(f'''<p style="font-size: 16px;">This displays a few random documents in the dataset that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
                                         <br>
                                         <p style="font-size: 16px;">Example queries:</p>
                                         <ul style="font-size: 16px;">
                                             <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
                                         </ul>
                                         <br>
+                                        <p style="font-size: 16px;">❗️WARNING: Dataset may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the dataset, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text datasets. Please use with caution. Don't be evil :)</p>
                                     ''')
                         with gr.Row():
                             with gr.Column(scale=1):
                     with gr.Column():
                         gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
                         with gr.Accordion(label='Click to view instructions', open=False):
+                            gr.HTML(f'''<p style="font-size: 16px;">This displays the documents in the dataset that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
                                         <br>
                                         <p style="font-size: 16px;">Example queries:</p>
                                         <ul style="font-size: 16px;">
                                             <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
                                         </ul>
                                         <br>
+                                        <p style="font-size: 16px;">❗️WARNING: Dataset may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the dataset, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text datasets. Please use with caution. Don't be evil :)</p>
                                     ''')
                         with gr.Row():
                             with gr.Column(scale=1):

constants.py CHANGED Viewed

@@ -2,14 +2,21 @@ import os
 # options
 INDEX_BY_DESC = {
-    'OLMo 2 32B Instruct (4.6T tokens)': 'v4_olmo-2-0325-32b-instruct_llama',
-    'OLMo 2 13B Instruct (4.6T tokens)': 'v4_olmo-2-1124-13b-instruct_llama',
-    'OLMoE 1B 7B Instruct (4.6T tokens)': 'v4_olmoe-0125-1b-7b-instruct_llama',
-    'Dolma-v1.7 (2.6T tokens)': 'v4_dolma-v1_7_llama',
-    'RedPajama (1.4T tokens)': 'v4_rpj_llama_s4',
-    'Pile-train (380B tokens)': 'v4_piletrain_llama',
-    'C4-train (200B tokens)': 'v4_c4train_llama',
-    'Pile-val (390M tokens)': 'v4_pileval_llama',
     # 'Pile-val (GPT-2 tokenizer), 380M tokens': 'v4_pileval_gpt2',
     # 'Dolma-v1.6-sample (OLMo tokenizer), 8.0B tokens': 'v4_dolmasample_olmo',
     # 'Dolma-v1.6-sample (9.2B tokens)': 'v4_dolma-v1_6-sample_llama',

 # options
 INDEX_BY_DESC = {
+    "Olmo 3 32B Think (6.1T)": "olmo-3-32b-think",
+    "OLMo 3 7B Think (6.1T)": "olmo-3-7b-think",
+    "OLMo 3 7B Instruct (6.0T)": "olmo-3-7b-instruct",
+    "Dolma 3 (5.9T)": "dolma3",
+    'OLMo 2 32B Instruct (4.6T)': 'v4_olmo-2-0325-32b-instruct_llama',
+    'OLMo 2 13B Instruct (4.6T)': 'v4_olmo-2-1124-13b-instruct_llama',
+    'OLMoE 1B 7B Instruct (4.6T)': 'v4_olmoe-0125-1b-7b-instruct_llama',
+    'dolmino-mix-1124-minus-olmo-mix-1124 (34B)': 'v4_dolmino-mix-1124-minus-olmo-mix-1124_llama',
+    'olmo-mix-1124 (4.6T)': 'v4_olmo-mix-1124_llama',
+    'DCLM-baseline (4.3T)': 'v4_dclm-baseline_llama',
+    'Dolma-v1.7 (2.6T)': 'v4_dolma-v1_7_llama',
+    'RedPajama (1.4T)': 'v4_rpj_llama_s4',
+    'Pile-train (380B)': 'v4_piletrain_llama',
+    'C4-train (200B)': 'v4_c4train_llama',
+    'Pile-val (390M)': 'v4_pileval_llama',
     # 'Pile-val (GPT-2 tokenizer), 380M tokens': 'v4_pileval_gpt2',
     # 'Dolma-v1.6-sample (OLMo tokenizer), 8.0B tokens': 'v4_dolmasample_olmo',
     # 'Dolma-v1.6-sample (9.2B tokens)': 'v4_dolma-v1_6-sample_llama',

requirements.txt CHANGED Viewed

@@ -2,5 +2,5 @@ torch==1.13.1
 transformers==4.31.0
 tokenizers==0.13.3
 sentencepiece==0.1.96
-huggingface_hub==0.14.1
 requests

 transformers==4.31.0
 tokenizers==0.13.3
 sentencepiece==0.1.96
+huggingface_hub==0.34.3
 requests