Spaces:
Running
Running
Commit
·
7f5f844
1
Parent(s):
9679701
sync: add olmo3 indexes; improve UI
Browse files- app.py +38 -21
- constants.py +15 -8
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -5,6 +5,19 @@ import random
|
|
| 5 |
import requests
|
| 6 |
from constants import *
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
def process(query_type, index_desc, **kwargs):
|
| 9 |
timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
|
| 10 |
index = INDEX_BY_DESC[index_desc]
|
|
@@ -19,7 +32,7 @@ def process(query_type, index_desc, **kwargs):
|
|
| 19 |
if API_URL is None:
|
| 20 |
raise ValueError(f'API_URL envvar is not set!')
|
| 21 |
try:
|
| 22 |
-
response = requests.post(API_URL, json=data, timeout=
|
| 23 |
except requests.exceptions.Timeout:
|
| 24 |
raise ValueError('Web request timed out. Please try again later.')
|
| 25 |
except requests.exceptions.RequestException as e:
|
|
@@ -80,7 +93,7 @@ def prob(index_desc, query):
|
|
| 80 |
if 'error' in result:
|
| 81 |
prob = result['error']
|
| 82 |
elif result['prompt_cnt'] == 0:
|
| 83 |
-
prob = '(n-1)-gram is not found in the
|
| 84 |
else:
|
| 85 |
prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
|
| 86 |
return latency, tokenization_info, prob
|
|
@@ -97,7 +110,7 @@ def ntd(index_desc, query, max_support):
|
|
| 97 |
for token_id, r in result_by_token_id.items():
|
| 98 |
ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
|
| 99 |
if ntd == {}:
|
| 100 |
-
ntd = '(n-1)-gram is not found in the
|
| 101 |
return latency, tokenization_info, ntd
|
| 102 |
|
| 103 |
def infgram_prob(index_desc, query):
|
|
@@ -173,11 +186,11 @@ def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_t
|
|
| 173 |
ptrs_by_shard = find_result['ptrs_by_shard']
|
| 174 |
cnt_retrievable = sum([len(ptrs) for ptrs in ptrs_by_shard])
|
| 175 |
if find_result["approx"]:
|
| 176 |
-
message = f'Approximately {find_result["cnt"]} occurrences found, of which {cnt_retrievable} are retrievable'
|
| 177 |
else:
|
| 178 |
-
message = f'{find_result["cnt"]} occurrences found'
|
| 179 |
else: # simple query
|
| 180 |
-
message = f'{find_result["cnt"]} occurrences found'
|
| 181 |
cnt_retrievable = find_result['cnt']
|
| 182 |
if cnt_retrievable == 0:
|
| 183 |
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
|
|
@@ -229,24 +242,28 @@ def get_another_doc(index_desc, idx, max_disp_len, state):
|
|
| 229 |
with gr.Blocks() as demo:
|
| 230 |
with gr.Column():
|
| 231 |
gr.HTML(
|
| 232 |
-
'''<h1 text-align="center">Infini-gram: An Efficient Search Engine over the Massive Pretraining Datasets of
|
| 233 |
|
| 234 |
-
<p style='font-size: 16px;'>
|
| 235 |
<p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. Feel free to check out our <a href="https://infini-gram.io">Project Homepage</a>.</p>
|
| 236 |
<p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
|
| 237 |
-
<p style='font-size: 16px;'><b>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
'''
|
| 239 |
)
|
| 240 |
with gr.Row():
|
| 241 |
with gr.Column(scale=1, min_width=240):
|
| 242 |
-
index_desc = gr.Radio(choices=INDEX_DESCS, label='
|
| 243 |
|
| 244 |
with gr.Column(scale=7):
|
| 245 |
with gr.Tab('1. Count an n-gram'):
|
| 246 |
with gr.Column():
|
| 247 |
gr.HTML('<h2>1. Count an n-gram</h2>')
|
| 248 |
with gr.Accordion(label='Click to view instructions', open=False):
|
| 249 |
-
gr.HTML(f'''<p style="font-size: 16px;">This counts the number of times an n-gram appears in the
|
| 250 |
<br>
|
| 251 |
<p style="font-size: 16px;">Example queries:</p>
|
| 252 |
<ul style="font-size: 16px;">
|
|
@@ -291,7 +308,7 @@ with gr.Blocks() as demo:
|
|
| 291 |
<br>
|
| 292 |
<p style="font-size: 16px;">Notes:</p>
|
| 293 |
<ul style="font-size: 16px;">
|
| 294 |
-
<li>The (n-1)-gram needs to exist in the
|
| 295 |
</ul>
|
| 296 |
''')
|
| 297 |
with gr.Row():
|
|
@@ -317,8 +334,8 @@ with gr.Blocks() as demo:
|
|
| 317 |
<br>
|
| 318 |
<p style="font-size: 16px;">Notes:</p>
|
| 319 |
<ul style="font-size: 16px;">
|
| 320 |
-
<li>The (n-1)-gram needs to exist in the
|
| 321 |
-
<li>If the (n-1)-gram appears more than {max_support} times in the
|
| 322 |
</ul>
|
| 323 |
''')
|
| 324 |
|
|
@@ -341,9 +358,9 @@ with gr.Blocks() as demo:
|
|
| 341 |
with gr.Column():
|
| 342 |
gr.HTML('<h2>4. Compute the ∞-gram probability of the last token</h2>')
|
| 343 |
with gr.Accordion(label='Click to view instructions', open=False):
|
| 344 |
-
gr.HTML(f'''<p style="font-size: 16px;">This computes the ∞-gram probability of the last token conditioned on the previous tokens. Compared to Query Type 2 (which uses your entire input for n-gram modeling), here we take the longest suffix that we can find in the
|
| 345 |
<br>
|
| 346 |
-
<p style="font-size: 16px;">Example query: <b>I love natural language processing</b> (if "natural language" appears in the
|
| 347 |
<br>
|
| 348 |
<p style="font-size: 16px;">Notes:</p>
|
| 349 |
<ul style="font-size: 16px;">
|
|
@@ -370,7 +387,7 @@ with gr.Blocks() as demo:
|
|
| 370 |
with gr.Accordion(label='Click to view instructions', open=False):
|
| 371 |
gr.HTML(f'''<p style="font-size: 16px;">This is similar to Query Type 3, but with ∞-gram instead of n-gram.</p>
|
| 372 |
<br>
|
| 373 |
-
<p style="font-size: 16px;">Example query: <b>I love natural language</b> (if "natural language" appears in the
|
| 374 |
''')
|
| 375 |
with gr.Row():
|
| 376 |
with gr.Column(scale=1):
|
|
@@ -392,7 +409,7 @@ with gr.Blocks() as demo:
|
|
| 392 |
with gr.Column():
|
| 393 |
gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
|
| 394 |
with gr.Accordion(label='Click to view instructions', open=False):
|
| 395 |
-
gr.HTML(f'''<p style="font-size: 16px;">This displays a few random documents in the
|
| 396 |
<br>
|
| 397 |
<p style="font-size: 16px;">Example queries:</p>
|
| 398 |
<ul style="font-size: 16px;">
|
|
@@ -412,7 +429,7 @@ with gr.Blocks() as demo:
|
|
| 412 |
<li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
|
| 413 |
</ul>
|
| 414 |
<br>
|
| 415 |
-
<p style="font-size: 16px;">❗️WARNING:
|
| 416 |
''')
|
| 417 |
with gr.Row():
|
| 418 |
with gr.Column(scale=1):
|
|
@@ -443,7 +460,7 @@ with gr.Blocks() as demo:
|
|
| 443 |
with gr.Column():
|
| 444 |
gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
|
| 445 |
with gr.Accordion(label='Click to view instructions', open=False):
|
| 446 |
-
gr.HTML(f'''<p style="font-size: 16px;">This displays the documents in the
|
| 447 |
<br>
|
| 448 |
<p style="font-size: 16px;">Example queries:</p>
|
| 449 |
<ul style="font-size: 16px;">
|
|
@@ -461,7 +478,7 @@ with gr.Blocks() as demo:
|
|
| 461 |
<li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
|
| 462 |
</ul>
|
| 463 |
<br>
|
| 464 |
-
<p style="font-size: 16px;">❗️WARNING:
|
| 465 |
''')
|
| 466 |
with gr.Row():
|
| 467 |
with gr.Column(scale=1):
|
|
|
|
| 5 |
import requests
|
| 6 |
from constants import *
|
| 7 |
|
| 8 |
+
# def get_demo_indexes():
|
| 9 |
+
# try:
|
| 10 |
+
# response = requests.get(API_URL)
|
| 11 |
+
# print(response)
|
| 12 |
+
# return response.json()
|
| 13 |
+
# except:
|
| 14 |
+
# return []
|
| 15 |
+
|
| 16 |
+
# INDEXES = get_demo_indexes()
|
| 17 |
+
# print(INDEXES)
|
| 18 |
+
# INDEX_BY_DESC = {index['desc']: index['index'] for index in INDEXES}
|
| 19 |
+
# INDEX_DESCS = [index['desc'] for index in INDEXES]
|
| 20 |
+
|
| 21 |
def process(query_type, index_desc, **kwargs):
|
| 22 |
timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
|
| 23 |
index = INDEX_BY_DESC[index_desc]
|
|
|
|
| 32 |
if API_URL is None:
|
| 33 |
raise ValueError(f'API_URL envvar is not set!')
|
| 34 |
try:
|
| 35 |
+
response = requests.post(API_URL, json=data, timeout=30)
|
| 36 |
except requests.exceptions.Timeout:
|
| 37 |
raise ValueError('Web request timed out. Please try again later.')
|
| 38 |
except requests.exceptions.RequestException as e:
|
|
|
|
| 93 |
if 'error' in result:
|
| 94 |
prob = result['error']
|
| 95 |
elif result['prompt_cnt'] == 0:
|
| 96 |
+
prob = '(n-1)-gram is not found in the dataset'
|
| 97 |
else:
|
| 98 |
prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
|
| 99 |
return latency, tokenization_info, prob
|
|
|
|
| 110 |
for token_id, r in result_by_token_id.items():
|
| 111 |
ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
|
| 112 |
if ntd == {}:
|
| 113 |
+
ntd = '(n-1)-gram is not found in the dataset'
|
| 114 |
return latency, tokenization_info, ntd
|
| 115 |
|
| 116 |
def infgram_prob(index_desc, query):
|
|
|
|
| 186 |
ptrs_by_shard = find_result['ptrs_by_shard']
|
| 187 |
cnt_retrievable = sum([len(ptrs) for ptrs in ptrs_by_shard])
|
| 188 |
if find_result["approx"]:
|
| 189 |
+
message = f'Approximately {find_result["cnt"]:,} occurrences found, of which {cnt_retrievable:,} are retrievable'
|
| 190 |
else:
|
| 191 |
+
message = f'{find_result["cnt"]:,} occurrences found'
|
| 192 |
else: # simple query
|
| 193 |
+
message = f'{find_result["cnt"]:,} occurrences found'
|
| 194 |
cnt_retrievable = find_result['cnt']
|
| 195 |
if cnt_retrievable == 0:
|
| 196 |
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
|
|
|
|
| 242 |
with gr.Blocks() as demo:
|
| 243 |
with gr.Column():
|
| 244 |
gr.HTML(
|
| 245 |
+
'''<h1 text-align="center">Infini-gram: An Efficient Search Engine over the Massive Pretraining Datasets of LLMs</h1>
|
| 246 |
|
| 247 |
+
<p style='font-size: 16px;'>Infini-gram does exact-match search over several open pretraining datasets of language models. Please first select the dataset and the type of query, then enter your query and submit.</p>
|
| 248 |
<p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. Feel free to check out our <a href="https://infini-gram.io">Project Homepage</a>.</p>
|
| 249 |
<p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
|
| 250 |
+
<p style='font-size: 16px;'><b>Notes:</b></p>
|
| 251 |
+
<ul style="font-size: 16px;">
|
| 252 |
+
<li>The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified). The total number of tokens in each dataset is shown in parenthesis in the dataset selection panel.</li>
|
| 253 |
+
<li>Dolma 3 and the Olmo 3 training datasets uses the Olmo 3 tokenizer. Also, these use a more cost-efficient technique to serve, meaning: (1) each query typically takes 12-15 seconds; (2) they only support n-gram counting and document search, and CNF queries are not supported.</li>
|
| 254 |
+
</ul>
|
| 255 |
'''
|
| 256 |
)
|
| 257 |
with gr.Row():
|
| 258 |
with gr.Column(scale=1, min_width=240):
|
| 259 |
+
index_desc = gr.Radio(choices=INDEX_DESCS, label='Dataset', value=INDEX_DESCS[0])
|
| 260 |
|
| 261 |
with gr.Column(scale=7):
|
| 262 |
with gr.Tab('1. Count an n-gram'):
|
| 263 |
with gr.Column():
|
| 264 |
gr.HTML('<h2>1. Count an n-gram</h2>')
|
| 265 |
with gr.Accordion(label='Click to view instructions', open=False):
|
| 266 |
+
gr.HTML(f'''<p style="font-size: 16px;">This counts the number of times an n-gram appears in the dataset. If you submit an empty input, it will return the total number of tokens in the dataset. You can also make more complex queries by connecting multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>.</p>
|
| 267 |
<br>
|
| 268 |
<p style="font-size: 16px;">Example queries:</p>
|
| 269 |
<ul style="font-size: 16px;">
|
|
|
|
| 308 |
<br>
|
| 309 |
<p style="font-size: 16px;">Notes:</p>
|
| 310 |
<ul style="font-size: 16px;">
|
| 311 |
+
<li>The (n-1)-gram needs to exist in the dataset. If the (n-1)-gram is not found in the dataset, an error message will appear.</li>
|
| 312 |
</ul>
|
| 313 |
''')
|
| 314 |
with gr.Row():
|
|
|
|
| 334 |
<br>
|
| 335 |
<p style="font-size: 16px;">Notes:</p>
|
| 336 |
<ul style="font-size: 16px;">
|
| 337 |
+
<li>The (n-1)-gram needs to exist in the dataset. If the (n-1)-gram is not found in the dataset, an error message will appear.</li>
|
| 338 |
+
<li>If the (n-1)-gram appears more than {max_support} times in the dataset, the result will be approximate: we will estimate the distribution by examining a subset of {max_support} occurrences of the (n-1)-gram. This value can be adjusted within range [1, {MAX_SUPPORT}] in "Advanced options".</li>
|
| 339 |
</ul>
|
| 340 |
''')
|
| 341 |
|
|
|
|
| 358 |
with gr.Column():
|
| 359 |
gr.HTML('<h2>4. Compute the ∞-gram probability of the last token</h2>')
|
| 360 |
with gr.Accordion(label='Click to view instructions', open=False):
|
| 361 |
+
gr.HTML(f'''<p style="font-size: 16px;">This computes the ∞-gram probability of the last token conditioned on the previous tokens. Compared to Query Type 2 (which uses your entire input for n-gram modeling), here we take the longest suffix that we can find in the dataset.</p>
|
| 362 |
<br>
|
| 363 |
+
<p style="font-size: 16px;">Example query: <b>I love natural language processing</b> (if "natural language" appears in the dataset but "love natural language" doesn't, the output is P(processing | natural language); in this case the effective n = 3)</p>
|
| 364 |
<br>
|
| 365 |
<p style="font-size: 16px;">Notes:</p>
|
| 366 |
<ul style="font-size: 16px;">
|
|
|
|
| 387 |
with gr.Accordion(label='Click to view instructions', open=False):
|
| 388 |
gr.HTML(f'''<p style="font-size: 16px;">This is similar to Query Type 3, but with ∞-gram instead of n-gram.</p>
|
| 389 |
<br>
|
| 390 |
+
<p style="font-size: 16px;">Example query: <b>I love natural language</b> (if "natural language" appears in the dataset but "love natural language" doesn't, the output is P(* | natural language), for the top-10 tokens *)</p>
|
| 391 |
''')
|
| 392 |
with gr.Row():
|
| 393 |
with gr.Column(scale=1):
|
|
|
|
| 409 |
with gr.Column():
|
| 410 |
gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
|
| 411 |
with gr.Accordion(label='Click to view instructions', open=False):
|
| 412 |
+
gr.HTML(f'''<p style="font-size: 16px;">This displays a few random documents in the dataset that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
|
| 413 |
<br>
|
| 414 |
<p style="font-size: 16px;">Example queries:</p>
|
| 415 |
<ul style="font-size: 16px;">
|
|
|
|
| 429 |
<li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
|
| 430 |
</ul>
|
| 431 |
<br>
|
| 432 |
+
<p style="font-size: 16px;">❗️WARNING: Dataset may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the dataset, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text datasets. Please use with caution. Don't be evil :)</p>
|
| 433 |
''')
|
| 434 |
with gr.Row():
|
| 435 |
with gr.Column(scale=1):
|
|
|
|
| 460 |
with gr.Column():
|
| 461 |
gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
|
| 462 |
with gr.Accordion(label='Click to view instructions', open=False):
|
| 463 |
+
gr.HTML(f'''<p style="font-size: 16px;">This displays the documents in the dataset that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
|
| 464 |
<br>
|
| 465 |
<p style="font-size: 16px;">Example queries:</p>
|
| 466 |
<ul style="font-size: 16px;">
|
|
|
|
| 478 |
<li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
|
| 479 |
</ul>
|
| 480 |
<br>
|
| 481 |
+
<p style="font-size: 16px;">❗️WARNING: Dataset may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the dataset, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text datasets. Please use with caution. Don't be evil :)</p>
|
| 482 |
''')
|
| 483 |
with gr.Row():
|
| 484 |
with gr.Column(scale=1):
|
constants.py
CHANGED
|
@@ -2,14 +2,21 @@ import os
|
|
| 2 |
|
| 3 |
# options
|
| 4 |
INDEX_BY_DESC = {
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
'
|
| 10 |
-
'
|
| 11 |
-
'
|
| 12 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
# 'Pile-val (GPT-2 tokenizer), 380M tokens': 'v4_pileval_gpt2',
|
| 14 |
# 'Dolma-v1.6-sample (OLMo tokenizer), 8.0B tokens': 'v4_dolmasample_olmo',
|
| 15 |
# 'Dolma-v1.6-sample (9.2B tokens)': 'v4_dolma-v1_6-sample_llama',
|
|
|
|
| 2 |
|
| 3 |
# options
|
| 4 |
INDEX_BY_DESC = {
|
| 5 |
+
"Olmo 3 32B Think (6.1T)": "olmo-3-32b-think",
|
| 6 |
+
"OLMo 3 7B Think (6.1T)": "olmo-3-7b-think",
|
| 7 |
+
"OLMo 3 7B Instruct (6.0T)": "olmo-3-7b-instruct",
|
| 8 |
+
"Dolma 3 (5.9T)": "dolma3",
|
| 9 |
+
'OLMo 2 32B Instruct (4.6T)': 'v4_olmo-2-0325-32b-instruct_llama',
|
| 10 |
+
'OLMo 2 13B Instruct (4.6T)': 'v4_olmo-2-1124-13b-instruct_llama',
|
| 11 |
+
'OLMoE 1B 7B Instruct (4.6T)': 'v4_olmoe-0125-1b-7b-instruct_llama',
|
| 12 |
+
'dolmino-mix-1124-minus-olmo-mix-1124 (34B)': 'v4_dolmino-mix-1124-minus-olmo-mix-1124_llama',
|
| 13 |
+
'olmo-mix-1124 (4.6T)': 'v4_olmo-mix-1124_llama',
|
| 14 |
+
'DCLM-baseline (4.3T)': 'v4_dclm-baseline_llama',
|
| 15 |
+
'Dolma-v1.7 (2.6T)': 'v4_dolma-v1_7_llama',
|
| 16 |
+
'RedPajama (1.4T)': 'v4_rpj_llama_s4',
|
| 17 |
+
'Pile-train (380B)': 'v4_piletrain_llama',
|
| 18 |
+
'C4-train (200B)': 'v4_c4train_llama',
|
| 19 |
+
'Pile-val (390M)': 'v4_pileval_llama',
|
| 20 |
# 'Pile-val (GPT-2 tokenizer), 380M tokens': 'v4_pileval_gpt2',
|
| 21 |
# 'Dolma-v1.6-sample (OLMo tokenizer), 8.0B tokens': 'v4_dolmasample_olmo',
|
| 22 |
# 'Dolma-v1.6-sample (9.2B tokens)': 'v4_dolma-v1_6-sample_llama',
|
requirements.txt
CHANGED
|
@@ -2,5 +2,5 @@ torch==1.13.1
|
|
| 2 |
transformers==4.31.0
|
| 3 |
tokenizers==0.13.3
|
| 4 |
sentencepiece==0.1.96
|
| 5 |
-
huggingface_hub==0.
|
| 6 |
requests
|
|
|
|
| 2 |
transformers==4.31.0
|
| 3 |
tokenizers==0.13.3
|
| 4 |
sentencepiece==0.1.96
|
| 5 |
+
huggingface_hub==0.34.3
|
| 6 |
requests
|