Spaces:
Running
Running
Commit
·
2195005
1
Parent(s):
aa7da7f
Sync changes
Browse files
app.py
CHANGED
|
@@ -40,23 +40,19 @@ def format_tokenization_info(result):
|
|
| 40 |
return ''
|
| 41 |
token_ids = result['token_ids']
|
| 42 |
tokens = result['tokens']
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
tt = '\n'.join(tt)
|
| 57 |
-
ttt.append(tt)
|
| 58 |
-
ttt = '\n\n'.join(ttt)
|
| 59 |
-
return ttt
|
| 60 |
def format_doc(doc):
|
| 61 |
formatted = []
|
| 62 |
if doc['doc_len'] == doc['disp_len']:
|
|
@@ -134,7 +130,7 @@ def infgram_ntd(corpus_desc, engine_desc, query, request: gr.Request):
|
|
| 134 |
def search_docs(corpus_desc, engine_desc, query, maxnum, request: gr.Request):
|
| 135 |
result = process('search_docs', corpus_desc, engine_desc, query, maxnum, request)
|
| 136 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
| 137 |
-
tokenization_info =
|
| 138 |
if 'error' in result:
|
| 139 |
message = result['error']
|
| 140 |
docs = [[] for _ in range(10)]
|
|
@@ -157,7 +153,7 @@ with gr.Blocks() as demo:
|
|
| 157 |
'''<h1 text-align="center">Infini-gram: An Engine for n-gram / ∞-gram Language Modeling with Trillion-Token Corpora</h1>
|
| 158 |
|
| 159 |
<p style='font-size: 16px;'>This is an engine that processes n-gram / ∞-gram queries on massive text corpora. Please first select the corpus and the type of query, then enter your query and submit.</p>
|
| 160 |
-
<p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng (Gary) Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>.</p>
|
| 161 |
<p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
|
| 162 |
<p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
|
| 163 |
'''
|
|
|
|
| 40 |
return ''
|
| 41 |
token_ids = result['token_ids']
|
| 42 |
tokens = result['tokens']
|
| 43 |
+
if type(token_ids) == list and all([type(token_id) == int for token_id in token_ids]):
|
| 44 |
+
output = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
|
| 45 |
+
else:
|
| 46 |
+
ttt = []
|
| 47 |
+
for token_idss, tokenss in zip(token_ids, tokens):
|
| 48 |
+
tt = []
|
| 49 |
+
for token_ids, tokens in zip(token_idss, tokenss):
|
| 50 |
+
t = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
|
| 51 |
+
tt.append(t)
|
| 52 |
+
tt = '\n'.join(tt)
|
| 53 |
+
ttt.append(tt)
|
| 54 |
+
output = '\n\n'.join(ttt)
|
| 55 |
+
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
def format_doc(doc):
|
| 57 |
formatted = []
|
| 58 |
if doc['doc_len'] == doc['disp_len']:
|
|
|
|
| 130 |
def search_docs(corpus_desc, engine_desc, query, maxnum, request: gr.Request):
|
| 131 |
result = process('search_docs', corpus_desc, engine_desc, query, maxnum, request)
|
| 132 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
| 133 |
+
tokenization_info = format_tokenization_info(result)
|
| 134 |
if 'error' in result:
|
| 135 |
message = result['error']
|
| 136 |
docs = [[] for _ in range(10)]
|
|
|
|
| 153 |
'''<h1 text-align="center">Infini-gram: An Engine for n-gram / ∞-gram Language Modeling with Trillion-Token Corpora</h1>
|
| 154 |
|
| 155 |
<p style='font-size: 16px;'>This is an engine that processes n-gram / ∞-gram queries on massive text corpora. Please first select the corpus and the type of query, then enter your query and submit.</p>
|
| 156 |
+
<p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng (Gary) Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. Feel free to check out our <a href="https://infini-gram.io">Project Homepage</a>.</p>
|
| 157 |
<p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
|
| 158 |
<p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
|
| 159 |
'''
|