Spaces:

liujch1998
/

infini-gram

Running

App Files Files Community

liujch1998 commited on May 8, 2024

Commit

2195005

1 Parent(s): aa7da7f

Sync changes

Browse files

Files changed (1) hide show

app.py +15 -19

app.py CHANGED Viewed

@@ -40,23 +40,19 @@ def format_tokenization_info(result):
         return ''
     token_ids = result['token_ids']
     tokens = result['tokens']
-    t = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
-    return t
-def format_tokenization_info_nested(result):
-    if not ('token_idsss' in result and 'tokensss' in result):
-        return ''
-    token_idsss = result['token_idsss']
-    tokensss = result['tokensss']
-    ttt = []
-    for token_idss, tokenss in zip(token_idsss, tokensss):
-        tt = []
-        for token_ids, tokens in zip(token_idss, tokenss):
-            t = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
-            tt.append(t)
-        tt = '\n'.join(tt)
-        ttt.append(tt)
-    ttt = '\n\n'.join(ttt)
-    return ttt
 def format_doc(doc):
     formatted = []
     if doc['doc_len'] == doc['disp_len']:
@@ -134,7 +130,7 @@ def infgram_ntd(corpus_desc, engine_desc, query, request: gr.Request):
 def search_docs(corpus_desc, engine_desc, query, maxnum, request: gr.Request):
     result = process('search_docs', corpus_desc, engine_desc, query, maxnum, request)
     latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
-    tokenization_info = format_tokenization_info_nested(result)
     if 'error' in result:
         message = result['error']
         docs = [[] for _ in range(10)]
@@ -157,7 +153,7 @@ with gr.Blocks() as demo:
             '''<h1 text-align="center">Infini-gram: An Engine for n-gram / ∞-gram Language Modeling with Trillion-Token Corpora</h1>
             <p style='font-size: 16px;'>This is an engine that processes n-gram / ∞-gram queries on massive text corpora. Please first select the corpus and the type of query, then enter your query and submit.</p>
-            <p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng (Gary) Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>.</p>
             <p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
             <p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
             '''

         return ''
     token_ids = result['token_ids']
     tokens = result['tokens']
+    if type(token_ids) == list and all([type(token_id) == int for token_id in token_ids]):
+        output = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
+    else:
+        ttt = []
+        for token_idss, tokenss in zip(token_ids, tokens):
+            tt = []
+            for token_ids, tokens in zip(token_idss, tokenss):
+                t = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
+                tt.append(t)
+            tt = '\n'.join(tt)
+            ttt.append(tt)
+        output = '\n\n'.join(ttt)
+    return output
 def format_doc(doc):
     formatted = []
     if doc['doc_len'] == doc['disp_len']:
 def search_docs(corpus_desc, engine_desc, query, maxnum, request: gr.Request):
     result = process('search_docs', corpus_desc, engine_desc, query, maxnum, request)
     latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
+    tokenization_info = format_tokenization_info(result)
     if 'error' in result:
         message = result['error']
         docs = [[] for _ in range(10)]
             '''<h1 text-align="center">Infini-gram: An Engine for n-gram / ∞-gram Language Modeling with Trillion-Token Corpora</h1>
             <p style='font-size: 16px;'>This is an engine that processes n-gram / ∞-gram queries on massive text corpora. Please first select the corpus and the type of query, then enter your query and submit.</p>
+            <p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng (Gary) Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. Feel free to check out our <a href="https://infini-gram.io">Project Homepage</a>.</p>
             <p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
             <p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
             '''