Spaces:

liujch1998
/

infini-gram

Running

App Files Files Community

liujch1998 commited on Feb 9, 2024

Commit

cb08e07

1 Parent(s): 7474206

Sync changes

Browse files

Files changed (2) hide show

app.py +5 -4
constants.py +2 -9

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ def process(query_type, corpus_desc, engine_desc, query, maxnum, request: gr.Req
     corpus = CORPUS_BY_DESC[corpus_desc]
     engine = ENGINE_BY_DESC[engine_desc]
     data = {
         'timestamp': timestamp,
         'query_type': query_type,
         'corpus': corpus,
@@ -18,9 +19,9 @@ def process(query_type, corpus_desc, engine_desc, query, maxnum, request: gr.Req
     if maxnum is not None:
         data['maxnum'] = maxnum
     print(json.dumps(data))
-    if API_IPADDR is None:
-        raise ValueError(f'API_IPADDR envvar is not set!')
-    response = requests.post(f'http://{API_IPADDR}:5000/', json=data)
     if response.status_code == 200:
         result = response.json()
     else:
@@ -230,7 +231,7 @@ with gr.Blocks() as demo:
                                         <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
                                         <li>If the document is too long, it will be truncated to {MAX_OUTPUT_DOC_TOKENS} tokens.</li>
                                         <li>We can only include documents where all terms (or clauses) are separated by no more than {MAX_DIFF_TOKENS} tokens.</li>
-                                        <li>If you query for two or more clauses, and a clause has more than {MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD} matches (per shard), we will estimate the count from a random subset of all documents containing that clause. This might cause a zero count on conjuction of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
                                         <li>The number of found documents may contain duplicates (e.g., if a document contains your query term twice, it may be counted twice).</li>
                                     </ul>
                                     <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>

     corpus = CORPUS_BY_DESC[corpus_desc]
     engine = ENGINE_BY_DESC[engine_desc]
     data = {
+        'source': 'hf' if not DEBUG else 'hf-dev',
         'timestamp': timestamp,
         'query_type': query_type,
         'corpus': corpus,
     if maxnum is not None:
         data['maxnum'] = maxnum
     print(json.dumps(data))
+    if API_URL is None:
+        raise ValueError(f'API_URL envvar is not set!')
+    response = requests.post(API_URL, json=data)
     if response.status_code == 200:
         result = response.json()
     else:
                                         <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
                                         <li>If the document is too long, it will be truncated to {MAX_OUTPUT_DOC_TOKENS} tokens.</li>
                                         <li>We can only include documents where all terms (or clauses) are separated by no more than {MAX_DIFF_TOKENS} tokens.</li>
+                                        <li>If you query for two or more clauses, and a clause has more than {MAX_CLAUSE_FREQ_PER_SHARD} matches (per shard), we will estimate the count from a random subset of all documents containing that clause. This might cause a zero count on conjuction of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
                                         <li>The number of found documents may contain duplicates (e.g., if a document contains your query term twice, it may be counted twice).</li>
                                     </ul>
                                     <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>

constants.py CHANGED Viewed

@@ -22,22 +22,15 @@ MAX_INPUT_DOC_TOKENS = int(os.environ.get('MAX_INPUT_DOC_TOKENS', 1000))
 MAX_OUTPUT_DOC_TOKENS = int(os.environ.get('MAX_OUTPUT_DOC_TOKENS', 5000))
 MAX_OUTPUT_NUM_DOCS = int(os.environ.get('MAX_OUTPUT_NUM_DOCS', 10)) # This number is also hard-coded in app.py
 MAX_CNT_FOR_NTD = int(os.environ.get('MAX_CNT_FOR_NTD', 1000))
-MAX_CLAUSE_FREQ = int(os.environ.get('MAX_CLAUSE_FREQ', 10000))
-MAX_CLAUSE_FREQ_FAST = int(os.environ.get('MAX_CLAUSE_FREQ_FAST', 1000000))
-MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD = int(os.environ.get('MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD', 50000))
 MAX_DIFF_TOKENS = int(os.environ.get('MAX_DIFF_TOKENS', 100))
 MAX_DIFF_BYTES = 2 * MAX_DIFF_TOKENS
 MAX_CLAUSES_IN_CNF = int(os.environ.get('MAX_CLAUSES_IN_CNF', 4))
 MAX_TERMS_IN_DISJ_CLAUSE = int(os.environ.get('MAX_TERMS_IN_DISJ_CLAUSE', 4))
 # HF demo
-API_IPADDR = os.environ.get('API_IPADDR', None)
 DEFAULT_CONCURRENCY_LIMIT = os.environ.get('DEFAULT_CONCURRENCY_LIMIT', 10)
 MAX_SIZE = os.environ.get('MAX_SIZE', 100)
 MAX_THREADS = os.environ.get('MAX_THREADS', 40)
 DEBUG = (os.environ.get('DEBUG', 'False') != 'False')
-# C++ engine
-CPP_PORT = int(os.environ.get('CPP_PORT', 3786))
-SOCKET_IN_BUFFER_SIZE = 2048
-SOCKET_OUT_BUFFER_SIZE = 65536

 MAX_OUTPUT_DOC_TOKENS = int(os.environ.get('MAX_OUTPUT_DOC_TOKENS', 5000))
 MAX_OUTPUT_NUM_DOCS = int(os.environ.get('MAX_OUTPUT_NUM_DOCS', 10)) # This number is also hard-coded in app.py
 MAX_CNT_FOR_NTD = int(os.environ.get('MAX_CNT_FOR_NTD', 1000))
+MAX_CLAUSE_FREQ_PER_SHARD = int(os.environ.get('MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD', 50000))
 MAX_DIFF_TOKENS = int(os.environ.get('MAX_DIFF_TOKENS', 100))
 MAX_DIFF_BYTES = 2 * MAX_DIFF_TOKENS
 MAX_CLAUSES_IN_CNF = int(os.environ.get('MAX_CLAUSES_IN_CNF', 4))
 MAX_TERMS_IN_DISJ_CLAUSE = int(os.environ.get('MAX_TERMS_IN_DISJ_CLAUSE', 4))
 # HF demo
+API_URL = os.environ.get('API_URL', None)
 DEFAULT_CONCURRENCY_LIMIT = os.environ.get('DEFAULT_CONCURRENCY_LIMIT', 10)
 MAX_SIZE = os.environ.get('MAX_SIZE', 100)
 MAX_THREADS = os.environ.get('MAX_THREADS', 40)
 DEBUG = (os.environ.get('DEBUG', 'False') != 'False')