Spaces:
Running
Running
Commit
·
cb08e07
1
Parent(s):
7474206
Sync changes
Browse files- app.py +5 -4
- constants.py +2 -9
app.py
CHANGED
|
@@ -9,6 +9,7 @@ def process(query_type, corpus_desc, engine_desc, query, maxnum, request: gr.Req
|
|
| 9 |
corpus = CORPUS_BY_DESC[corpus_desc]
|
| 10 |
engine = ENGINE_BY_DESC[engine_desc]
|
| 11 |
data = {
|
|
|
|
| 12 |
'timestamp': timestamp,
|
| 13 |
'query_type': query_type,
|
| 14 |
'corpus': corpus,
|
|
@@ -18,9 +19,9 @@ def process(query_type, corpus_desc, engine_desc, query, maxnum, request: gr.Req
|
|
| 18 |
if maxnum is not None:
|
| 19 |
data['maxnum'] = maxnum
|
| 20 |
print(json.dumps(data))
|
| 21 |
-
if
|
| 22 |
-
raise ValueError(f'
|
| 23 |
-
response = requests.post(
|
| 24 |
if response.status_code == 200:
|
| 25 |
result = response.json()
|
| 26 |
else:
|
|
@@ -230,7 +231,7 @@ with gr.Blocks() as demo:
|
|
| 230 |
<li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
|
| 231 |
<li>If the document is too long, it will be truncated to {MAX_OUTPUT_DOC_TOKENS} tokens.</li>
|
| 232 |
<li>We can only include documents where all terms (or clauses) are separated by no more than {MAX_DIFF_TOKENS} tokens.</li>
|
| 233 |
-
<li>If you query for two or more clauses, and a clause has more than {
|
| 234 |
<li>The number of found documents may contain duplicates (e.g., if a document contains your query term twice, it may be counted twice).</li>
|
| 235 |
</ul>
|
| 236 |
<p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
|
|
|
|
| 9 |
corpus = CORPUS_BY_DESC[corpus_desc]
|
| 10 |
engine = ENGINE_BY_DESC[engine_desc]
|
| 11 |
data = {
|
| 12 |
+
'source': 'hf' if not DEBUG else 'hf-dev',
|
| 13 |
'timestamp': timestamp,
|
| 14 |
'query_type': query_type,
|
| 15 |
'corpus': corpus,
|
|
|
|
| 19 |
if maxnum is not None:
|
| 20 |
data['maxnum'] = maxnum
|
| 21 |
print(json.dumps(data))
|
| 22 |
+
if API_URL is None:
|
| 23 |
+
raise ValueError(f'API_URL envvar is not set!')
|
| 24 |
+
response = requests.post(API_URL, json=data)
|
| 25 |
if response.status_code == 200:
|
| 26 |
result = response.json()
|
| 27 |
else:
|
|
|
|
| 231 |
<li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
|
| 232 |
<li>If the document is too long, it will be truncated to {MAX_OUTPUT_DOC_TOKENS} tokens.</li>
|
| 233 |
<li>We can only include documents where all terms (or clauses) are separated by no more than {MAX_DIFF_TOKENS} tokens.</li>
|
| 234 |
+
<li>If you query for two or more clauses, and a clause has more than {MAX_CLAUSE_FREQ_PER_SHARD} matches (per shard), we will estimate the count from a random subset of all documents containing that clause. This might cause a zero count on conjuction of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
|
| 235 |
<li>The number of found documents may contain duplicates (e.g., if a document contains your query term twice, it may be counted twice).</li>
|
| 236 |
</ul>
|
| 237 |
<p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
|
constants.py
CHANGED
|
@@ -22,22 +22,15 @@ MAX_INPUT_DOC_TOKENS = int(os.environ.get('MAX_INPUT_DOC_TOKENS', 1000))
|
|
| 22 |
MAX_OUTPUT_DOC_TOKENS = int(os.environ.get('MAX_OUTPUT_DOC_TOKENS', 5000))
|
| 23 |
MAX_OUTPUT_NUM_DOCS = int(os.environ.get('MAX_OUTPUT_NUM_DOCS', 10)) # This number is also hard-coded in app.py
|
| 24 |
MAX_CNT_FOR_NTD = int(os.environ.get('MAX_CNT_FOR_NTD', 1000))
|
| 25 |
-
|
| 26 |
-
MAX_CLAUSE_FREQ_FAST = int(os.environ.get('MAX_CLAUSE_FREQ_FAST', 1000000))
|
| 27 |
-
MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD = int(os.environ.get('MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD', 50000))
|
| 28 |
MAX_DIFF_TOKENS = int(os.environ.get('MAX_DIFF_TOKENS', 100))
|
| 29 |
MAX_DIFF_BYTES = 2 * MAX_DIFF_TOKENS
|
| 30 |
MAX_CLAUSES_IN_CNF = int(os.environ.get('MAX_CLAUSES_IN_CNF', 4))
|
| 31 |
MAX_TERMS_IN_DISJ_CLAUSE = int(os.environ.get('MAX_TERMS_IN_DISJ_CLAUSE', 4))
|
| 32 |
|
| 33 |
# HF demo
|
| 34 |
-
|
| 35 |
DEFAULT_CONCURRENCY_LIMIT = os.environ.get('DEFAULT_CONCURRENCY_LIMIT', 10)
|
| 36 |
MAX_SIZE = os.environ.get('MAX_SIZE', 100)
|
| 37 |
MAX_THREADS = os.environ.get('MAX_THREADS', 40)
|
| 38 |
DEBUG = (os.environ.get('DEBUG', 'False') != 'False')
|
| 39 |
-
|
| 40 |
-
# C++ engine
|
| 41 |
-
CPP_PORT = int(os.environ.get('CPP_PORT', 3786))
|
| 42 |
-
SOCKET_IN_BUFFER_SIZE = 2048
|
| 43 |
-
SOCKET_OUT_BUFFER_SIZE = 65536
|
|
|
|
| 22 |
MAX_OUTPUT_DOC_TOKENS = int(os.environ.get('MAX_OUTPUT_DOC_TOKENS', 5000))
|
| 23 |
MAX_OUTPUT_NUM_DOCS = int(os.environ.get('MAX_OUTPUT_NUM_DOCS', 10)) # This number is also hard-coded in app.py
|
| 24 |
MAX_CNT_FOR_NTD = int(os.environ.get('MAX_CNT_FOR_NTD', 1000))
|
| 25 |
+
MAX_CLAUSE_FREQ_PER_SHARD = int(os.environ.get('MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD', 50000))
|
|
|
|
|
|
|
| 26 |
MAX_DIFF_TOKENS = int(os.environ.get('MAX_DIFF_TOKENS', 100))
|
| 27 |
MAX_DIFF_BYTES = 2 * MAX_DIFF_TOKENS
|
| 28 |
MAX_CLAUSES_IN_CNF = int(os.environ.get('MAX_CLAUSES_IN_CNF', 4))
|
| 29 |
MAX_TERMS_IN_DISJ_CLAUSE = int(os.environ.get('MAX_TERMS_IN_DISJ_CLAUSE', 4))
|
| 30 |
|
| 31 |
# HF demo
|
| 32 |
+
API_URL = os.environ.get('API_URL', None)
|
| 33 |
DEFAULT_CONCURRENCY_LIMIT = os.environ.get('DEFAULT_CONCURRENCY_LIMIT', 10)
|
| 34 |
MAX_SIZE = os.environ.get('MAX_SIZE', 100)
|
| 35 |
MAX_THREADS = os.environ.get('MAX_THREADS', 40)
|
| 36 |
DEBUG = (os.environ.get('DEBUG', 'False') != 'False')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|