Spaces:
Running
Running
Commit
Β·
e59eb9e
1
Parent(s):
ce324c1
Customizable consts
Browse files- app.py +1 -29
- constants.py +30 -0
app.py
CHANGED
|
@@ -2,35 +2,7 @@ import gradio as gr
|
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
import requests
|
| 5 |
-
|
| 6 |
-
CORPUS_BY_DESC = {
|
| 7 |
-
'RedPajama (LLaMA tokenizer)': 'rpj_v3_c4_llama2',
|
| 8 |
-
'Pile-val (GPT-2 tokenizer)': 'pile_v3_val',
|
| 9 |
-
}
|
| 10 |
-
CORPUS_DESCS = list(CORPUS_BY_DESC.keys())
|
| 11 |
-
QUERY_TYPE_BY_DESC = {
|
| 12 |
-
'1. Count an n-gram': 'count',
|
| 13 |
-
'2. Compute the probability of the last token in an n-gram': 'compute_prob',
|
| 14 |
-
'3. Compute the next-token distribution of an (n-1)-gram': 'get_next_token_distribution_approx',
|
| 15 |
-
'4. Compute the β-gram probability of the last token': 'compute_infgram_prob',
|
| 16 |
-
'5. Compute the β-gram next-token distribution': 'get_infgram_next_token_distribution_approx',
|
| 17 |
-
'6. Searching for document containing n-gram(s)': 'get_a_random_document_from_cnf_query_fast_approx',
|
| 18 |
-
# '7. Analyze an (AI-generated) document using β-gram': 'analyze_document',
|
| 19 |
-
}
|
| 20 |
-
QUERY_DESC_BY_TYPE = {v: k for k, v in QUERY_TYPE_BY_DESC.items()}
|
| 21 |
-
QUERY_DESCS = list(QUERY_TYPE_BY_DESC.keys())
|
| 22 |
-
|
| 23 |
-
MAX_QUERY_CHARS = 1000
|
| 24 |
-
MAX_INPUT_DOC_TOKENS = 1000
|
| 25 |
-
MAX_OUTPUT_DOC_TOKENS = 5000 # must be an even number!
|
| 26 |
-
MAX_CNT_FOR_NTD = 1000
|
| 27 |
-
MAX_CLAUSE_FREQ = 10000
|
| 28 |
-
MAX_CLAUSE_FREQ_FAST = 1000000
|
| 29 |
-
MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD = 50000
|
| 30 |
-
MAX_DIFF_TOKENS = 100
|
| 31 |
-
MAX_DIFF_BYTES = 2 * MAX_DIFF_TOKENS
|
| 32 |
-
MAX_CLAUSES_IN_CNF = 4
|
| 33 |
-
MAX_TERMS_IN_DISJ_CLAUSE = 4
|
| 34 |
|
| 35 |
API_IPADDR = os.environ.get('API_IPADDR', None)
|
| 36 |
default_concurrency_limit = os.environ.get('default_concurrency_limit', 10)
|
|
|
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
import requests
|
| 5 |
+
from .constants import *
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
API_IPADDR = os.environ.get('API_IPADDR', None)
|
| 8 |
default_concurrency_limit = os.environ.get('default_concurrency_limit', 10)
|
constants.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
CORPUS_BY_DESC = {
|
| 4 |
+
'RedPajama (LLaMA tokenizer)': 'rpj_v3_c4_llama2',
|
| 5 |
+
'Pile-val (GPT-2 tokenizer)': 'pile_v3_val',
|
| 6 |
+
}
|
| 7 |
+
CORPUS_DESCS = list(CORPUS_BY_DESC.keys())
|
| 8 |
+
QUERY_TYPE_BY_DESC = {
|
| 9 |
+
'1. Count an n-gram': 'count',
|
| 10 |
+
'2. Compute the probability of the last token in an n-gram': 'compute_prob',
|
| 11 |
+
'3. Compute the next-token distribution of an (n-1)-gram': 'get_next_token_distribution_approx',
|
| 12 |
+
'4. Compute the β-gram probability of the last token': 'compute_infgram_prob',
|
| 13 |
+
'5. Compute the β-gram next-token distribution': 'get_infgram_next_token_distribution_approx',
|
| 14 |
+
'6. Searching for document containing n-gram(s)': 'get_a_random_document_from_cnf_query_fast_approx',
|
| 15 |
+
# '7. Analyze an (AI-generated) document using β-gram': 'analyze_document',
|
| 16 |
+
}
|
| 17 |
+
QUERY_DESC_BY_TYPE = {v: k for k, v in QUERY_TYPE_BY_DESC.items()}
|
| 18 |
+
QUERY_DESCS = list(QUERY_TYPE_BY_DESC.keys())
|
| 19 |
+
|
| 20 |
+
MAX_QUERY_CHARS = os.environ.get('MAX_QUERY_CHARS', 1000)
|
| 21 |
+
MAX_INPUT_DOC_TOKENS = os.environ.get('MAX_INPUT_DOC_TOKENS', 1000)
|
| 22 |
+
MAX_OUTPUT_DOC_TOKENS = os.environ.get('MAX_OUTPUT_DOC_TOKENS', 5000)
|
| 23 |
+
MAX_CNT_FOR_NTD = os.environ.get('MAX_CNT_FOR_NTD', 1000)
|
| 24 |
+
MAX_CLAUSE_FREQ = os.environ.get('MAX_CLAUSE_FREQ', 10000)
|
| 25 |
+
MAX_CLAUSE_FREQ_FAST = os.environ.get('MAX_CLAUSE_FREQ_FAST', 1000000)
|
| 26 |
+
MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD = os.environ.get('MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD', 50000)
|
| 27 |
+
MAX_DIFF_TOKENS = os.environ.get('MAX_DIFF_TOKENS', 100)
|
| 28 |
+
MAX_DIFF_BYTES = 2 * MAX_DIFF_TOKENS
|
| 29 |
+
MAX_CLAUSES_IN_CNF = os.environ.get('MAX_CLAUSES_IN_CNF', 4)
|
| 30 |
+
MAX_TERMS_IN_DISJ_CLAUSE = os.environ.get('MAX_TERMS_IN_DISJ_CLAUSE', 4)
|