Spaces:
Running
Running
| import os | |
| CORPUS_BY_DESC = { | |
| 'RedPajama (LLaMA tokenizer)': 'rpj_v3_c4_llama2', | |
| 'Pile-val (GPT-2 tokenizer)': 'pile_v3_val', | |
| } | |
| CORPUS_DESCS = list(CORPUS_BY_DESC.keys()) | |
| QUERY_TYPE_BY_DESC = { | |
| '1. Count an n-gram': 'count', | |
| '2. Compute the probability of the last token in an n-gram': 'compute_prob', | |
| '3. Compute the next-token distribution of an (n-1)-gram': 'get_next_token_distribution_approx', | |
| '4. Compute the β-gram probability of the last token': 'compute_infgram_prob', | |
| '5. Compute the β-gram next-token distribution': 'get_infgram_next_token_distribution_approx', | |
| '6. Searching for document containing n-gram(s)': 'get_a_random_document_from_cnf_query_fast_approx', | |
| # '7. Analyze an (AI-generated) document using β-gram': 'analyze_document', | |
| } | |
| QUERY_DESC_BY_TYPE = {v: k for k, v in QUERY_TYPE_BY_DESC.items()} | |
| QUERY_DESCS = list(QUERY_TYPE_BY_DESC.keys()) | |
| MAX_QUERY_CHARS = os.environ.get('MAX_QUERY_CHARS', 1000) | |
| MAX_INPUT_DOC_TOKENS = os.environ.get('MAX_INPUT_DOC_TOKENS', 1000) | |
| MAX_OUTPUT_DOC_TOKENS = os.environ.get('MAX_OUTPUT_DOC_TOKENS', 5000) | |
| MAX_CNT_FOR_NTD = os.environ.get('MAX_CNT_FOR_NTD', 1000) | |
| MAX_CLAUSE_FREQ = os.environ.get('MAX_CLAUSE_FREQ', 10000) | |
| MAX_CLAUSE_FREQ_FAST = os.environ.get('MAX_CLAUSE_FREQ_FAST', 1000000) | |
| MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD = os.environ.get('MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD', 50000) | |
| MAX_DIFF_TOKENS = os.environ.get('MAX_DIFF_TOKENS', 100) | |
| MAX_DIFF_BYTES = 2 * MAX_DIFF_TOKENS | |
| MAX_CLAUSES_IN_CNF = os.environ.get('MAX_CLAUSES_IN_CNF', 4) | |
| MAX_TERMS_IN_DISJ_CLAUSE = os.environ.get('MAX_TERMS_IN_DISJ_CLAUSE', 4) | |