Spaces:
Running
Running
Commit
·
619c9ac
1
Parent(s):
2e63f1e
Adapt to API updates
Browse files
app.py
CHANGED
|
@@ -30,78 +30,117 @@ def process(query_type, corpus_desc, engine_desc, query, maxnum, request: gr.Req
|
|
| 30 |
print(result)
|
| 31 |
return result
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
def count(corpus_desc, engine_desc, query, request: gr.Request):
|
| 34 |
result = process('count', corpus_desc, engine_desc, query, None, request)
|
| 35 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
| 36 |
-
|
| 37 |
if 'error' in result:
|
| 38 |
count = result['error']
|
| 39 |
else:
|
| 40 |
count = f'{result["count"]:,}'
|
| 41 |
-
return latency,
|
| 42 |
|
| 43 |
def prob(corpus_desc, engine_desc, query, request: gr.Request):
|
| 44 |
result = process('prob', corpus_desc, engine_desc, query, None, request)
|
| 45 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
| 46 |
-
|
| 47 |
if 'error' in result:
|
| 48 |
prob = result['error']
|
| 49 |
elif result['prompt_cnt'] == 0:
|
| 50 |
prob = '(n-1)-gram is not found in the corpus'
|
| 51 |
else:
|
| 52 |
prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
|
| 53 |
-
return latency,
|
| 54 |
|
| 55 |
def ntd(corpus_desc, engine_desc, query, request: gr.Request):
|
| 56 |
result = process('ntd', corpus_desc, engine_desc, query, None, request)
|
| 57 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
| 58 |
-
|
| 59 |
if 'error' in result:
|
| 60 |
ntd = result['error']
|
| 61 |
else:
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
| 63 |
if ntd == {}:
|
| 64 |
ntd = '(n-1)-gram is not found in the corpus'
|
| 65 |
-
return latency,
|
| 66 |
|
| 67 |
def infgram_prob(corpus_desc, engine_desc, query, request: gr.Request):
|
| 68 |
result = process('infgram_prob', corpus_desc, engine_desc, query, None, request)
|
| 69 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
| 70 |
-
|
| 71 |
if 'error' in result:
|
| 72 |
longest_suffix = ''
|
| 73 |
prob = result['error']
|
| 74 |
else:
|
| 75 |
longest_suffix = result['longest_suffix']
|
| 76 |
prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
|
| 77 |
-
return latency,
|
| 78 |
|
| 79 |
def infgram_ntd(corpus_desc, engine_desc, query, request: gr.Request):
|
| 80 |
result = process('infgram_ntd', corpus_desc, engine_desc, query, None, request)
|
| 81 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
| 82 |
-
|
| 83 |
if 'error' in result:
|
| 84 |
longest_suffix = ''
|
| 85 |
ntd = result['error']
|
| 86 |
else:
|
| 87 |
longest_suffix = result['longest_suffix']
|
| 88 |
-
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
def search_docs(corpus_desc, engine_desc, query, maxnum, request: gr.Request):
|
| 92 |
result = process('search_docs', corpus_desc, engine_desc, query, maxnum, request)
|
| 93 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
| 94 |
-
|
| 95 |
if 'error' in result:
|
| 96 |
message = result['error']
|
| 97 |
docs = [[] for _ in range(10)]
|
| 98 |
else:
|
| 99 |
message = result['message']
|
| 100 |
-
docs = result['
|
|
|
|
| 101 |
docs = docs[:maxnum]
|
| 102 |
while len(docs) < 10:
|
| 103 |
docs.append([])
|
| 104 |
-
return latency,
|
| 105 |
|
| 106 |
def analyze_document(corpus_desc, engine_desc, query, request: gr.Request):
|
| 107 |
result = process('analyze_document', corpus_desc, engine_desc, query, None, request)
|
|
|
|
| 30 |
print(result)
|
| 31 |
return result
|
| 32 |
|
| 33 |
+
def format_tokenization_info(result):
|
| 34 |
+
if not ('token_ids' in result and 'tokens' in result):
|
| 35 |
+
return ''
|
| 36 |
+
token_ids = result['token_ids']
|
| 37 |
+
tokens = result['tokens']
|
| 38 |
+
t = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
|
| 39 |
+
return t
|
| 40 |
+
def format_tokenization_info_nested(result):
|
| 41 |
+
if not ('token_idsss' in result and 'tokensss' in result):
|
| 42 |
+
return ''
|
| 43 |
+
token_idsss = result['token_idsss']
|
| 44 |
+
tokensss = result['tokensss']
|
| 45 |
+
ttt = []
|
| 46 |
+
for token_idss, tokenss in zip(token_idsss, tokensss):
|
| 47 |
+
tt = []
|
| 48 |
+
for token_ids, tokens in zip(token_idss, tokenss):
|
| 49 |
+
t = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
|
| 50 |
+
tt.append(t)
|
| 51 |
+
tt = '\n'.join(tt)
|
| 52 |
+
ttt.append(tt)
|
| 53 |
+
ttt = '\n\n'.join(ttt)
|
| 54 |
+
return ttt
|
| 55 |
+
def format_doc(doc):
|
| 56 |
+
formatted = []
|
| 57 |
+
if doc['doc_len'] == doc['disp_len']:
|
| 58 |
+
header = f'[Document #{doc["doc_ix"]}, length = {doc["doc_len"]} tokens]\n\n'
|
| 59 |
+
else:
|
| 60 |
+
header = f'[Document #{doc["doc_ix"]}, length = {doc["doc_len"]} tokens ({doc["disp_len"]} tokens displayed)]\n\n'
|
| 61 |
+
formatted.append((header, None))
|
| 62 |
+
formatted += doc['spans']
|
| 63 |
+
return formatted
|
| 64 |
+
|
| 65 |
def count(corpus_desc, engine_desc, query, request: gr.Request):
|
| 66 |
result = process('count', corpus_desc, engine_desc, query, None, request)
|
| 67 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
| 68 |
+
tokenization_info = format_tokenization_info(result)
|
| 69 |
if 'error' in result:
|
| 70 |
count = result['error']
|
| 71 |
else:
|
| 72 |
count = f'{result["count"]:,}'
|
| 73 |
+
return latency, tokenization_info, count
|
| 74 |
|
| 75 |
def prob(corpus_desc, engine_desc, query, request: gr.Request):
|
| 76 |
result = process('prob', corpus_desc, engine_desc, query, None, request)
|
| 77 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
| 78 |
+
tokenization_info = format_tokenization_info(result)
|
| 79 |
if 'error' in result:
|
| 80 |
prob = result['error']
|
| 81 |
elif result['prompt_cnt'] == 0:
|
| 82 |
prob = '(n-1)-gram is not found in the corpus'
|
| 83 |
else:
|
| 84 |
prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
|
| 85 |
+
return latency, tokenization_info, prob
|
| 86 |
|
| 87 |
def ntd(corpus_desc, engine_desc, query, request: gr.Request):
|
| 88 |
result = process('ntd', corpus_desc, engine_desc, query, None, request)
|
| 89 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
| 90 |
+
tokenization_info = format_tokenization_info(result)
|
| 91 |
if 'error' in result:
|
| 92 |
ntd = result['error']
|
| 93 |
else:
|
| 94 |
+
result_by_token_id = result['result_by_token_id']
|
| 95 |
+
ntd = {}
|
| 96 |
+
for token_id, r in result_by_token_id.items():
|
| 97 |
+
ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
|
| 98 |
if ntd == {}:
|
| 99 |
ntd = '(n-1)-gram is not found in the corpus'
|
| 100 |
+
return latency, tokenization_info, ntd
|
| 101 |
|
| 102 |
def infgram_prob(corpus_desc, engine_desc, query, request: gr.Request):
|
| 103 |
result = process('infgram_prob', corpus_desc, engine_desc, query, None, request)
|
| 104 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
| 105 |
+
tokenization_info = format_tokenization_info(result)
|
| 106 |
if 'error' in result:
|
| 107 |
longest_suffix = ''
|
| 108 |
prob = result['error']
|
| 109 |
else:
|
| 110 |
longest_suffix = result['longest_suffix']
|
| 111 |
prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
|
| 112 |
+
return latency, tokenization_info, longest_suffix, prob
|
| 113 |
|
| 114 |
def infgram_ntd(corpus_desc, engine_desc, query, request: gr.Request):
|
| 115 |
result = process('infgram_ntd', corpus_desc, engine_desc, query, None, request)
|
| 116 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
| 117 |
+
tokenization_info = format_tokenization_info(result)
|
| 118 |
if 'error' in result:
|
| 119 |
longest_suffix = ''
|
| 120 |
ntd = result['error']
|
| 121 |
else:
|
| 122 |
longest_suffix = result['longest_suffix']
|
| 123 |
+
result_by_token_id = result['result_by_token_id']
|
| 124 |
+
ntd = {}
|
| 125 |
+
for token_id, r in result_by_token_id.items():
|
| 126 |
+
ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
|
| 127 |
+
return latency, tokenization_info, longest_suffix, ntd
|
| 128 |
|
| 129 |
def search_docs(corpus_desc, engine_desc, query, maxnum, request: gr.Request):
|
| 130 |
result = process('search_docs', corpus_desc, engine_desc, query, maxnum, request)
|
| 131 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
| 132 |
+
tokenization_info = format_tokenization_info_nested(result)
|
| 133 |
if 'error' in result:
|
| 134 |
message = result['error']
|
| 135 |
docs = [[] for _ in range(10)]
|
| 136 |
else:
|
| 137 |
message = result['message']
|
| 138 |
+
docs = result['documents']
|
| 139 |
+
docs = [format_doc(doc) for doc in docs]
|
| 140 |
docs = docs[:maxnum]
|
| 141 |
while len(docs) < 10:
|
| 142 |
docs.append([])
|
| 143 |
+
return latency, tokenization_info, message, docs[0], docs[1], docs[2], docs[3], docs[4], docs[5], docs[6], docs[7], docs[8], docs[9]
|
| 144 |
|
| 145 |
def analyze_document(corpus_desc, engine_desc, query, request: gr.Request):
|
| 146 |
result = process('analyze_document', corpus_desc, engine_desc, query, None, request)
|