Spaces:
Runtime error
Runtime error
maintenance message
Browse files
app.py
CHANGED
|
@@ -10,9 +10,7 @@ from huggingface_hub import HfApi
|
|
| 10 |
hf_api = HfApi()
|
| 11 |
roots_datasets = {
|
| 12 |
dset.id.split("/")[-1]: dset
|
| 13 |
-
for dset in hf_api.list_datasets(
|
| 14 |
-
author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token")
|
| 15 |
-
)
|
| 16 |
}
|
| 17 |
|
| 18 |
|
|
@@ -64,9 +62,7 @@ def process_pii(text):
|
|
| 64 |
for tag in PII_TAGS:
|
| 65 |
text = text.replace(
|
| 66 |
PII_PREFIX + tag,
|
| 67 |
-
"""<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(
|
| 68 |
-
tag
|
| 69 |
-
),
|
| 70 |
)
|
| 71 |
return text
|
| 72 |
|
|
@@ -133,9 +129,7 @@ def format_result(result, highlight_terms, exact_search, datasets_filter=None):
|
|
| 133 |
return "<p>" + result_html + "</p>"
|
| 134 |
|
| 135 |
|
| 136 |
-
def format_result_page(
|
| 137 |
-
language, results, highlight_terms, num_results, exact_search, datasets_filter=None
|
| 138 |
-
) -> gr.HTML:
|
| 139 |
filtered_num_results = 0
|
| 140 |
header_html = ""
|
| 141 |
|
|
@@ -160,9 +154,7 @@ def format_result_page(
|
|
| 160 |
continue
|
| 161 |
results_for_lang_html = ""
|
| 162 |
for result in results_for_lang:
|
| 163 |
-
result_html = format_result(
|
| 164 |
-
result, highlight_terms, exact_search, datasets_filter
|
| 165 |
-
)
|
| 166 |
if result_html != "":
|
| 167 |
filtered_num_results += 1
|
| 168 |
results_for_lang_html += result_html
|
|
@@ -204,9 +196,7 @@ def extract_results_from_payload(query, language, payload, exact_search):
|
|
| 204 |
text = result["text"]
|
| 205 |
url = (
|
| 206 |
result["meta"]["url"]
|
| 207 |
-
if "meta" in result
|
| 208 |
-
and result["meta"] is not None
|
| 209 |
-
and "url" in result["meta"]
|
| 210 |
else None
|
| 211 |
)
|
| 212 |
docid = result["docid"]
|
|
@@ -244,11 +234,7 @@ def request_payload(query, language, exact_search, num_results=10, received_resu
|
|
| 244 |
post_data = {"query": query, "k": num_results, "received_results": received_results}
|
| 245 |
if language != "detect_language":
|
| 246 |
post_data["lang"] = language
|
| 247 |
-
address = (
|
| 248 |
-
os.environ.get("address_exact_search")
|
| 249 |
-
if exact_search
|
| 250 |
-
else os.environ.get("address")
|
| 251 |
-
)
|
| 252 |
output = requests.post(
|
| 253 |
address,
|
| 254 |
headers={"Content-type": "application/json"},
|
|
@@ -259,10 +245,12 @@ def request_payload(query, language, exact_search, num_results=10, received_resu
|
|
| 259 |
return payload
|
| 260 |
|
| 261 |
|
| 262 |
-
title =
|
| 263 |
-
"""<p style="text-align: center; font-size:28px"> πΈ π ROOTS search tool π πΈ </p>"""
|
| 264 |
-
)
|
| 265 |
description = """
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
|
| 267 |
of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). The ROOTS Search
|
| 268 |
Tool allows you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages
|
|
@@ -379,9 +367,7 @@ if __name__ == "__main__":
|
|
| 379 |
payload,
|
| 380 |
exact_search,
|
| 381 |
)
|
| 382 |
-
result_page = format_result_page(
|
| 383 |
-
lang, processed_results, highlight_terms, num_results, exact_search
|
| 384 |
-
)
|
| 385 |
return (
|
| 386 |
processed_results,
|
| 387 |
highlight_terms,
|
|
@@ -402,19 +388,13 @@ if __name__ == "__main__":
|
|
| 402 |
datasets,
|
| 403 |
) = run_query(query, lang, k, dropdown_input, 0)
|
| 404 |
has_more_results = exact_search and (num_results > k)
|
| 405 |
-
current_results = (
|
| 406 |
-
len(next(iter(processed_results.values())))
|
| 407 |
-
if len(processed_results) > 0
|
| 408 |
-
else 0
|
| 409 |
-
)
|
| 410 |
return [
|
| 411 |
processed_results,
|
| 412 |
highlight_terms,
|
| 413 |
num_results,
|
| 414 |
exact_search,
|
| 415 |
-
gr.update(visible=True)
|
| 416 |
-
if current_results > 0
|
| 417 |
-
else gr.update(visible=False),
|
| 418 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
| 419 |
gr.update(visible=has_more_results),
|
| 420 |
current_results,
|
|
@@ -437,12 +417,8 @@ if __name__ == "__main__":
|
|
| 437 |
result_page,
|
| 438 |
datasets,
|
| 439 |
) = run_query(query, lang, k, dropdown_input, received_results)
|
| 440 |
-
current_results = sum(
|
| 441 |
-
|
| 442 |
-
)
|
| 443 |
-
has_more_results = exact_search and (
|
| 444 |
-
received_results + current_results < num_results
|
| 445 |
-
)
|
| 446 |
print("received_results", received_results)
|
| 447 |
print("current_results", current_results)
|
| 448 |
print("has_more_results", has_more_results)
|
|
@@ -451,9 +427,7 @@ if __name__ == "__main__":
|
|
| 451 |
highlight_terms,
|
| 452 |
num_results,
|
| 453 |
exact_search,
|
| 454 |
-
gr.update(visible=True)
|
| 455 |
-
if current_results > 0
|
| 456 |
-
else gr.update(visible=False),
|
| 457 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
| 458 |
gr.update(visible=current_results >= k and has_more_results),
|
| 459 |
received_results + current_results,
|
|
|
|
| 10 |
hf_api = HfApi()
|
| 11 |
roots_datasets = {
|
| 12 |
dset.id.split("/")[-1]: dset
|
| 13 |
+
for dset in hf_api.list_datasets(author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token"))
|
|
|
|
|
|
|
| 14 |
}
|
| 15 |
|
| 16 |
|
|
|
|
| 62 |
for tag in PII_TAGS:
|
| 63 |
text = text.replace(
|
| 64 |
PII_PREFIX + tag,
|
| 65 |
+
"""<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(tag),
|
|
|
|
|
|
|
| 66 |
)
|
| 67 |
return text
|
| 68 |
|
|
|
|
| 129 |
return "<p>" + result_html + "</p>"
|
| 130 |
|
| 131 |
|
| 132 |
+
def format_result_page(language, results, highlight_terms, num_results, exact_search, datasets_filter=None) -> gr.HTML:
|
|
|
|
|
|
|
| 133 |
filtered_num_results = 0
|
| 134 |
header_html = ""
|
| 135 |
|
|
|
|
| 154 |
continue
|
| 155 |
results_for_lang_html = ""
|
| 156 |
for result in results_for_lang:
|
| 157 |
+
result_html = format_result(result, highlight_terms, exact_search, datasets_filter)
|
|
|
|
|
|
|
| 158 |
if result_html != "":
|
| 159 |
filtered_num_results += 1
|
| 160 |
results_for_lang_html += result_html
|
|
|
|
| 196 |
text = result["text"]
|
| 197 |
url = (
|
| 198 |
result["meta"]["url"]
|
| 199 |
+
if "meta" in result and result["meta"] is not None and "url" in result["meta"]
|
|
|
|
|
|
|
| 200 |
else None
|
| 201 |
)
|
| 202 |
docid = result["docid"]
|
|
|
|
| 234 |
post_data = {"query": query, "k": num_results, "received_results": received_results}
|
| 235 |
if language != "detect_language":
|
| 236 |
post_data["lang"] = language
|
| 237 |
+
address = os.environ.get("address_exact_search") if exact_search else os.environ.get("address")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
output = requests.post(
|
| 239 |
address,
|
| 240 |
headers={"Content-type": "application/json"},
|
|
|
|
| 245 |
return payload
|
| 246 |
|
| 247 |
|
| 248 |
+
title = """<p style="text-align: center; font-size:28px"> πΈ π ROOTS search tool π πΈ </p>"""
|
|
|
|
|
|
|
| 249 |
description = """
|
| 250 |
+
# We're running maintenance works on the exact search index, so it may not work properly until the end of the day,
|
| 251 |
+
Monday 27th of March.
|
| 252 |
+
|
| 253 |
+
|
| 254 |
The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
|
| 255 |
of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). The ROOTS Search
|
| 256 |
Tool allows you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages
|
|
|
|
| 367 |
payload,
|
| 368 |
exact_search,
|
| 369 |
)
|
| 370 |
+
result_page = format_result_page(lang, processed_results, highlight_terms, num_results, exact_search)
|
|
|
|
|
|
|
| 371 |
return (
|
| 372 |
processed_results,
|
| 373 |
highlight_terms,
|
|
|
|
| 388 |
datasets,
|
| 389 |
) = run_query(query, lang, k, dropdown_input, 0)
|
| 390 |
has_more_results = exact_search and (num_results > k)
|
| 391 |
+
current_results = len(next(iter(processed_results.values()))) if len(processed_results) > 0 else 0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
return [
|
| 393 |
processed_results,
|
| 394 |
highlight_terms,
|
| 395 |
num_results,
|
| 396 |
exact_search,
|
| 397 |
+
gr.update(visible=True) if current_results > 0 else gr.update(visible=False),
|
|
|
|
|
|
|
| 398 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
| 399 |
gr.update(visible=has_more_results),
|
| 400 |
current_results,
|
|
|
|
| 417 |
result_page,
|
| 418 |
datasets,
|
| 419 |
) = run_query(query, lang, k, dropdown_input, received_results)
|
| 420 |
+
current_results = sum(len(results) for results in processed_results.values())
|
| 421 |
+
has_more_results = exact_search and (received_results + current_results < num_results)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
print("received_results", received_results)
|
| 423 |
print("current_results", current_results)
|
| 424 |
print("has_more_results", has_more_results)
|
|
|
|
| 427 |
highlight_terms,
|
| 428 |
num_results,
|
| 429 |
exact_search,
|
| 430 |
+
gr.update(visible=True) if current_results > 0 else gr.update(visible=False),
|
|
|
|
|
|
|
| 431 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
| 432 |
gr.update(visible=current_results >= k and has_more_results),
|
| 433 |
received_results + current_results,
|