Spaces:
Running
Running
UI help tooltips + simplify incomplete shard upload behavior
Browse files
app.py
CHANGED
|
@@ -267,6 +267,24 @@ APP_CSS = """
|
|
| 267 |
font-size: 0.83rem;
|
| 268 |
padding: 0.24rem 0.3rem;
|
| 269 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
"""
|
| 271 |
|
| 272 |
THEME_JS = """
|
|
@@ -328,6 +346,42 @@ SEED_WIDGET_JS = """
|
|
| 328 |
}
|
| 329 |
"""
|
| 330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
|
| 332 |
def utc_now_iso() -> str:
|
| 333 |
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
|
@@ -461,7 +515,6 @@ def build_crawler_config(
|
|
| 461 |
shard_size_rows: int,
|
| 462 |
enable_hf_upload: bool,
|
| 463 |
upload_incomplete_shards: bool,
|
| 464 |
-
incomplete_shard_flush_seconds: float,
|
| 465 |
hf_repo_id: str,
|
| 466 |
hf_token: str,
|
| 467 |
hf_private_repo: bool,
|
|
@@ -480,7 +533,6 @@ def build_crawler_config(
|
|
| 480 |
output_dir=Path(__file__).resolve().parent / "shards",
|
| 481 |
enable_hf_upload=bool(enable_hf_upload),
|
| 482 |
upload_incomplete_shards=bool(upload_incomplete_shards),
|
| 483 |
-
incomplete_shard_flush_seconds=float(incomplete_shard_flush_seconds),
|
| 484 |
hf_repo_id=hf_repo_id.strip(),
|
| 485 |
hf_token=hf_token.strip(),
|
| 486 |
hf_private_repo=bool(hf_private_repo),
|
|
@@ -699,7 +751,6 @@ def _start_crawl(
|
|
| 699 |
shard_size_rows: int,
|
| 700 |
enable_hf_upload: bool,
|
| 701 |
upload_incomplete_shards: bool,
|
| 702 |
-
incomplete_shard_flush_seconds: float,
|
| 703 |
hf_repo_id: str,
|
| 704 |
hf_token: str,
|
| 705 |
hf_private_repo: bool,
|
|
@@ -714,7 +765,6 @@ def _start_crawl(
|
|
| 714 |
shard_size_rows=shard_size_rows,
|
| 715 |
enable_hf_upload=enable_hf_upload,
|
| 716 |
upload_incomplete_shards=upload_incomplete_shards,
|
| 717 |
-
incomplete_shard_flush_seconds=incomplete_shard_flush_seconds,
|
| 718 |
hf_repo_id=hf_repo_id,
|
| 719 |
hf_token=hf_token,
|
| 720 |
hf_private_repo=hf_private_repo,
|
|
@@ -737,7 +787,6 @@ def start_crawl_standard(
|
|
| 737 |
shard_size_rows: int,
|
| 738 |
enable_hf_upload: bool,
|
| 739 |
upload_incomplete_shards: bool,
|
| 740 |
-
incomplete_shard_flush_seconds: float,
|
| 741 |
hf_repo_id: str,
|
| 742 |
hf_token: str,
|
| 743 |
hf_private_repo: bool,
|
|
@@ -752,7 +801,6 @@ def start_crawl_standard(
|
|
| 752 |
shard_size_rows=shard_size_rows,
|
| 753 |
enable_hf_upload=enable_hf_upload,
|
| 754 |
upload_incomplete_shards=upload_incomplete_shards,
|
| 755 |
-
incomplete_shard_flush_seconds=incomplete_shard_flush_seconds,
|
| 756 |
hf_repo_id=hf_repo_id,
|
| 757 |
hf_token=hf_token,
|
| 758 |
hf_private_repo=hf_private_repo,
|
|
@@ -768,7 +816,6 @@ def start_crawl_super(
|
|
| 768 |
shard_size_rows: int,
|
| 769 |
enable_hf_upload: bool,
|
| 770 |
upload_incomplete_shards: bool,
|
| 771 |
-
incomplete_shard_flush_seconds: float,
|
| 772 |
hf_repo_id: str,
|
| 773 |
hf_token: str,
|
| 774 |
hf_private_repo: bool,
|
|
@@ -783,7 +830,6 @@ def start_crawl_super(
|
|
| 783 |
shard_size_rows=shard_size_rows,
|
| 784 |
enable_hf_upload=enable_hf_upload,
|
| 785 |
upload_incomplete_shards=upload_incomplete_shards,
|
| 786 |
-
incomplete_shard_flush_seconds=incomplete_shard_flush_seconds,
|
| 787 |
hf_repo_id=hf_repo_id,
|
| 788 |
hf_token=hf_token,
|
| 789 |
hf_private_repo=hf_private_repo,
|
|
@@ -807,13 +853,6 @@ def toggle_hf_fields(enable_hf_upload: bool) -> tuple[Any, Any, Any, Any, Any]:
|
|
| 807 |
return update, update, update, update, update
|
| 808 |
|
| 809 |
|
| 810 |
-
def toggle_incomplete_flush_field(
|
| 811 |
-
enable_hf_upload: bool,
|
| 812 |
-
upload_incomplete_shards: bool,
|
| 813 |
-
) -> Any:
|
| 814 |
-
return gr.update(visible=bool(enable_hf_upload and upload_incomplete_shards))
|
| 815 |
-
|
| 816 |
-
|
| 817 |
def build_ui() -> gr.Blocks:
|
| 818 |
defaults = CrawlerConfig(
|
| 819 |
seed_urls=[
|
|
@@ -931,14 +970,6 @@ def build_ui() -> gr.Blocks:
|
|
| 931 |
value=False,
|
| 932 |
visible=False,
|
| 933 |
)
|
| 934 |
-
incomplete_shard_flush_seconds = gr.Slider(
|
| 935 |
-
label="Incomplete Upload Flush Interval (seconds)",
|
| 936 |
-
minimum=5,
|
| 937 |
-
maximum=300,
|
| 938 |
-
step=1,
|
| 939 |
-
value=int(defaults.incomplete_shard_flush_seconds),
|
| 940 |
-
visible=False,
|
| 941 |
-
)
|
| 942 |
|
| 943 |
with gr.Row():
|
| 944 |
start_button = gr.Button("Start Crawl (12 Threads)", variant="primary")
|
|
@@ -958,7 +989,6 @@ def build_ui() -> gr.Blocks:
|
|
| 958 |
shard_size_rows,
|
| 959 |
enable_hf_upload,
|
| 960 |
upload_incomplete_shards,
|
| 961 |
-
incomplete_shard_flush_seconds,
|
| 962 |
hf_repo_id,
|
| 963 |
hf_token,
|
| 964 |
hf_private_repo,
|
|
@@ -982,16 +1012,6 @@ def build_ui() -> gr.Blocks:
|
|
| 982 |
upload_incomplete_shards,
|
| 983 |
],
|
| 984 |
)
|
| 985 |
-
enable_hf_upload.change(
|
| 986 |
-
toggle_incomplete_flush_field,
|
| 987 |
-
inputs=[enable_hf_upload, upload_incomplete_shards],
|
| 988 |
-
outputs=[incomplete_shard_flush_seconds],
|
| 989 |
-
)
|
| 990 |
-
upload_incomplete_shards.change(
|
| 991 |
-
toggle_incomplete_flush_field,
|
| 992 |
-
inputs=[enable_hf_upload, upload_incomplete_shards],
|
| 993 |
-
outputs=[incomplete_shard_flush_seconds],
|
| 994 |
-
)
|
| 995 |
|
| 996 |
seed_urls_input.change(
|
| 997 |
fn=None,
|
|
@@ -1001,6 +1021,7 @@ def build_ui() -> gr.Blocks:
|
|
| 1001 |
)
|
| 1002 |
|
| 1003 |
theme_name.change(fn=None, inputs=theme_name, outputs=[], js=THEME_JS)
|
|
|
|
| 1004 |
demo.load(
|
| 1005 |
fn=None,
|
| 1006 |
inputs=[],
|
|
@@ -1014,6 +1035,8 @@ def build_ui() -> gr.Blocks:
|
|
| 1014 |
js=SEED_WIDGET_JS,
|
| 1015 |
)
|
| 1016 |
demo.load(fn=poll_dashboard, inputs=[], outputs=outputs)
|
|
|
|
|
|
|
| 1017 |
|
| 1018 |
timer = gr.Timer(value=1.0)
|
| 1019 |
timer.tick(fn=poll_dashboard, inputs=[], outputs=outputs)
|
|
|
|
| 267 |
font-size: 0.83rem;
|
| 268 |
padding: 0.24rem 0.3rem;
|
| 269 |
}
|
| 270 |
+
|
| 271 |
+
.setting-help-q {
|
| 272 |
+
display: inline-flex;
|
| 273 |
+
align-items: center;
|
| 274 |
+
justify-content: center;
|
| 275 |
+
width: 1.05rem;
|
| 276 |
+
height: 1.05rem;
|
| 277 |
+
margin-left: 0.42rem;
|
| 278 |
+
border: 1px solid var(--border);
|
| 279 |
+
border-radius: 999px;
|
| 280 |
+
color: var(--text-main);
|
| 281 |
+
background: color-mix(in srgb, var(--bg-panel) 90%, transparent);
|
| 282 |
+
font-size: 0.74rem;
|
| 283 |
+
font-weight: 700;
|
| 284 |
+
cursor: help;
|
| 285 |
+
line-height: 1;
|
| 286 |
+
vertical-align: middle;
|
| 287 |
+
}
|
| 288 |
"""
|
| 289 |
|
| 290 |
THEME_JS = """
|
|
|
|
| 346 |
}
|
| 347 |
"""
|
| 348 |
|
| 349 |
+
SETTING_HELP_JS = """
|
| 350 |
+
() => {
|
| 351 |
+
const helpByPrefix = [
|
| 352 |
+
["Theme", "Switch between visual color themes."],
|
| 353 |
+
["Seed URL List (one URL per line)", "Provide crawl entry points. Put one URL per line; duplicates are ignored."],
|
| 354 |
+
["Shard Size Rows", "Rows written per parquet shard before a full shard is emitted."],
|
| 355 |
+
["Max Links Per Page", "Maximum discovered links to enqueue from each parsed page."],
|
| 356 |
+
["Request Timeout (seconds)", "HTTP request timeout per URL."],
|
| 357 |
+
["Max Response Bytes", "Maximum response body bytes to read per page."],
|
| 358 |
+
["Upload shards to my HF repo", "Enable direct upload of produced shards to your Hugging Face Space repo."],
|
| 359 |
+
["HF Repo ID", "Target Hugging Face repo in owner/name format."],
|
| 360 |
+
["HF Token (write permissions)", "Token with write access to the target repo."],
|
| 361 |
+
["Private HF Repo", "Create the target repo as private if it does not exist."],
|
| 362 |
+
["HF Path Prefix", "Folder path inside the repo where shards are uploaded."],
|
| 363 |
+
["Upload incomplete shard buffers", "On crawl finish/stop, flush the current partial shard buffer and upload it too."],
|
| 364 |
+
];
|
| 365 |
+
|
| 366 |
+
const clean = (value) => String(value || "").replace(/\\s+/g, " ").trim();
|
| 367 |
+
const labels = document.querySelectorAll(".gradio-container label");
|
| 368 |
+
|
| 369 |
+
for (const label of labels) {
|
| 370 |
+
if (label.querySelector(".setting-help-q")) continue;
|
| 371 |
+
const text = clean(label.textContent);
|
| 372 |
+
const match = helpByPrefix.find(([prefix]) => text.startsWith(prefix));
|
| 373 |
+
if (!match) continue;
|
| 374 |
+
|
| 375 |
+
const q = document.createElement("span");
|
| 376 |
+
q.className = "setting-help-q";
|
| 377 |
+
q.textContent = "?";
|
| 378 |
+
q.title = match[1];
|
| 379 |
+
label.appendChild(q);
|
| 380 |
+
}
|
| 381 |
+
return [];
|
| 382 |
+
}
|
| 383 |
+
"""
|
| 384 |
+
|
| 385 |
|
| 386 |
def utc_now_iso() -> str:
|
| 387 |
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
|
|
|
| 515 |
shard_size_rows: int,
|
| 516 |
enable_hf_upload: bool,
|
| 517 |
upload_incomplete_shards: bool,
|
|
|
|
| 518 |
hf_repo_id: str,
|
| 519 |
hf_token: str,
|
| 520 |
hf_private_repo: bool,
|
|
|
|
| 533 |
output_dir=Path(__file__).resolve().parent / "shards",
|
| 534 |
enable_hf_upload=bool(enable_hf_upload),
|
| 535 |
upload_incomplete_shards=bool(upload_incomplete_shards),
|
|
|
|
| 536 |
hf_repo_id=hf_repo_id.strip(),
|
| 537 |
hf_token=hf_token.strip(),
|
| 538 |
hf_private_repo=bool(hf_private_repo),
|
|
|
|
| 751 |
shard_size_rows: int,
|
| 752 |
enable_hf_upload: bool,
|
| 753 |
upload_incomplete_shards: bool,
|
|
|
|
| 754 |
hf_repo_id: str,
|
| 755 |
hf_token: str,
|
| 756 |
hf_private_repo: bool,
|
|
|
|
| 765 |
shard_size_rows=shard_size_rows,
|
| 766 |
enable_hf_upload=enable_hf_upload,
|
| 767 |
upload_incomplete_shards=upload_incomplete_shards,
|
|
|
|
| 768 |
hf_repo_id=hf_repo_id,
|
| 769 |
hf_token=hf_token,
|
| 770 |
hf_private_repo=hf_private_repo,
|
|
|
|
| 787 |
shard_size_rows: int,
|
| 788 |
enable_hf_upload: bool,
|
| 789 |
upload_incomplete_shards: bool,
|
|
|
|
| 790 |
hf_repo_id: str,
|
| 791 |
hf_token: str,
|
| 792 |
hf_private_repo: bool,
|
|
|
|
| 801 |
shard_size_rows=shard_size_rows,
|
| 802 |
enable_hf_upload=enable_hf_upload,
|
| 803 |
upload_incomplete_shards=upload_incomplete_shards,
|
|
|
|
| 804 |
hf_repo_id=hf_repo_id,
|
| 805 |
hf_token=hf_token,
|
| 806 |
hf_private_repo=hf_private_repo,
|
|
|
|
| 816 |
shard_size_rows: int,
|
| 817 |
enable_hf_upload: bool,
|
| 818 |
upload_incomplete_shards: bool,
|
|
|
|
| 819 |
hf_repo_id: str,
|
| 820 |
hf_token: str,
|
| 821 |
hf_private_repo: bool,
|
|
|
|
| 830 |
shard_size_rows=shard_size_rows,
|
| 831 |
enable_hf_upload=enable_hf_upload,
|
| 832 |
upload_incomplete_shards=upload_incomplete_shards,
|
|
|
|
| 833 |
hf_repo_id=hf_repo_id,
|
| 834 |
hf_token=hf_token,
|
| 835 |
hf_private_repo=hf_private_repo,
|
|
|
|
| 853 |
return update, update, update, update, update
|
| 854 |
|
| 855 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 856 |
def build_ui() -> gr.Blocks:
|
| 857 |
defaults = CrawlerConfig(
|
| 858 |
seed_urls=[
|
|
|
|
| 970 |
value=False,
|
| 971 |
visible=False,
|
| 972 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 973 |
|
| 974 |
with gr.Row():
|
| 975 |
start_button = gr.Button("Start Crawl (12 Threads)", variant="primary")
|
|
|
|
| 989 |
shard_size_rows,
|
| 990 |
enable_hf_upload,
|
| 991 |
upload_incomplete_shards,
|
|
|
|
| 992 |
hf_repo_id,
|
| 993 |
hf_token,
|
| 994 |
hf_private_repo,
|
|
|
|
| 1012 |
upload_incomplete_shards,
|
| 1013 |
],
|
| 1014 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1015 |
|
| 1016 |
seed_urls_input.change(
|
| 1017 |
fn=None,
|
|
|
|
| 1021 |
)
|
| 1022 |
|
| 1023 |
theme_name.change(fn=None, inputs=theme_name, outputs=[], js=THEME_JS)
|
| 1024 |
+
demo.load(fn=None, inputs=[], outputs=[], js=SETTING_HELP_JS)
|
| 1025 |
demo.load(
|
| 1026 |
fn=None,
|
| 1027 |
inputs=[],
|
|
|
|
| 1035 |
js=SEED_WIDGET_JS,
|
| 1036 |
)
|
| 1037 |
demo.load(fn=poll_dashboard, inputs=[], outputs=outputs)
|
| 1038 |
+
enable_hf_upload.change(fn=None, inputs=[], outputs=[], js=SETTING_HELP_JS)
|
| 1039 |
+
upload_incomplete_shards.change(fn=None, inputs=[], outputs=[], js=SETTING_HELP_JS)
|
| 1040 |
|
| 1041 |
timer = gr.Timer(value=1.0)
|
| 1042 |
timer.tick(fn=poll_dashboard, inputs=[], outputs=outputs)
|