Spaces:
Running
Running
Add shard limit setting (1-15) to side panel and wire through crawl config
Browse files
app.py
CHANGED
|
@@ -37,6 +37,7 @@ import gradio as gr
|
|
| 37 |
|
| 38 |
from crawler import (
|
| 39 |
MAX_SHARD_ROWS,
|
|
|
|
| 40 |
NORMAL_TOTAL_WORKERS,
|
| 41 |
SUPER_TOTAL_WORKERS,
|
| 42 |
AsyncCrawler,
|
|
@@ -352,6 +353,7 @@ SETTING_HELP_JS = """
|
|
| 352 |
["Theme", "Switch between visual color themes."],
|
| 353 |
["Seed URL List (one URL per line)", "Provide crawl entry points. Put one URL per line; duplicates are ignored."],
|
| 354 |
["Shard Size Rows", "Rows written per parquet shard before a full shard is emitted."],
|
|
|
|
| 355 |
["Max Links Per Page", "Maximum discovered links to enqueue from each parsed page."],
|
| 356 |
["Request Timeout (seconds)", "HTTP request timeout per URL."],
|
| 357 |
["Max Response Bytes", "Maximum response body bytes to read per page."],
|
|
@@ -533,6 +535,7 @@ def build_crawler_config(
|
|
| 533 |
request_timeout_seconds: float,
|
| 534 |
max_response_bytes: int,
|
| 535 |
shard_size_rows: int,
|
|
|
|
| 536 |
enable_hf_upload: bool,
|
| 537 |
upload_incomplete_shards: bool,
|
| 538 |
hf_repo_id: str,
|
|
@@ -550,6 +553,7 @@ def build_crawler_config(
|
|
| 550 |
request_timeout_seconds=float(request_timeout_seconds),
|
| 551 |
max_response_bytes=int(max_response_bytes),
|
| 552 |
shard_size_rows=int(shard_size_rows),
|
|
|
|
| 553 |
output_dir=Path(__file__).resolve().parent / "shards",
|
| 554 |
enable_hf_upload=bool(enable_hf_upload),
|
| 555 |
upload_incomplete_shards=bool(upload_incomplete_shards),
|
|
@@ -769,6 +773,7 @@ def _start_crawl(
|
|
| 769 |
request_timeout_seconds: float,
|
| 770 |
max_response_bytes: int,
|
| 771 |
shard_size_rows: int,
|
|
|
|
| 772 |
enable_hf_upload: bool,
|
| 773 |
upload_incomplete_shards: bool,
|
| 774 |
hf_repo_id: str,
|
|
@@ -783,6 +788,7 @@ def _start_crawl(
|
|
| 783 |
request_timeout_seconds=request_timeout_seconds,
|
| 784 |
max_response_bytes=max_response_bytes,
|
| 785 |
shard_size_rows=shard_size_rows,
|
|
|
|
| 786 |
enable_hf_upload=enable_hf_upload,
|
| 787 |
upload_incomplete_shards=upload_incomplete_shards,
|
| 788 |
hf_repo_id=hf_repo_id,
|
|
@@ -805,6 +811,7 @@ def start_crawl_standard(
|
|
| 805 |
request_timeout_seconds: float,
|
| 806 |
max_response_bytes: int,
|
| 807 |
shard_size_rows: int,
|
|
|
|
| 808 |
enable_hf_upload: bool,
|
| 809 |
upload_incomplete_shards: bool,
|
| 810 |
hf_repo_id: str,
|
|
@@ -819,6 +826,7 @@ def start_crawl_standard(
|
|
| 819 |
request_timeout_seconds=request_timeout_seconds,
|
| 820 |
max_response_bytes=max_response_bytes,
|
| 821 |
shard_size_rows=shard_size_rows,
|
|
|
|
| 822 |
enable_hf_upload=enable_hf_upload,
|
| 823 |
upload_incomplete_shards=upload_incomplete_shards,
|
| 824 |
hf_repo_id=hf_repo_id,
|
|
@@ -834,6 +842,7 @@ def start_crawl_super(
|
|
| 834 |
request_timeout_seconds: float,
|
| 835 |
max_response_bytes: int,
|
| 836 |
shard_size_rows: int,
|
|
|
|
| 837 |
enable_hf_upload: bool,
|
| 838 |
upload_incomplete_shards: bool,
|
| 839 |
hf_repo_id: str,
|
|
@@ -848,6 +857,7 @@ def start_crawl_super(
|
|
| 848 |
request_timeout_seconds=request_timeout_seconds,
|
| 849 |
max_response_bytes=max_response_bytes,
|
| 850 |
shard_size_rows=shard_size_rows,
|
|
|
|
| 851 |
enable_hf_upload=enable_hf_upload,
|
| 852 |
upload_incomplete_shards=upload_incomplete_shards,
|
| 853 |
hf_repo_id=hf_repo_id,
|
|
@@ -947,6 +957,13 @@ def build_ui() -> gr.Blocks:
|
|
| 947 |
step=100,
|
| 948 |
value=min(defaults.shard_size_rows, MAX_SHARD_ROWS),
|
| 949 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 950 |
max_links_per_page = gr.Slider(
|
| 951 |
label="Max Links Per Page",
|
| 952 |
minimum=10,
|
|
@@ -1017,6 +1034,7 @@ def build_ui() -> gr.Blocks:
|
|
| 1017 |
request_timeout_seconds,
|
| 1018 |
max_response_bytes,
|
| 1019 |
shard_size_rows,
|
|
|
|
| 1020 |
enable_hf_upload,
|
| 1021 |
upload_incomplete_shards,
|
| 1022 |
hf_repo_id,
|
|
|
|
| 37 |
|
| 38 |
from crawler import (
|
| 39 |
MAX_SHARD_ROWS,
|
| 40 |
+
MAX_SHARDS,
|
| 41 |
NORMAL_TOTAL_WORKERS,
|
| 42 |
SUPER_TOTAL_WORKERS,
|
| 43 |
AsyncCrawler,
|
|
|
|
| 353 |
["Theme", "Switch between visual color themes."],
|
| 354 |
["Seed URL List (one URL per line)", "Provide crawl entry points. Put one URL per line; duplicates are ignored."],
|
| 355 |
["Shard Size Rows", "Rows written per parquet shard before a full shard is emitted."],
|
| 356 |
+
["Shard Limit", "Maximum number of shards to produce for a run (1 to 15)."],
|
| 357 |
["Max Links Per Page", "Maximum discovered links to enqueue from each parsed page."],
|
| 358 |
["Request Timeout (seconds)", "HTTP request timeout per URL."],
|
| 359 |
["Max Response Bytes", "Maximum response body bytes to read per page."],
|
|
|
|
| 535 |
request_timeout_seconds: float,
|
| 536 |
max_response_bytes: int,
|
| 537 |
shard_size_rows: int,
|
| 538 |
+
max_shards: int,
|
| 539 |
enable_hf_upload: bool,
|
| 540 |
upload_incomplete_shards: bool,
|
| 541 |
hf_repo_id: str,
|
|
|
|
| 553 |
request_timeout_seconds=float(request_timeout_seconds),
|
| 554 |
max_response_bytes=int(max_response_bytes),
|
| 555 |
shard_size_rows=int(shard_size_rows),
|
| 556 |
+
max_shards=int(max_shards),
|
| 557 |
output_dir=Path(__file__).resolve().parent / "shards",
|
| 558 |
enable_hf_upload=bool(enable_hf_upload),
|
| 559 |
upload_incomplete_shards=bool(upload_incomplete_shards),
|
|
|
|
| 773 |
request_timeout_seconds: float,
|
| 774 |
max_response_bytes: int,
|
| 775 |
shard_size_rows: int,
|
| 776 |
+
max_shards: int,
|
| 777 |
enable_hf_upload: bool,
|
| 778 |
upload_incomplete_shards: bool,
|
| 779 |
hf_repo_id: str,
|
|
|
|
| 788 |
request_timeout_seconds=request_timeout_seconds,
|
| 789 |
max_response_bytes=max_response_bytes,
|
| 790 |
shard_size_rows=shard_size_rows,
|
| 791 |
+
max_shards=max_shards,
|
| 792 |
enable_hf_upload=enable_hf_upload,
|
| 793 |
upload_incomplete_shards=upload_incomplete_shards,
|
| 794 |
hf_repo_id=hf_repo_id,
|
|
|
|
| 811 |
request_timeout_seconds: float,
|
| 812 |
max_response_bytes: int,
|
| 813 |
shard_size_rows: int,
|
| 814 |
+
max_shards: int,
|
| 815 |
enable_hf_upload: bool,
|
| 816 |
upload_incomplete_shards: bool,
|
| 817 |
hf_repo_id: str,
|
|
|
|
| 826 |
request_timeout_seconds=request_timeout_seconds,
|
| 827 |
max_response_bytes=max_response_bytes,
|
| 828 |
shard_size_rows=shard_size_rows,
|
| 829 |
+
max_shards=max_shards,
|
| 830 |
enable_hf_upload=enable_hf_upload,
|
| 831 |
upload_incomplete_shards=upload_incomplete_shards,
|
| 832 |
hf_repo_id=hf_repo_id,
|
|
|
|
| 842 |
request_timeout_seconds: float,
|
| 843 |
max_response_bytes: int,
|
| 844 |
shard_size_rows: int,
|
| 845 |
+
max_shards: int,
|
| 846 |
enable_hf_upload: bool,
|
| 847 |
upload_incomplete_shards: bool,
|
| 848 |
hf_repo_id: str,
|
|
|
|
| 857 |
request_timeout_seconds=request_timeout_seconds,
|
| 858 |
max_response_bytes=max_response_bytes,
|
| 859 |
shard_size_rows=shard_size_rows,
|
| 860 |
+
max_shards=max_shards,
|
| 861 |
enable_hf_upload=enable_hf_upload,
|
| 862 |
upload_incomplete_shards=upload_incomplete_shards,
|
| 863 |
hf_repo_id=hf_repo_id,
|
|
|
|
| 957 |
step=100,
|
| 958 |
value=min(defaults.shard_size_rows, MAX_SHARD_ROWS),
|
| 959 |
)
|
| 960 |
+
max_shards = gr.Slider(
|
| 961 |
+
label=f"Shard Limit (1-{MAX_SHARDS})",
|
| 962 |
+
minimum=1,
|
| 963 |
+
maximum=MAX_SHARDS,
|
| 964 |
+
step=1,
|
| 965 |
+
value=min(defaults.max_shards, MAX_SHARDS),
|
| 966 |
+
)
|
| 967 |
max_links_per_page = gr.Slider(
|
| 968 |
label="Max Links Per Page",
|
| 969 |
minimum=10,
|
|
|
|
| 1034 |
request_timeout_seconds,
|
| 1035 |
max_response_bytes,
|
| 1036 |
shard_size_rows,
|
| 1037 |
+
max_shards,
|
| 1038 |
enable_hf_upload,
|
| 1039 |
upload_incomplete_shards,
|
| 1040 |
hf_repo_id,
|