Roman190928 commited on
Commit
60eb7e9
·
verified ·
1 Parent(s): 6cfa53e

Add shard limit setting (1-15) to side panel and wire through crawl config

Browse files
Files changed (1) hide show
  1. app.py +18 -0
app.py CHANGED
@@ -37,6 +37,7 @@ import gradio as gr
37
 
38
  from crawler import (
39
  MAX_SHARD_ROWS,
 
40
  NORMAL_TOTAL_WORKERS,
41
  SUPER_TOTAL_WORKERS,
42
  AsyncCrawler,
@@ -352,6 +353,7 @@ SETTING_HELP_JS = """
352
  ["Theme", "Switch between visual color themes."],
353
  ["Seed URL List (one URL per line)", "Provide crawl entry points. Put one URL per line; duplicates are ignored."],
354
  ["Shard Size Rows", "Rows written per parquet shard before a full shard is emitted."],
 
355
  ["Max Links Per Page", "Maximum discovered links to enqueue from each parsed page."],
356
  ["Request Timeout (seconds)", "HTTP request timeout per URL."],
357
  ["Max Response Bytes", "Maximum response body bytes to read per page."],
@@ -533,6 +535,7 @@ def build_crawler_config(
533
  request_timeout_seconds: float,
534
  max_response_bytes: int,
535
  shard_size_rows: int,
 
536
  enable_hf_upload: bool,
537
  upload_incomplete_shards: bool,
538
  hf_repo_id: str,
@@ -550,6 +553,7 @@ def build_crawler_config(
550
  request_timeout_seconds=float(request_timeout_seconds),
551
  max_response_bytes=int(max_response_bytes),
552
  shard_size_rows=int(shard_size_rows),
 
553
  output_dir=Path(__file__).resolve().parent / "shards",
554
  enable_hf_upload=bool(enable_hf_upload),
555
  upload_incomplete_shards=bool(upload_incomplete_shards),
@@ -769,6 +773,7 @@ def _start_crawl(
769
  request_timeout_seconds: float,
770
  max_response_bytes: int,
771
  shard_size_rows: int,
 
772
  enable_hf_upload: bool,
773
  upload_incomplete_shards: bool,
774
  hf_repo_id: str,
@@ -783,6 +788,7 @@ def _start_crawl(
783
  request_timeout_seconds=request_timeout_seconds,
784
  max_response_bytes=max_response_bytes,
785
  shard_size_rows=shard_size_rows,
 
786
  enable_hf_upload=enable_hf_upload,
787
  upload_incomplete_shards=upload_incomplete_shards,
788
  hf_repo_id=hf_repo_id,
@@ -805,6 +811,7 @@ def start_crawl_standard(
805
  request_timeout_seconds: float,
806
  max_response_bytes: int,
807
  shard_size_rows: int,
 
808
  enable_hf_upload: bool,
809
  upload_incomplete_shards: bool,
810
  hf_repo_id: str,
@@ -819,6 +826,7 @@ def start_crawl_standard(
819
  request_timeout_seconds=request_timeout_seconds,
820
  max_response_bytes=max_response_bytes,
821
  shard_size_rows=shard_size_rows,
 
822
  enable_hf_upload=enable_hf_upload,
823
  upload_incomplete_shards=upload_incomplete_shards,
824
  hf_repo_id=hf_repo_id,
@@ -834,6 +842,7 @@ def start_crawl_super(
834
  request_timeout_seconds: float,
835
  max_response_bytes: int,
836
  shard_size_rows: int,
 
837
  enable_hf_upload: bool,
838
  upload_incomplete_shards: bool,
839
  hf_repo_id: str,
@@ -848,6 +857,7 @@ def start_crawl_super(
848
  request_timeout_seconds=request_timeout_seconds,
849
  max_response_bytes=max_response_bytes,
850
  shard_size_rows=shard_size_rows,
 
851
  enable_hf_upload=enable_hf_upload,
852
  upload_incomplete_shards=upload_incomplete_shards,
853
  hf_repo_id=hf_repo_id,
@@ -947,6 +957,13 @@ def build_ui() -> gr.Blocks:
947
  step=100,
948
  value=min(defaults.shard_size_rows, MAX_SHARD_ROWS),
949
  )
 
 
 
 
 
 
 
950
  max_links_per_page = gr.Slider(
951
  label="Max Links Per Page",
952
  minimum=10,
@@ -1017,6 +1034,7 @@ def build_ui() -> gr.Blocks:
1017
  request_timeout_seconds,
1018
  max_response_bytes,
1019
  shard_size_rows,
 
1020
  enable_hf_upload,
1021
  upload_incomplete_shards,
1022
  hf_repo_id,
 
37
 
38
  from crawler import (
39
  MAX_SHARD_ROWS,
40
+ MAX_SHARDS,
41
  NORMAL_TOTAL_WORKERS,
42
  SUPER_TOTAL_WORKERS,
43
  AsyncCrawler,
 
353
  ["Theme", "Switch between visual color themes."],
354
  ["Seed URL List (one URL per line)", "Provide crawl entry points. Put one URL per line; duplicates are ignored."],
355
  ["Shard Size Rows", "Rows written per parquet shard before a full shard is emitted."],
356
+ ["Shard Limit", "Maximum number of shards to produce for a run (1 to 15)."],
357
  ["Max Links Per Page", "Maximum discovered links to enqueue from each parsed page."],
358
  ["Request Timeout (seconds)", "HTTP request timeout per URL."],
359
  ["Max Response Bytes", "Maximum response body bytes to read per page."],
 
535
  request_timeout_seconds: float,
536
  max_response_bytes: int,
537
  shard_size_rows: int,
538
+ max_shards: int,
539
  enable_hf_upload: bool,
540
  upload_incomplete_shards: bool,
541
  hf_repo_id: str,
 
553
  request_timeout_seconds=float(request_timeout_seconds),
554
  max_response_bytes=int(max_response_bytes),
555
  shard_size_rows=int(shard_size_rows),
556
+ max_shards=int(max_shards),
557
  output_dir=Path(__file__).resolve().parent / "shards",
558
  enable_hf_upload=bool(enable_hf_upload),
559
  upload_incomplete_shards=bool(upload_incomplete_shards),
 
773
  request_timeout_seconds: float,
774
  max_response_bytes: int,
775
  shard_size_rows: int,
776
+ max_shards: int,
777
  enable_hf_upload: bool,
778
  upload_incomplete_shards: bool,
779
  hf_repo_id: str,
 
788
  request_timeout_seconds=request_timeout_seconds,
789
  max_response_bytes=max_response_bytes,
790
  shard_size_rows=shard_size_rows,
791
+ max_shards=max_shards,
792
  enable_hf_upload=enable_hf_upload,
793
  upload_incomplete_shards=upload_incomplete_shards,
794
  hf_repo_id=hf_repo_id,
 
811
  request_timeout_seconds: float,
812
  max_response_bytes: int,
813
  shard_size_rows: int,
814
+ max_shards: int,
815
  enable_hf_upload: bool,
816
  upload_incomplete_shards: bool,
817
  hf_repo_id: str,
 
826
  request_timeout_seconds=request_timeout_seconds,
827
  max_response_bytes=max_response_bytes,
828
  shard_size_rows=shard_size_rows,
829
+ max_shards=max_shards,
830
  enable_hf_upload=enable_hf_upload,
831
  upload_incomplete_shards=upload_incomplete_shards,
832
  hf_repo_id=hf_repo_id,
 
842
  request_timeout_seconds: float,
843
  max_response_bytes: int,
844
  shard_size_rows: int,
845
+ max_shards: int,
846
  enable_hf_upload: bool,
847
  upload_incomplete_shards: bool,
848
  hf_repo_id: str,
 
857
  request_timeout_seconds=request_timeout_seconds,
858
  max_response_bytes=max_response_bytes,
859
  shard_size_rows=shard_size_rows,
860
+ max_shards=max_shards,
861
  enable_hf_upload=enable_hf_upload,
862
  upload_incomplete_shards=upload_incomplete_shards,
863
  hf_repo_id=hf_repo_id,
 
957
  step=100,
958
  value=min(defaults.shard_size_rows, MAX_SHARD_ROWS),
959
  )
960
+ max_shards = gr.Slider(
961
+ label=f"Shard Limit (1-{MAX_SHARDS})",
962
+ minimum=1,
963
+ maximum=MAX_SHARDS,
964
+ step=1,
965
+ value=min(defaults.max_shards, MAX_SHARDS),
966
+ )
967
  max_links_per_page = gr.Slider(
968
  label="Max Links Per Page",
969
  minimum=10,
 
1034
  request_timeout_seconds,
1035
  max_response_bytes,
1036
  shard_size_rows,
1037
+ max_shards,
1038
  enable_hf_upload,
1039
  upload_incomplete_shards,
1040
  hf_repo_id,