Roman190928 commited on
Commit
b18026c
·
verified ·
1 Parent(s): 378a0c0

Fix seed URL input: textbox + accurate stats parsing

Browse files
Files changed (1) hide show
  1. app.py +56 -58
app.py CHANGED
@@ -278,22 +278,13 @@ THEME_JS = """
278
  """
279
 
280
  SEED_WIDGET_JS = """
281
- (seed_rows) => {
282
- const parseRows = (rows) => {
283
- if (!Array.isArray(rows)) return [];
284
- const out = [];
285
- for (const row of rows) {
286
- let value = "";
287
- if (Array.isArray(row)) {
288
- value = String(row[0] ?? "").trim();
289
- } else if (row && typeof row === "object") {
290
- value = String(Object.values(row)[0] ?? "").trim();
291
- } else if (row !== null && row !== undefined) {
292
- value = String(row).trim();
293
- }
294
- if (value) out.push(value);
295
- }
296
- return out;
297
  };
298
 
299
  const dedupe = (values) => {
@@ -323,7 +314,7 @@ SEED_WIDGET_JS = """
323
  .replaceAll('"', """)
324
  .replaceAll("'", "'");
325
 
326
- const seeds = dedupe(parseRows(seed_rows));
327
  const domainSet = new Set(seeds.map(domainOf).filter(Boolean));
328
  const chips = seeds.length
329
  ? seeds.slice(0, 12).map((url) => `<span class=\"seed-chip\">${escapeHtml(url)}</span>`).join("")
@@ -331,8 +322,9 @@ SEED_WIDGET_JS = """
331
  const overflow = seeds.length > 12
332
  ? `<span class=\"seed-overflow\">+${seeds.length - 12} more</span>`
333
  : "";
 
334
 
335
- return `<div class=\"seed-widget\"><div class=\"seed-stats\"><span><strong>${seeds.length}</strong> seeds</span><span><strong>${domainSet.size}</strong> domains</span><span><strong>${seeds.slice(0, 1).join("").length || 0}</strong> first-url chars</span></div><div class=\"seed-chip-wrap\">${chips}${overflow}</div></div>`;
336
  }
337
  """
338
 
@@ -348,32 +340,40 @@ def safe_queue_size(queue: Any) -> int:
348
  return -1
349
 
350
 
351
- def parse_seed_url_rows(rows: Any) -> list[str]:
352
- if rows is None:
353
  return []
354
 
355
- if isinstance(rows, (list, tuple)):
356
- rows_iterable: list[Any] = list(rows)
357
- elif hasattr(rows, "values"):
 
 
 
358
  try:
359
- rows_iterable = rows.values.tolist() # pandas.DataFrame path
360
  except Exception:
361
  rows_iterable = []
362
  else:
363
- rows_iterable = [rows]
364
 
365
  items: list[str] = []
366
  for row in rows_iterable:
367
- value = ""
368
  if isinstance(row, dict):
369
- value = str(next(iter(row.values()), "") or "").strip()
370
  elif isinstance(row, (list, tuple)):
371
- value = str(row[0] if row else "").strip()
372
- elif row is not None:
373
- value = str(row).strip()
374
-
375
- if value:
376
- items.append(value)
 
 
 
 
 
377
  return items
378
 
379
 
@@ -388,12 +388,12 @@ def unique_preserve_order(values: list[str]) -> list[str]:
388
  return out
389
 
390
 
391
- def collect_seed_urls(seed_urls_table: Any) -> list[str]:
392
- return unique_preserve_order(parse_seed_url_rows(seed_urls_table))
393
 
394
 
395
- def render_seed_widget_html(seed_urls_table: Any) -> str:
396
- seeds = collect_seed_urls(seed_urls_table)
397
  domains = {(urlsplit(u).hostname or "").lower().strip(".") for u in seeds}
398
  domains = {d for d in domains if d}
399
 
@@ -454,7 +454,7 @@ def validate_hf_requirements(enable_hf_upload: bool, hf_repo_id: str, hf_token:
454
 
455
  def build_crawler_config(
456
  *,
457
- seed_urls_table: Any,
458
  max_links_per_page: int,
459
  request_timeout_seconds: float,
460
  max_response_bytes: int,
@@ -469,7 +469,7 @@ def build_crawler_config(
469
  total_workers: int,
470
  ) -> CrawlerConfig:
471
  validate_hf_requirements(enable_hf_upload, hf_repo_id, hf_token)
472
- seed_urls = collect_seed_urls(seed_urls_table)
473
 
474
  return CrawlerConfig(
475
  seed_urls=seed_urls,
@@ -692,7 +692,7 @@ def _format_dashboard_response(
692
  def _start_crawl(
693
  *,
694
  total_workers: int,
695
- seed_urls_table: Any,
696
  max_links_per_page: int,
697
  request_timeout_seconds: float,
698
  max_response_bytes: int,
@@ -707,7 +707,7 @@ def _start_crawl(
707
  ) -> tuple[str, str, str, str]:
708
  try:
709
  config = build_crawler_config(
710
- seed_urls_table=seed_urls_table,
711
  max_links_per_page=max_links_per_page,
712
  request_timeout_seconds=request_timeout_seconds,
713
  max_response_bytes=max_response_bytes,
@@ -730,7 +730,7 @@ def _start_crawl(
730
 
731
 
732
  def start_crawl_standard(
733
- seed_urls_table: Any,
734
  max_links_per_page: int,
735
  request_timeout_seconds: float,
736
  max_response_bytes: int,
@@ -745,7 +745,7 @@ def start_crawl_standard(
745
  ) -> tuple[str, str, str, str]:
746
  return _start_crawl(
747
  total_workers=NORMAL_TOTAL_WORKERS,
748
- seed_urls_table=seed_urls_table,
749
  max_links_per_page=max_links_per_page,
750
  request_timeout_seconds=request_timeout_seconds,
751
  max_response_bytes=max_response_bytes,
@@ -761,7 +761,7 @@ def start_crawl_standard(
761
 
762
 
763
  def start_crawl_super(
764
- seed_urls_table: Any,
765
  max_links_per_page: int,
766
  request_timeout_seconds: float,
767
  max_response_bytes: int,
@@ -776,7 +776,7 @@ def start_crawl_super(
776
  ) -> tuple[str, str, str, str]:
777
  return _start_crawl(
778
  total_workers=SUPER_TOTAL_WORKERS,
779
- seed_urls_table=seed_urls_table,
780
  max_links_per_page=max_links_per_page,
781
  request_timeout_seconds=request_timeout_seconds,
782
  max_response_bytes=max_response_bytes,
@@ -823,7 +823,7 @@ def build_ui() -> gr.Blocks:
823
  "https://www.nasa.gov/",
824
  ]
825
  )
826
- default_seed_rows = [[url] for url in defaults.seed_urls]
827
 
828
  with gr.Blocks(title="DataMuncherLabs AutoWS") as demo:
829
  gr.Markdown("# DataMuncherLabs AutoWS")
@@ -854,18 +854,16 @@ def build_ui() -> gr.Blocks:
854
 
855
  with gr.Row():
856
  with gr.Column(scale=2):
857
- seed_urls_table = gr.Dataframe(
858
- headers=["seed_url"],
859
- datatype=["str"],
860
- type="array",
861
- row_count=(8, "dynamic"),
862
- value=default_seed_rows,
863
  interactive=True,
864
- label="Seed URL List (editable)",
 
865
  )
866
  seed_widget_html = gr.HTML(
867
  label="Seed URL Summary",
868
- value=render_seed_widget_html(default_seed_rows),
869
  )
870
  token_widget_html = gr.HTML(
871
  label="Live Tokenization",
@@ -953,7 +951,7 @@ def build_ui() -> gr.Blocks:
953
  logs_box = gr.Textbox(label="Run Log", lines=12, interactive=False)
954
 
955
  start_inputs = [
956
- seed_urls_table,
957
  max_links_per_page,
958
  request_timeout_seconds,
959
  max_response_bytes,
@@ -995,9 +993,9 @@ def build_ui() -> gr.Blocks:
995
  outputs=[incomplete_shard_flush_seconds],
996
  )
997
 
998
- seed_urls_table.change(
999
  fn=None,
1000
- inputs=[seed_urls_table],
1001
  outputs=[seed_widget_html],
1002
  js=SEED_WIDGET_JS,
1003
  )
@@ -1011,7 +1009,7 @@ def build_ui() -> gr.Blocks:
1011
  )
1012
  demo.load(
1013
  fn=None,
1014
- inputs=[seed_urls_table],
1015
  outputs=[seed_widget_html],
1016
  js=SEED_WIDGET_JS,
1017
  )
 
278
  """
279
 
280
  SEED_WIDGET_JS = """
281
+ (seed_text) => {
282
+ const parseSeedText = (value) => {
283
+ if (typeof value !== "string") return [];
284
+ return value
285
+ .split(/\\r?\\n/)
286
+ .map((line) => line.trim())
287
+ .filter((line) => line.length > 0);
 
 
 
 
 
 
 
 
 
288
  };
289
 
290
  const dedupe = (values) => {
 
314
  .replaceAll('"', "&quot;")
315
  .replaceAll("'", "&#39;");
316
 
317
+ const seeds = dedupe(parseSeedText(seed_text));
318
  const domainSet = new Set(seeds.map(domainOf).filter(Boolean));
319
  const chips = seeds.length
320
  ? seeds.slice(0, 12).map((url) => `<span class=\"seed-chip\">${escapeHtml(url)}</span>`).join("")
 
322
  const overflow = seeds.length > 12
323
  ? `<span class=\"seed-overflow\">+${seeds.length - 12} more</span>`
324
  : "";
325
+ const firstUrlChars = seeds.length ? seeds[0].length : 0;
326
 
327
+ return `<div class=\"seed-widget\"><div class=\"seed-stats\"><span><strong>${seeds.length}</strong> seeds</span><span><strong>${domainSet.size}</strong> domains</span><span><strong>${firstUrlChars}</strong> first-url chars</span></div><div class=\"seed-chip-wrap\">${chips}${overflow}</div></div>`;
328
  }
329
  """
330
 
 
340
  return -1
341
 
342
 
343
+ def parse_seed_url_rows(seed_urls_input: Any) -> list[str]:
344
+ if seed_urls_input is None:
345
  return []
346
 
347
+ if isinstance(seed_urls_input, str):
348
+ return [line.strip() for line in seed_urls_input.splitlines() if line.strip()]
349
+
350
+ if isinstance(seed_urls_input, (list, tuple)):
351
+ rows_iterable: list[Any] = list(seed_urls_input)
352
+ elif hasattr(seed_urls_input, "values"):
353
  try:
354
+ rows_iterable = seed_urls_input.values.tolist() # pandas.DataFrame path
355
  except Exception:
356
  rows_iterable = []
357
  else:
358
+ rows_iterable = [seed_urls_input]
359
 
360
  items: list[str] = []
361
  for row in rows_iterable:
362
+ value_sources: list[Any]
363
  if isinstance(row, dict):
364
+ value_sources = [next(iter(row.values()), "")]
365
  elif isinstance(row, (list, tuple)):
366
+ value_sources = [row[0] if row else ""]
367
+ else:
368
+ value_sources = [row]
369
+
370
+ for source in value_sources:
371
+ if source is None:
372
+ continue
373
+ for line in str(source).splitlines():
374
+ value = line.strip()
375
+ if value:
376
+ items.append(value)
377
  return items
378
 
379
 
 
388
  return out
389
 
390
 
391
+ def collect_seed_urls(seed_urls_input: Any) -> list[str]:
392
+ return unique_preserve_order(parse_seed_url_rows(seed_urls_input))
393
 
394
 
395
+ def render_seed_widget_html(seed_urls_input: Any) -> str:
396
+ seeds = collect_seed_urls(seed_urls_input)
397
  domains = {(urlsplit(u).hostname or "").lower().strip(".") for u in seeds}
398
  domains = {d for d in domains if d}
399
 
 
454
 
455
  def build_crawler_config(
456
  *,
457
+ seed_urls_input: Any,
458
  max_links_per_page: int,
459
  request_timeout_seconds: float,
460
  max_response_bytes: int,
 
469
  total_workers: int,
470
  ) -> CrawlerConfig:
471
  validate_hf_requirements(enable_hf_upload, hf_repo_id, hf_token)
472
+ seed_urls = collect_seed_urls(seed_urls_input)
473
 
474
  return CrawlerConfig(
475
  seed_urls=seed_urls,
 
692
  def _start_crawl(
693
  *,
694
  total_workers: int,
695
+ seed_urls_input: Any,
696
  max_links_per_page: int,
697
  request_timeout_seconds: float,
698
  max_response_bytes: int,
 
707
  ) -> tuple[str, str, str, str]:
708
  try:
709
  config = build_crawler_config(
710
+ seed_urls_input=seed_urls_input,
711
  max_links_per_page=max_links_per_page,
712
  request_timeout_seconds=request_timeout_seconds,
713
  max_response_bytes=max_response_bytes,
 
730
 
731
 
732
  def start_crawl_standard(
733
+ seed_urls_input: Any,
734
  max_links_per_page: int,
735
  request_timeout_seconds: float,
736
  max_response_bytes: int,
 
745
  ) -> tuple[str, str, str, str]:
746
  return _start_crawl(
747
  total_workers=NORMAL_TOTAL_WORKERS,
748
+ seed_urls_input=seed_urls_input,
749
  max_links_per_page=max_links_per_page,
750
  request_timeout_seconds=request_timeout_seconds,
751
  max_response_bytes=max_response_bytes,
 
761
 
762
 
763
  def start_crawl_super(
764
+ seed_urls_input: Any,
765
  max_links_per_page: int,
766
  request_timeout_seconds: float,
767
  max_response_bytes: int,
 
776
  ) -> tuple[str, str, str, str]:
777
  return _start_crawl(
778
  total_workers=SUPER_TOTAL_WORKERS,
779
+ seed_urls_input=seed_urls_input,
780
  max_links_per_page=max_links_per_page,
781
  request_timeout_seconds=request_timeout_seconds,
782
  max_response_bytes=max_response_bytes,
 
823
  "https://www.nasa.gov/",
824
  ]
825
  )
826
+ default_seed_text = "\n".join(defaults.seed_urls)
827
 
828
  with gr.Blocks(title="DataMuncherLabs AutoWS") as demo:
829
  gr.Markdown("# DataMuncherLabs AutoWS")
 
854
 
855
  with gr.Row():
856
  with gr.Column(scale=2):
857
+ seed_urls_input = gr.Textbox(
858
+ lines=10,
859
+ value=default_seed_text,
 
 
 
860
  interactive=True,
861
+ label="Seed URL List (one URL per line)",
862
+ placeholder="https://example.com",
863
  )
864
  seed_widget_html = gr.HTML(
865
  label="Seed URL Summary",
866
+ value=render_seed_widget_html(default_seed_text),
867
  )
868
  token_widget_html = gr.HTML(
869
  label="Live Tokenization",
 
951
  logs_box = gr.Textbox(label="Run Log", lines=12, interactive=False)
952
 
953
  start_inputs = [
954
+ seed_urls_input,
955
  max_links_per_page,
956
  request_timeout_seconds,
957
  max_response_bytes,
 
993
  outputs=[incomplete_shard_flush_seconds],
994
  )
995
 
996
+ seed_urls_input.change(
997
  fn=None,
998
+ inputs=[seed_urls_input],
999
  outputs=[seed_widget_html],
1000
  js=SEED_WIDGET_JS,
1001
  )
 
1009
  )
1010
  demo.load(
1011
  fn=None,
1012
+ inputs=[seed_urls_input],
1013
  outputs=[seed_widget_html],
1014
  js=SEED_WIDGET_JS,
1015
  )