Spaces:
Running
Running
Fix seed URL input: textbox + accurate stats parsing
Browse files
app.py
CHANGED
|
@@ -278,22 +278,13 @@ THEME_JS = """
|
|
| 278 |
"""
|
| 279 |
|
| 280 |
SEED_WIDGET_JS = """
|
| 281 |
-
(
|
| 282 |
-
const
|
| 283 |
-
if (
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
value = String(row[0] ?? "").trim();
|
| 289 |
-
} else if (row && typeof row === "object") {
|
| 290 |
-
value = String(Object.values(row)[0] ?? "").trim();
|
| 291 |
-
} else if (row !== null && row !== undefined) {
|
| 292 |
-
value = String(row).trim();
|
| 293 |
-
}
|
| 294 |
-
if (value) out.push(value);
|
| 295 |
-
}
|
| 296 |
-
return out;
|
| 297 |
};
|
| 298 |
|
| 299 |
const dedupe = (values) => {
|
|
@@ -323,7 +314,7 @@ SEED_WIDGET_JS = """
|
|
| 323 |
.replaceAll('"', """)
|
| 324 |
.replaceAll("'", "'");
|
| 325 |
|
| 326 |
-
const seeds = dedupe(
|
| 327 |
const domainSet = new Set(seeds.map(domainOf).filter(Boolean));
|
| 328 |
const chips = seeds.length
|
| 329 |
? seeds.slice(0, 12).map((url) => `<span class=\"seed-chip\">${escapeHtml(url)}</span>`).join("")
|
|
@@ -331,8 +322,9 @@ SEED_WIDGET_JS = """
|
|
| 331 |
const overflow = seeds.length > 12
|
| 332 |
? `<span class=\"seed-overflow\">+${seeds.length - 12} more</span>`
|
| 333 |
: "";
|
|
|
|
| 334 |
|
| 335 |
-
return `<div class=\"seed-widget\"><div class=\"seed-stats\"><span><strong>${seeds.length}</strong> seeds</span><span><strong>${domainSet.size}</strong> domains</span><span><strong>${
|
| 336 |
}
|
| 337 |
"""
|
| 338 |
|
|
@@ -348,32 +340,40 @@ def safe_queue_size(queue: Any) -> int:
|
|
| 348 |
return -1
|
| 349 |
|
| 350 |
|
| 351 |
-
def parse_seed_url_rows(
|
| 352 |
-
if
|
| 353 |
return []
|
| 354 |
|
| 355 |
-
if isinstance(
|
| 356 |
-
|
| 357 |
-
|
|
|
|
|
|
|
|
|
|
| 358 |
try:
|
| 359 |
-
rows_iterable =
|
| 360 |
except Exception:
|
| 361 |
rows_iterable = []
|
| 362 |
else:
|
| 363 |
-
rows_iterable = [
|
| 364 |
|
| 365 |
items: list[str] = []
|
| 366 |
for row in rows_iterable:
|
| 367 |
-
|
| 368 |
if isinstance(row, dict):
|
| 369 |
-
|
| 370 |
elif isinstance(row, (list, tuple)):
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
return items
|
| 378 |
|
| 379 |
|
|
@@ -388,12 +388,12 @@ def unique_preserve_order(values: list[str]) -> list[str]:
|
|
| 388 |
return out
|
| 389 |
|
| 390 |
|
| 391 |
-
def collect_seed_urls(
|
| 392 |
-
return unique_preserve_order(parse_seed_url_rows(
|
| 393 |
|
| 394 |
|
| 395 |
-
def render_seed_widget_html(
|
| 396 |
-
seeds = collect_seed_urls(
|
| 397 |
domains = {(urlsplit(u).hostname or "").lower().strip(".") for u in seeds}
|
| 398 |
domains = {d for d in domains if d}
|
| 399 |
|
|
@@ -454,7 +454,7 @@ def validate_hf_requirements(enable_hf_upload: bool, hf_repo_id: str, hf_token:
|
|
| 454 |
|
| 455 |
def build_crawler_config(
|
| 456 |
*,
|
| 457 |
-
|
| 458 |
max_links_per_page: int,
|
| 459 |
request_timeout_seconds: float,
|
| 460 |
max_response_bytes: int,
|
|
@@ -469,7 +469,7 @@ def build_crawler_config(
|
|
| 469 |
total_workers: int,
|
| 470 |
) -> CrawlerConfig:
|
| 471 |
validate_hf_requirements(enable_hf_upload, hf_repo_id, hf_token)
|
| 472 |
-
seed_urls = collect_seed_urls(
|
| 473 |
|
| 474 |
return CrawlerConfig(
|
| 475 |
seed_urls=seed_urls,
|
|
@@ -692,7 +692,7 @@ def _format_dashboard_response(
|
|
| 692 |
def _start_crawl(
|
| 693 |
*,
|
| 694 |
total_workers: int,
|
| 695 |
-
|
| 696 |
max_links_per_page: int,
|
| 697 |
request_timeout_seconds: float,
|
| 698 |
max_response_bytes: int,
|
|
@@ -707,7 +707,7 @@ def _start_crawl(
|
|
| 707 |
) -> tuple[str, str, str, str]:
|
| 708 |
try:
|
| 709 |
config = build_crawler_config(
|
| 710 |
-
|
| 711 |
max_links_per_page=max_links_per_page,
|
| 712 |
request_timeout_seconds=request_timeout_seconds,
|
| 713 |
max_response_bytes=max_response_bytes,
|
|
@@ -730,7 +730,7 @@ def _start_crawl(
|
|
| 730 |
|
| 731 |
|
| 732 |
def start_crawl_standard(
|
| 733 |
-
|
| 734 |
max_links_per_page: int,
|
| 735 |
request_timeout_seconds: float,
|
| 736 |
max_response_bytes: int,
|
|
@@ -745,7 +745,7 @@ def start_crawl_standard(
|
|
| 745 |
) -> tuple[str, str, str, str]:
|
| 746 |
return _start_crawl(
|
| 747 |
total_workers=NORMAL_TOTAL_WORKERS,
|
| 748 |
-
|
| 749 |
max_links_per_page=max_links_per_page,
|
| 750 |
request_timeout_seconds=request_timeout_seconds,
|
| 751 |
max_response_bytes=max_response_bytes,
|
|
@@ -761,7 +761,7 @@ def start_crawl_standard(
|
|
| 761 |
|
| 762 |
|
| 763 |
def start_crawl_super(
|
| 764 |
-
|
| 765 |
max_links_per_page: int,
|
| 766 |
request_timeout_seconds: float,
|
| 767 |
max_response_bytes: int,
|
|
@@ -776,7 +776,7 @@ def start_crawl_super(
|
|
| 776 |
) -> tuple[str, str, str, str]:
|
| 777 |
return _start_crawl(
|
| 778 |
total_workers=SUPER_TOTAL_WORKERS,
|
| 779 |
-
|
| 780 |
max_links_per_page=max_links_per_page,
|
| 781 |
request_timeout_seconds=request_timeout_seconds,
|
| 782 |
max_response_bytes=max_response_bytes,
|
|
@@ -823,7 +823,7 @@ def build_ui() -> gr.Blocks:
|
|
| 823 |
"https://www.nasa.gov/",
|
| 824 |
]
|
| 825 |
)
|
| 826 |
-
|
| 827 |
|
| 828 |
with gr.Blocks(title="DataMuncherLabs AutoWS") as demo:
|
| 829 |
gr.Markdown("# DataMuncherLabs AutoWS")
|
|
@@ -854,18 +854,16 @@ def build_ui() -> gr.Blocks:
|
|
| 854 |
|
| 855 |
with gr.Row():
|
| 856 |
with gr.Column(scale=2):
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
type="array",
|
| 861 |
-
row_count=(8, "dynamic"),
|
| 862 |
-
value=default_seed_rows,
|
| 863 |
interactive=True,
|
| 864 |
-
label="Seed URL List (
|
|
|
|
| 865 |
)
|
| 866 |
seed_widget_html = gr.HTML(
|
| 867 |
label="Seed URL Summary",
|
| 868 |
-
value=render_seed_widget_html(
|
| 869 |
)
|
| 870 |
token_widget_html = gr.HTML(
|
| 871 |
label="Live Tokenization",
|
|
@@ -953,7 +951,7 @@ def build_ui() -> gr.Blocks:
|
|
| 953 |
logs_box = gr.Textbox(label="Run Log", lines=12, interactive=False)
|
| 954 |
|
| 955 |
start_inputs = [
|
| 956 |
-
|
| 957 |
max_links_per_page,
|
| 958 |
request_timeout_seconds,
|
| 959 |
max_response_bytes,
|
|
@@ -995,9 +993,9 @@ def build_ui() -> gr.Blocks:
|
|
| 995 |
outputs=[incomplete_shard_flush_seconds],
|
| 996 |
)
|
| 997 |
|
| 998 |
-
|
| 999 |
fn=None,
|
| 1000 |
-
inputs=[
|
| 1001 |
outputs=[seed_widget_html],
|
| 1002 |
js=SEED_WIDGET_JS,
|
| 1003 |
)
|
|
@@ -1011,7 +1009,7 @@ def build_ui() -> gr.Blocks:
|
|
| 1011 |
)
|
| 1012 |
demo.load(
|
| 1013 |
fn=None,
|
| 1014 |
-
inputs=[
|
| 1015 |
outputs=[seed_widget_html],
|
| 1016 |
js=SEED_WIDGET_JS,
|
| 1017 |
)
|
|
|
|
| 278 |
"""
|
| 279 |
|
| 280 |
SEED_WIDGET_JS = """
|
| 281 |
+
(seed_text) => {
|
| 282 |
+
const parseSeedText = (value) => {
|
| 283 |
+
if (typeof value !== "string") return [];
|
| 284 |
+
return value
|
| 285 |
+
.split(/\\r?\\n/)
|
| 286 |
+
.map((line) => line.trim())
|
| 287 |
+
.filter((line) => line.length > 0);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
};
|
| 289 |
|
| 290 |
const dedupe = (values) => {
|
|
|
|
| 314 |
.replaceAll('"', """)
|
| 315 |
.replaceAll("'", "'");
|
| 316 |
|
| 317 |
+
const seeds = dedupe(parseSeedText(seed_text));
|
| 318 |
const domainSet = new Set(seeds.map(domainOf).filter(Boolean));
|
| 319 |
const chips = seeds.length
|
| 320 |
? seeds.slice(0, 12).map((url) => `<span class=\"seed-chip\">${escapeHtml(url)}</span>`).join("")
|
|
|
|
| 322 |
const overflow = seeds.length > 12
|
| 323 |
? `<span class=\"seed-overflow\">+${seeds.length - 12} more</span>`
|
| 324 |
: "";
|
| 325 |
+
const firstUrlChars = seeds.length ? seeds[0].length : 0;
|
| 326 |
|
| 327 |
+
return `<div class=\"seed-widget\"><div class=\"seed-stats\"><span><strong>${seeds.length}</strong> seeds</span><span><strong>${domainSet.size}</strong> domains</span><span><strong>${firstUrlChars}</strong> first-url chars</span></div><div class=\"seed-chip-wrap\">${chips}${overflow}</div></div>`;
|
| 328 |
}
|
| 329 |
"""
|
| 330 |
|
|
|
|
| 340 |
return -1
|
| 341 |
|
| 342 |
|
| 343 |
+
def parse_seed_url_rows(seed_urls_input: Any) -> list[str]:
|
| 344 |
+
if seed_urls_input is None:
|
| 345 |
return []
|
| 346 |
|
| 347 |
+
if isinstance(seed_urls_input, str):
|
| 348 |
+
return [line.strip() for line in seed_urls_input.splitlines() if line.strip()]
|
| 349 |
+
|
| 350 |
+
if isinstance(seed_urls_input, (list, tuple)):
|
| 351 |
+
rows_iterable: list[Any] = list(seed_urls_input)
|
| 352 |
+
elif hasattr(seed_urls_input, "values"):
|
| 353 |
try:
|
| 354 |
+
rows_iterable = seed_urls_input.values.tolist() # pandas.DataFrame path
|
| 355 |
except Exception:
|
| 356 |
rows_iterable = []
|
| 357 |
else:
|
| 358 |
+
rows_iterable = [seed_urls_input]
|
| 359 |
|
| 360 |
items: list[str] = []
|
| 361 |
for row in rows_iterable:
|
| 362 |
+
value_sources: list[Any]
|
| 363 |
if isinstance(row, dict):
|
| 364 |
+
value_sources = [next(iter(row.values()), "")]
|
| 365 |
elif isinstance(row, (list, tuple)):
|
| 366 |
+
value_sources = [row[0] if row else ""]
|
| 367 |
+
else:
|
| 368 |
+
value_sources = [row]
|
| 369 |
+
|
| 370 |
+
for source in value_sources:
|
| 371 |
+
if source is None:
|
| 372 |
+
continue
|
| 373 |
+
for line in str(source).splitlines():
|
| 374 |
+
value = line.strip()
|
| 375 |
+
if value:
|
| 376 |
+
items.append(value)
|
| 377 |
return items
|
| 378 |
|
| 379 |
|
|
|
|
| 388 |
return out
|
| 389 |
|
| 390 |
|
| 391 |
+
def collect_seed_urls(seed_urls_input: Any) -> list[str]:
|
| 392 |
+
return unique_preserve_order(parse_seed_url_rows(seed_urls_input))
|
| 393 |
|
| 394 |
|
| 395 |
+
def render_seed_widget_html(seed_urls_input: Any) -> str:
|
| 396 |
+
seeds = collect_seed_urls(seed_urls_input)
|
| 397 |
domains = {(urlsplit(u).hostname or "").lower().strip(".") for u in seeds}
|
| 398 |
domains = {d for d in domains if d}
|
| 399 |
|
|
|
|
| 454 |
|
| 455 |
def build_crawler_config(
|
| 456 |
*,
|
| 457 |
+
seed_urls_input: Any,
|
| 458 |
max_links_per_page: int,
|
| 459 |
request_timeout_seconds: float,
|
| 460 |
max_response_bytes: int,
|
|
|
|
| 469 |
total_workers: int,
|
| 470 |
) -> CrawlerConfig:
|
| 471 |
validate_hf_requirements(enable_hf_upload, hf_repo_id, hf_token)
|
| 472 |
+
seed_urls = collect_seed_urls(seed_urls_input)
|
| 473 |
|
| 474 |
return CrawlerConfig(
|
| 475 |
seed_urls=seed_urls,
|
|
|
|
| 692 |
def _start_crawl(
|
| 693 |
*,
|
| 694 |
total_workers: int,
|
| 695 |
+
seed_urls_input: Any,
|
| 696 |
max_links_per_page: int,
|
| 697 |
request_timeout_seconds: float,
|
| 698 |
max_response_bytes: int,
|
|
|
|
| 707 |
) -> tuple[str, str, str, str]:
|
| 708 |
try:
|
| 709 |
config = build_crawler_config(
|
| 710 |
+
seed_urls_input=seed_urls_input,
|
| 711 |
max_links_per_page=max_links_per_page,
|
| 712 |
request_timeout_seconds=request_timeout_seconds,
|
| 713 |
max_response_bytes=max_response_bytes,
|
|
|
|
| 730 |
|
| 731 |
|
| 732 |
def start_crawl_standard(
|
| 733 |
+
seed_urls_input: Any,
|
| 734 |
max_links_per_page: int,
|
| 735 |
request_timeout_seconds: float,
|
| 736 |
max_response_bytes: int,
|
|
|
|
| 745 |
) -> tuple[str, str, str, str]:
|
| 746 |
return _start_crawl(
|
| 747 |
total_workers=NORMAL_TOTAL_WORKERS,
|
| 748 |
+
seed_urls_input=seed_urls_input,
|
| 749 |
max_links_per_page=max_links_per_page,
|
| 750 |
request_timeout_seconds=request_timeout_seconds,
|
| 751 |
max_response_bytes=max_response_bytes,
|
|
|
|
| 761 |
|
| 762 |
|
| 763 |
def start_crawl_super(
|
| 764 |
+
seed_urls_input: Any,
|
| 765 |
max_links_per_page: int,
|
| 766 |
request_timeout_seconds: float,
|
| 767 |
max_response_bytes: int,
|
|
|
|
| 776 |
) -> tuple[str, str, str, str]:
|
| 777 |
return _start_crawl(
|
| 778 |
total_workers=SUPER_TOTAL_WORKERS,
|
| 779 |
+
seed_urls_input=seed_urls_input,
|
| 780 |
max_links_per_page=max_links_per_page,
|
| 781 |
request_timeout_seconds=request_timeout_seconds,
|
| 782 |
max_response_bytes=max_response_bytes,
|
|
|
|
| 823 |
"https://www.nasa.gov/",
|
| 824 |
]
|
| 825 |
)
|
| 826 |
+
default_seed_text = "\n".join(defaults.seed_urls)
|
| 827 |
|
| 828 |
with gr.Blocks(title="DataMuncherLabs AutoWS") as demo:
|
| 829 |
gr.Markdown("# DataMuncherLabs AutoWS")
|
|
|
|
| 854 |
|
| 855 |
with gr.Row():
|
| 856 |
with gr.Column(scale=2):
|
| 857 |
+
seed_urls_input = gr.Textbox(
|
| 858 |
+
lines=10,
|
| 859 |
+
value=default_seed_text,
|
|
|
|
|
|
|
|
|
|
| 860 |
interactive=True,
|
| 861 |
+
label="Seed URL List (one URL per line)",
|
| 862 |
+
placeholder="https://example.com",
|
| 863 |
)
|
| 864 |
seed_widget_html = gr.HTML(
|
| 865 |
label="Seed URL Summary",
|
| 866 |
+
value=render_seed_widget_html(default_seed_text),
|
| 867 |
)
|
| 868 |
token_widget_html = gr.HTML(
|
| 869 |
label="Live Tokenization",
|
|
|
|
| 951 |
logs_box = gr.Textbox(label="Run Log", lines=12, interactive=False)
|
| 952 |
|
| 953 |
start_inputs = [
|
| 954 |
+
seed_urls_input,
|
| 955 |
max_links_per_page,
|
| 956 |
request_timeout_seconds,
|
| 957 |
max_response_bytes,
|
|
|
|
| 993 |
outputs=[incomplete_shard_flush_seconds],
|
| 994 |
)
|
| 995 |
|
| 996 |
+
seed_urls_input.change(
|
| 997 |
fn=None,
|
| 998 |
+
inputs=[seed_urls_input],
|
| 999 |
outputs=[seed_widget_html],
|
| 1000 |
js=SEED_WIDGET_JS,
|
| 1001 |
)
|
|
|
|
| 1009 |
)
|
| 1010 |
demo.load(
|
| 1011 |
fn=None,
|
| 1012 |
+
inputs=[seed_urls_input],
|
| 1013 |
outputs=[seed_widget_html],
|
| 1014 |
js=SEED_WIDGET_JS,
|
| 1015 |
)
|