#!/usr/bin/env python3 from __future__ import annotations import asyncio import contextlib import inspect import threading import traceback from collections import deque from dataclasses import dataclass from datetime import datetime, timezone from html import escape from pathlib import Path from typing import Any from urllib.parse import urlsplit import huggingface_hub as hf_hub if not hasattr(hf_hub, "HfFolder"): class _CompatHfFolder: @staticmethod def get_token() -> str | None: return None @staticmethod def save_token(token: str) -> None: del token return None @staticmethod def delete_token() -> None: return None hf_hub.HfFolder = _CompatHfFolder # type: ignore[attr-defined] import gradio as gr from crawler import ( MAX_SHARD_ROWS, MAX_SHARDS, NORMAL_TOTAL_WORKERS, SUPER_TOTAL_WORKERS, AsyncCrawler, CrawlerConfig, ) APP_CSS = """ :root { --bg-main: #0a0d12; --bg-surface: #151a22; --bg-panel: #1b2230; --text-main: #f0f4fb; --text-muted: #9aa4b6; --accent: #3bd9ff; --accent-2: #4cffb1; --border: #2f3a50; --shadow: 0 18px 36px rgba(0, 0, 0, 0.45); } :root[data-crawler-theme="red"] { --bg-main: #17080c; --bg-surface: #250d15; --bg-panel: #341322; --text-main: #f8e8ee; --text-muted: #d5b0c0; --accent: #7a0018; --accent-2: #8e3ff5; --border: #5a2035; } :root[data-crawler-theme="blue"] { --bg-main: #021116; --bg-surface: #08222c; --bg-panel: #0e2f3b; --text-main: #eaffff; --text-muted: #8fbcc7; --accent: #2fff9d; --accent-2: #13e5ff; --border: #1e5662; } :root[data-crawler-theme="light"] { --bg-main: #f6f7f9; --bg-surface: #ffffff; --bg-panel: #eceff2; --text-main: #111317; --text-muted: #60666f; --accent: #2a2f37; --accent-2: #868b95; --border: #d0d4db; --shadow: 0 10px 25px rgba(35, 42, 52, 0.16); } :root[data-crawler-theme="dark"] { --bg-main: #090909; --bg-surface: #141414; --bg-panel: #1d1d1d; --text-main: #f0f0f0; --text-muted: #a8a8a8; --accent: #444444; --accent-2: #686868; --border: #2b2b2b; } :root[data-crawler-theme="green"] { --bg-main: #08110b; --bg-surface: #0f1d14; --bg-panel: #17301e; --text-main: #e8f8ed; --text-muted: #97bc9f; --accent: #2ea84b; --accent-2: #185f2a; --border: #2a5d36; } :root[data-crawler-theme="sunset"] { --bg-main: #1c0f0b; --bg-surface: #2f1810; --bg-panel: #422015; --text-main: #ffeede; --text-muted: #d9b59d; --accent: #ff7f3f; --accent-2: #ff4f81; --border: #6e3525; } :root[data-crawler-theme="ocean"] { --bg-main: #04131d; --bg-surface: #092230; --bg-panel: #0e3144; --text-main: #e8fbff; --text-muted: #9fc3cf; --accent: #2cd9ff; --accent-2: #38ffcb; --border: #1b5062; } :root[data-crawler-theme="amber"] { --bg-main: #171104; --bg-surface: #2a1d07; --bg-panel: #3a2a0a; --text-main: #fff6dc; --text-muted: #d2bd84; --accent: #ffb300; --accent-2: #ffd54f; --border: #6b4d12; } :root[data-crawler-theme="graphite"] { --bg-main: #0f1012; --bg-surface: #1a1d21; --bg-panel: #242a30; --text-main: #f1f3f6; --text-muted: #adb5bf; --accent: #8fa0b7; --accent-2: #d7dce2; --border: #3a424c; } :root[data-crawler-theme="mint"] { --bg-main: #06140f; --bg-surface: #0b2319; --bg-panel: #123125; --text-main: #e8fff4; --text-muted: #a4d8bf; --accent: #3dffb2; --accent-2: #7df9d1; --border: #256347; } .gradio-container { background: radial-gradient(1200px 550px at 8% 0%, color-mix(in srgb, var(--accent) 18%, transparent), transparent), radial-gradient(900px 600px at 100% 0%, color-mix(in srgb, var(--accent-2) 14%, transparent), transparent), var(--bg-main); color: var(--text-main); } .gradio-container .block, .gradio-container .form, .gradio-container .gr-box, .gradio-container .panel-wrap { background: color-mix(in srgb, var(--bg-surface) 92%, transparent) !important; border: 1px solid var(--border) !important; box-shadow: var(--shadow); } .gradio-container h1, .gradio-container h2, .gradio-container h3, .gradio-container p, .gradio-container label, .gradio-container .prose, .gradio-container .prose * { color: var(--text-main) !important; } .gradio-container input, .gradio-container textarea, .gradio-container select { background: var(--bg-panel) !important; color: var(--text-main) !important; border: 1px solid var(--border) !important; } .gradio-container button { border: 1px solid var(--border) !important; } .gradio-container button.primary { background: linear-gradient(135deg, var(--accent), var(--accent-2)) !important; color: #0b0e13 !important; font-weight: 700; } .seed-widget, .token-widget { display: flex; flex-direction: column; gap: 0.75rem; border: 1px solid var(--border); border-radius: 0.9rem; padding: 0.85rem; background: color-mix(in srgb, var(--bg-panel) 86%, transparent); } .seed-stats, .token-stats { display: grid; grid-template-columns: repeat(3, minmax(0, 1fr)); gap: 0.6rem; } .seed-stats > span, .token-stats > span { display: block; padding: 0.55rem; border: 1px solid var(--border); border-radius: 0.6rem; background: color-mix(in srgb, var(--bg-surface) 90%, transparent); color: var(--text-main); font-size: 0.9rem; } .seed-chip-wrap { display: flex; flex-wrap: wrap; gap: 0.45rem; } .seed-chip { border: 1px solid var(--border); border-radius: 999px; padding: 0.24rem 0.7rem; color: var(--text-main); background: linear-gradient( 145deg, color-mix(in srgb, var(--accent) 20%, transparent), color-mix(in srgb, var(--accent-2) 15%, transparent) ); font-size: 0.83rem; } .seed-empty, .seed-overflow, .token-note { color: var(--text-muted); font-size: 0.83rem; padding: 0.24rem 0.3rem; } .setting-help-q { position: relative; display: inline-flex; align-items: center; justify-content: center; width: 1.05rem; height: 1.05rem; margin-left: 0.42rem; border: 1px solid var(--border); border-radius: 999px; color: var(--text-main); background: color-mix(in srgb, var(--bg-panel) 90%, transparent); font-size: 0.74rem; font-weight: 700; cursor: help; line-height: 1; vertical-align: middle; } .setting-help-tooltip { position: fixed; left: 0; top: 0; transform: translate(-50%, -100%); min-width: 180px; max-width: 320px; padding: 0.42rem 0.55rem; border: 1px solid var(--border); border-radius: 0.5rem; background: color-mix(in srgb, var(--bg-surface) 98%, transparent); color: var(--text-main); font-size: 0.74rem; font-weight: 500; line-height: 1.25; box-shadow: var(--shadow); opacity: 0; visibility: hidden; transition: opacity 120ms ease; z-index: 10000; pointer-events: none; white-space: normal; } .setting-help-tooltip.is-visible { opacity: 1; visibility: visible; } """ THEME_JS = """ (theme_name) => { const theme = theme_name || "dark"; document.documentElement.setAttribute("data-crawler-theme", theme); return []; } """ SEED_WIDGET_JS = """ (seed_text) => { const parseSeedText = (value) => { if (typeof value !== "string") return []; return value .split(/\\r?\\n/) .map((line) => line.trim()) .filter((line) => line.length > 0); }; const dedupe = (values) => { const seen = new Set(); const out = []; for (const value of values) { if (!seen.has(value)) { seen.add(value); out.push(value); } } return out; }; const domainOf = (value) => { try { return new URL(value).hostname || ""; } catch { return ""; } }; const escapeHtml = (value) => String(value) .replaceAll("&", "&") .replaceAll("<", "<") .replaceAll(">", ">") .replaceAll('"', """) .replaceAll("'", "'"); const seeds = dedupe(parseSeedText(seed_text)); const domainSet = new Set(seeds.map(domainOf).filter(Boolean)); const chips = seeds.length ? seeds.slice(0, 12).map((url) => `${escapeHtml(url)}`).join("") : 'No seed URLs configured yet.'; const overflow = seeds.length > 12 ? `+${seeds.length - 12} more` : ""; const firstUrlChars = seeds.length ? seeds[0].length : 0; return `
${seeds.length} seeds${domainSet.size} domains${firstUrlChars} first-url chars
${chips}${overflow}
`; } """ SETTING_HELP_JS = """ () => { const helpByPrefix = [ ["Theme", "Switch between visual color themes."], ["Seed URL List (one URL per line)", "Provide crawl entry points. Put one URL per line; duplicates are ignored."], ["Shard Size Rows", "Rows written per parquet shard before a full shard is emitted."], ["Shard Limit", "Maximum number of shards to produce for a run (1 to 15)."], ["Max Links Per Page", "Maximum discovered links to enqueue from each parsed page."], ["Request Timeout (seconds)", "HTTP request timeout per URL."], ["Max Response Bytes", "Maximum response body bytes to read per page."], ["Upload shards to my HF repo", "Enable direct upload of produced shards to your Hugging Face Space repo."], ["HF Repo ID", "Target Hugging Face repo in owner/name format."], ["HF Token (write permissions)", "Token with write access to the target repo."], ["Private HF Repo", "Create the target repo as private if it does not exist."], ["HF Path Prefix", "Folder path inside the repo where shards are uploaded."], ["Upload incomplete shard buffers", "On crawl finish/stop, flush the current partial shard buffer and upload it too."], ]; const tooltipId = "setting-help-tooltip"; let tooltip = document.getElementById(tooltipId); if (!tooltip) { tooltip = document.createElement("div"); tooltip.id = tooltipId; tooltip.className = "setting-help-tooltip"; document.body.appendChild(tooltip); } const placeTooltip = (target) => { const rect = target.getBoundingClientRect(); const aboveTop = rect.top - 10; tooltip.style.left = `${rect.left + (rect.width / 2)}px`; if (aboveTop < 44) { tooltip.style.top = `${rect.bottom + 10}px`; tooltip.style.transform = "translate(-50%, 0)"; } else { tooltip.style.top = `${rect.top - 10}px`; tooltip.style.transform = "translate(-50%, -100%)"; } }; const showTooltip = (target) => { const message = target.getAttribute("data-help") || ""; if (!message) return; tooltip.textContent = message; placeTooltip(target); tooltip.classList.add("is-visible"); }; const hideTooltip = () => { tooltip.classList.remove("is-visible"); }; const clean = (value) => String(value || "").replace(/\\s+/g, " ").trim(); const labels = document.querySelectorAll(".gradio-container label"); for (const label of labels) { const text = clean(label.textContent); const match = helpByPrefix.find(([prefix]) => text.startsWith(prefix)); if (!match) continue; let q = label.querySelector(".setting-help-q"); if (!q) { q = document.createElement("span"); q.className = "setting-help-q"; q.textContent = "?"; label.appendChild(q); } q.setAttribute("data-help", match[1]); q.setAttribute("aria-label", match[1]); q.setAttribute("title", match[1]); q.tabIndex = 0; if (!q.hasAttribute("data-help-bound")) { q.addEventListener("mouseenter", () => showTooltip(q)); q.addEventListener("mouseleave", hideTooltip); q.addEventListener("focus", () => showTooltip(q)); q.addEventListener("blur", hideTooltip); q.setAttribute("data-help-bound", "1"); } } return []; } """ def utc_now_iso() -> str: return datetime.now(timezone.utc).isoformat(timespec="seconds") def safe_queue_size(queue: Any) -> int: try: return int(queue.qsize()) except Exception: return -1 def parse_seed_url_rows(seed_urls_input: Any) -> list[str]: if seed_urls_input is None: return [] if isinstance(seed_urls_input, str): return [line.strip() for line in seed_urls_input.splitlines() if line.strip()] if isinstance(seed_urls_input, (list, tuple)): rows_iterable: list[Any] = list(seed_urls_input) elif hasattr(seed_urls_input, "values"): try: rows_iterable = seed_urls_input.values.tolist() # pandas.DataFrame path except Exception: rows_iterable = [] else: rows_iterable = [seed_urls_input] items: list[str] = [] for row in rows_iterable: value_sources: list[Any] if isinstance(row, dict): value_sources = [next(iter(row.values()), "")] elif isinstance(row, (list, tuple)): value_sources = [row[0] if row else ""] else: value_sources = [row] for source in value_sources: if source is None: continue for line in str(source).splitlines(): value = line.strip() if value: items.append(value) return items def unique_preserve_order(values: list[str]) -> list[str]: seen: set[str] = set() out: list[str] = [] for value in values: if value in seen: continue seen.add(value) out.append(value) return out def collect_seed_urls(seed_urls_input: Any) -> list[str]: return unique_preserve_order(parse_seed_url_rows(seed_urls_input)) def render_seed_widget_html(seed_urls_input: Any) -> str: seeds = collect_seed_urls(seed_urls_input) domains = {(urlsplit(u).hostname or "").lower().strip(".") for u in seeds} domains = {d for d in domains if d} chips = [f'{escape(url)}' for url in seeds[:12]] chips_html = "".join(chips) if chips else 'No seed URLs configured yet.' overflow_html = f'+{len(seeds) - 12} more' if len(seeds) > 12 else "" return ( '
' '
' f"{len(seeds)} seeds" f"{len(domains)} domains" f"{len(seeds[0]) if seeds else 0} first-url chars" "
" f'
{chips_html}{overflow_html}
' "
" ) def render_seed_summary_text(seed_urls_input: Any) -> str: seeds = collect_seed_urls(seed_urls_input) domains = {(urlsplit(u).hostname or "").lower().strip(".") for u in seeds} domains = {d for d in domains if d} first_url_chars = len(seeds[0]) if seeds else 0 lines = [ f"Seeds: {len(seeds)}", f"Domains: {len(domains)}", f"First URL chars: {first_url_chars}", "", "Seed URLs:", ] if seeds: lines.extend([f"- {url}" for url in seeds]) else: lines.append("- (none)") return "\n".join(lines) def render_tokenization_widget_html(snapshot: dict[str, Any]) -> str: tokenized_shards = int(snapshot.get("tokenized_shards", 0) or 0) tokenized_rows = int(snapshot.get("tokenized_rows", 0) or 0) tokenized_tokens = int(snapshot.get("tokenized_tokens", 0) or 0) written_shards = int(snapshot.get("written_shards", 0) or 0) return ( '
' '
' f"{tokenized_tokens} text tokens" f"{tokenized_rows} tokenized rows" f"{tokenized_shards}/{written_shards} tokenized shards" "
" '
Live shard tokenization uses tiktoken on the parquet text column.
' "
" ) def render_qvp_widget_md(snapshot: dict[str, Any]) -> str: queue_count = int(snapshot.get("fetch_queue", 0) or 0) visited_count = int(snapshot.get("fetch_succeeded", 0) or 0) parsed_count = int(snapshot.get("parsed_pages", 0) or 0) return ( "### Live Metrics\n" f"- Queue: `{queue_count}`\n" f"- Visited: `{visited_count}`\n" f"- Parsed: `{parsed_count}`" ) def validate_hf_requirements(enable_hf_upload: bool, hf_repo_id: str, hf_token: str) -> None: if not enable_hf_upload: return if not hf_repo_id.strip(): raise ValueError("HF repo is required when upload is enabled.") if not hf_token.strip(): raise ValueError("HF token is required when upload is enabled.") def preflight_validate_start( enable_hf_upload: bool, hf_repo_id: str, hf_token: str, ) -> None: if not bool(enable_hf_upload): return if not hf_repo_id.strip() or not hf_token.strip(): raise gr.Error("HF upload is enabled. Enter both HF Repo ID and HF Token first.") def build_crawler_config( *, seed_urls_input: Any, max_links_per_page: int, request_timeout_seconds: float, max_response_bytes: int, shard_size_rows: int, max_shards: int, enable_hf_upload: bool, upload_incomplete_shards: bool, hf_repo_id: str, hf_token: str, hf_private_repo: bool, hf_path_prefix: str, total_workers: int, ) -> CrawlerConfig: validate_hf_requirements(enable_hf_upload, hf_repo_id, hf_token) seed_urls = collect_seed_urls(seed_urls_input) return CrawlerConfig( seed_urls=seed_urls, max_links_per_page=int(max_links_per_page), request_timeout_seconds=float(request_timeout_seconds), max_response_bytes=int(max_response_bytes), shard_size_rows=int(shard_size_rows), max_shards=int(max_shards), output_dir=Path(__file__).resolve().parent / "shards", enable_hf_upload=bool(enable_hf_upload), upload_incomplete_shards=bool(upload_incomplete_shards), hf_repo_id=hf_repo_id.strip(), hf_token=hf_token.strip(), hf_private_repo=bool(hf_private_repo), hf_path_prefix=hf_path_prefix.strip() or "crawl_shards", total_workers=int(total_workers), ) @dataclass class RunState: run_id: int = 0 running: bool = False started_at: str = "" finished_at: str = "" stop_requested: bool = False last_error: str = "" class CrawlerRunManager: def __init__(self) -> None: self._lock = threading.Lock() self._thread: threading.Thread | None = None self._loop: asyncio.AbstractEventLoop | None = None self._crawler: AsyncCrawler | None = None self._state = RunState() self._logs: deque[str] = deque(maxlen=600) self._last_snapshot: dict[str, Any] | None = None def start(self, config: CrawlerConfig) -> str: with self._lock: if self._thread is not None and self._thread.is_alive(): return "A crawl is already running. Stop it before starting another one." self._state.run_id += 1 self._state.running = True self._state.started_at = utc_now_iso() self._state.finished_at = "" self._state.stop_requested = False self._state.last_error = "" self._last_snapshot = None self._logs.clear() run_id = self._state.run_id self._logs.append( f"[{utc_now_iso()}] Started run #{run_id} with {config.total_workers} workers " f"({config.fetch_workers} fetch / {config.parser_workers} parser)." ) self._thread = threading.Thread( target=self._run_crawler, args=(run_id, config), daemon=True, name=f"crawler-run-{run_id}", ) self._thread.start() return f"Run #{run_id} started." def stop(self) -> str: with self._lock: if self._thread is None or not self._thread.is_alive(): return "No active crawl to stop." self._state.stop_requested = True crawler = self._crawler loop = self._loop run_id = self._state.run_id self._logs.append(f"[{utc_now_iso()}] Stop requested for run #{run_id}") if crawler is not None and loop is not None and loop.is_running(): loop.call_soon_threadsafe(crawler.request_stop, "user_requested_stop") elif crawler is not None: crawler.request_stop("user_requested_stop") return f"Stop signal sent to run #{run_id}." def _run_crawler(self, run_id: int, config: CrawlerConfig) -> None: loop: asyncio.AbstractEventLoop | None = None try: crawler = AsyncCrawler(config) if hasattr(asyncio, "Runner"): with asyncio.Runner() as runner: # type: ignore[attr-defined] loop = runner.get_loop() with self._lock: if self._state.run_id == run_id: self._crawler = crawler self._loop = loop runner.run(crawler.run()) else: loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) with self._lock: if self._state.run_id == run_id: self._crawler = crawler self._loop = loop loop.run_until_complete(crawler.run()) final_snapshot = self._snapshot_from_crawler(crawler) with self._lock: if self._state.run_id == run_id: self._last_snapshot = final_snapshot self._logs.append(f"[{utc_now_iso()}] Run #{run_id} completed") except Exception: error_text = traceback.format_exc(limit=20) with self._lock: self._state.last_error = error_text self._logs.append(f"[{utc_now_iso()}] Run #{run_id} crashed") finally: with self._lock: if self._state.run_id == run_id: self._state.running = False self._state.finished_at = utc_now_iso() self._crawler = None self._loop = None if loop is not None and not loop.is_closed(): loop.close() with contextlib.suppress(Exception): asyncio.set_event_loop(None) def _snapshot_from_crawler(self, crawler: AsyncCrawler) -> dict[str, Any]: stats = crawler.stats return { "timestamp": utc_now_iso(), "workers_total": crawler.config.total_workers, "workers_split": f"{crawler.config.fetch_workers}/{crawler.config.parser_workers}", "stop_reason": crawler.stop_reason or "-", "fetch_succeeded": stats.fetch_succeeded, "parsed_pages": stats.parsed_pages, "written_shards": stats.written_shards, "tokenized_shards": stats.tokenized_shards, "tokenized_rows": stats.tokenized_rows, "tokenized_tokens": stats.tokenized_tokens, "fetch_queue": safe_queue_size(crawler.fetch_queue), "parse_queue": safe_queue_size(crawler.parse_queue), "record_queue": safe_queue_size(crawler.record_queue), "stop_event": crawler.stop_event.is_set(), } def poll(self) -> tuple[str, dict[str, Any], str]: with self._lock: crawler = self._crawler state = RunState( run_id=self._state.run_id, running=self._state.running, started_at=self._state.started_at, finished_at=self._state.finished_at, stop_requested=self._state.stop_requested, last_error=self._state.last_error, ) if crawler is not None: snapshot = self._snapshot_from_crawler(crawler) with self._lock: self._last_snapshot = snapshot with self._lock: latest = self._last_snapshot or { "timestamp": utc_now_iso(), "workers_total": 0, "workers_split": "-", "stop_reason": "-", "fetch_succeeded": 0, "parsed_pages": 0, "written_shards": 0, "tokenized_shards": 0, "tokenized_rows": 0, "tokenized_tokens": 0, "fetch_queue": 0, "parse_queue": 0, "record_queue": 0, "stop_event": False, } logs_text = "\n".join(self._logs) status_lines = [ "### Crawler Status", f"- Run ID: `{state.run_id}`", f"- Running: `{state.running}`", f"- Stop requested: `{state.stop_requested}`", f"- Started at (UTC): `{state.started_at or '-'}`", f"- Finished at (UTC): `{state.finished_at or '-'}`", ] if state.last_error: status_lines.append("- Last error:") status_lines.append("```text") status_lines.append(state.last_error.strip()) status_lines.append("```") return "\n".join(status_lines), latest, logs_text RUN_MANAGER = CrawlerRunManager() def _format_dashboard_response( status: str, snapshot: dict[str, Any], logs: str, ) -> tuple[str, str, str, str]: return ( status, render_qvp_widget_md(snapshot), logs, render_tokenization_widget_html(snapshot), ) def _start_crawl( *, total_workers: int, seed_urls_input: Any, max_links_per_page: int, request_timeout_seconds: float, max_response_bytes: int, shard_size_rows: int, max_shards: int, enable_hf_upload: bool, upload_incomplete_shards: bool, hf_repo_id: str, hf_token: str, hf_private_repo: bool, hf_path_prefix: str, ) -> tuple[str, str, str, str]: try: config = build_crawler_config( seed_urls_input=seed_urls_input, max_links_per_page=max_links_per_page, request_timeout_seconds=request_timeout_seconds, max_response_bytes=max_response_bytes, shard_size_rows=shard_size_rows, max_shards=max_shards, enable_hf_upload=enable_hf_upload, upload_incomplete_shards=upload_incomplete_shards, hf_repo_id=hf_repo_id, hf_token=hf_token, hf_private_repo=hf_private_repo, hf_path_prefix=hf_path_prefix, total_workers=total_workers, ) except ValueError as exc: raise gr.Error(str(exc)) from exc message = RUN_MANAGER.start(config) status, snapshot, logs = RUN_MANAGER.poll() return _format_dashboard_response(f"{status}\n\n{message}", snapshot, logs) def start_crawl_standard( seed_urls_input: Any, max_links_per_page: int, request_timeout_seconds: float, max_response_bytes: int, shard_size_rows: int, max_shards: int, enable_hf_upload: bool, upload_incomplete_shards: bool, hf_repo_id: str, hf_token: str, hf_private_repo: bool, hf_path_prefix: str, ) -> tuple[str, str, str, str]: return _start_crawl( total_workers=NORMAL_TOTAL_WORKERS, seed_urls_input=seed_urls_input, max_links_per_page=max_links_per_page, request_timeout_seconds=request_timeout_seconds, max_response_bytes=max_response_bytes, shard_size_rows=shard_size_rows, max_shards=max_shards, enable_hf_upload=enable_hf_upload, upload_incomplete_shards=upload_incomplete_shards, hf_repo_id=hf_repo_id, hf_token=hf_token, hf_private_repo=hf_private_repo, hf_path_prefix=hf_path_prefix, ) def start_crawl_super( seed_urls_input: Any, max_links_per_page: int, request_timeout_seconds: float, max_response_bytes: int, shard_size_rows: int, max_shards: int, enable_hf_upload: bool, upload_incomplete_shards: bool, hf_repo_id: str, hf_token: str, hf_private_repo: bool, hf_path_prefix: str, ) -> tuple[str, str, str, str]: return _start_crawl( total_workers=SUPER_TOTAL_WORKERS, seed_urls_input=seed_urls_input, max_links_per_page=max_links_per_page, request_timeout_seconds=request_timeout_seconds, max_response_bytes=max_response_bytes, shard_size_rows=shard_size_rows, max_shards=max_shards, enable_hf_upload=enable_hf_upload, upload_incomplete_shards=upload_incomplete_shards, hf_repo_id=hf_repo_id, hf_token=hf_token, hf_private_repo=hf_private_repo, hf_path_prefix=hf_path_prefix, ) def stop_crawl() -> tuple[str, str, str, str]: message = RUN_MANAGER.stop() status, snapshot, logs = RUN_MANAGER.poll() return _format_dashboard_response(f"{status}\n\n{message}", snapshot, logs) def poll_dashboard() -> tuple[str, str, str, str]: status, snapshot, logs = RUN_MANAGER.poll() return _format_dashboard_response(status, snapshot, logs) def render_seed_widget(seed_urls_input: Any) -> str: return render_seed_summary_text(seed_urls_input) def noop_event(*_args: Any) -> None: return None def toggle_hf_fields(enable_hf_upload: bool) -> tuple[Any, Any, Any, Any, Any]: update = gr.update(visible=enable_hf_upload) return update, update, update, update, update def build_ui() -> gr.Blocks: defaults = CrawlerConfig( seed_urls=[ "https://en.wikipedia.org/wiki/Main_Page", "https://docs.python.org/3/", "https://developer.mozilla.org/en-US/", "https://www.nasa.gov/", ] ) default_seed_text = "\n".join(defaults.seed_urls) with gr.Blocks(title="DataMuncherLabs AutoWS") as demo: gr.Markdown("# DataMuncherLabs AutoWS") gr.Markdown("Async web crawler dashboard with live parquet text tokenization.") with gr.Row(): theme_name = gr.Dropdown( choices=[ "red", "blue", "light", "dark", "green", "sunset", "ocean", "amber", "graphite", "mint", ], value="dark", label="Theme", interactive=True, ) gr.Markdown( "- Standard mode: **12 threads** (`10 fetch`, `2 parse`)\n" "- Super mode: **24 threads** (`20 fetch`, `4 parse`)" ) with gr.Row(): with gr.Column(scale=2): seed_urls_input = gr.Textbox( lines=10, value=default_seed_text, interactive=True, label="Seed URL List (one URL per line)", placeholder="https://example.com", ) seed_widget_html = gr.Textbox( label="Seed URL Summary", value=render_seed_summary_text(default_seed_text), lines=10, interactive=False, ) token_widget_html = gr.HTML( label="Live Tokenization", value=render_tokenization_widget_html({}), ) with gr.Column(scale=1): shard_size_rows = gr.Slider( label=f"Shard Size Rows (max {MAX_SHARD_ROWS})", minimum=100, maximum=MAX_SHARD_ROWS, step=100, value=min(defaults.shard_size_rows, MAX_SHARD_ROWS), ) max_shards = gr.Slider( label=f"Shard Limit (1-{MAX_SHARDS})", minimum=1, maximum=MAX_SHARDS, step=1, value=min(defaults.max_shards, MAX_SHARDS), ) max_links_per_page = gr.Slider( label="Max Links Per Page", minimum=10, maximum=1000, step=10, value=defaults.max_links_per_page, ) request_timeout_seconds = gr.Slider( label="Request Timeout (seconds)", minimum=3, maximum=60, step=1, value=defaults.request_timeout_seconds, ) max_response_bytes = gr.Slider( label="Max Response Bytes", minimum=500_000, maximum=8_000_000, step=100_000, value=defaults.max_response_bytes, ) with gr.Accordion("Hugging Face Upload", open=False): enable_hf_upload = gr.Checkbox( label="Upload shards to my HF repo", value=False, ) hf_repo_id = gr.Textbox( label="HF Repo ID", placeholder="username/dataset-name", visible=False, ) hf_token = gr.Textbox( label="HF Token (write permissions)", type="password", placeholder="hf_xxx", visible=False, ) hf_private_repo = gr.Checkbox( label="Private HF Repo", value=False, visible=False, ) hf_path_prefix = gr.Textbox( label="HF Path Prefix", value="crawl_shards", visible=False, ) upload_incomplete_shards = gr.Checkbox( label="Upload incomplete shard buffers", value=False, visible=False, ) with gr.Row(): start_button = gr.Button("Start Crawl (12 Threads)", variant="primary") super_button = gr.Button("Super Mode (24 Threads)", variant="primary") stop_button = gr.Button("Stop Crawl", variant="stop") refresh_button = gr.Button("Refresh") status_md = gr.Markdown("### Crawler Status\n- Run ID: `0`\n- Running: `False`") qvp_md = gr.Markdown("### Live Metrics\n- Queue: `0`\n- Visited: `0`\n- Parsed: `0`") logs_box = gr.Textbox(label="Run Log", lines=12, interactive=False) start_inputs = [ seed_urls_input, max_links_per_page, request_timeout_seconds, max_response_bytes, shard_size_rows, max_shards, enable_hf_upload, upload_incomplete_shards, hf_repo_id, hf_token, hf_private_repo, hf_path_prefix, ] outputs = [status_md, qvp_md, logs_box, token_widget_html] start_button.click( preflight_validate_start, inputs=[enable_hf_upload, hf_repo_id, hf_token], outputs=[], queue=False, ).then(start_crawl_standard, inputs=start_inputs, outputs=outputs) super_button.click( preflight_validate_start, inputs=[enable_hf_upload, hf_repo_id, hf_token], outputs=[], queue=False, ).then(start_crawl_super, inputs=start_inputs, outputs=outputs) stop_button.click(stop_crawl, inputs=[], outputs=outputs) refresh_button.click(poll_dashboard, inputs=[], outputs=outputs) enable_hf_upload.change( toggle_hf_fields, inputs=enable_hf_upload, outputs=[ hf_repo_id, hf_token, hf_private_repo, hf_path_prefix, upload_incomplete_shards, ], ) seed_urls_input.change( fn=render_seed_widget, inputs=[seed_urls_input], outputs=[seed_widget_html], queue=False, ) theme_name.change( fn=noop_event, inputs=[theme_name], outputs=[], js=THEME_JS, queue=False, ) demo.load(fn=noop_event, inputs=[], outputs=[], js=SETTING_HELP_JS, queue=False) demo.load( fn=noop_event, inputs=[], outputs=[], js='() => { document.documentElement.setAttribute("data-crawler-theme", "dark"); }', queue=False, ) demo.load( fn=render_seed_widget, inputs=[seed_urls_input], outputs=[seed_widget_html], queue=False, ) demo.load(fn=poll_dashboard, inputs=[], outputs=outputs) enable_hf_upload.change( fn=noop_event, inputs=[enable_hf_upload], outputs=[], js=SETTING_HELP_JS, queue=False, ) upload_incomplete_shards.change( fn=noop_event, inputs=[upload_incomplete_shards], outputs=[], js=SETTING_HELP_JS, queue=False, ) timer = gr.Timer(value=1.0) timer.tick(fn=poll_dashboard, inputs=[], outputs=outputs) return demo demo = build_ui() def main() -> None: queued = demo.queue(default_concurrency_limit=32) launch_sig = inspect.signature(queued.launch) launch_kwargs: dict[str, Any] = {} if "css" in launch_sig.parameters: launch_kwargs["css"] = APP_CSS if "theme" in launch_sig.parameters: launch_kwargs["theme"] = gr.themes.Default(primary_hue="green") if "ssr_mode" in launch_sig.parameters: launch_kwargs["ssr_mode"] = False queued.launch(**launch_kwargs) if __name__ == "__main__": main()