Spaces:

DataMuncher-Labs
/

AutoWS

Running

App Files Files Community

AutoWS / app.py

Roman190928

Add preflight popup validation for missing HF repo/token before crawl start

0e53210 verified 15 days ago

raw

history blame contribute delete

38.3 kB

	#!/usr/bin/env python3
	from __future__ import annotations

	import asyncio
	import contextlib
	import inspect
	import threading
	import traceback
	from collections import deque
	from dataclasses import dataclass
	from datetime import datetime, timezone
	from html import escape
	from pathlib import Path
	from typing import Any
	from urllib.parse import urlsplit

	import huggingface_hub as hf_hub

	if not hasattr(hf_hub, "HfFolder"):
	class _CompatHfFolder:
	@staticmethod
	def get_token() -> str \| None:
	return None

	@staticmethod
	def save_token(token: str) -> None:
	del token
	return None

	@staticmethod
	def delete_token() -> None:
	return None

	hf_hub.HfFolder = _CompatHfFolder # type: ignore[attr-defined]

	import gradio as gr

	from crawler import (
	MAX_SHARD_ROWS,
	MAX_SHARDS,
	NORMAL_TOTAL_WORKERS,
	SUPER_TOTAL_WORKERS,
	AsyncCrawler,
	CrawlerConfig,
	)

	APP_CSS = """
	:root {
	--bg-main: #0a0d12;
	--bg-surface: #151a22;
	--bg-panel: #1b2230;
	--text-main: #f0f4fb;
	--text-muted: #9aa4b6;
	--accent: #3bd9ff;
	--accent-2: #4cffb1;
	--border: #2f3a50;
	--shadow: 0 18px 36px rgba(0, 0, 0, 0.45);
	}

	:root[data-crawler-theme="red"] {
	--bg-main: #17080c;
	--bg-surface: #250d15;
	--bg-panel: #341322;
	--text-main: #f8e8ee;
	--text-muted: #d5b0c0;
	--accent: #7a0018;
	--accent-2: #8e3ff5;
	--border: #5a2035;
	}

	:root[data-crawler-theme="blue"] {
	--bg-main: #021116;
	--bg-surface: #08222c;
	--bg-panel: #0e2f3b;
	--text-main: #eaffff;
	--text-muted: #8fbcc7;
	--accent: #2fff9d;
	--accent-2: #13e5ff;
	--border: #1e5662;
	}

	:root[data-crawler-theme="light"] {
	--bg-main: #f6f7f9;
	--bg-surface: #ffffff;
	--bg-panel: #eceff2;
	--text-main: #111317;
	--text-muted: #60666f;
	--accent: #2a2f37;
	--accent-2: #868b95;
	--border: #d0d4db;
	--shadow: 0 10px 25px rgba(35, 42, 52, 0.16);
	}

	:root[data-crawler-theme="dark"] {
	--bg-main: #090909;
	--bg-surface: #141414;
	--bg-panel: #1d1d1d;
	--text-main: #f0f0f0;
	--text-muted: #a8a8a8;
	--accent: #444444;
	--accent-2: #686868;
	--border: #2b2b2b;
	}

	:root[data-crawler-theme="green"] {
	--bg-main: #08110b;
	--bg-surface: #0f1d14;
	--bg-panel: #17301e;
	--text-main: #e8f8ed;
	--text-muted: #97bc9f;
	--accent: #2ea84b;
	--accent-2: #185f2a;
	--border: #2a5d36;
	}

	:root[data-crawler-theme="sunset"] {
	--bg-main: #1c0f0b;
	--bg-surface: #2f1810;
	--bg-panel: #422015;
	--text-main: #ffeede;
	--text-muted: #d9b59d;
	--accent: #ff7f3f;
	--accent-2: #ff4f81;
	--border: #6e3525;
	}

	:root[data-crawler-theme="ocean"] {
	--bg-main: #04131d;
	--bg-surface: #092230;
	--bg-panel: #0e3144;
	--text-main: #e8fbff;
	--text-muted: #9fc3cf;
	--accent: #2cd9ff;
	--accent-2: #38ffcb;
	--border: #1b5062;
	}

	:root[data-crawler-theme="amber"] {
	--bg-main: #171104;
	--bg-surface: #2a1d07;
	--bg-panel: #3a2a0a;
	--text-main: #fff6dc;
	--text-muted: #d2bd84;
	--accent: #ffb300;
	--accent-2: #ffd54f;
	--border: #6b4d12;
	}

	:root[data-crawler-theme="graphite"] {
	--bg-main: #0f1012;
	--bg-surface: #1a1d21;
	--bg-panel: #242a30;
	--text-main: #f1f3f6;
	--text-muted: #adb5bf;
	--accent: #8fa0b7;
	--accent-2: #d7dce2;
	--border: #3a424c;
	}

	:root[data-crawler-theme="mint"] {
	--bg-main: #06140f;
	--bg-surface: #0b2319;
	--bg-panel: #123125;
	--text-main: #e8fff4;
	--text-muted: #a4d8bf;
	--accent: #3dffb2;
	--accent-2: #7df9d1;
	--border: #256347;
	}

	.gradio-container {
	background:
	radial-gradient(1200px 550px at 8% 0%, color-mix(in srgb, var(--accent) 18%, transparent), transparent),
	radial-gradient(900px 600px at 100% 0%, color-mix(in srgb, var(--accent-2) 14%, transparent), transparent),
	var(--bg-main);
	color: var(--text-main);
	}

	.gradio-container .block,
	.gradio-container .form,
	.gradio-container .gr-box,
	.gradio-container .panel-wrap {
	background: color-mix(in srgb, var(--bg-surface) 92%, transparent) !important;
	border: 1px solid var(--border) !important;
	box-shadow: var(--shadow);
	}

	.gradio-container h1,
	.gradio-container h2,
	.gradio-container h3,
	.gradio-container p,
	.gradio-container label,
	.gradio-container .prose,
	.gradio-container .prose * {
	color: var(--text-main) !important;
	}

	.gradio-container input,
	.gradio-container textarea,
	.gradio-container select {
	background: var(--bg-panel) !important;
	color: var(--text-main) !important;
	border: 1px solid var(--border) !important;
	}

	.gradio-container button {
	border: 1px solid var(--border) !important;
	}

	.gradio-container button.primary {
	background: linear-gradient(135deg, var(--accent), var(--accent-2)) !important;
	color: #0b0e13 !important;
	font-weight: 700;
	}

	.seed-widget,
	.token-widget {
	display: flex;
	flex-direction: column;
	gap: 0.75rem;
	border: 1px solid var(--border);
	border-radius: 0.9rem;
	padding: 0.85rem;
	background: color-mix(in srgb, var(--bg-panel) 86%, transparent);
	}

	.seed-stats,
	.token-stats {
	display: grid;
	grid-template-columns: repeat(3, minmax(0, 1fr));
	gap: 0.6rem;
	}

	.seed-stats > span,
	.token-stats > span {
	display: block;
	padding: 0.55rem;
	border: 1px solid var(--border);
	border-radius: 0.6rem;
	background: color-mix(in srgb, var(--bg-surface) 90%, transparent);
	color: var(--text-main);
	font-size: 0.9rem;
	}

	.seed-chip-wrap {
	display: flex;
	flex-wrap: wrap;
	gap: 0.45rem;
	}

	.seed-chip {
	border: 1px solid var(--border);
	border-radius: 999px;
	padding: 0.24rem 0.7rem;
	color: var(--text-main);
	background: linear-gradient(
	145deg,
	color-mix(in srgb, var(--accent) 20%, transparent),
	color-mix(in srgb, var(--accent-2) 15%, transparent)
	);
	font-size: 0.83rem;
	}

	.seed-empty,
	.seed-overflow,
	.token-note {
	color: var(--text-muted);
	font-size: 0.83rem;
	padding: 0.24rem 0.3rem;
	}

	.setting-help-q {
	position: relative;
	display: inline-flex;
	align-items: center;
	justify-content: center;
	width: 1.05rem;
	height: 1.05rem;
	margin-left: 0.42rem;
	border: 1px solid var(--border);
	border-radius: 999px;
	color: var(--text-main);
	background: color-mix(in srgb, var(--bg-panel) 90%, transparent);
	font-size: 0.74rem;
	font-weight: 700;
	cursor: help;
	line-height: 1;
	vertical-align: middle;
	}

	.setting-help-tooltip {
	position: fixed;
	left: 0;
	top: 0;
	transform: translate(-50%, -100%);
	min-width: 180px;
	max-width: 320px;
	padding: 0.42rem 0.55rem;
	border: 1px solid var(--border);
	border-radius: 0.5rem;
	background: color-mix(in srgb, var(--bg-surface) 98%, transparent);
	color: var(--text-main);
	font-size: 0.74rem;
	font-weight: 500;
	line-height: 1.25;
	box-shadow: var(--shadow);
	opacity: 0;
	visibility: hidden;
	transition: opacity 120ms ease;
	z-index: 10000;
	pointer-events: none;
	white-space: normal;
	}

	.setting-help-tooltip.is-visible {
	opacity: 1;
	visibility: visible;
	}
	"""

	THEME_JS = """
	(theme_name) => {
	const theme = theme_name \|\| "dark";
	document.documentElement.setAttribute("data-crawler-theme", theme);
	return [];
	}
	"""

	SEED_WIDGET_JS = """
	(seed_text) => {
	const parseSeedText = (value) => {
	if (typeof value !== "string") return [];
	return value
	.split(/\\r?\\n/)
	.map((line) => line.trim())
	.filter((line) => line.length > 0);
	};

	const dedupe = (values) => {
	const seen = new Set();
	const out = [];
	for (const value of values) {
	if (!seen.has(value)) {
	seen.add(value);
	out.push(value);
	}
	}
	return out;
	};

	const domainOf = (value) => {
	try {
	return new URL(value).hostname \|\| "";
	} catch {
	return "";
	}
	};

	const escapeHtml = (value) => String(value)
	.replaceAll("&", "&")
	.replaceAll("<", "<")
	.replaceAll(">", ">")
	.replaceAll('"', """)
	.replaceAll("'", "'");

	const seeds = dedupe(parseSeedText(seed_text));
	const domainSet = new Set(seeds.map(domainOf).filter(Boolean));
	const chips = seeds.length
	? seeds.slice(0, 12).map((url) => `<span class=\"seed-chip\">${escapeHtml(url)}</span>`).join("")
	: '<span class=\"seed-empty\">No seed URLs configured yet.</span>';
	const overflow = seeds.length > 12
	? `<span class=\"seed-overflow\">+${seeds.length - 12} more</span>`
	: "";
	const firstUrlChars = seeds.length ? seeds[0].length : 0;

	return `<div class=\"seed-widget\"><div class=\"seed-stats\"><span><strong>${seeds.length}</strong> seeds</span><span><strong>${domainSet.size}</strong> domains</span><span><strong>${firstUrlChars}</strong> first-url chars</span></div><div class=\"seed-chip-wrap\">${chips}${overflow}</div></div>`;
	}
	"""

	SETTING_HELP_JS = """
	() => {
	const helpByPrefix = [
	["Theme", "Switch between visual color themes."],
	["Seed URL List (one URL per line)", "Provide crawl entry points. Put one URL per line; duplicates are ignored."],
	["Shard Size Rows", "Rows written per parquet shard before a full shard is emitted."],
	["Shard Limit", "Maximum number of shards to produce for a run (1 to 15)."],
	["Max Links Per Page", "Maximum discovered links to enqueue from each parsed page."],
	["Request Timeout (seconds)", "HTTP request timeout per URL."],
	["Max Response Bytes", "Maximum response body bytes to read per page."],
	["Upload shards to my HF repo", "Enable direct upload of produced shards to your Hugging Face Space repo."],
	["HF Repo ID", "Target Hugging Face repo in owner/name format."],
	["HF Token (write permissions)", "Token with write access to the target repo."],
	["Private HF Repo", "Create the target repo as private if it does not exist."],
	["HF Path Prefix", "Folder path inside the repo where shards are uploaded."],
	["Upload incomplete shard buffers", "On crawl finish/stop, flush the current partial shard buffer and upload it too."],
	];

	const tooltipId = "setting-help-tooltip";
	let tooltip = document.getElementById(tooltipId);
	if (!tooltip) {
	tooltip = document.createElement("div");
	tooltip.id = tooltipId;
	tooltip.className = "setting-help-tooltip";
	document.body.appendChild(tooltip);
	}

	const placeTooltip = (target) => {
	const rect = target.getBoundingClientRect();
	const aboveTop = rect.top - 10;

	tooltip.style.left = `${rect.left + (rect.width / 2)}px`;
	if (aboveTop < 44) {
	tooltip.style.top = `${rect.bottom + 10}px`;
	tooltip.style.transform = "translate(-50%, 0)";
	} else {
	tooltip.style.top = `${rect.top - 10}px`;
	tooltip.style.transform = "translate(-50%, -100%)";
	}
	};

	const showTooltip = (target) => {
	const message = target.getAttribute("data-help") \|\| "";
	if (!message) return;
	tooltip.textContent = message;
	placeTooltip(target);
	tooltip.classList.add("is-visible");
	};

	const hideTooltip = () => {
	tooltip.classList.remove("is-visible");
	};

	const clean = (value) => String(value \|\| "").replace(/\\s+/g, " ").trim();
	const labels = document.querySelectorAll(".gradio-container label");

	for (const label of labels) {
	const text = clean(label.textContent);
	const match = helpByPrefix.find(([prefix]) => text.startsWith(prefix));
	if (!match) continue;

	let q = label.querySelector(".setting-help-q");
	if (!q) {
	q = document.createElement("span");
	q.className = "setting-help-q";
	q.textContent = "?";
	label.appendChild(q);
	}

	q.setAttribute("data-help", match[1]);
	q.setAttribute("aria-label", match[1]);
	q.setAttribute("title", match[1]);
	q.tabIndex = 0;

	if (!q.hasAttribute("data-help-bound")) {
	q.addEventListener("mouseenter", () => showTooltip(q));
	q.addEventListener("mouseleave", hideTooltip);
	q.addEventListener("focus", () => showTooltip(q));
	q.addEventListener("blur", hideTooltip);
	q.setAttribute("data-help-bound", "1");
	}
	}
	return [];
	}
	"""


	def utc_now_iso() -> str:
	return datetime.now(timezone.utc).isoformat(timespec="seconds")


	def safe_queue_size(queue: Any) -> int:
	try:
	return int(queue.qsize())
	except Exception:
	return -1


	def parse_seed_url_rows(seed_urls_input: Any) -> list[str]:
	if seed_urls_input is None:
	return []

	if isinstance(seed_urls_input, str):
	return [line.strip() for line in seed_urls_input.splitlines() if line.strip()]

	if isinstance(seed_urls_input, (list, tuple)):
	rows_iterable: list[Any] = list(seed_urls_input)
	elif hasattr(seed_urls_input, "values"):
	try:
	rows_iterable = seed_urls_input.values.tolist() # pandas.DataFrame path
	except Exception:
	rows_iterable = []
	else:
	rows_iterable = [seed_urls_input]

	items: list[str] = []
	for row in rows_iterable:
	value_sources: list[Any]
	if isinstance(row, dict):
	value_sources = [next(iter(row.values()), "")]
	elif isinstance(row, (list, tuple)):
	value_sources = [row[0] if row else ""]
	else:
	value_sources = [row]

	for source in value_sources:
	if source is None:
	continue
	for line in str(source).splitlines():
	value = line.strip()
	if value:
	items.append(value)
	return items


	def unique_preserve_order(values: list[str]) -> list[str]:
	seen: set[str] = set()
	out: list[str] = []
	for value in values:
	if value in seen:
	continue
	seen.add(value)
	out.append(value)
	return out


	def collect_seed_urls(seed_urls_input: Any) -> list[str]:
	return unique_preserve_order(parse_seed_url_rows(seed_urls_input))


	def render_seed_widget_html(seed_urls_input: Any) -> str:
	seeds = collect_seed_urls(seed_urls_input)
	domains = {(urlsplit(u).hostname or "").lower().strip(".") for u in seeds}
	domains = {d for d in domains if d}

	chips = [f'<span class="seed-chip">{escape(url)}</span>' for url in seeds[:12]]
	chips_html = "".join(chips) if chips else '<span class="seed-empty">No seed URLs configured yet.</span>'
	overflow_html = f'<span class="seed-overflow">+{len(seeds) - 12} more</span>' if len(seeds) > 12 else ""

	return (
	'<div class="seed-widget">'
	'<div class="seed-stats">'
	f"<span><strong>{len(seeds)}</strong> seeds</span>"
	f"<span><strong>{len(domains)}</strong> domains</span>"
	f"<span><strong>{len(seeds[0]) if seeds else 0}</strong> first-url chars</span>"
	"</div>"
	f'<div class="seed-chip-wrap">{chips_html}{overflow_html}</div>'
	"</div>"
	)


	def render_seed_summary_text(seed_urls_input: Any) -> str:
	seeds = collect_seed_urls(seed_urls_input)
	domains = {(urlsplit(u).hostname or "").lower().strip(".") for u in seeds}
	domains = {d for d in domains if d}
	first_url_chars = len(seeds[0]) if seeds else 0

	lines = [
	f"Seeds: {len(seeds)}",
	f"Domains: {len(domains)}",
	f"First URL chars: {first_url_chars}",
	"",
	"Seed URLs:",
	]
	if seeds:
	lines.extend([f"- {url}" for url in seeds])
	else:
	lines.append("- (none)")
	return "\n".join(lines)


	def render_tokenization_widget_html(snapshot: dict[str, Any]) -> str:
	tokenized_shards = int(snapshot.get("tokenized_shards", 0) or 0)
	tokenized_rows = int(snapshot.get("tokenized_rows", 0) or 0)
	tokenized_tokens = int(snapshot.get("tokenized_tokens", 0) or 0)
	written_shards = int(snapshot.get("written_shards", 0) or 0)

	return (
	'<div class="token-widget">'
	'<div class="token-stats">'
	f"<span><strong>{tokenized_tokens}</strong> text tokens</span>"
	f"<span><strong>{tokenized_rows}</strong> tokenized rows</span>"
	f"<span><strong>{tokenized_shards}/{written_shards}</strong> tokenized shards</span>"
	"</div>"
	'<div class="token-note">Live shard tokenization uses tiktoken on the parquet <code>text</code> column.</div>'
	"</div>"
	)


	def render_qvp_widget_md(snapshot: dict[str, Any]) -> str:
	queue_count = int(snapshot.get("fetch_queue", 0) or 0)
	visited_count = int(snapshot.get("fetch_succeeded", 0) or 0)
	parsed_count = int(snapshot.get("parsed_pages", 0) or 0)
	return (
	"### Live Metrics\n"
	f"- Queue: `{queue_count}`\n"
	f"- Visited: `{visited_count}`\n"
	f"- Parsed: `{parsed_count}`"
	)


	def validate_hf_requirements(enable_hf_upload: bool, hf_repo_id: str, hf_token: str) -> None:
	if not enable_hf_upload:
	return
	if not hf_repo_id.strip():
	raise ValueError("HF repo is required when upload is enabled.")
	if not hf_token.strip():
	raise ValueError("HF token is required when upload is enabled.")


	def preflight_validate_start(
	enable_hf_upload: bool,
	hf_repo_id: str,
	hf_token: str,
	) -> None:
	if not bool(enable_hf_upload):
	return
	if not hf_repo_id.strip() or not hf_token.strip():
	raise gr.Error("HF upload is enabled. Enter both HF Repo ID and HF Token first.")


	def build_crawler_config(
	*,
	seed_urls_input: Any,
	max_links_per_page: int,
	request_timeout_seconds: float,
	max_response_bytes: int,
	shard_size_rows: int,
	max_shards: int,
	enable_hf_upload: bool,
	upload_incomplete_shards: bool,
	hf_repo_id: str,
	hf_token: str,
	hf_private_repo: bool,
	hf_path_prefix: str,
	total_workers: int,
	) -> CrawlerConfig:
	validate_hf_requirements(enable_hf_upload, hf_repo_id, hf_token)
	seed_urls = collect_seed_urls(seed_urls_input)

	return CrawlerConfig(
	seed_urls=seed_urls,
	max_links_per_page=int(max_links_per_page),
	request_timeout_seconds=float(request_timeout_seconds),
	max_response_bytes=int(max_response_bytes),
	shard_size_rows=int(shard_size_rows),
	max_shards=int(max_shards),
	output_dir=Path(__file__).resolve().parent / "shards",
	enable_hf_upload=bool(enable_hf_upload),
	upload_incomplete_shards=bool(upload_incomplete_shards),
	hf_repo_id=hf_repo_id.strip(),
	hf_token=hf_token.strip(),
	hf_private_repo=bool(hf_private_repo),
	hf_path_prefix=hf_path_prefix.strip() or "crawl_shards",
	total_workers=int(total_workers),
	)


	@dataclass
	class RunState:
	run_id: int = 0
	running: bool = False
	started_at: str = ""
	finished_at: str = ""
	stop_requested: bool = False
	last_error: str = ""


	class CrawlerRunManager:
	def __init__(self) -> None:
	self._lock = threading.Lock()
	self._thread: threading.Thread \| None = None
	self._loop: asyncio.AbstractEventLoop \| None = None
	self._crawler: AsyncCrawler \| None = None
	self._state = RunState()
	self._logs: deque[str] = deque(maxlen=600)
	self._last_snapshot: dict[str, Any] \| None = None

	def start(self, config: CrawlerConfig) -> str:
	with self._lock:
	if self._thread is not None and self._thread.is_alive():
	return "A crawl is already running. Stop it before starting another one."

	self._state.run_id += 1
	self._state.running = True
	self._state.started_at = utc_now_iso()
	self._state.finished_at = ""
	self._state.stop_requested = False
	self._state.last_error = ""
	self._last_snapshot = None
	self._logs.clear()

	run_id = self._state.run_id
	self._logs.append(
	f"[{utc_now_iso()}] Started run #{run_id} with {config.total_workers} workers "
	f"({config.fetch_workers} fetch / {config.parser_workers} parser)."
	)

	self._thread = threading.Thread(
	target=self._run_crawler,
	args=(run_id, config),
	daemon=True,
	name=f"crawler-run-{run_id}",
	)
	self._thread.start()

	return f"Run #{run_id} started."

	def stop(self) -> str:
	with self._lock:
	if self._thread is None or not self._thread.is_alive():
	return "No active crawl to stop."

	self._state.stop_requested = True
	crawler = self._crawler
	loop = self._loop
	run_id = self._state.run_id
	self._logs.append(f"[{utc_now_iso()}] Stop requested for run #{run_id}")

	if crawler is not None and loop is not None and loop.is_running():
	loop.call_soon_threadsafe(crawler.request_stop, "user_requested_stop")
	elif crawler is not None:
	crawler.request_stop("user_requested_stop")

	return f"Stop signal sent to run #{run_id}."

	def _run_crawler(self, run_id: int, config: CrawlerConfig) -> None:
	loop: asyncio.AbstractEventLoop \| None = None
	try:
	crawler = AsyncCrawler(config)
	if hasattr(asyncio, "Runner"):
	with asyncio.Runner() as runner: # type: ignore[attr-defined]
	loop = runner.get_loop()
	with self._lock:
	if self._state.run_id == run_id:
	self._crawler = crawler
	self._loop = loop
	runner.run(crawler.run())
	else:
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)
	with self._lock:
	if self._state.run_id == run_id:
	self._crawler = crawler
	self._loop = loop
	loop.run_until_complete(crawler.run())

	final_snapshot = self._snapshot_from_crawler(crawler)
	with self._lock:
	if self._state.run_id == run_id:
	self._last_snapshot = final_snapshot
	self._logs.append(f"[{utc_now_iso()}] Run #{run_id} completed")
	except Exception:
	error_text = traceback.format_exc(limit=20)
	with self._lock:
	self._state.last_error = error_text
	self._logs.append(f"[{utc_now_iso()}] Run #{run_id} crashed")
	finally:
	with self._lock:
	if self._state.run_id == run_id:
	self._state.running = False
	self._state.finished_at = utc_now_iso()
	self._crawler = None
	self._loop = None

	if loop is not None and not loop.is_closed():
	loop.close()
	with contextlib.suppress(Exception):
	asyncio.set_event_loop(None)

	def _snapshot_from_crawler(self, crawler: AsyncCrawler) -> dict[str, Any]:
	stats = crawler.stats
	return {
	"timestamp": utc_now_iso(),
	"workers_total": crawler.config.total_workers,
	"workers_split": f"{crawler.config.fetch_workers}/{crawler.config.parser_workers}",
	"stop_reason": crawler.stop_reason or "-",
	"fetch_succeeded": stats.fetch_succeeded,
	"parsed_pages": stats.parsed_pages,
	"written_shards": stats.written_shards,
	"tokenized_shards": stats.tokenized_shards,
	"tokenized_rows": stats.tokenized_rows,
	"tokenized_tokens": stats.tokenized_tokens,
	"fetch_queue": safe_queue_size(crawler.fetch_queue),
	"parse_queue": safe_queue_size(crawler.parse_queue),
	"record_queue": safe_queue_size(crawler.record_queue),
	"stop_event": crawler.stop_event.is_set(),
	}

	def poll(self) -> tuple[str, dict[str, Any], str]:
	with self._lock:
	crawler = self._crawler
	state = RunState(
	run_id=self._state.run_id,
	running=self._state.running,
	started_at=self._state.started_at,
	finished_at=self._state.finished_at,
	stop_requested=self._state.stop_requested,
	last_error=self._state.last_error,
	)

	if crawler is not None:
	snapshot = self._snapshot_from_crawler(crawler)
	with self._lock:
	self._last_snapshot = snapshot

	with self._lock:
	latest = self._last_snapshot or {
	"timestamp": utc_now_iso(),
	"workers_total": 0,
	"workers_split": "-",
	"stop_reason": "-",
	"fetch_succeeded": 0,
	"parsed_pages": 0,
	"written_shards": 0,
	"tokenized_shards": 0,
	"tokenized_rows": 0,
	"tokenized_tokens": 0,
	"fetch_queue": 0,
	"parse_queue": 0,
	"record_queue": 0,
	"stop_event": False,
	}
	logs_text = "\n".join(self._logs)

	status_lines = [
	"### Crawler Status",
	f"- Run ID: `{state.run_id}`",
	f"- Running: `{state.running}`",
	f"- Stop requested: `{state.stop_requested}`",
	f"- Started at (UTC): `{state.started_at or '-'}`",
	f"- Finished at (UTC): `{state.finished_at or '-'}`",
	]
	if state.last_error:
	status_lines.append("- Last error:")
	status_lines.append("```text")
	status_lines.append(state.last_error.strip())
	status_lines.append("```")

	return "\n".join(status_lines), latest, logs_text


	RUN_MANAGER = CrawlerRunManager()


	def _format_dashboard_response(
	status: str,
	snapshot: dict[str, Any],
	logs: str,
	) -> tuple[str, str, str, str]:
	return (
	status,
	render_qvp_widget_md(snapshot),
	logs,
	render_tokenization_widget_html(snapshot),
	)


	def _start_crawl(
	*,
	total_workers: int,
	seed_urls_input: Any,
	max_links_per_page: int,
	request_timeout_seconds: float,
	max_response_bytes: int,
	shard_size_rows: int,
	max_shards: int,
	enable_hf_upload: bool,
	upload_incomplete_shards: bool,
	hf_repo_id: str,
	hf_token: str,
	hf_private_repo: bool,
	hf_path_prefix: str,
	) -> tuple[str, str, str, str]:
	try:
	config = build_crawler_config(
	seed_urls_input=seed_urls_input,
	max_links_per_page=max_links_per_page,
	request_timeout_seconds=request_timeout_seconds,
	max_response_bytes=max_response_bytes,
	shard_size_rows=shard_size_rows,
	max_shards=max_shards,
	enable_hf_upload=enable_hf_upload,
	upload_incomplete_shards=upload_incomplete_shards,
	hf_repo_id=hf_repo_id,
	hf_token=hf_token,
	hf_private_repo=hf_private_repo,
	hf_path_prefix=hf_path_prefix,
	total_workers=total_workers,
	)
	except ValueError as exc:
	raise gr.Error(str(exc)) from exc

	message = RUN_MANAGER.start(config)
	status, snapshot, logs = RUN_MANAGER.poll()
	return _format_dashboard_response(f"{status}\n\n{message}", snapshot, logs)


	def start_crawl_standard(
	seed_urls_input: Any,
	max_links_per_page: int,
	request_timeout_seconds: float,
	max_response_bytes: int,
	shard_size_rows: int,
	max_shards: int,
	enable_hf_upload: bool,
	upload_incomplete_shards: bool,
	hf_repo_id: str,
	hf_token: str,
	hf_private_repo: bool,
	hf_path_prefix: str,
	) -> tuple[str, str, str, str]:
	return _start_crawl(
	total_workers=NORMAL_TOTAL_WORKERS,
	seed_urls_input=seed_urls_input,
	max_links_per_page=max_links_per_page,
	request_timeout_seconds=request_timeout_seconds,
	max_response_bytes=max_response_bytes,
	shard_size_rows=shard_size_rows,
	max_shards=max_shards,
	enable_hf_upload=enable_hf_upload,
	upload_incomplete_shards=upload_incomplete_shards,
	hf_repo_id=hf_repo_id,
	hf_token=hf_token,
	hf_private_repo=hf_private_repo,
	hf_path_prefix=hf_path_prefix,
	)


	def start_crawl_super(
	seed_urls_input: Any,
	max_links_per_page: int,
	request_timeout_seconds: float,
	max_response_bytes: int,
	shard_size_rows: int,
	max_shards: int,
	enable_hf_upload: bool,
	upload_incomplete_shards: bool,
	hf_repo_id: str,
	hf_token: str,
	hf_private_repo: bool,
	hf_path_prefix: str,
	) -> tuple[str, str, str, str]:
	return _start_crawl(
	total_workers=SUPER_TOTAL_WORKERS,
	seed_urls_input=seed_urls_input,
	max_links_per_page=max_links_per_page,
	request_timeout_seconds=request_timeout_seconds,
	max_response_bytes=max_response_bytes,
	shard_size_rows=shard_size_rows,
	max_shards=max_shards,
	enable_hf_upload=enable_hf_upload,
	upload_incomplete_shards=upload_incomplete_shards,
	hf_repo_id=hf_repo_id,
	hf_token=hf_token,
	hf_private_repo=hf_private_repo,
	hf_path_prefix=hf_path_prefix,
	)


	def stop_crawl() -> tuple[str, str, str, str]:
	message = RUN_MANAGER.stop()
	status, snapshot, logs = RUN_MANAGER.poll()
	return _format_dashboard_response(f"{status}\n\n{message}", snapshot, logs)


	def poll_dashboard() -> tuple[str, str, str, str]:
	status, snapshot, logs = RUN_MANAGER.poll()
	return _format_dashboard_response(status, snapshot, logs)


	def render_seed_widget(seed_urls_input: Any) -> str:
	return render_seed_summary_text(seed_urls_input)


	def noop_event(*_args: Any) -> None:
	return None


	def toggle_hf_fields(enable_hf_upload: bool) -> tuple[Any, Any, Any, Any, Any]:
	update = gr.update(visible=enable_hf_upload)
	return update, update, update, update, update


	def build_ui() -> gr.Blocks:
	defaults = CrawlerConfig(
	seed_urls=[
	"https://en.wikipedia.org/wiki/Main_Page",
	"https://docs.python.org/3/",
	"https://developer.mozilla.org/en-US/",
	"https://www.nasa.gov/",
	]
	)
	default_seed_text = "\n".join(defaults.seed_urls)

	with gr.Blocks(title="DataMuncherLabs AutoWS") as demo:
	gr.Markdown("# DataMuncherLabs AutoWS")
	gr.Markdown("Async web crawler dashboard with live parquet text tokenization.")

	with gr.Row():
	theme_name = gr.Dropdown(
	choices=[
	"red",
	"blue",
	"light",
	"dark",
	"green",
	"sunset",
	"ocean",
	"amber",
	"graphite",
	"mint",
	],
	value="dark",
	label="Theme",
	interactive=True,
	)
	gr.Markdown(
	"- Standard mode: 12 threads (`10 fetch`, `2 parse`)\n"
	"- Super mode: 24 threads (`20 fetch`, `4 parse`)"
	)

	with gr.Row():
	with gr.Column(scale=2):
	seed_urls_input = gr.Textbox(
	lines=10,
	value=default_seed_text,
	interactive=True,
	label="Seed URL List (one URL per line)",
	placeholder="https://example.com",
	)
	seed_widget_html = gr.Textbox(
	label="Seed URL Summary",
	value=render_seed_summary_text(default_seed_text),
	lines=10,
	interactive=False,
	)
	token_widget_html = gr.HTML(
	label="Live Tokenization",
	value=render_tokenization_widget_html({}),
	)

	with gr.Column(scale=1):
	shard_size_rows = gr.Slider(
	label=f"Shard Size Rows (max {MAX_SHARD_ROWS})",
	minimum=100,
	maximum=MAX_SHARD_ROWS,
	step=100,
	value=min(defaults.shard_size_rows, MAX_SHARD_ROWS),
	)
	max_shards = gr.Slider(
	label=f"Shard Limit (1-{MAX_SHARDS})",
	minimum=1,
	maximum=MAX_SHARDS,
	step=1,
	value=min(defaults.max_shards, MAX_SHARDS),
	)
	max_links_per_page = gr.Slider(
	label="Max Links Per Page",
	minimum=10,
	maximum=1000,
	step=10,
	value=defaults.max_links_per_page,
	)
	request_timeout_seconds = gr.Slider(
	label="Request Timeout (seconds)",
	minimum=3,
	maximum=60,
	step=1,
	value=defaults.request_timeout_seconds,
	)
	max_response_bytes = gr.Slider(
	label="Max Response Bytes",
	minimum=500_000,
	maximum=8_000_000,
	step=100_000,
	value=defaults.max_response_bytes,
	)

	with gr.Accordion("Hugging Face Upload", open=False):
	enable_hf_upload = gr.Checkbox(
	label="Upload shards to my HF repo",
	value=False,
	)
	hf_repo_id = gr.Textbox(
	label="HF Repo ID",
	placeholder="username/dataset-name",
	visible=False,
	)
	hf_token = gr.Textbox(
	label="HF Token (write permissions)",
	type="password",
	placeholder="hf_xxx",
	visible=False,
	)
	hf_private_repo = gr.Checkbox(
	label="Private HF Repo",
	value=False,
	visible=False,
	)
	hf_path_prefix = gr.Textbox(
	label="HF Path Prefix",
	value="crawl_shards",
	visible=False,
	)
	upload_incomplete_shards = gr.Checkbox(
	label="Upload incomplete shard buffers",
	value=False,
	visible=False,
	)

	with gr.Row():
	start_button = gr.Button("Start Crawl (12 Threads)", variant="primary")
	super_button = gr.Button("Super Mode (24 Threads)", variant="primary")
	stop_button = gr.Button("Stop Crawl", variant="stop")
	refresh_button = gr.Button("Refresh")

	status_md = gr.Markdown("### Crawler Status\n- Run ID: `0`\n- Running: `False`")
	qvp_md = gr.Markdown("### Live Metrics\n- Queue: `0`\n- Visited: `0`\n- Parsed: `0`")
	logs_box = gr.Textbox(label="Run Log", lines=12, interactive=False)

	start_inputs = [
	seed_urls_input,
	max_links_per_page,
	request_timeout_seconds,
	max_response_bytes,
	shard_size_rows,
	max_shards,
	enable_hf_upload,
	upload_incomplete_shards,
	hf_repo_id,
	hf_token,
	hf_private_repo,
	hf_path_prefix,
	]
	outputs = [status_md, qvp_md, logs_box, token_widget_html]

	start_button.click(
	preflight_validate_start,
	inputs=[enable_hf_upload, hf_repo_id, hf_token],
	outputs=[],
	queue=False,
	).then(start_crawl_standard, inputs=start_inputs, outputs=outputs)
	super_button.click(
	preflight_validate_start,
	inputs=[enable_hf_upload, hf_repo_id, hf_token],
	outputs=[],
	queue=False,
	).then(start_crawl_super, inputs=start_inputs, outputs=outputs)
	stop_button.click(stop_crawl, inputs=[], outputs=outputs)
	refresh_button.click(poll_dashboard, inputs=[], outputs=outputs)

	enable_hf_upload.change(
	toggle_hf_fields,
	inputs=enable_hf_upload,
	outputs=[
	hf_repo_id,
	hf_token,
	hf_private_repo,
	hf_path_prefix,
	upload_incomplete_shards,
	],
	)

	seed_urls_input.change(
	fn=render_seed_widget,
	inputs=[seed_urls_input],
	outputs=[seed_widget_html],
	queue=False,
	)

	theme_name.change(
	fn=noop_event,
	inputs=[theme_name],
	outputs=[],
	js=THEME_JS,
	queue=False,
	)
	demo.load(fn=noop_event, inputs=[], outputs=[], js=SETTING_HELP_JS, queue=False)
	demo.load(
	fn=noop_event,
	inputs=[],
	outputs=[],
	js='() => { document.documentElement.setAttribute("data-crawler-theme", "dark"); }',
	queue=False,
	)
	demo.load(
	fn=render_seed_widget,
	inputs=[seed_urls_input],
	outputs=[seed_widget_html],
	queue=False,
	)
	demo.load(fn=poll_dashboard, inputs=[], outputs=outputs)
	enable_hf_upload.change(
	fn=noop_event,
	inputs=[enable_hf_upload],
	outputs=[],
	js=SETTING_HELP_JS,
	queue=False,
	)
	upload_incomplete_shards.change(
	fn=noop_event,
	inputs=[upload_incomplete_shards],
	outputs=[],
	js=SETTING_HELP_JS,
	queue=False,
	)

	timer = gr.Timer(value=1.0)
	timer.tick(fn=poll_dashboard, inputs=[], outputs=outputs)

	return demo


	demo = build_ui()


	def main() -> None:
	queued = demo.queue(default_concurrency_limit=32)
	launch_sig = inspect.signature(queued.launch)
	launch_kwargs: dict[str, Any] = {}

	if "css" in launch_sig.parameters:
	launch_kwargs["css"] = APP_CSS
	if "theme" in launch_sig.parameters:
	launch_kwargs["theme"] = gr.themes.Default(primary_hue="green")
	if "ssr_mode" in launch_sig.parameters:
	launch_kwargs["ssr_mode"] = False

	queued.launch(**launch_kwargs)


	if __name__ == "__main__":
	main()