AutoWS / app.py
Roman190928's picture
Add preflight popup validation for missing HF repo/token before crawl start
0e53210 verified
#!/usr/bin/env python3
from __future__ import annotations
import asyncio
import contextlib
import inspect
import threading
import traceback
from collections import deque
from dataclasses import dataclass
from datetime import datetime, timezone
from html import escape
from pathlib import Path
from typing import Any
from urllib.parse import urlsplit
import huggingface_hub as hf_hub
if not hasattr(hf_hub, "HfFolder"):
class _CompatHfFolder:
@staticmethod
def get_token() -> str | None:
return None
@staticmethod
def save_token(token: str) -> None:
del token
return None
@staticmethod
def delete_token() -> None:
return None
hf_hub.HfFolder = _CompatHfFolder # type: ignore[attr-defined]
import gradio as gr
from crawler import (
MAX_SHARD_ROWS,
MAX_SHARDS,
NORMAL_TOTAL_WORKERS,
SUPER_TOTAL_WORKERS,
AsyncCrawler,
CrawlerConfig,
)
APP_CSS = """
:root {
--bg-main: #0a0d12;
--bg-surface: #151a22;
--bg-panel: #1b2230;
--text-main: #f0f4fb;
--text-muted: #9aa4b6;
--accent: #3bd9ff;
--accent-2: #4cffb1;
--border: #2f3a50;
--shadow: 0 18px 36px rgba(0, 0, 0, 0.45);
}
:root[data-crawler-theme="red"] {
--bg-main: #17080c;
--bg-surface: #250d15;
--bg-panel: #341322;
--text-main: #f8e8ee;
--text-muted: #d5b0c0;
--accent: #7a0018;
--accent-2: #8e3ff5;
--border: #5a2035;
}
:root[data-crawler-theme="blue"] {
--bg-main: #021116;
--bg-surface: #08222c;
--bg-panel: #0e2f3b;
--text-main: #eaffff;
--text-muted: #8fbcc7;
--accent: #2fff9d;
--accent-2: #13e5ff;
--border: #1e5662;
}
:root[data-crawler-theme="light"] {
--bg-main: #f6f7f9;
--bg-surface: #ffffff;
--bg-panel: #eceff2;
--text-main: #111317;
--text-muted: #60666f;
--accent: #2a2f37;
--accent-2: #868b95;
--border: #d0d4db;
--shadow: 0 10px 25px rgba(35, 42, 52, 0.16);
}
:root[data-crawler-theme="dark"] {
--bg-main: #090909;
--bg-surface: #141414;
--bg-panel: #1d1d1d;
--text-main: #f0f0f0;
--text-muted: #a8a8a8;
--accent: #444444;
--accent-2: #686868;
--border: #2b2b2b;
}
:root[data-crawler-theme="green"] {
--bg-main: #08110b;
--bg-surface: #0f1d14;
--bg-panel: #17301e;
--text-main: #e8f8ed;
--text-muted: #97bc9f;
--accent: #2ea84b;
--accent-2: #185f2a;
--border: #2a5d36;
}
:root[data-crawler-theme="sunset"] {
--bg-main: #1c0f0b;
--bg-surface: #2f1810;
--bg-panel: #422015;
--text-main: #ffeede;
--text-muted: #d9b59d;
--accent: #ff7f3f;
--accent-2: #ff4f81;
--border: #6e3525;
}
:root[data-crawler-theme="ocean"] {
--bg-main: #04131d;
--bg-surface: #092230;
--bg-panel: #0e3144;
--text-main: #e8fbff;
--text-muted: #9fc3cf;
--accent: #2cd9ff;
--accent-2: #38ffcb;
--border: #1b5062;
}
:root[data-crawler-theme="amber"] {
--bg-main: #171104;
--bg-surface: #2a1d07;
--bg-panel: #3a2a0a;
--text-main: #fff6dc;
--text-muted: #d2bd84;
--accent: #ffb300;
--accent-2: #ffd54f;
--border: #6b4d12;
}
:root[data-crawler-theme="graphite"] {
--bg-main: #0f1012;
--bg-surface: #1a1d21;
--bg-panel: #242a30;
--text-main: #f1f3f6;
--text-muted: #adb5bf;
--accent: #8fa0b7;
--accent-2: #d7dce2;
--border: #3a424c;
}
:root[data-crawler-theme="mint"] {
--bg-main: #06140f;
--bg-surface: #0b2319;
--bg-panel: #123125;
--text-main: #e8fff4;
--text-muted: #a4d8bf;
--accent: #3dffb2;
--accent-2: #7df9d1;
--border: #256347;
}
.gradio-container {
background:
radial-gradient(1200px 550px at 8% 0%, color-mix(in srgb, var(--accent) 18%, transparent), transparent),
radial-gradient(900px 600px at 100% 0%, color-mix(in srgb, var(--accent-2) 14%, transparent), transparent),
var(--bg-main);
color: var(--text-main);
}
.gradio-container .block,
.gradio-container .form,
.gradio-container .gr-box,
.gradio-container .panel-wrap {
background: color-mix(in srgb, var(--bg-surface) 92%, transparent) !important;
border: 1px solid var(--border) !important;
box-shadow: var(--shadow);
}
.gradio-container h1,
.gradio-container h2,
.gradio-container h3,
.gradio-container p,
.gradio-container label,
.gradio-container .prose,
.gradio-container .prose * {
color: var(--text-main) !important;
}
.gradio-container input,
.gradio-container textarea,
.gradio-container select {
background: var(--bg-panel) !important;
color: var(--text-main) !important;
border: 1px solid var(--border) !important;
}
.gradio-container button {
border: 1px solid var(--border) !important;
}
.gradio-container button.primary {
background: linear-gradient(135deg, var(--accent), var(--accent-2)) !important;
color: #0b0e13 !important;
font-weight: 700;
}
.seed-widget,
.token-widget {
display: flex;
flex-direction: column;
gap: 0.75rem;
border: 1px solid var(--border);
border-radius: 0.9rem;
padding: 0.85rem;
background: color-mix(in srgb, var(--bg-panel) 86%, transparent);
}
.seed-stats,
.token-stats {
display: grid;
grid-template-columns: repeat(3, minmax(0, 1fr));
gap: 0.6rem;
}
.seed-stats > span,
.token-stats > span {
display: block;
padding: 0.55rem;
border: 1px solid var(--border);
border-radius: 0.6rem;
background: color-mix(in srgb, var(--bg-surface) 90%, transparent);
color: var(--text-main);
font-size: 0.9rem;
}
.seed-chip-wrap {
display: flex;
flex-wrap: wrap;
gap: 0.45rem;
}
.seed-chip {
border: 1px solid var(--border);
border-radius: 999px;
padding: 0.24rem 0.7rem;
color: var(--text-main);
background: linear-gradient(
145deg,
color-mix(in srgb, var(--accent) 20%, transparent),
color-mix(in srgb, var(--accent-2) 15%, transparent)
);
font-size: 0.83rem;
}
.seed-empty,
.seed-overflow,
.token-note {
color: var(--text-muted);
font-size: 0.83rem;
padding: 0.24rem 0.3rem;
}
.setting-help-q {
position: relative;
display: inline-flex;
align-items: center;
justify-content: center;
width: 1.05rem;
height: 1.05rem;
margin-left: 0.42rem;
border: 1px solid var(--border);
border-radius: 999px;
color: var(--text-main);
background: color-mix(in srgb, var(--bg-panel) 90%, transparent);
font-size: 0.74rem;
font-weight: 700;
cursor: help;
line-height: 1;
vertical-align: middle;
}
.setting-help-tooltip {
position: fixed;
left: 0;
top: 0;
transform: translate(-50%, -100%);
min-width: 180px;
max-width: 320px;
padding: 0.42rem 0.55rem;
border: 1px solid var(--border);
border-radius: 0.5rem;
background: color-mix(in srgb, var(--bg-surface) 98%, transparent);
color: var(--text-main);
font-size: 0.74rem;
font-weight: 500;
line-height: 1.25;
box-shadow: var(--shadow);
opacity: 0;
visibility: hidden;
transition: opacity 120ms ease;
z-index: 10000;
pointer-events: none;
white-space: normal;
}
.setting-help-tooltip.is-visible {
opacity: 1;
visibility: visible;
}
"""
THEME_JS = """
(theme_name) => {
const theme = theme_name || "dark";
document.documentElement.setAttribute("data-crawler-theme", theme);
return [];
}
"""
SEED_WIDGET_JS = """
(seed_text) => {
const parseSeedText = (value) => {
if (typeof value !== "string") return [];
return value
.split(/\\r?\\n/)
.map((line) => line.trim())
.filter((line) => line.length > 0);
};
const dedupe = (values) => {
const seen = new Set();
const out = [];
for (const value of values) {
if (!seen.has(value)) {
seen.add(value);
out.push(value);
}
}
return out;
};
const domainOf = (value) => {
try {
return new URL(value).hostname || "";
} catch {
return "";
}
};
const escapeHtml = (value) => String(value)
.replaceAll("&", "&")
.replaceAll("<", "&lt;")
.replaceAll(">", "&gt;")
.replaceAll('"', "&quot;")
.replaceAll("'", "&#39;");
const seeds = dedupe(parseSeedText(seed_text));
const domainSet = new Set(seeds.map(domainOf).filter(Boolean));
const chips = seeds.length
? seeds.slice(0, 12).map((url) => `<span class=\"seed-chip\">${escapeHtml(url)}</span>`).join("")
: '<span class=\"seed-empty\">No seed URLs configured yet.</span>';
const overflow = seeds.length > 12
? `<span class=\"seed-overflow\">+${seeds.length - 12} more</span>`
: "";
const firstUrlChars = seeds.length ? seeds[0].length : 0;
return `<div class=\"seed-widget\"><div class=\"seed-stats\"><span><strong>${seeds.length}</strong> seeds</span><span><strong>${domainSet.size}</strong> domains</span><span><strong>${firstUrlChars}</strong> first-url chars</span></div><div class=\"seed-chip-wrap\">${chips}${overflow}</div></div>`;
}
"""
SETTING_HELP_JS = """
() => {
const helpByPrefix = [
["Theme", "Switch between visual color themes."],
["Seed URL List (one URL per line)", "Provide crawl entry points. Put one URL per line; duplicates are ignored."],
["Shard Size Rows", "Rows written per parquet shard before a full shard is emitted."],
["Shard Limit", "Maximum number of shards to produce for a run (1 to 15)."],
["Max Links Per Page", "Maximum discovered links to enqueue from each parsed page."],
["Request Timeout (seconds)", "HTTP request timeout per URL."],
["Max Response Bytes", "Maximum response body bytes to read per page."],
["Upload shards to my HF repo", "Enable direct upload of produced shards to your Hugging Face Space repo."],
["HF Repo ID", "Target Hugging Face repo in owner/name format."],
["HF Token (write permissions)", "Token with write access to the target repo."],
["Private HF Repo", "Create the target repo as private if it does not exist."],
["HF Path Prefix", "Folder path inside the repo where shards are uploaded."],
["Upload incomplete shard buffers", "On crawl finish/stop, flush the current partial shard buffer and upload it too."],
];
const tooltipId = "setting-help-tooltip";
let tooltip = document.getElementById(tooltipId);
if (!tooltip) {
tooltip = document.createElement("div");
tooltip.id = tooltipId;
tooltip.className = "setting-help-tooltip";
document.body.appendChild(tooltip);
}
const placeTooltip = (target) => {
const rect = target.getBoundingClientRect();
const aboveTop = rect.top - 10;
tooltip.style.left = `${rect.left + (rect.width / 2)}px`;
if (aboveTop < 44) {
tooltip.style.top = `${rect.bottom + 10}px`;
tooltip.style.transform = "translate(-50%, 0)";
} else {
tooltip.style.top = `${rect.top - 10}px`;
tooltip.style.transform = "translate(-50%, -100%)";
}
};
const showTooltip = (target) => {
const message = target.getAttribute("data-help") || "";
if (!message) return;
tooltip.textContent = message;
placeTooltip(target);
tooltip.classList.add("is-visible");
};
const hideTooltip = () => {
tooltip.classList.remove("is-visible");
};
const clean = (value) => String(value || "").replace(/\\s+/g, " ").trim();
const labels = document.querySelectorAll(".gradio-container label");
for (const label of labels) {
const text = clean(label.textContent);
const match = helpByPrefix.find(([prefix]) => text.startsWith(prefix));
if (!match) continue;
let q = label.querySelector(".setting-help-q");
if (!q) {
q = document.createElement("span");
q.className = "setting-help-q";
q.textContent = "?";
label.appendChild(q);
}
q.setAttribute("data-help", match[1]);
q.setAttribute("aria-label", match[1]);
q.setAttribute("title", match[1]);
q.tabIndex = 0;
if (!q.hasAttribute("data-help-bound")) {
q.addEventListener("mouseenter", () => showTooltip(q));
q.addEventListener("mouseleave", hideTooltip);
q.addEventListener("focus", () => showTooltip(q));
q.addEventListener("blur", hideTooltip);
q.setAttribute("data-help-bound", "1");
}
}
return [];
}
"""
def utc_now_iso() -> str:
return datetime.now(timezone.utc).isoformat(timespec="seconds")
def safe_queue_size(queue: Any) -> int:
try:
return int(queue.qsize())
except Exception:
return -1
def parse_seed_url_rows(seed_urls_input: Any) -> list[str]:
if seed_urls_input is None:
return []
if isinstance(seed_urls_input, str):
return [line.strip() for line in seed_urls_input.splitlines() if line.strip()]
if isinstance(seed_urls_input, (list, tuple)):
rows_iterable: list[Any] = list(seed_urls_input)
elif hasattr(seed_urls_input, "values"):
try:
rows_iterable = seed_urls_input.values.tolist() # pandas.DataFrame path
except Exception:
rows_iterable = []
else:
rows_iterable = [seed_urls_input]
items: list[str] = []
for row in rows_iterable:
value_sources: list[Any]
if isinstance(row, dict):
value_sources = [next(iter(row.values()), "")]
elif isinstance(row, (list, tuple)):
value_sources = [row[0] if row else ""]
else:
value_sources = [row]
for source in value_sources:
if source is None:
continue
for line in str(source).splitlines():
value = line.strip()
if value:
items.append(value)
return items
def unique_preserve_order(values: list[str]) -> list[str]:
seen: set[str] = set()
out: list[str] = []
for value in values:
if value in seen:
continue
seen.add(value)
out.append(value)
return out
def collect_seed_urls(seed_urls_input: Any) -> list[str]:
return unique_preserve_order(parse_seed_url_rows(seed_urls_input))
def render_seed_widget_html(seed_urls_input: Any) -> str:
seeds = collect_seed_urls(seed_urls_input)
domains = {(urlsplit(u).hostname or "").lower().strip(".") for u in seeds}
domains = {d for d in domains if d}
chips = [f'<span class="seed-chip">{escape(url)}</span>' for url in seeds[:12]]
chips_html = "".join(chips) if chips else '<span class="seed-empty">No seed URLs configured yet.</span>'
overflow_html = f'<span class="seed-overflow">+{len(seeds) - 12} more</span>' if len(seeds) > 12 else ""
return (
'<div class="seed-widget">'
'<div class="seed-stats">'
f"<span><strong>{len(seeds)}</strong> seeds</span>"
f"<span><strong>{len(domains)}</strong> domains</span>"
f"<span><strong>{len(seeds[0]) if seeds else 0}</strong> first-url chars</span>"
"</div>"
f'<div class="seed-chip-wrap">{chips_html}{overflow_html}</div>'
"</div>"
)
def render_seed_summary_text(seed_urls_input: Any) -> str:
seeds = collect_seed_urls(seed_urls_input)
domains = {(urlsplit(u).hostname or "").lower().strip(".") for u in seeds}
domains = {d for d in domains if d}
first_url_chars = len(seeds[0]) if seeds else 0
lines = [
f"Seeds: {len(seeds)}",
f"Domains: {len(domains)}",
f"First URL chars: {first_url_chars}",
"",
"Seed URLs:",
]
if seeds:
lines.extend([f"- {url}" for url in seeds])
else:
lines.append("- (none)")
return "\n".join(lines)
def render_tokenization_widget_html(snapshot: dict[str, Any]) -> str:
tokenized_shards = int(snapshot.get("tokenized_shards", 0) or 0)
tokenized_rows = int(snapshot.get("tokenized_rows", 0) or 0)
tokenized_tokens = int(snapshot.get("tokenized_tokens", 0) or 0)
written_shards = int(snapshot.get("written_shards", 0) or 0)
return (
'<div class="token-widget">'
'<div class="token-stats">'
f"<span><strong>{tokenized_tokens}</strong> text tokens</span>"
f"<span><strong>{tokenized_rows}</strong> tokenized rows</span>"
f"<span><strong>{tokenized_shards}/{written_shards}</strong> tokenized shards</span>"
"</div>"
'<div class="token-note">Live shard tokenization uses tiktoken on the parquet <code>text</code> column.</div>'
"</div>"
)
def render_qvp_widget_md(snapshot: dict[str, Any]) -> str:
queue_count = int(snapshot.get("fetch_queue", 0) or 0)
visited_count = int(snapshot.get("fetch_succeeded", 0) or 0)
parsed_count = int(snapshot.get("parsed_pages", 0) or 0)
return (
"### Live Metrics\n"
f"- Queue: `{queue_count}`\n"
f"- Visited: `{visited_count}`\n"
f"- Parsed: `{parsed_count}`"
)
def validate_hf_requirements(enable_hf_upload: bool, hf_repo_id: str, hf_token: str) -> None:
if not enable_hf_upload:
return
if not hf_repo_id.strip():
raise ValueError("HF repo is required when upload is enabled.")
if not hf_token.strip():
raise ValueError("HF token is required when upload is enabled.")
def preflight_validate_start(
enable_hf_upload: bool,
hf_repo_id: str,
hf_token: str,
) -> None:
if not bool(enable_hf_upload):
return
if not hf_repo_id.strip() or not hf_token.strip():
raise gr.Error("HF upload is enabled. Enter both HF Repo ID and HF Token first.")
def build_crawler_config(
*,
seed_urls_input: Any,
max_links_per_page: int,
request_timeout_seconds: float,
max_response_bytes: int,
shard_size_rows: int,
max_shards: int,
enable_hf_upload: bool,
upload_incomplete_shards: bool,
hf_repo_id: str,
hf_token: str,
hf_private_repo: bool,
hf_path_prefix: str,
total_workers: int,
) -> CrawlerConfig:
validate_hf_requirements(enable_hf_upload, hf_repo_id, hf_token)
seed_urls = collect_seed_urls(seed_urls_input)
return CrawlerConfig(
seed_urls=seed_urls,
max_links_per_page=int(max_links_per_page),
request_timeout_seconds=float(request_timeout_seconds),
max_response_bytes=int(max_response_bytes),
shard_size_rows=int(shard_size_rows),
max_shards=int(max_shards),
output_dir=Path(__file__).resolve().parent / "shards",
enable_hf_upload=bool(enable_hf_upload),
upload_incomplete_shards=bool(upload_incomplete_shards),
hf_repo_id=hf_repo_id.strip(),
hf_token=hf_token.strip(),
hf_private_repo=bool(hf_private_repo),
hf_path_prefix=hf_path_prefix.strip() or "crawl_shards",
total_workers=int(total_workers),
)
@dataclass
class RunState:
run_id: int = 0
running: bool = False
started_at: str = ""
finished_at: str = ""
stop_requested: bool = False
last_error: str = ""
class CrawlerRunManager:
def __init__(self) -> None:
self._lock = threading.Lock()
self._thread: threading.Thread | None = None
self._loop: asyncio.AbstractEventLoop | None = None
self._crawler: AsyncCrawler | None = None
self._state = RunState()
self._logs: deque[str] = deque(maxlen=600)
self._last_snapshot: dict[str, Any] | None = None
def start(self, config: CrawlerConfig) -> str:
with self._lock:
if self._thread is not None and self._thread.is_alive():
return "A crawl is already running. Stop it before starting another one."
self._state.run_id += 1
self._state.running = True
self._state.started_at = utc_now_iso()
self._state.finished_at = ""
self._state.stop_requested = False
self._state.last_error = ""
self._last_snapshot = None
self._logs.clear()
run_id = self._state.run_id
self._logs.append(
f"[{utc_now_iso()}] Started run #{run_id} with {config.total_workers} workers "
f"({config.fetch_workers} fetch / {config.parser_workers} parser)."
)
self._thread = threading.Thread(
target=self._run_crawler,
args=(run_id, config),
daemon=True,
name=f"crawler-run-{run_id}",
)
self._thread.start()
return f"Run #{run_id} started."
def stop(self) -> str:
with self._lock:
if self._thread is None or not self._thread.is_alive():
return "No active crawl to stop."
self._state.stop_requested = True
crawler = self._crawler
loop = self._loop
run_id = self._state.run_id
self._logs.append(f"[{utc_now_iso()}] Stop requested for run #{run_id}")
if crawler is not None and loop is not None and loop.is_running():
loop.call_soon_threadsafe(crawler.request_stop, "user_requested_stop")
elif crawler is not None:
crawler.request_stop("user_requested_stop")
return f"Stop signal sent to run #{run_id}."
def _run_crawler(self, run_id: int, config: CrawlerConfig) -> None:
loop: asyncio.AbstractEventLoop | None = None
try:
crawler = AsyncCrawler(config)
if hasattr(asyncio, "Runner"):
with asyncio.Runner() as runner: # type: ignore[attr-defined]
loop = runner.get_loop()
with self._lock:
if self._state.run_id == run_id:
self._crawler = crawler
self._loop = loop
runner.run(crawler.run())
else:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
with self._lock:
if self._state.run_id == run_id:
self._crawler = crawler
self._loop = loop
loop.run_until_complete(crawler.run())
final_snapshot = self._snapshot_from_crawler(crawler)
with self._lock:
if self._state.run_id == run_id:
self._last_snapshot = final_snapshot
self._logs.append(f"[{utc_now_iso()}] Run #{run_id} completed")
except Exception:
error_text = traceback.format_exc(limit=20)
with self._lock:
self._state.last_error = error_text
self._logs.append(f"[{utc_now_iso()}] Run #{run_id} crashed")
finally:
with self._lock:
if self._state.run_id == run_id:
self._state.running = False
self._state.finished_at = utc_now_iso()
self._crawler = None
self._loop = None
if loop is not None and not loop.is_closed():
loop.close()
with contextlib.suppress(Exception):
asyncio.set_event_loop(None)
def _snapshot_from_crawler(self, crawler: AsyncCrawler) -> dict[str, Any]:
stats = crawler.stats
return {
"timestamp": utc_now_iso(),
"workers_total": crawler.config.total_workers,
"workers_split": f"{crawler.config.fetch_workers}/{crawler.config.parser_workers}",
"stop_reason": crawler.stop_reason or "-",
"fetch_succeeded": stats.fetch_succeeded,
"parsed_pages": stats.parsed_pages,
"written_shards": stats.written_shards,
"tokenized_shards": stats.tokenized_shards,
"tokenized_rows": stats.tokenized_rows,
"tokenized_tokens": stats.tokenized_tokens,
"fetch_queue": safe_queue_size(crawler.fetch_queue),
"parse_queue": safe_queue_size(crawler.parse_queue),
"record_queue": safe_queue_size(crawler.record_queue),
"stop_event": crawler.stop_event.is_set(),
}
def poll(self) -> tuple[str, dict[str, Any], str]:
with self._lock:
crawler = self._crawler
state = RunState(
run_id=self._state.run_id,
running=self._state.running,
started_at=self._state.started_at,
finished_at=self._state.finished_at,
stop_requested=self._state.stop_requested,
last_error=self._state.last_error,
)
if crawler is not None:
snapshot = self._snapshot_from_crawler(crawler)
with self._lock:
self._last_snapshot = snapshot
with self._lock:
latest = self._last_snapshot or {
"timestamp": utc_now_iso(),
"workers_total": 0,
"workers_split": "-",
"stop_reason": "-",
"fetch_succeeded": 0,
"parsed_pages": 0,
"written_shards": 0,
"tokenized_shards": 0,
"tokenized_rows": 0,
"tokenized_tokens": 0,
"fetch_queue": 0,
"parse_queue": 0,
"record_queue": 0,
"stop_event": False,
}
logs_text = "\n".join(self._logs)
status_lines = [
"### Crawler Status",
f"- Run ID: `{state.run_id}`",
f"- Running: `{state.running}`",
f"- Stop requested: `{state.stop_requested}`",
f"- Started at (UTC): `{state.started_at or '-'}`",
f"- Finished at (UTC): `{state.finished_at or '-'}`",
]
if state.last_error:
status_lines.append("- Last error:")
status_lines.append("```text")
status_lines.append(state.last_error.strip())
status_lines.append("```")
return "\n".join(status_lines), latest, logs_text
RUN_MANAGER = CrawlerRunManager()
def _format_dashboard_response(
status: str,
snapshot: dict[str, Any],
logs: str,
) -> tuple[str, str, str, str]:
return (
status,
render_qvp_widget_md(snapshot),
logs,
render_tokenization_widget_html(snapshot),
)
def _start_crawl(
*,
total_workers: int,
seed_urls_input: Any,
max_links_per_page: int,
request_timeout_seconds: float,
max_response_bytes: int,
shard_size_rows: int,
max_shards: int,
enable_hf_upload: bool,
upload_incomplete_shards: bool,
hf_repo_id: str,
hf_token: str,
hf_private_repo: bool,
hf_path_prefix: str,
) -> tuple[str, str, str, str]:
try:
config = build_crawler_config(
seed_urls_input=seed_urls_input,
max_links_per_page=max_links_per_page,
request_timeout_seconds=request_timeout_seconds,
max_response_bytes=max_response_bytes,
shard_size_rows=shard_size_rows,
max_shards=max_shards,
enable_hf_upload=enable_hf_upload,
upload_incomplete_shards=upload_incomplete_shards,
hf_repo_id=hf_repo_id,
hf_token=hf_token,
hf_private_repo=hf_private_repo,
hf_path_prefix=hf_path_prefix,
total_workers=total_workers,
)
except ValueError as exc:
raise gr.Error(str(exc)) from exc
message = RUN_MANAGER.start(config)
status, snapshot, logs = RUN_MANAGER.poll()
return _format_dashboard_response(f"{status}\n\n{message}", snapshot, logs)
def start_crawl_standard(
seed_urls_input: Any,
max_links_per_page: int,
request_timeout_seconds: float,
max_response_bytes: int,
shard_size_rows: int,
max_shards: int,
enable_hf_upload: bool,
upload_incomplete_shards: bool,
hf_repo_id: str,
hf_token: str,
hf_private_repo: bool,
hf_path_prefix: str,
) -> tuple[str, str, str, str]:
return _start_crawl(
total_workers=NORMAL_TOTAL_WORKERS,
seed_urls_input=seed_urls_input,
max_links_per_page=max_links_per_page,
request_timeout_seconds=request_timeout_seconds,
max_response_bytes=max_response_bytes,
shard_size_rows=shard_size_rows,
max_shards=max_shards,
enable_hf_upload=enable_hf_upload,
upload_incomplete_shards=upload_incomplete_shards,
hf_repo_id=hf_repo_id,
hf_token=hf_token,
hf_private_repo=hf_private_repo,
hf_path_prefix=hf_path_prefix,
)
def start_crawl_super(
seed_urls_input: Any,
max_links_per_page: int,
request_timeout_seconds: float,
max_response_bytes: int,
shard_size_rows: int,
max_shards: int,
enable_hf_upload: bool,
upload_incomplete_shards: bool,
hf_repo_id: str,
hf_token: str,
hf_private_repo: bool,
hf_path_prefix: str,
) -> tuple[str, str, str, str]:
return _start_crawl(
total_workers=SUPER_TOTAL_WORKERS,
seed_urls_input=seed_urls_input,
max_links_per_page=max_links_per_page,
request_timeout_seconds=request_timeout_seconds,
max_response_bytes=max_response_bytes,
shard_size_rows=shard_size_rows,
max_shards=max_shards,
enable_hf_upload=enable_hf_upload,
upload_incomplete_shards=upload_incomplete_shards,
hf_repo_id=hf_repo_id,
hf_token=hf_token,
hf_private_repo=hf_private_repo,
hf_path_prefix=hf_path_prefix,
)
def stop_crawl() -> tuple[str, str, str, str]:
message = RUN_MANAGER.stop()
status, snapshot, logs = RUN_MANAGER.poll()
return _format_dashboard_response(f"{status}\n\n{message}", snapshot, logs)
def poll_dashboard() -> tuple[str, str, str, str]:
status, snapshot, logs = RUN_MANAGER.poll()
return _format_dashboard_response(status, snapshot, logs)
def render_seed_widget(seed_urls_input: Any) -> str:
return render_seed_summary_text(seed_urls_input)
def noop_event(*_args: Any) -> None:
return None
def toggle_hf_fields(enable_hf_upload: bool) -> tuple[Any, Any, Any, Any, Any]:
update = gr.update(visible=enable_hf_upload)
return update, update, update, update, update
def build_ui() -> gr.Blocks:
defaults = CrawlerConfig(
seed_urls=[
"https://en.wikipedia.org/wiki/Main_Page",
"https://docs.python.org/3/",
"https://developer.mozilla.org/en-US/",
"https://www.nasa.gov/",
]
)
default_seed_text = "\n".join(defaults.seed_urls)
with gr.Blocks(title="DataMuncherLabs AutoWS") as demo:
gr.Markdown("# DataMuncherLabs AutoWS")
gr.Markdown("Async web crawler dashboard with live parquet text tokenization.")
with gr.Row():
theme_name = gr.Dropdown(
choices=[
"red",
"blue",
"light",
"dark",
"green",
"sunset",
"ocean",
"amber",
"graphite",
"mint",
],
value="dark",
label="Theme",
interactive=True,
)
gr.Markdown(
"- Standard mode: **12 threads** (`10 fetch`, `2 parse`)\n"
"- Super mode: **24 threads** (`20 fetch`, `4 parse`)"
)
with gr.Row():
with gr.Column(scale=2):
seed_urls_input = gr.Textbox(
lines=10,
value=default_seed_text,
interactive=True,
label="Seed URL List (one URL per line)",
placeholder="https://example.com",
)
seed_widget_html = gr.Textbox(
label="Seed URL Summary",
value=render_seed_summary_text(default_seed_text),
lines=10,
interactive=False,
)
token_widget_html = gr.HTML(
label="Live Tokenization",
value=render_tokenization_widget_html({}),
)
with gr.Column(scale=1):
shard_size_rows = gr.Slider(
label=f"Shard Size Rows (max {MAX_SHARD_ROWS})",
minimum=100,
maximum=MAX_SHARD_ROWS,
step=100,
value=min(defaults.shard_size_rows, MAX_SHARD_ROWS),
)
max_shards = gr.Slider(
label=f"Shard Limit (1-{MAX_SHARDS})",
minimum=1,
maximum=MAX_SHARDS,
step=1,
value=min(defaults.max_shards, MAX_SHARDS),
)
max_links_per_page = gr.Slider(
label="Max Links Per Page",
minimum=10,
maximum=1000,
step=10,
value=defaults.max_links_per_page,
)
request_timeout_seconds = gr.Slider(
label="Request Timeout (seconds)",
minimum=3,
maximum=60,
step=1,
value=defaults.request_timeout_seconds,
)
max_response_bytes = gr.Slider(
label="Max Response Bytes",
minimum=500_000,
maximum=8_000_000,
step=100_000,
value=defaults.max_response_bytes,
)
with gr.Accordion("Hugging Face Upload", open=False):
enable_hf_upload = gr.Checkbox(
label="Upload shards to my HF repo",
value=False,
)
hf_repo_id = gr.Textbox(
label="HF Repo ID",
placeholder="username/dataset-name",
visible=False,
)
hf_token = gr.Textbox(
label="HF Token (write permissions)",
type="password",
placeholder="hf_xxx",
visible=False,
)
hf_private_repo = gr.Checkbox(
label="Private HF Repo",
value=False,
visible=False,
)
hf_path_prefix = gr.Textbox(
label="HF Path Prefix",
value="crawl_shards",
visible=False,
)
upload_incomplete_shards = gr.Checkbox(
label="Upload incomplete shard buffers",
value=False,
visible=False,
)
with gr.Row():
start_button = gr.Button("Start Crawl (12 Threads)", variant="primary")
super_button = gr.Button("Super Mode (24 Threads)", variant="primary")
stop_button = gr.Button("Stop Crawl", variant="stop")
refresh_button = gr.Button("Refresh")
status_md = gr.Markdown("### Crawler Status\n- Run ID: `0`\n- Running: `False`")
qvp_md = gr.Markdown("### Live Metrics\n- Queue: `0`\n- Visited: `0`\n- Parsed: `0`")
logs_box = gr.Textbox(label="Run Log", lines=12, interactive=False)
start_inputs = [
seed_urls_input,
max_links_per_page,
request_timeout_seconds,
max_response_bytes,
shard_size_rows,
max_shards,
enable_hf_upload,
upload_incomplete_shards,
hf_repo_id,
hf_token,
hf_private_repo,
hf_path_prefix,
]
outputs = [status_md, qvp_md, logs_box, token_widget_html]
start_button.click(
preflight_validate_start,
inputs=[enable_hf_upload, hf_repo_id, hf_token],
outputs=[],
queue=False,
).then(start_crawl_standard, inputs=start_inputs, outputs=outputs)
super_button.click(
preflight_validate_start,
inputs=[enable_hf_upload, hf_repo_id, hf_token],
outputs=[],
queue=False,
).then(start_crawl_super, inputs=start_inputs, outputs=outputs)
stop_button.click(stop_crawl, inputs=[], outputs=outputs)
refresh_button.click(poll_dashboard, inputs=[], outputs=outputs)
enable_hf_upload.change(
toggle_hf_fields,
inputs=enable_hf_upload,
outputs=[
hf_repo_id,
hf_token,
hf_private_repo,
hf_path_prefix,
upload_incomplete_shards,
],
)
seed_urls_input.change(
fn=render_seed_widget,
inputs=[seed_urls_input],
outputs=[seed_widget_html],
queue=False,
)
theme_name.change(
fn=noop_event,
inputs=[theme_name],
outputs=[],
js=THEME_JS,
queue=False,
)
demo.load(fn=noop_event, inputs=[], outputs=[], js=SETTING_HELP_JS, queue=False)
demo.load(
fn=noop_event,
inputs=[],
outputs=[],
js='() => { document.documentElement.setAttribute("data-crawler-theme", "dark"); }',
queue=False,
)
demo.load(
fn=render_seed_widget,
inputs=[seed_urls_input],
outputs=[seed_widget_html],
queue=False,
)
demo.load(fn=poll_dashboard, inputs=[], outputs=outputs)
enable_hf_upload.change(
fn=noop_event,
inputs=[enable_hf_upload],
outputs=[],
js=SETTING_HELP_JS,
queue=False,
)
upload_incomplete_shards.change(
fn=noop_event,
inputs=[upload_incomplete_shards],
outputs=[],
js=SETTING_HELP_JS,
queue=False,
)
timer = gr.Timer(value=1.0)
timer.tick(fn=poll_dashboard, inputs=[], outputs=outputs)
return demo
demo = build_ui()
def main() -> None:
queued = demo.queue(default_concurrency_limit=32)
launch_sig = inspect.signature(queued.launch)
launch_kwargs: dict[str, Any] = {}
if "css" in launch_sig.parameters:
launch_kwargs["css"] = APP_CSS
if "theme" in launch_sig.parameters:
launch_kwargs["theme"] = gr.themes.Default(primary_hue="green")
if "ssr_mode" in launch_sig.parameters:
launch_kwargs["ssr_mode"] = False
queued.launch(**launch_kwargs)
if __name__ == "__main__":
main()