Spaces:

DataMuncher-Labs
/

AutoWS

Running

App Files Files Community

Roman190928 commited on 16 days ago

Commit

83d2ebb

verified ·

1 Parent(s): f1fd68b

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -943

app.py DELETED Viewed

@@ -1,943 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-import asyncio
-import threading
-import traceback
-from collections import deque
-from dataclasses import dataclass
-from datetime import datetime, timezone
-from html import escape
-from pathlib import Path
-from typing import Any
-from urllib.parse import urlsplit
-import huggingface_hub as hf_hub
-if not hasattr(hf_hub, "HfFolder"):
-    class _CompatHfFolder:
-        @staticmethod
-        def get_token() -> str | None:
-            return None
-        @staticmethod
-        def save_token(token: str) -> None:
-            del token
-            return None
-        @staticmethod
-        def delete_token() -> None:
-            return None
-    hf_hub.HfFolder = _CompatHfFolder  # type: ignore[attr-defined]
-import gradio as gr
-from crawler import (
-    MAX_SHARD_ROWS,
-    NORMAL_TOTAL_WORKERS,
-    SUPER_TOTAL_WORKERS,
-    AsyncCrawler,
-    CrawlerConfig,
-)
-APP_CSS = """
-:root {
-  --bg-main: #0a0d12;
-  --bg-surface: #151a22;
-  --bg-panel: #1b2230;
-  --text-main: #f0f4fb;
-  --text-muted: #9aa4b6;
-  --accent: #3bd9ff;
-  --accent-2: #4cffb1;
-  --border: #2f3a50;
-  --shadow: 0 18px 36px rgba(0, 0, 0, 0.45);
-}
-:root[data-crawler-theme="red"] {
-  --bg-main: #17080c;
-  --bg-surface: #250d15;
-  --bg-panel: #341322;
-  --text-main: #f8e8ee;
-  --text-muted: #d5b0c0;
-  --accent: #7a0018;
-  --accent-2: #8e3ff5;
-  --border: #5a2035;
-}
-:root[data-crawler-theme="blue"] {
-  --bg-main: #021116;
-  --bg-surface: #08222c;
-  --bg-panel: #0e2f3b;
-  --text-main: #eaffff;
-  --text-muted: #8fbcc7;
-  --accent: #2fff9d;
-  --accent-2: #13e5ff;
-  --border: #1e5662;
-}
-:root[data-crawler-theme="light"] {
-  --bg-main: #f6f7f9;
-  --bg-surface: #ffffff;
-  --bg-panel: #eceff2;
-  --text-main: #111317;
-  --text-muted: #60666f;
-  --accent: #2a2f37;
-  --accent-2: #868b95;
-  --border: #d0d4db;
-  --shadow: 0 10px 25px rgba(35, 42, 52, 0.16);
-}
-:root[data-crawler-theme="dark"] {
-  --bg-main: #090909;
-  --bg-surface: #141414;
-  --bg-panel: #1d1d1d;
-  --text-main: #f0f0f0;
-  --text-muted: #a8a8a8;
-  --accent: #444444;
-  --accent-2: #686868;
-  --border: #2b2b2b;
-}
-:root[data-crawler-theme="green"] {
-  --bg-main: #08110b;
-  --bg-surface: #0f1d14;
-  --bg-panel: #17301e;
-  --text-main: #e8f8ed;
-  --text-muted: #97bc9f;
-  --accent: #2ea84b;
-  --accent-2: #185f2a;
-  --border: #2a5d36;
-}
-.gradio-container {
-  background:
-    radial-gradient(1200px 550px at 8% 0%, color-mix(in srgb, var(--accent) 18%, transparent), transparent),
-    radial-gradient(900px 600px at 100% 0%, color-mix(in srgb, var(--accent-2) 14%, transparent), transparent),
-    var(--bg-main);
-  color: var(--text-main);
-}
-.gradio-container .block,
-.gradio-container .form,
-.gradio-container .gr-box,
-.gradio-container .panel-wrap {
-  background: color-mix(in srgb, var(--bg-surface) 92%, transparent) !important;
-  border: 1px solid var(--border) !important;
-  box-shadow: var(--shadow);
-}
-.gradio-container h1,
-.gradio-container h2,
-.gradio-container h3,
-.gradio-container p,
-.gradio-container label,
-.gradio-container .prose,
-.gradio-container .prose * {
-  color: var(--text-main) !important;
-}
-.gradio-container input,
-.gradio-container textarea,
-.gradio-container select {
-  background: var(--bg-panel) !important;
-  color: var(--text-main) !important;
-  border: 1px solid var(--border) !important;
-}
-.gradio-container button {
-  border: 1px solid var(--border) !important;
-}
-.gradio-container button.primary {
-  background: linear-gradient(135deg, var(--accent), var(--accent-2)) !important;
-  color: #0b0e13 !important;
-  font-weight: 700;
-}
-.seed-widget,
-.token-widget {
-  display: flex;
-  flex-direction: column;
-  gap: 0.75rem;
-  border: 1px solid var(--border);
-  border-radius: 0.9rem;
-  padding: 0.85rem;
-  background: color-mix(in srgb, var(--bg-panel) 86%, transparent);
-}
-.seed-stats,
-.token-stats {
-  display: grid;
-  grid-template-columns: repeat(3, minmax(0, 1fr));
-  gap: 0.6rem;
-}
-.seed-stats > span,
-.token-stats > span {
-  display: block;
-  padding: 0.55rem;
-  border: 1px solid var(--border);
-  border-radius: 0.6rem;
-  background: color-mix(in srgb, var(--bg-surface) 90%, transparent);
-  color: var(--text-main);
-  font-size: 0.9rem;
-}
-.seed-chip-wrap {
-  display: flex;
-  flex-wrap: wrap;
-  gap: 0.45rem;
-}
-.seed-chip {
-  border: 1px solid var(--border);
-  border-radius: 999px;
-  padding: 0.24rem 0.7rem;
-  color: var(--text-main);
-  background: linear-gradient(
-    145deg,
-    color-mix(in srgb, var(--accent) 20%, transparent),
-    color-mix(in srgb, var(--accent-2) 15%, transparent)
-  );
-  font-size: 0.83rem;
-}
-.seed-empty,
-.seed-overflow,
-.token-note {
-  color: var(--text-muted);
-  font-size: 0.83rem;
-  padding: 0.24rem 0.3rem;
-}
-"""
-THEME_JS = """
-(theme_name) => {
-  const theme = theme_name || "dark";
-  document.documentElement.setAttribute("data-crawler-theme", theme);
-  return [];
-}
-"""
-SEED_WIDGET_JS = """
-(seed_rows) => {
-  const parseRows = (rows) => {
-    if (!Array.isArray(rows)) return [];
-    const out = [];
-    for (const row of rows) {
-      let value = "";
-      if (Array.isArray(row)) {
-        value = String(row[0] ?? "").trim();
-      } else if (row && typeof row === "object") {
-        value = String(Object.values(row)[0] ?? "").trim();
-      } else if (row !== null && row !== undefined) {
-        value = String(row).trim();
-      }
-      if (value) out.push(value);
-    }
-    return out;
-  };
-  const dedupe = (values) => {
-    const seen = new Set();
-    const out = [];
-    for (const value of values) {
-      if (!seen.has(value)) {
-        seen.add(value);
-        out.push(value);
-      }
-    }
-    return out;
-  };
-  const domainOf = (value) => {
-    try {
-      return new URL(value).hostname || "";
-    } catch {
-      return "";
-    }
-  };
-  const escapeHtml = (value) => String(value)
-    .replaceAll("&", "&amp;")
-    .replaceAll("<", "&lt;")
-    .replaceAll(">", "&gt;")
-    .replaceAll('"', "&quot;")
-    .replaceAll("'", "&#39;");
-  const seeds = dedupe(parseRows(seed_rows));
-  const domainSet = new Set(seeds.map(domainOf).filter(Boolean));
-  const chips = seeds.length
-    ? seeds.slice(0, 12).map((url) => `<span class=\"seed-chip\">${escapeHtml(url)}</span>`).join("")
-    : '<span class=\"seed-empty\">No seed URLs configured yet.</span>';
-  const overflow = seeds.length > 12
-    ? `<span class=\"seed-overflow\">+${seeds.length - 12} more</span>`
-    : "";
-  return `<div class=\"seed-widget\"><div class=\"seed-stats\"><span><strong>${seeds.length}</strong> seeds</span><span><strong>${domainSet.size}</strong> domains</span><span><strong>${seeds.slice(0, 1).join("").length || 0}</strong> first-url chars</span></div><div class=\"seed-chip-wrap\">${chips}${overflow}</div></div>`;
-}
-"""
-def utc_now_iso() -> str:
-    return datetime.now(timezone.utc).isoformat(timespec="seconds")
-def safe_queue_size(queue: Any) -> int:
-    try:
-        return int(queue.qsize())
-    except Exception:
-        return -1
-def parse_seed_url_rows(rows: Any) -> list[str]:
-    items: list[str] = []
-    if not rows:
-        return items
-    for row in rows:
-        value = ""
-        if isinstance(row, dict):
-            first_value = next(iter(row.values()), "")
-            value = str(first_value or "").strip()
-        elif isinstance(row, (list, tuple)):
-            first_value = row[0] if row else ""
-            value = str(first_value or "").strip()
-        elif row is not None:
-            value = str(row).strip()
-        if value:
-            items.append(value)
-    return items
-def unique_preserve_order(values: list[str]) -> list[str]:
-    seen: set[str] = set()
-    out: list[str] = []
-    for value in values:
-        if value in seen:
-            continue
-        seen.add(value)
-        out.append(value)
-    return out
-def collect_seed_urls(seed_urls_table: Any) -> list[str]:
-    return unique_preserve_order(parse_seed_url_rows(seed_urls_table))
-def render_seed_widget_html(seed_urls_table: Any) -> str:
-    seeds = collect_seed_urls(seed_urls_table)
-    domains = {(urlsplit(u).hostname or "").lower().strip(".") for u in seeds}
-    domains = {d for d in domains if d}
-    chips: list[str] = []
-    for url in seeds[:12]:
-        chips.append(f'<span class="seed-chip">{escape(url)}</span>')
-    chips_html = "".join(chips) if chips else '<span class="seed-empty">No seed URLs configured yet.</span>'
-    overflow_html = f'<span class="seed-overflow">+{len(seeds) - 12} more</span>' if len(seeds) > 12 else ""
-    return (
-        '<div class="seed-widget">'
-        '<div class="seed-stats">'
-        f"<span><strong>{len(seeds)}</strong> seeds</span>"
-        f"<span><strong>{len(domains)}</strong> domains</span>"
-        f"<span><strong>{len(seeds[0]) if seeds else 0}</strong> first-url chars</span>"
-        "</div>"
-        f'<div class="seed-chip-wrap">{chips_html}{overflow_html}</div>'
-        "</div>"
-    )
-def render_tokenization_widget_html(snapshot: dict[str, Any]) -> str:
-    tokenized_shards = int(snapshot.get("tokenized_shards", 0) or 0)
-    tokenized_rows = int(snapshot.get("tokenized_rows", 0) or 0)
-    tokenized_tokens = int(snapshot.get("tokenized_tokens", 0) or 0)
-    written_shards = int(snapshot.get("written_shards", 0) or 0)
-    return (
-        '<div class="token-widget">'
-        '<div class="token-stats">'
-        f"<span><strong>{tokenized_tokens}</strong> text tokens</span>"
-        f"<span><strong>{tokenized_rows}</strong> tokenized rows</span>"
-        f"<span><strong>{tokenized_shards}/{written_shards}</strong> tokenized shards</span>"
-        "</div>"
-        '<div class="token-note">Live shard tokenization uses tiktoken on the parquet <code>text</code> column.</div>'
-        "</div>"
-    )
-def render_qvp_widget_md(snapshot: dict[str, Any]) -> str:
-    queue_count = int(snapshot.get("fetch_queue", 0) or 0)
-    visited_count = int(snapshot.get("fetch_succeeded", 0) or 0)
-    parsed_count = int(snapshot.get("parsed_pages", 0) or 0)
-    return (
-        "### Live Metrics\n"
-        f"- Queue: `{queue_count}`\n"
-        f"- Visited: `{visited_count}`\n"
-        f"- Parsed: `{parsed_count}`"
-    )
-def validate_hf_requirements(enable_hf_upload: bool, hf_repo_id: str, hf_token: str) -> None:
-    if not enable_hf_upload:
-        return
-    if not hf_repo_id.strip():
-        raise ValueError("HF repo is required when upload is enabled.")
-    if not hf_token.strip():
-        raise ValueError("HF token is required when upload is enabled.")
-def build_crawler_config(
-    *,
-    seed_urls_table: Any,
-    max_links_per_page: int,
-    request_timeout_seconds: float,
-    max_response_bytes: int,
-    shard_size_rows: int,
-    enable_hf_upload: bool,
-    hf_repo_id: str,
-    hf_token: str,
-    hf_private_repo: bool,
-    hf_path_prefix: str,
-    total_workers: int,
-) -> CrawlerConfig:
-    validate_hf_requirements(enable_hf_upload, hf_repo_id, hf_token)
-    seed_urls = collect_seed_urls(seed_urls_table)
-    return CrawlerConfig(
-        seed_urls=seed_urls,
-        max_links_per_page=int(max_links_per_page),
-        request_timeout_seconds=float(request_timeout_seconds),
-        max_response_bytes=int(max_response_bytes),
-        shard_size_rows=int(shard_size_rows),
-        output_dir=Path(__file__).resolve().parent / "shards",
-        enable_hf_upload=bool(enable_hf_upload),
-        hf_repo_id=hf_repo_id.strip(),
-        hf_token=hf_token.strip(),
-        hf_private_repo=bool(hf_private_repo),
-        hf_path_prefix=hf_path_prefix.strip() or "crawl_shards",
-        total_workers=int(total_workers),
-    )
-@dataclass
-class RunState:
-    run_id: int = 0
-    running: bool = False
-    started_at: str = ""
-    finished_at: str = ""
-    stop_requested: bool = False
-    last_error: str = ""
-class CrawlerRunManager:
-    def __init__(self) -> None:
-        self._lock = threading.Lock()
-        self._thread: threading.Thread | None = None
-        self._loop: asyncio.AbstractEventLoop | None = None
-        self._crawler: AsyncCrawler | None = None
-        self._state = RunState()
-        self._history: deque[dict[str, Any]] = deque(maxlen=1200)
-        self._logs: deque[str] = deque(maxlen=600)
-        self._last_snapshot: dict[str, Any] | None = None
-    def start(self, config: CrawlerConfig) -> str:
-        with self._lock:
-            if self._thread is not None and self._thread.is_alive():
-                return "A crawl is already running. Stop it before starting another one."
-            self._state.run_id += 1
-            self._state.running = True
-            self._state.started_at = utc_now_iso()
-            self._state.finished_at = ""
-            self._state.stop_requested = False
-            self._state.last_error = ""
-            self._history.clear()
-            self._last_snapshot = None
-            self._logs.clear()
-            run_id = self._state.run_id
-            self._logs.append(
-                f"[{utc_now_iso()}] Started run #{run_id} with {config.total_workers} workers "
-                f"({config.fetch_workers} fetch / {config.parser_workers} parser)."
-            )
-            self._thread = threading.Thread(
-                target=self._run_crawler,
-                args=(run_id, config),
-                daemon=True,
-                name=f"crawler-run-{run_id}",
-            )
-            self._thread.start()
-        return f"Run #{run_id} started."
-    def stop(self) -> str:
-        with self._lock:
-            if self._thread is None or not self._thread.is_alive():
-                return "No active crawl to stop."
-            self._state.stop_requested = True
-            crawler = self._crawler
-            loop = self._loop
-            run_id = self._state.run_id
-            self._logs.append(f"[{utc_now_iso()}] Stop requested for run #{run_id}")
-        if crawler is not None and loop is not None and loop.is_running():
-            loop.call_soon_threadsafe(crawler.request_stop, "user_requested_stop")
-        elif crawler is not None:
-            crawler.request_stop("user_requested_stop")
-        return f"Stop signal sent to run #{run_id}."
-    def _run_crawler(self, run_id: int, config: CrawlerConfig) -> None:
-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
-        try:
-            crawler = AsyncCrawler(config)
-            with self._lock:
-                if self._state.run_id == run_id:
-                    self._crawler = crawler
-                    self._loop = loop
-            loop.run_until_complete(crawler.run())
-            final_snapshot = self._snapshot_from_crawler(crawler)
-            with self._lock:
-                if self._state.run_id == run_id:
-                    self._last_snapshot = final_snapshot
-                    self._history.append(final_snapshot)
-                self._logs.append(f"[{utc_now_iso()}] Run #{run_id} completed")
-        except Exception:
-            error_text = traceback.format_exc(limit=20)
-            with self._lock:
-                self._state.last_error = error_text
-                self._logs.append(f"[{utc_now_iso()}] Run #{run_id} crashed")
-        finally:
-            with self._lock:
-                if self._state.run_id == run_id:
-                    self._state.running = False
-                    self._state.finished_at = utc_now_iso()
-                    self._crawler = None
-                    self._loop = None
-            loop.close()
-            asyncio.set_event_loop(None)
-    def _snapshot_from_crawler(self, crawler: AsyncCrawler) -> dict[str, Any]:
-        stats = crawler.stats
-        return {
-            "timestamp": utc_now_iso(),
-            "workers_total": crawler.config.total_workers,
-            "workers_split": f"{crawler.config.fetch_workers}/{crawler.config.parser_workers}",
-            "stop_reason": crawler.stop_reason or "-",
-            "queued_urls": stats.queued_urls,
-            "fetch_reserved": stats.fetch_reserved,
-            "fetch_succeeded": stats.fetch_succeeded,
-            "fetch_failed": stats.fetch_failed,
-            "parsed_pages": stats.parsed_pages,
-            "parse_failed": stats.parse_failed,
-            "extracted_links": stats.extracted_links,
-            "dropped_urls": stats.dropped_urls,
-            "robots_blocked": stats.robots_blocked,
-            "stored_rows": stats.stored_rows,
-            "written_shards": stats.written_shards,
-            "uploaded_shards": stats.uploaded_shards,
-            "tokenized_shards": stats.tokenized_shards,
-            "tokenized_rows": stats.tokenized_rows,
-            "tokenized_tokens": stats.tokenized_tokens,
-            "active_fetchers": crawler.active_fetchers,
-            "active_parsers": crawler.active_parsers,
-            "fetch_queue": safe_queue_size(crawler.fetch_queue),
-            "parse_queue": safe_queue_size(crawler.parse_queue),
-            "record_queue": safe_queue_size(crawler.record_queue),
-            "stop_event": crawler.stop_event.is_set(),
-        }
-    def poll(self) -> tuple[str, dict[str, Any], list[list[Any]], str]:
-        with self._lock:
-            crawler = self._crawler
-            state = RunState(
-                run_id=self._state.run_id,
-                running=self._state.running,
-                started_at=self._state.started_at,
-                finished_at=self._state.finished_at,
-                stop_requested=self._state.stop_requested,
-                last_error=self._state.last_error,
-            )
-        if crawler is not None:
-            snapshot = self._snapshot_from_crawler(crawler)
-            with self._lock:
-                self._last_snapshot = snapshot
-                if not self._history or self._history[-1]["timestamp"] != snapshot["timestamp"]:
-                    self._history.append(snapshot)
-        with self._lock:
-            latest = self._last_snapshot or {
-                "timestamp": utc_now_iso(),
-                "workers_total": 0,
-                "workers_split": "-",
-                "stop_reason": "-",
-                "queued_urls": 0,
-                "fetch_reserved": 0,
-                "fetch_succeeded": 0,
-                "fetch_failed": 0,
-                "parsed_pages": 0,
-                "parse_failed": 0,
-                "extracted_links": 0,
-                "dropped_urls": 0,
-                "robots_blocked": 0,
-                "stored_rows": 0,
-                "written_shards": 0,
-                "uploaded_shards": 0,
-                "tokenized_shards": 0,
-                "tokenized_rows": 0,
-                "tokenized_tokens": 0,
-                "active_fetchers": 0,
-                "active_parsers": 0,
-                "fetch_queue": 0,
-                "parse_queue": 0,
-                "record_queue": 0,
-                "stop_event": False,
-            }
-            history_copy = list(self._history)
-            logs_text = "\n".join(self._logs)
-        history_rows: list[list[Any]] = []
-        for item in reversed(history_copy[-180:]):
-            history_rows.append(
-                [
-                    item["timestamp"],
-                    item["workers_total"],
-                    item["workers_split"],
-                    item["fetch_reserved"],
-                    item["fetch_succeeded"],
-                    item["parsed_pages"],
-                    item["stored_rows"],
-                    item["written_shards"],
-                    item["uploaded_shards"],
-                    item["tokenized_tokens"],
-                    item["fetch_queue"],
-                    item["parse_queue"],
-                    item["record_queue"],
-                    item["stop_reason"],
-                ]
-            )
-        status_lines = [
-            "### Crawler Status",
-            f"- Run ID: `{state.run_id}`",
-            f"- Running: `{state.running}`",
-            f"- Stop requested: `{state.stop_requested}`",
-            f"- Started at (UTC): `{state.started_at or '-'}`",
-            f"- Finished at (UTC): `{state.finished_at or '-'}`",
-        ]
-        if state.last_error:
-            status_lines.append("- Last error:")
-            status_lines.append("```text")
-            status_lines.append(state.last_error.strip())
-            status_lines.append("```")
-        return "\n".join(status_lines), latest, history_rows, logs_text
-RUN_MANAGER = CrawlerRunManager()
-def _format_dashboard_response(
-    payload: tuple[str, dict[str, Any], list[list[Any]], str]
-) -> tuple[str, str, str, str]:
-    status, snapshot, history, logs = payload
-    del history
-    return (
-        status,
-        render_qvp_widget_md(snapshot),
-        logs,
-        render_tokenization_widget_html(snapshot),
-    )
-def _start_crawl(
-    *,
-    total_workers: int,
-    seed_urls_table: Any,
-    max_links_per_page: int,
-    request_timeout_seconds: float,
-    max_response_bytes: int,
-    shard_size_rows: int,
-    enable_hf_upload: bool,
-    hf_repo_id: str,
-    hf_token: str,
-    hf_private_repo: bool,
-    hf_path_prefix: str,
-) -> tuple[str, str, str, str]:
-    try:
-        config = build_crawler_config(
-            seed_urls_table=seed_urls_table,
-            max_links_per_page=max_links_per_page,
-            request_timeout_seconds=request_timeout_seconds,
-            max_response_bytes=max_response_bytes,
-            shard_size_rows=shard_size_rows,
-            enable_hf_upload=enable_hf_upload,
-            hf_repo_id=hf_repo_id,
-            hf_token=hf_token,
-            hf_private_repo=hf_private_repo,
-            hf_path_prefix=hf_path_prefix,
-            total_workers=total_workers,
-        )
-    except ValueError as exc:
-        raise gr.Error(str(exc)) from exc
-    message = RUN_MANAGER.start(config)
-    status, snapshot, history, logs = RUN_MANAGER.poll()
-    return _format_dashboard_response((f"{status}\n\n{message}", snapshot, history, logs))
-def start_crawl_standard(
-    seed_urls_table: Any,
-    max_links_per_page: int,
-    request_timeout_seconds: float,
-    max_response_bytes: int,
-    shard_size_rows: int,
-    enable_hf_upload: bool,
-    hf_repo_id: str,
-    hf_token: str,
-    hf_private_repo: bool,
-    hf_path_prefix: str,
-) -> tuple[str, str, str, str]:
-    return _start_crawl(
-        total_workers=NORMAL_TOTAL_WORKERS,
-        seed_urls_table=seed_urls_table,
-        max_links_per_page=max_links_per_page,
-        request_timeout_seconds=request_timeout_seconds,
-        max_response_bytes=max_response_bytes,
-        shard_size_rows=shard_size_rows,
-        enable_hf_upload=enable_hf_upload,
-        hf_repo_id=hf_repo_id,
-        hf_token=hf_token,
-        hf_private_repo=hf_private_repo,
-        hf_path_prefix=hf_path_prefix,
-    )
-def start_crawl_super(
-    seed_urls_table: Any,
-    max_links_per_page: int,
-    request_timeout_seconds: float,
-    max_response_bytes: int,
-    shard_size_rows: int,
-    enable_hf_upload: bool,
-    hf_repo_id: str,
-    hf_token: str,
-    hf_private_repo: bool,
-    hf_path_prefix: str,
-) -> tuple[str, str, str, str]:
-    return _start_crawl(
-        total_workers=SUPER_TOTAL_WORKERS,
-        seed_urls_table=seed_urls_table,
-        max_links_per_page=max_links_per_page,
-        request_timeout_seconds=request_timeout_seconds,
-        max_response_bytes=max_response_bytes,
-        shard_size_rows=shard_size_rows,
-        enable_hf_upload=enable_hf_upload,
-        hf_repo_id=hf_repo_id,
-        hf_token=hf_token,
-        hf_private_repo=hf_private_repo,
-        hf_path_prefix=hf_path_prefix,
-    )
-def stop_crawl() -> tuple[str, str, str, str]:
-    message = RUN_MANAGER.stop()
-    status, snapshot, history, logs = RUN_MANAGER.poll()
-    return _format_dashboard_response((f"{status}\n\n{message}", snapshot, history, logs))
-def poll_dashboard() -> tuple[str, str, str, str]:
-    return _format_dashboard_response(RUN_MANAGER.poll())
-def toggle_hf_fields(enable_hf_upload: bool) -> tuple[Any, Any, Any, Any]:
-    update = gr.update(visible=enable_hf_upload)
-    return update, update, update, update
-def build_ui() -> gr.Blocks:
-    defaults = CrawlerConfig(
-        seed_urls=[
-            "https://en.wikipedia.org/wiki/Main_Page",
-            "https://docs.python.org/3/",
-            "https://developer.mozilla.org/en-US/",
-            "https://www.nasa.gov/",
-        ]
-    )
-    default_seed_rows = [[url] for url in defaults.seed_urls]
-    with gr.Blocks(
-        title="DataMuncherLabs AutoWS",
-        css=APP_CSS,
-        theme=gr.themes.Default(primary_hue="green"),
-    ) as demo:
-        gr.Markdown("# DataMuncherLabs AutoWS")
-        gr.Markdown("Async web crawler dashboard with live parquet text tokenization.")
-        with gr.Row():
-            theme_name = gr.Dropdown(
-                choices=["red", "blue", "light", "dark", "green"],
-                value="dark",
-                label="Theme",
-                interactive=True,
-            )
-            gr.Markdown(
-                "- Standard mode: **12 threads** (`10 fetch`, `2 parse`)\n"
-                "- Super mode: **24 threads** (`20 fetch`, `4 parse`)"
-            )
-        with gr.Row():
-            with gr.Column(scale=2):
-                seed_urls_table = gr.Dataframe(
-                    headers=["seed_url"],
-                    datatype=["str"],
-                    row_count=(8, "dynamic"),
-                    value=default_seed_rows,
-                    interactive=True,
-                    label="Seed URL List (editable)",
-                )
-                seed_widget_html = gr.HTML(
-                    label="Seed URL Summary",
-                    value=render_seed_widget_html(default_seed_rows),
-                )
-                token_widget_html = gr.HTML(
-                    label="Live Tokenization",
-                    value=render_tokenization_widget_html({}),
-                )
-            with gr.Column(scale=1):
-                shard_size_rows = gr.Slider(
-                    label=f"Shard Size Rows (max {MAX_SHARD_ROWS})",
-                    minimum=100,
-                    maximum=MAX_SHARD_ROWS,
-                    step=100,
-                    value=min(defaults.shard_size_rows, MAX_SHARD_ROWS),
-                )
-                max_links_per_page = gr.Slider(
-                    label="Max Links Per Page",
-                    minimum=10,
-                    maximum=1000,
-                    step=10,
-                    value=defaults.max_links_per_page,
-                )
-                request_timeout_seconds = gr.Slider(
-                    label="Request Timeout (seconds)",
-                    minimum=3,
-                    maximum=60,
-                    step=1,
-                    value=defaults.request_timeout_seconds,
-                )
-                max_response_bytes = gr.Slider(
-                    label="Max Response Bytes",
-                    minimum=500_000,
-                    maximum=8_000_000,
-                    step=100_000,
-                    value=defaults.max_response_bytes,
-                )
-        with gr.Accordion("Hugging Face Upload", open=False):
-            enable_hf_upload = gr.Checkbox(
-                label="Upload shards to my HF repo",
-                value=False,
-            )
-            hf_repo_id = gr.Textbox(
-                label="HF Repo ID",
-                placeholder="username/dataset-name",
-                visible=False,
-            )
-            hf_token = gr.Textbox(
-                label="HF Token (write permissions)",
-                type="password",
-                placeholder="hf_xxx",
-                visible=False,
-            )
-            hf_private_repo = gr.Checkbox(
-                label="Private HF Repo",
-                value=False,
-                visible=False,
-            )
-            hf_path_prefix = gr.Textbox(
-                label="HF Path Prefix",
-                value="crawl_shards",
-                visible=False,
-            )
-        with gr.Row():
-            start_button = gr.Button("Start Crawl (12 Threads)", variant="primary")
-            super_button = gr.Button("Super Mode (24 Threads)", variant="primary")
-            stop_button = gr.Button("Stop Crawl", variant="stop")
-            refresh_button = gr.Button("Refresh")
-        status_md = gr.Markdown("### Crawler Status\n- Run ID: `0`\n- Running: `False`")
-        qvp_md = gr.Markdown("### Live Metrics\n- Queue: `0`\n- Visited: `0`\n- Parsed: `0`")
-        logs_box = gr.Textbox(label="Run Log", lines=12, interactive=False)
-        start_inputs = [
-            seed_urls_table,
-            max_links_per_page,
-            request_timeout_seconds,
-            max_response_bytes,
-            shard_size_rows,
-            enable_hf_upload,
-            hf_repo_id,
-            hf_token,
-            hf_private_repo,
-            hf_path_prefix,
-        ]
-        outputs = [status_md, qvp_md, logs_box, token_widget_html]
-        start_button.click(start_crawl_standard, inputs=start_inputs, outputs=outputs)
-        super_button.click(start_crawl_super, inputs=start_inputs, outputs=outputs)
-        stop_button.click(stop_crawl, inputs=[], outputs=outputs)
-        refresh_button.click(poll_dashboard, inputs=[], outputs=outputs)
-        enable_hf_upload.change(
-            toggle_hf_fields,
-            inputs=enable_hf_upload,
-            outputs=[hf_repo_id, hf_token, hf_private_repo, hf_path_prefix],
-        )
-        seed_urls_table.change(
-            fn=None,
-            inputs=[seed_urls_table],
-            outputs=[seed_widget_html],
-            js=SEED_WIDGET_JS,
-        )
-        theme_name.change(fn=None, inputs=theme_name, outputs=[], js=THEME_JS)
-        demo.load(
-            fn=None,
-            inputs=[],
-            outputs=[],
-            js='() => { document.documentElement.setAttribute("data-crawler-theme", "dark"); }',
-        )
-        demo.load(
-            fn=None,
-            inputs=[seed_urls_table],
-            outputs=[seed_widget_html],
-            js=SEED_WIDGET_JS,
-        )
-        demo.load(fn=poll_dashboard, inputs=[], outputs=outputs)
-        timer = gr.Timer(value=1.0)
-        timer.tick(fn=poll_dashboard, inputs=[], outputs=outputs)
-    return demo
-demo = build_ui()
-def main() -> None:
-    demo.queue(default_concurrency_limit=32).launch()
-if __name__ == "__main__":
-    main()