Roman190928 commited on
Commit
ddf0cf6
·
verified ·
1 Parent(s): f599975

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -997
app.py DELETED
@@ -1,997 +0,0 @@
1
- #!/usr/bin/env python3
2
- from __future__ import annotations
3
-
4
- import asyncio
5
- import re
6
- import threading
7
- import traceback
8
- from collections import deque
9
- from dataclasses import dataclass
10
- from datetime import datetime, timezone
11
- from html import escape
12
- from pathlib import Path
13
- from typing import Any
14
-
15
- import huggingface_hub as hf_hub
16
-
17
- if not hasattr(hf_hub, "HfFolder"):
18
- class _CompatHfFolder:
19
- @staticmethod
20
- def get_token() -> str | None:
21
- return None
22
-
23
- @staticmethod
24
- def save_token(token: str) -> None:
25
- del token
26
- return None
27
-
28
- @staticmethod
29
- def delete_token() -> None:
30
- return None
31
-
32
- hf_hub.HfFolder = _CompatHfFolder # type: ignore[attr-defined]
33
-
34
- import gradio as gr
35
-
36
- from crawler import (
37
- MAX_SHARD_ROWS,
38
- NORMAL_TOTAL_WORKERS,
39
- SUPER_TOTAL_WORKERS,
40
- AsyncCrawler,
41
- CrawlerConfig,
42
- )
43
-
44
- APP_CSS = """
45
- :root {
46
- --bg-main: #0a0d12;
47
- --bg-surface: #151a22;
48
- --bg-panel: #1b2230;
49
- --text-main: #f0f4fb;
50
- --text-muted: #9aa4b6;
51
- --accent: #3bd9ff;
52
- --accent-2: #4cffb1;
53
- --border: #2f3a50;
54
- --shadow: 0 18px 36px rgba(0, 0, 0, 0.45);
55
- }
56
-
57
- :root[data-crawler-theme="red"] {
58
- --bg-main: #17080c;
59
- --bg-surface: #250d15;
60
- --bg-panel: #341322;
61
- --text-main: #f8e8ee;
62
- --text-muted: #d5b0c0;
63
- --accent: #7a0018;
64
- --accent-2: #8e3ff5;
65
- --border: #5a2035;
66
- }
67
-
68
- :root[data-crawler-theme="blue"] {
69
- --bg-main: #021116;
70
- --bg-surface: #08222c;
71
- --bg-panel: #0e2f3b;
72
- --text-main: #eaffff;
73
- --text-muted: #8fbcc7;
74
- --accent: #2fff9d;
75
- --accent-2: #13e5ff;
76
- --border: #1e5662;
77
- }
78
-
79
- :root[data-crawler-theme="light"] {
80
- --bg-main: #f6f7f9;
81
- --bg-surface: #ffffff;
82
- --bg-panel: #eceff2;
83
- --text-main: #111317;
84
- --text-muted: #60666f;
85
- --accent: #2a2f37;
86
- --accent-2: #868b95;
87
- --border: #d0d4db;
88
- --shadow: 0 10px 25px rgba(35, 42, 52, 0.16);
89
- }
90
-
91
- :root[data-crawler-theme="dark"] {
92
- --bg-main: #090909;
93
- --bg-surface: #141414;
94
- --bg-panel: #1d1d1d;
95
- --text-main: #f0f0f0;
96
- --text-muted: #a8a8a8;
97
- --accent: #444444;
98
- --accent-2: #686868;
99
- --border: #2b2b2b;
100
- }
101
-
102
- :root[data-crawler-theme="green"] {
103
- --bg-main: #08110b;
104
- --bg-surface: #0f1d14;
105
- --bg-panel: #17301e;
106
- --text-main: #e8f8ed;
107
- --text-muted: #97bc9f;
108
- --accent: #2ea84b;
109
- --accent-2: #185f2a;
110
- --border: #2a5d36;
111
- }
112
-
113
- .gradio-container {
114
- background:
115
- radial-gradient(1200px 550px at 8% 0%, color-mix(in srgb, var(--accent) 18%, transparent), transparent),
116
- radial-gradient(900px 600px at 100% 0%, color-mix(in srgb, var(--accent-2) 14%, transparent), transparent),
117
- var(--bg-main);
118
- color: var(--text-main);
119
- }
120
-
121
- .gradio-container .block,
122
- .gradio-container .form,
123
- .gradio-container .gr-box,
124
- .gradio-container .panel-wrap {
125
- background: color-mix(in srgb, var(--bg-surface) 92%, transparent) !important;
126
- border: 1px solid var(--border) !important;
127
- box-shadow: var(--shadow);
128
- }
129
-
130
- .gradio-container h1,
131
- .gradio-container h2,
132
- .gradio-container h3,
133
- .gradio-container p,
134
- .gradio-container label,
135
- .gradio-container .prose,
136
- .gradio-container .prose * {
137
- color: var(--text-main) !important;
138
- }
139
-
140
- .gradio-container input,
141
- .gradio-container textarea,
142
- .gradio-container select {
143
- background: var(--bg-panel) !important;
144
- color: var(--text-main) !important;
145
- border: 1px solid var(--border) !important;
146
- }
147
-
148
- .gradio-container button {
149
- border: 1px solid var(--border) !important;
150
- }
151
-
152
- .gradio-container button.primary {
153
- background: linear-gradient(135deg, var(--accent), var(--accent-2)) !important;
154
- color: #0b0e13 !important;
155
- font-weight: 700;
156
- }
157
-
158
- .seed-widget {
159
- display: flex;
160
- flex-direction: column;
161
- gap: 0.75rem;
162
- border: 1px solid var(--border);
163
- border-radius: 0.9rem;
164
- padding: 0.85rem;
165
- background: color-mix(in srgb, var(--bg-panel) 86%, transparent);
166
- }
167
-
168
- .seed-stats {
169
- display: grid;
170
- grid-template-columns: repeat(3, minmax(0, 1fr));
171
- gap: 0.6rem;
172
- }
173
-
174
- .seed-stats > span {
175
- display: block;
176
- padding: 0.55rem;
177
- border: 1px solid var(--border);
178
- border-radius: 0.6rem;
179
- background: color-mix(in srgb, var(--bg-surface) 90%, transparent);
180
- color: var(--text-main);
181
- font-size: 0.9rem;
182
- }
183
-
184
- .seed-chip-wrap {
185
- display: flex;
186
- flex-wrap: wrap;
187
- gap: 0.45rem;
188
- }
189
-
190
- .seed-chip {
191
- border: 1px solid var(--border);
192
- border-radius: 999px;
193
- padding: 0.24rem 0.7rem;
194
- color: var(--text-main);
195
- background: linear-gradient(
196
- 145deg,
197
- color-mix(in srgb, var(--accent) 20%, transparent),
198
- color-mix(in srgb, var(--accent-2) 15%, transparent)
199
- );
200
- font-size: 0.83rem;
201
- }
202
-
203
- .seed-empty,
204
- .seed-overflow {
205
- color: var(--text-muted);
206
- font-size: 0.83rem;
207
- padding: 0.24rem 0.3rem;
208
- }
209
- """
210
-
211
- THEME_JS = """
212
- (theme_name) => {
213
- const theme = theme_name || "dark";
214
- document.documentElement.setAttribute("data-crawler-theme", theme);
215
- return [];
216
- }
217
- """
218
-
219
- SEED_WIDGET_JS = """
220
- (seed_text, seed_rows) => {
221
- const parseText = (raw) => {
222
- const value = typeof raw === "string" ? raw : "";
223
- return value.replaceAll(",", "\\n").split(/\\r?\\n/).map((item) => item.trim()).filter(Boolean);
224
- };
225
- const parseRows = (rows) => {
226
- if (!Array.isArray(rows)) return [];
227
- const out = [];
228
- for (const row of rows) {
229
- let value = "";
230
- if (Array.isArray(row)) {
231
- value = String(row[0] ?? "").trim();
232
- } else if (row && typeof row === "object") {
233
- value = String(Object.values(row)[0] ?? "").trim();
234
- } else if (row !== null && row !== undefined) {
235
- value = String(row).trim();
236
- }
237
- if (value) out.push(value);
238
- }
239
- return out;
240
- };
241
- const dedupe = (values) => {
242
- const seen = new Set();
243
- const out = [];
244
- for (const value of values) {
245
- if (!seen.has(value)) {
246
- seen.add(value);
247
- out.push(value);
248
- }
249
- }
250
- return out;
251
- };
252
- const escapeHtml = (value) => String(value)
253
- .replaceAll("&", "&")
254
- .replaceAll("<", "&lt;")
255
- .replaceAll(">", "&gt;")
256
- .replaceAll('"', "&quot;")
257
- .replaceAll("'", "&#39;");
258
-
259
- const seeds = dedupe([...parseText(seed_text), ...parseRows(seed_rows)]);
260
- const canonical = seeds.join("\\n");
261
- const tokenEstimate = (canonical.match(/[A-Za-z0-9_]+|[^\\s]/g) || []).length;
262
- const chips = seeds.length
263
- ? seeds.slice(0, 12).map((url) => `<span class="seed-chip">${escapeHtml(url)}</span>`).join("")
264
- : '<span class="seed-empty">No seed URLs detected yet.</span>';
265
- const overflow = seeds.length > 12
266
- ? `<span class="seed-overflow">+${seeds.length - 12} more</span>`
267
- : "";
268
-
269
- return `<div class="seed-widget"><div class="seed-stats"><span><strong>${seeds.length}</strong> seeds</span><span><strong>${tokenEstimate}</strong> est. tokens</span><span><strong>${canonical.length}</strong> chars</span></div><div class="seed-chip-wrap">${chips}${overflow}</div></div>`;
270
- }
271
- """
272
-
273
- TOKEN_RE = re.compile(r"[A-Za-z0-9_]+|[^\s]")
274
-
275
-
276
- def utc_now_iso() -> str:
277
- return datetime.now(timezone.utc).isoformat(timespec="seconds")
278
-
279
-
280
- def parse_multiline_urls(raw: str) -> list[str]:
281
- items: list[str] = []
282
- for line in raw.replace(",", "\n").splitlines():
283
- value = line.strip()
284
- if value:
285
- items.append(value)
286
- return items
287
-
288
-
289
- def parse_seed_url_rows(rows: Any) -> list[str]:
290
- items: list[str] = []
291
- if not rows:
292
- return items
293
-
294
- for row in rows:
295
- value = ""
296
- if isinstance(row, dict):
297
- first_value = next(iter(row.values()), "")
298
- value = str(first_value or "").strip()
299
- elif isinstance(row, (list, tuple)):
300
- first_value = row[0] if row else ""
301
- value = str(first_value or "").strip()
302
- elif row is not None:
303
- value = str(row).strip()
304
-
305
- if value:
306
- items.append(value)
307
- return items
308
-
309
-
310
- def unique_preserve_order(values: list[str]) -> list[str]:
311
- seen: set[str] = set()
312
- out: list[str] = []
313
- for value in values:
314
- if value in seen:
315
- continue
316
- seen.add(value)
317
- out.append(value)
318
- return out
319
-
320
-
321
- def collect_seed_urls(seed_urls_raw: str, seed_urls_table: Any) -> list[str]:
322
- merged = parse_multiline_urls(seed_urls_raw) + parse_seed_url_rows(seed_urls_table)
323
- return unique_preserve_order(merged)
324
-
325
-
326
- def estimate_token_count(text: str) -> int:
327
- return len(TOKEN_RE.findall(text))
328
-
329
-
330
- def render_seed_widget_html(seed_urls_raw: str, seed_urls_table: Any) -> str:
331
- seeds = collect_seed_urls(seed_urls_raw, seed_urls_table)
332
- canonical = "\n".join(seeds)
333
- token_estimate = estimate_token_count(canonical)
334
-
335
- chips: list[str] = []
336
- for url in seeds[:12]:
337
- chips.append(f'<span class="seed-chip">{escape(url)}</span>')
338
-
339
- if not chips:
340
- chips_html = '<span class="seed-empty">No seed URLs detected yet.</span>'
341
- else:
342
- chips_html = "".join(chips)
343
-
344
- overflow_html = ""
345
- if len(seeds) > 12:
346
- overflow_html = f'<span class="seed-overflow">+{len(seeds) - 12} more</span>'
347
-
348
- return (
349
- '<div class="seed-widget">'
350
- '<div class="seed-stats">'
351
- f"<span><strong>{len(seeds)}</strong> seeds</span>"
352
- f"<span><strong>{token_estimate}</strong> est. tokens</span>"
353
- f"<span><strong>{len(canonical)}</strong> chars</span>"
354
- "</div>"
355
- f'<div class="seed-chip-wrap">{chips_html}{overflow_html}</div>'
356
- "</div>"
357
- )
358
-
359
-
360
- def safe_queue_size(queue: Any) -> int:
361
- try:
362
- return int(queue.qsize())
363
- except Exception:
364
- return -1
365
-
366
-
367
- def validate_hf_requirements(enable_hf_upload: bool, hf_repo_id: str, hf_token: str) -> None:
368
- if not enable_hf_upload:
369
- raise ValueError(
370
- "HF upload must be enabled to operate. Toggle it on and provide repo/token."
371
- )
372
- if not hf_repo_id.strip() or not hf_token.strip():
373
- raise ValueError("HF token and HF repo are required to operate.")
374
-
375
-
376
- def build_crawler_config(
377
- *,
378
- seed_urls_raw: str,
379
- seed_urls_table: Any = None,
380
- allowed_domains_raw: str,
381
- max_links_per_page: int,
382
- request_timeout_seconds: float,
383
- max_response_bytes: int,
384
- shard_size_rows: int,
385
- output_dir_raw: str,
386
- enable_hf_upload: bool,
387
- hf_repo_id: str,
388
- hf_token: str,
389
- hf_private_repo: bool,
390
- hf_path_prefix: str,
391
- total_workers: int,
392
- ) -> CrawlerConfig:
393
- validate_hf_requirements(enable_hf_upload, hf_repo_id, hf_token)
394
-
395
- seed_urls = collect_seed_urls(seed_urls_raw, seed_urls_table)
396
- allowed_domains = set(parse_multiline_urls(allowed_domains_raw))
397
-
398
- output_dir = Path(output_dir_raw.strip()).expanduser()
399
- if not output_dir.is_absolute():
400
- output_dir = (Path(__file__).resolve().parent / output_dir).resolve()
401
-
402
- return CrawlerConfig(
403
- seed_urls=seed_urls,
404
- allowed_domains=allowed_domains,
405
- max_links_per_page=int(max_links_per_page),
406
- request_timeout_seconds=float(request_timeout_seconds),
407
- max_response_bytes=int(max_response_bytes),
408
- shard_size_rows=int(shard_size_rows),
409
- output_dir=output_dir,
410
- enable_hf_upload=bool(enable_hf_upload),
411
- hf_repo_id=hf_repo_id.strip(),
412
- hf_token=hf_token.strip(),
413
- hf_private_repo=bool(hf_private_repo),
414
- hf_path_prefix=hf_path_prefix.strip() or "crawl_shards",
415
- total_workers=int(total_workers),
416
- )
417
-
418
-
419
- @dataclass
420
- class RunState:
421
- run_id: int = 0
422
- running: bool = False
423
- started_at: str = ""
424
- finished_at: str = ""
425
- stop_requested: bool = False
426
- last_error: str = ""
427
-
428
-
429
- class CrawlerRunManager:
430
- def __init__(self) -> None:
431
- self._lock = threading.Lock()
432
- self._thread: threading.Thread | None = None
433
- self._loop: asyncio.AbstractEventLoop | None = None
434
- self._crawler: AsyncCrawler | None = None
435
- self._state = RunState()
436
- self._history: deque[dict[str, Any]] = deque(maxlen=1200)
437
- self._logs: deque[str] = deque(maxlen=600)
438
- self._last_snapshot: dict[str, Any] | None = None
439
-
440
- def start(self, config: CrawlerConfig) -> str:
441
- with self._lock:
442
- if self._thread is not None and self._thread.is_alive():
443
- return "A crawl is already running. Stop it before starting another one."
444
-
445
- self._state.run_id += 1
446
- self._state.running = True
447
- self._state.started_at = utc_now_iso()
448
- self._state.finished_at = ""
449
- self._state.stop_requested = False
450
- self._state.last_error = ""
451
- self._history.clear()
452
- self._last_snapshot = None
453
- self._logs.clear()
454
-
455
- run_id = self._state.run_id
456
- self._logs.append(
457
- f"[{utc_now_iso()}] Started run #{run_id} with {config.total_workers} workers "
458
- f"({config.fetch_workers} fetch / {config.parser_workers} parser)."
459
- )
460
-
461
- self._thread = threading.Thread(
462
- target=self._run_crawler,
463
- args=(run_id, config),
464
- daemon=True,
465
- name=f"crawler-run-{run_id}",
466
- )
467
- self._thread.start()
468
-
469
- return f"Run #{run_id} started."
470
-
471
- def stop(self) -> str:
472
- with self._lock:
473
- if self._thread is None or not self._thread.is_alive():
474
- return "No active crawl to stop."
475
-
476
- self._state.stop_requested = True
477
- crawler = self._crawler
478
- loop = self._loop
479
- run_id = self._state.run_id
480
- self._logs.append(f"[{utc_now_iso()}] Stop requested for run #{run_id}")
481
-
482
- if crawler is not None and loop is not None and loop.is_running():
483
- loop.call_soon_threadsafe(crawler.request_stop, "user_requested_stop")
484
- elif crawler is not None:
485
- crawler.request_stop("user_requested_stop")
486
-
487
- return f"Stop signal sent to run #{run_id}."
488
-
489
- def _run_crawler(self, run_id: int, config: CrawlerConfig) -> None:
490
- loop = asyncio.new_event_loop()
491
- asyncio.set_event_loop(loop)
492
- try:
493
- crawler = AsyncCrawler(config)
494
- with self._lock:
495
- if self._state.run_id == run_id:
496
- self._crawler = crawler
497
- self._loop = loop
498
-
499
- loop.run_until_complete(crawler.run())
500
- final_snapshot = self._snapshot_from_crawler(crawler)
501
- with self._lock:
502
- if self._state.run_id == run_id:
503
- self._last_snapshot = final_snapshot
504
- self._history.append(final_snapshot)
505
- self._logs.append(f"[{utc_now_iso()}] Run #{run_id} completed")
506
- except Exception:
507
- error_text = traceback.format_exc(limit=20)
508
- with self._lock:
509
- self._state.last_error = error_text
510
- self._logs.append(f"[{utc_now_iso()}] Run #{run_id} crashed")
511
- finally:
512
- with self._lock:
513
- if self._state.run_id == run_id:
514
- self._state.running = False
515
- self._state.finished_at = utc_now_iso()
516
- self._crawler = None
517
- self._loop = None
518
- loop.close()
519
- asyncio.set_event_loop(None)
520
-
521
- def _snapshot_from_crawler(self, crawler: AsyncCrawler) -> dict[str, Any]:
522
- stats = crawler.stats
523
- return {
524
- "timestamp": utc_now_iso(),
525
- "workers_total": crawler.config.total_workers,
526
- "workers_split": f"{crawler.config.fetch_workers}/{crawler.config.parser_workers}",
527
- "stop_reason": crawler.stop_reason or "-",
528
- "queued_urls": stats.queued_urls,
529
- "fetch_reserved": stats.fetch_reserved,
530
- "fetch_succeeded": stats.fetch_succeeded,
531
- "fetch_failed": stats.fetch_failed,
532
- "parsed_pages": stats.parsed_pages,
533
- "parse_failed": stats.parse_failed,
534
- "extracted_links": stats.extracted_links,
535
- "dropped_urls": stats.dropped_urls,
536
- "robots_blocked": stats.robots_blocked,
537
- "stored_rows": stats.stored_rows,
538
- "written_shards": stats.written_shards,
539
- "uploaded_shards": stats.uploaded_shards,
540
- "active_fetchers": crawler.active_fetchers,
541
- "active_parsers": crawler.active_parsers,
542
- "fetch_queue": safe_queue_size(crawler.fetch_queue),
543
- "parse_queue": safe_queue_size(crawler.parse_queue),
544
- "record_queue": safe_queue_size(crawler.record_queue),
545
- "stop_event": crawler.stop_event.is_set(),
546
- }
547
-
548
- def poll(self) -> tuple[str, dict[str, Any], list[list[Any]], str]:
549
- with self._lock:
550
- crawler = self._crawler
551
- state = RunState(
552
- run_id=self._state.run_id,
553
- running=self._state.running,
554
- started_at=self._state.started_at,
555
- finished_at=self._state.finished_at,
556
- stop_requested=self._state.stop_requested,
557
- last_error=self._state.last_error,
558
- )
559
-
560
- if crawler is not None:
561
- snapshot = self._snapshot_from_crawler(crawler)
562
- with self._lock:
563
- self._last_snapshot = snapshot
564
- if not self._history or self._history[-1]["timestamp"] != snapshot["timestamp"]:
565
- self._history.append(snapshot)
566
-
567
- with self._lock:
568
- latest = self._last_snapshot or {
569
- "timestamp": utc_now_iso(),
570
- "workers_total": 0,
571
- "workers_split": "-",
572
- "stop_reason": "-",
573
- "queued_urls": 0,
574
- "fetch_reserved": 0,
575
- "fetch_succeeded": 0,
576
- "fetch_failed": 0,
577
- "parsed_pages": 0,
578
- "parse_failed": 0,
579
- "extracted_links": 0,
580
- "dropped_urls": 0,
581
- "robots_blocked": 0,
582
- "stored_rows": 0,
583
- "written_shards": 0,
584
- "uploaded_shards": 0,
585
- "active_fetchers": 0,
586
- "active_parsers": 0,
587
- "fetch_queue": 0,
588
- "parse_queue": 0,
589
- "record_queue": 0,
590
- "stop_event": False,
591
- }
592
- history_copy = list(self._history)
593
- logs_text = "\n".join(self._logs)
594
-
595
- history_rows: list[list[Any]] = []
596
- for item in reversed(history_copy[-180:]):
597
- history_rows.append(
598
- [
599
- item["timestamp"],
600
- item["workers_total"],
601
- item["workers_split"],
602
- item["fetch_reserved"],
603
- item["fetch_succeeded"],
604
- item["parsed_pages"],
605
- item["robots_blocked"],
606
- item["stored_rows"],
607
- item["written_shards"],
608
- item["uploaded_shards"],
609
- item["fetch_queue"],
610
- item["parse_queue"],
611
- item["record_queue"],
612
- item["stop_reason"],
613
- ]
614
- )
615
-
616
- status_lines = [
617
- "### Crawler Status",
618
- f"- Run ID: `{state.run_id}`",
619
- f"- Running: `{state.running}`",
620
- f"- Stop requested: `{state.stop_requested}`",
621
- f"- Started at (UTC): `{state.started_at or '-'}`",
622
- f"- Finished at (UTC): `{state.finished_at or '-'}`",
623
- ]
624
- if state.last_error:
625
- status_lines.append("- Last error:")
626
- status_lines.append("```text")
627
- status_lines.append(state.last_error.strip())
628
- status_lines.append("```")
629
-
630
- return "\n".join(status_lines), latest, history_rows, logs_text
631
-
632
-
633
- RUN_MANAGER = CrawlerRunManager()
634
-
635
-
636
- def _start_crawl(
637
- *,
638
- total_workers: int,
639
- seed_urls_raw: str,
640
- seed_urls_table: Any = None,
641
- allowed_domains_raw: str,
642
- max_links_per_page: int,
643
- request_timeout_seconds: float,
644
- max_response_bytes: int,
645
- shard_size_rows: int,
646
- output_dir_raw: str,
647
- enable_hf_upload: bool,
648
- hf_repo_id: str,
649
- hf_token: str,
650
- hf_private_repo: bool,
651
- hf_path_prefix: str,
652
- ) -> tuple[str, dict[str, Any], list[list[Any]], str]:
653
- try:
654
- config = build_crawler_config(
655
- seed_urls_raw=seed_urls_raw,
656
- seed_urls_table=seed_urls_table,
657
- allowed_domains_raw=allowed_domains_raw,
658
- max_links_per_page=max_links_per_page,
659
- request_timeout_seconds=request_timeout_seconds,
660
- max_response_bytes=max_response_bytes,
661
- shard_size_rows=shard_size_rows,
662
- output_dir_raw=output_dir_raw,
663
- enable_hf_upload=enable_hf_upload,
664
- hf_repo_id=hf_repo_id,
665
- hf_token=hf_token,
666
- hf_private_repo=hf_private_repo,
667
- hf_path_prefix=hf_path_prefix,
668
- total_workers=total_workers,
669
- )
670
- except ValueError as exc:
671
- raise gr.Error(str(exc)) from exc
672
-
673
- message = RUN_MANAGER.start(config)
674
- status, snapshot, history, logs = RUN_MANAGER.poll()
675
- return f"{status}\n\n{message}", snapshot, history, logs
676
-
677
-
678
- def start_crawl_standard(
679
- seed_urls_raw: str,
680
- seed_urls_table: Any,
681
- allowed_domains_raw: str,
682
- max_links_per_page: int,
683
- request_timeout_seconds: float,
684
- max_response_bytes: int,
685
- shard_size_rows: int,
686
- output_dir_raw: str,
687
- enable_hf_upload: bool,
688
- hf_repo_id: str,
689
- hf_token: str,
690
- hf_private_repo: bool,
691
- hf_path_prefix: str,
692
- ) -> tuple[str, dict[str, Any], list[list[Any]], str]:
693
- return _start_crawl(
694
- total_workers=NORMAL_TOTAL_WORKERS,
695
- seed_urls_raw=seed_urls_raw,
696
- seed_urls_table=seed_urls_table,
697
- allowed_domains_raw=allowed_domains_raw,
698
- max_links_per_page=max_links_per_page,
699
- request_timeout_seconds=request_timeout_seconds,
700
- max_response_bytes=max_response_bytes,
701
- shard_size_rows=shard_size_rows,
702
- output_dir_raw=output_dir_raw,
703
- enable_hf_upload=enable_hf_upload,
704
- hf_repo_id=hf_repo_id,
705
- hf_token=hf_token,
706
- hf_private_repo=hf_private_repo,
707
- hf_path_prefix=hf_path_prefix,
708
- )
709
-
710
-
711
- def start_crawl_super(
712
- seed_urls_raw: str,
713
- seed_urls_table: Any,
714
- allowed_domains_raw: str,
715
- max_links_per_page: int,
716
- request_timeout_seconds: float,
717
- max_response_bytes: int,
718
- shard_size_rows: int,
719
- output_dir_raw: str,
720
- enable_hf_upload: bool,
721
- hf_repo_id: str,
722
- hf_token: str,
723
- hf_private_repo: bool,
724
- hf_path_prefix: str,
725
- ) -> tuple[str, dict[str, Any], list[list[Any]], str]:
726
- return _start_crawl(
727
- total_workers=SUPER_TOTAL_WORKERS,
728
- seed_urls_raw=seed_urls_raw,
729
- seed_urls_table=seed_urls_table,
730
- allowed_domains_raw=allowed_domains_raw,
731
- max_links_per_page=max_links_per_page,
732
- request_timeout_seconds=request_timeout_seconds,
733
- max_response_bytes=max_response_bytes,
734
- shard_size_rows=shard_size_rows,
735
- output_dir_raw=output_dir_raw,
736
- enable_hf_upload=enable_hf_upload,
737
- hf_repo_id=hf_repo_id,
738
- hf_token=hf_token,
739
- hf_private_repo=hf_private_repo,
740
- hf_path_prefix=hf_path_prefix,
741
- )
742
-
743
-
744
- def stop_crawl() -> tuple[str, dict[str, Any], list[list[Any]], str]:
745
- message = RUN_MANAGER.stop()
746
- status, snapshot, history, logs = RUN_MANAGER.poll()
747
- return f"{status}\n\n{message}", snapshot, history, logs
748
-
749
-
750
- def poll_dashboard() -> tuple[str, dict[str, Any], list[list[Any]], str]:
751
- return RUN_MANAGER.poll()
752
-
753
-
754
- def toggle_hf_fields(enable_hf_upload: bool) -> tuple[Any, Any, Any, Any]:
755
- update = gr.update(visible=enable_hf_upload)
756
- return update, update, update, update
757
-
758
-
759
- def build_ui() -> gr.Blocks:
760
- defaults = CrawlerConfig(
761
- seed_urls=[
762
- "https://en.wikipedia.org/wiki/Main_Page",
763
- "https://docs.python.org/3/",
764
- "https://developer.mozilla.org/en-US/",
765
- "https://www.nasa.gov/",
766
- ]
767
- )
768
- default_seed_rows = [[url] for url in defaults.seed_urls]
769
-
770
- with gr.Blocks(
771
- title="HF DB Continuous Async Crawler",
772
- css=APP_CSS,
773
- theme=gr.themes.Default(primary_hue="green"),
774
- ) as demo:
775
- gr.Markdown("# HF DB Continuous Async Crawler")
776
- gr.Markdown(
777
- "Polite async crawler with robots.txt compliance, per-domain delay, shard uploads, and strict shard limits."
778
- )
779
-
780
- with gr.Row():
781
- theme_name = gr.Dropdown(
782
- choices=["red", "blue", "light", "dark", "green"],
783
- value="dark",
784
- label="Theme",
785
- interactive=True,
786
- )
787
- gr.Markdown(
788
- "- Normal mode uses **12 threads** (`10 fetch`, `2 parse`).\n"
789
- "- Super mode uses **24 threads** (`20 fetch`, `4 parse`)."
790
- )
791
-
792
- with gr.Row():
793
- with gr.Column(scale=2):
794
- seed_urls_raw = gr.Textbox(
795
- label="Seed URLs (one per line)",
796
- lines=8,
797
- value="\n".join(defaults.seed_urls),
798
- )
799
- seed_urls_table = gr.Dataframe(
800
- headers=["seed_url"],
801
- datatype=["str"],
802
- row_count=(6, "dynamic"),
803
- value=default_seed_rows,
804
- interactive=True,
805
- label="Seed URL List (editable rows)",
806
- )
807
- gr.Markdown(
808
- "Seed URLs are merged from textbox + table, deduplicated, then used for crawl start."
809
- )
810
- seed_widget_html = gr.HTML(
811
- label="Seed URL Live Summary",
812
- value=render_seed_widget_html("\n".join(defaults.seed_urls), default_seed_rows),
813
- )
814
- allowed_domains_raw = gr.Textbox(
815
- label="Allowed Domains (optional, one per line)",
816
- lines=4,
817
- placeholder="Leave empty to derive from seed URLs",
818
- )
819
- output_dir_raw = gr.Textbox(
820
- label="Output Directory",
821
- value=str(defaults.output_dir),
822
- )
823
- with gr.Column(scale=1):
824
- shard_size_rows = gr.Slider(
825
- label=f"Shard Size Rows (max {MAX_SHARD_ROWS})",
826
- minimum=100,
827
- maximum=MAX_SHARD_ROWS,
828
- step=100,
829
- value=min(defaults.shard_size_rows, MAX_SHARD_ROWS),
830
- )
831
- max_links_per_page = gr.Slider(
832
- label="Max Links Per Page",
833
- minimum=10,
834
- maximum=1000,
835
- step=10,
836
- value=defaults.max_links_per_page,
837
- )
838
- request_timeout_seconds = gr.Slider(
839
- label="Request Timeout (seconds)",
840
- minimum=3,
841
- maximum=60,
842
- step=1,
843
- value=defaults.request_timeout_seconds,
844
- )
845
- max_response_bytes = gr.Slider(
846
- label="Max Response Bytes",
847
- minimum=500_000,
848
- maximum=8_000_000,
849
- step=100_000,
850
- value=defaults.max_response_bytes,
851
- )
852
-
853
- with gr.Accordion("Hugging Face Upload (Required)", open=True):
854
- enable_hf_upload = gr.Checkbox(
855
- label="Enable HF upload with my token (required to run)",
856
- value=False,
857
- )
858
- hf_repo_id = gr.Textbox(
859
- label="HF Repo ID",
860
- placeholder="username/dataset-name",
861
- visible=False,
862
- )
863
- hf_token = gr.Textbox(
864
- label="HF Token (write permissions required)",
865
- type="password",
866
- placeholder="hf_xxx",
867
- visible=False,
868
- )
869
- hf_private_repo = gr.Checkbox(
870
- label="Private HF Repo",
871
- value=False,
872
- visible=False,
873
- )
874
- hf_path_prefix = gr.Textbox(
875
- label="HF Path Prefix",
876
- value="crawl_shards",
877
- visible=False,
878
- )
879
-
880
- with gr.Row():
881
- start_button = gr.Button("Start Crawl (12 Threads)", variant="primary")
882
- super_button = gr.Button("Super Mode (24 Threads)", variant="primary")
883
- stop_button = gr.Button("Stop Crawl", variant="stop")
884
- refresh_button = gr.Button("Refresh")
885
-
886
- status_md = gr.Markdown("### Crawler Status\n- Run ID: `0`\n- Running: `False`")
887
- latest_snapshot = gr.JSON(label="Latest Snapshot")
888
- history_table = gr.Dataframe(
889
- headers=[
890
- "timestamp",
891
- "workers_total",
892
- "workers_split",
893
- "fetch_reserved",
894
- "fetch_succeeded",
895
- "parsed_pages",
896
- "robots_blocked",
897
- "stored_rows",
898
- "written_shards",
899
- "uploaded_shards",
900
- "fetch_queue",
901
- "parse_queue",
902
- "record_queue",
903
- "stop_reason",
904
- ],
905
- datatype=[
906
- "str",
907
- "number",
908
- "str",
909
- "number",
910
- "number",
911
- "number",
912
- "number",
913
- "number",
914
- "number",
915
- "number",
916
- "number",
917
- "number",
918
- "number",
919
- "str",
920
- ],
921
- row_count=(14, "dynamic"),
922
- interactive=False,
923
- label="Recent Metrics (latest first)",
924
- )
925
- logs_box = gr.Textbox(label="Run Log", lines=12, interactive=False)
926
-
927
- start_inputs = [
928
- seed_urls_raw,
929
- seed_urls_table,
930
- allowed_domains_raw,
931
- max_links_per_page,
932
- request_timeout_seconds,
933
- max_response_bytes,
934
- shard_size_rows,
935
- output_dir_raw,
936
- enable_hf_upload,
937
- hf_repo_id,
938
- hf_token,
939
- hf_private_repo,
940
- hf_path_prefix,
941
- ]
942
- outputs = [status_md, latest_snapshot, history_table, logs_box]
943
-
944
- start_button.click(start_crawl_standard, inputs=start_inputs, outputs=outputs)
945
- super_button.click(start_crawl_super, inputs=start_inputs, outputs=outputs)
946
- stop_button.click(stop_crawl, inputs=[], outputs=outputs)
947
- refresh_button.click(poll_dashboard, inputs=[], outputs=outputs)
948
-
949
- enable_hf_upload.change(
950
- toggle_hf_fields,
951
- inputs=enable_hf_upload,
952
- outputs=[hf_repo_id, hf_token, hf_private_repo, hf_path_prefix],
953
- )
954
-
955
- seed_urls_raw.input(
956
- fn=None,
957
- inputs=[seed_urls_raw, seed_urls_table],
958
- outputs=[seed_widget_html],
959
- js=SEED_WIDGET_JS,
960
- )
961
- seed_urls_table.change(
962
- fn=None,
963
- inputs=[seed_urls_raw, seed_urls_table],
964
- outputs=[seed_widget_html],
965
- js=SEED_WIDGET_JS,
966
- )
967
-
968
- theme_name.change(fn=None, inputs=theme_name, outputs=[], js=THEME_JS)
969
- demo.load(
970
- fn=None,
971
- inputs=[],
972
- outputs=[],
973
- js='() => { document.documentElement.setAttribute("data-crawler-theme", "dark"); }',
974
- )
975
- demo.load(
976
- fn=None,
977
- inputs=[seed_urls_raw, seed_urls_table],
978
- outputs=[seed_widget_html],
979
- js=SEED_WIDGET_JS,
980
- )
981
- demo.load(fn=poll_dashboard, inputs=[], outputs=outputs)
982
-
983
- timer = gr.Timer(value=1.0)
984
- timer.tick(fn=poll_dashboard, inputs=[], outputs=outputs)
985
-
986
- return demo
987
-
988
-
989
- demo = build_ui()
990
-
991
-
992
- def main() -> None:
993
- demo.queue(default_concurrency_limit=32).launch()
994
-
995
-
996
- if __name__ == "__main__":
997
- main()