Roman190928 commited on
Commit
f55f92e
·
verified ·
1 Parent(s): 261efcd

Upload AutoWS app files without plan/readme

Browse files
README.md DELETED
@@ -1,15 +0,0 @@
1
- ---
2
- license: cc-by-nc-nd-4.0
3
- title: AutoW
4
- sdk: gradio
5
- emoji: 📚
6
- colorFrom: indigo
7
- colorTo: red
8
- short_description: Automated Web Scraping!
9
- sdk_version: 6.5.1
10
- ---
11
- # This is the only official space for this project.
12
- # Do not trust mirrors/forks. This is only native to HuggingFace.
13
- # This does not log or steal your HF token. I cannot guarantee that forks ot mirrors do not.
14
- # To combat this, I'm using the most restrictive license I know.
15
- # Licensed under Creative Commons Attribution Non Commercial No Derivatives 4.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,924 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import contextlib
6
+ import inspect
7
+ import threading
8
+ import traceback
9
+ from dataclasses import dataclass
10
+ from datetime import datetime, timezone
11
+ from html import escape
12
+ from pathlib import Path
13
+ from typing import Any
14
+ from urllib.parse import urlsplit
15
+
16
+ import huggingface_hub as hf_hub
17
+
18
+ if not hasattr(hf_hub, "HfFolder"):
19
+ class _CompatHfFolder:
20
+ @staticmethod
21
+ def get_token() -> str | None:
22
+ return None
23
+
24
+ @staticmethod
25
+ def save_token(token: str) -> None:
26
+ del token
27
+ return None
28
+
29
+ @staticmethod
30
+ def delete_token() -> None:
31
+ return None
32
+
33
+ hf_hub.HfFolder = _CompatHfFolder # type: ignore[attr-defined]
34
+
35
+ import gradio as gr
36
+
37
+ from crawler import (
38
+ MAX_SHARD_ROWS,
39
+ NORMAL_TOTAL_WORKERS,
40
+ SUPER_TOTAL_WORKERS,
41
+ AsyncCrawler,
42
+ CrawlerConfig,
43
+ )
44
+
45
+ APP_CSS = """
46
+ :root {
47
+ --bg-main: #0a0d12;
48
+ --bg-surface: #151a22;
49
+ --bg-panel: #1b2230;
50
+ --text-main: #f0f4fb;
51
+ --text-muted: #9aa4b6;
52
+ --accent: #3bd9ff;
53
+ --accent-2: #4cffb1;
54
+ --border: #2f3a50;
55
+ --shadow: 0 18px 36px rgba(0, 0, 0, 0.45);
56
+ }
57
+
58
+ :root[data-crawler-theme="red"] {
59
+ --bg-main: #17080c;
60
+ --bg-surface: #250d15;
61
+ --bg-panel: #341322;
62
+ --text-main: #f8e8ee;
63
+ --text-muted: #d5b0c0;
64
+ --accent: #7a0018;
65
+ --accent-2: #8e3ff5;
66
+ --border: #5a2035;
67
+ }
68
+
69
+ :root[data-crawler-theme="blue"] {
70
+ --bg-main: #021116;
71
+ --bg-surface: #08222c;
72
+ --bg-panel: #0e2f3b;
73
+ --text-main: #eaffff;
74
+ --text-muted: #8fbcc7;
75
+ --accent: #2fff9d;
76
+ --accent-2: #13e5ff;
77
+ --border: #1e5662;
78
+ }
79
+
80
+ :root[data-crawler-theme="light"] {
81
+ --bg-main: #f6f7f9;
82
+ --bg-surface: #ffffff;
83
+ --bg-panel: #eceff2;
84
+ --text-main: #111317;
85
+ --text-muted: #60666f;
86
+ --accent: #2a2f37;
87
+ --accent-2: #868b95;
88
+ --border: #d0d4db;
89
+ --shadow: 0 10px 25px rgba(35, 42, 52, 0.16);
90
+ }
91
+
92
+ :root[data-crawler-theme="dark"] {
93
+ --bg-main: #090909;
94
+ --bg-surface: #141414;
95
+ --bg-panel: #1d1d1d;
96
+ --text-main: #f0f0f0;
97
+ --text-muted: #a8a8a8;
98
+ --accent: #444444;
99
+ --accent-2: #686868;
100
+ --border: #2b2b2b;
101
+ }
102
+
103
+ :root[data-crawler-theme="green"] {
104
+ --bg-main: #08110b;
105
+ --bg-surface: #0f1d14;
106
+ --bg-panel: #17301e;
107
+ --text-main: #e8f8ed;
108
+ --text-muted: #97bc9f;
109
+ --accent: #2ea84b;
110
+ --accent-2: #185f2a;
111
+ --border: #2a5d36;
112
+ }
113
+
114
+ .gradio-container {
115
+ background:
116
+ radial-gradient(1200px 550px at 8% 0%, color-mix(in srgb, var(--accent) 18%, transparent), transparent),
117
+ radial-gradient(900px 600px at 100% 0%, color-mix(in srgb, var(--accent-2) 14%, transparent), transparent),
118
+ var(--bg-main);
119
+ color: var(--text-main);
120
+ }
121
+
122
+ .gradio-container .block,
123
+ .gradio-container .form,
124
+ .gradio-container .gr-box,
125
+ .gradio-container .panel-wrap {
126
+ background: color-mix(in srgb, var(--bg-surface) 92%, transparent) !important;
127
+ border: 1px solid var(--border) !important;
128
+ box-shadow: var(--shadow);
129
+ }
130
+
131
+ .gradio-container h1,
132
+ .gradio-container h2,
133
+ .gradio-container h3,
134
+ .gradio-container p,
135
+ .gradio-container label,
136
+ .gradio-container .prose,
137
+ .gradio-container .prose * {
138
+ color: var(--text-main) !important;
139
+ }
140
+
141
+ .gradio-container input,
142
+ .gradio-container textarea,
143
+ .gradio-container select {
144
+ background: var(--bg-panel) !important;
145
+ color: var(--text-main) !important;
146
+ border: 1px solid var(--border) !important;
147
+ }
148
+
149
+ .gradio-container button {
150
+ border: 1px solid var(--border) !important;
151
+ }
152
+
153
+ .gradio-container button.primary {
154
+ background: linear-gradient(135deg, var(--accent), var(--accent-2)) !important;
155
+ color: #0b0e13 !important;
156
+ font-weight: 700;
157
+ }
158
+
159
+ .seed-widget,
160
+ .token-widget {
161
+ display: flex;
162
+ flex-direction: column;
163
+ gap: 0.75rem;
164
+ border: 1px solid var(--border);
165
+ border-radius: 0.9rem;
166
+ padding: 0.85rem;
167
+ background: color-mix(in srgb, var(--bg-panel) 86%, transparent);
168
+ }
169
+
170
+ .seed-stats,
171
+ .token-stats {
172
+ display: grid;
173
+ grid-template-columns: repeat(3, minmax(0, 1fr));
174
+ gap: 0.6rem;
175
+ }
176
+
177
+ .seed-stats > span,
178
+ .token-stats > span {
179
+ display: block;
180
+ padding: 0.55rem;
181
+ border: 1px solid var(--border);
182
+ border-radius: 0.6rem;
183
+ background: color-mix(in srgb, var(--bg-surface) 90%, transparent);
184
+ color: var(--text-main);
185
+ font-size: 0.9rem;
186
+ }
187
+
188
+ .seed-chip-wrap {
189
+ display: flex;
190
+ flex-wrap: wrap;
191
+ gap: 0.45rem;
192
+ }
193
+
194
+ .seed-chip {
195
+ border: 1px solid var(--border);
196
+ border-radius: 999px;
197
+ padding: 0.24rem 0.7rem;
198
+ color: var(--text-main);
199
+ background: linear-gradient(
200
+ 145deg,
201
+ color-mix(in srgb, var(--accent) 20%, transparent),
202
+ color-mix(in srgb, var(--accent-2) 15%, transparent)
203
+ );
204
+ font-size: 0.83rem;
205
+ }
206
+
207
+ .seed-empty,
208
+ .seed-overflow,
209
+ .token-note {
210
+ color: var(--text-muted);
211
+ font-size: 0.83rem;
212
+ padding: 0.24rem 0.3rem;
213
+ }
214
+ """
215
+
216
+ THEME_JS = """
217
+ (theme_name) => {
218
+ const theme = theme_name || "dark";
219
+ document.documentElement.setAttribute("data-crawler-theme", theme);
220
+ return [];
221
+ }
222
+ """
223
+
224
+ SEED_WIDGET_JS = """
225
+ (seed_rows) => {
226
+ const parseRows = (rows) => {
227
+ if (!Array.isArray(rows)) return [];
228
+ const out = [];
229
+ for (const row of rows) {
230
+ let value = "";
231
+ if (Array.isArray(row)) {
232
+ value = String(row[0] ?? "").trim();
233
+ } else if (row && typeof row === "object") {
234
+ value = String(Object.values(row)[0] ?? "").trim();
235
+ } else if (row !== null && row !== undefined) {
236
+ value = String(row).trim();
237
+ }
238
+ if (value) out.push(value);
239
+ }
240
+ return out;
241
+ };
242
+
243
+ const dedupe = (values) => {
244
+ const seen = new Set();
245
+ const out = [];
246
+ for (const value of values) {
247
+ if (!seen.has(value)) {
248
+ seen.add(value);
249
+ out.push(value);
250
+ }
251
+ }
252
+ return out;
253
+ };
254
+
255
+ const domainOf = (value) => {
256
+ try {
257
+ return new URL(value).hostname || "";
258
+ } catch {
259
+ return "";
260
+ }
261
+ };
262
+
263
+ const escapeHtml = (value) => String(value)
264
+ .replaceAll("&", "&")
265
+ .replaceAll("<", "&lt;")
266
+ .replaceAll(">", "&gt;")
267
+ .replaceAll('"', "&quot;")
268
+ .replaceAll("'", "&#39;");
269
+
270
+ const seeds = dedupe(parseRows(seed_rows));
271
+ const domainSet = new Set(seeds.map(domainOf).filter(Boolean));
272
+ const chips = seeds.length
273
+ ? seeds.slice(0, 12).map((url) => `<span class=\"seed-chip\">${escapeHtml(url)}</span>`).join("")
274
+ : '<span class=\"seed-empty\">No seed URLs configured yet.</span>';
275
+ const overflow = seeds.length > 12
276
+ ? `<span class=\"seed-overflow\">+${seeds.length - 12} more</span>`
277
+ : "";
278
+
279
+ return `<div class=\"seed-widget\"><div class=\"seed-stats\"><span><strong>${seeds.length}</strong> seeds</span><span><strong>${domainSet.size}</strong> domains</span><span><strong>${seeds.slice(0, 1).join("").length || 0}</strong> first-url chars</span></div><div class=\"seed-chip-wrap\">${chips}${overflow}</div></div>`;
280
+ }
281
+ """
282
+
283
+
284
+ def utc_now_iso() -> str:
285
+ return datetime.now(timezone.utc).isoformat(timespec="seconds")
286
+
287
+
288
+ def safe_queue_size(queue: Any) -> int:
289
+ try:
290
+ return int(queue.qsize())
291
+ except Exception:
292
+ return -1
293
+
294
+
295
+ def parse_seed_url_rows(rows: Any) -> list[str]:
296
+ if rows is None:
297
+ return []
298
+
299
+ if isinstance(rows, (list, tuple)):
300
+ rows_iterable: list[Any] = list(rows)
301
+ elif hasattr(rows, "values"):
302
+ try:
303
+ rows_iterable = rows.values.tolist() # pandas.DataFrame path
304
+ except Exception:
305
+ rows_iterable = []
306
+ else:
307
+ rows_iterable = [rows]
308
+
309
+ items: list[str] = []
310
+ for row in rows_iterable:
311
+ value = ""
312
+ if isinstance(row, dict):
313
+ value = str(next(iter(row.values()), "") or "").strip()
314
+ elif isinstance(row, (list, tuple)):
315
+ value = str(row[0] if row else "").strip()
316
+ elif row is not None:
317
+ value = str(row).strip()
318
+
319
+ if value:
320
+ items.append(value)
321
+ return items
322
+
323
+
324
+ def unique_preserve_order(values: list[str]) -> list[str]:
325
+ seen: set[str] = set()
326
+ out: list[str] = []
327
+ for value in values:
328
+ if value in seen:
329
+ continue
330
+ seen.add(value)
331
+ out.append(value)
332
+ return out
333
+
334
+
335
+ def collect_seed_urls(seed_urls_table: Any) -> list[str]:
336
+ return unique_preserve_order(parse_seed_url_rows(seed_urls_table))
337
+
338
+
339
+ def render_seed_widget_html(seed_urls_table: Any) -> str:
340
+ seeds = collect_seed_urls(seed_urls_table)
341
+ domains = {(urlsplit(u).hostname or "").lower().strip(".") for u in seeds}
342
+ domains = {d for d in domains if d}
343
+
344
+ chips = [f'<span class="seed-chip">{escape(url)}</span>' for url in seeds[:12]]
345
+ chips_html = "".join(chips) if chips else '<span class="seed-empty">No seed URLs configured yet.</span>'
346
+ overflow_html = f'<span class="seed-overflow">+{len(seeds) - 12} more</span>' if len(seeds) > 12 else ""
347
+
348
+ return (
349
+ '<div class="seed-widget">'
350
+ '<div class="seed-stats">'
351
+ f"<span><strong>{len(seeds)}</strong> seeds</span>"
352
+ f"<span><strong>{len(domains)}</strong> domains</span>"
353
+ f"<span><strong>{len(seeds[0]) if seeds else 0}</strong> first-url chars</span>"
354
+ "</div>"
355
+ f'<div class="seed-chip-wrap">{chips_html}{overflow_html}</div>'
356
+ "</div>"
357
+ )
358
+
359
+
360
+ def render_tokenization_widget_html(snapshot: dict[str, Any]) -> str:
361
+ tokenized_shards = int(snapshot.get("tokenized_shards", 0) or 0)
362
+ tokenized_rows = int(snapshot.get("tokenized_rows", 0) or 0)
363
+ tokenized_tokens = int(snapshot.get("tokenized_tokens", 0) or 0)
364
+ written_shards = int(snapshot.get("written_shards", 0) or 0)
365
+
366
+ return (
367
+ '<div class="token-widget">'
368
+ '<div class="token-stats">'
369
+ f"<span><strong>{tokenized_tokens}</strong> text tokens</span>"
370
+ f"<span><strong>{tokenized_rows}</strong> tokenized rows</span>"
371
+ f"<span><strong>{tokenized_shards}/{written_shards}</strong> tokenized shards</span>"
372
+ "</div>"
373
+ '<div class="token-note">Live shard tokenization uses tiktoken on the parquet <code>text</code> column.</div>'
374
+ "</div>"
375
+ )
376
+
377
+
378
+ def render_qvp_widget_md(snapshot: dict[str, Any]) -> str:
379
+ queue_count = int(snapshot.get("fetch_queue", 0) or 0)
380
+ visited_count = int(snapshot.get("fetch_succeeded", 0) or 0)
381
+ parsed_count = int(snapshot.get("parsed_pages", 0) or 0)
382
+ return (
383
+ "### Live Metrics\n"
384
+ f"- Queue: `{queue_count}`\n"
385
+ f"- Visited: `{visited_count}`\n"
386
+ f"- Parsed: `{parsed_count}`"
387
+ )
388
+
389
+
390
+ def validate_hf_requirements(enable_hf_upload: bool, hf_repo_id: str, hf_token: str) -> None:
391
+ if not enable_hf_upload:
392
+ return
393
+ if not hf_repo_id.strip():
394
+ raise ValueError("HF repo is required when upload is enabled.")
395
+ if not hf_token.strip():
396
+ raise ValueError("HF token is required when upload is enabled.")
397
+
398
+
399
+ def build_crawler_config(
400
+ *,
401
+ seed_urls_table: Any,
402
+ max_links_per_page: int,
403
+ request_timeout_seconds: float,
404
+ max_response_bytes: int,
405
+ shard_size_rows: int,
406
+ enable_hf_upload: bool,
407
+ hf_repo_id: str,
408
+ hf_token: str,
409
+ hf_private_repo: bool,
410
+ hf_path_prefix: str,
411
+ total_workers: int,
412
+ ) -> CrawlerConfig:
413
+ validate_hf_requirements(enable_hf_upload, hf_repo_id, hf_token)
414
+ seed_urls = collect_seed_urls(seed_urls_table)
415
+
416
+ return CrawlerConfig(
417
+ seed_urls=seed_urls,
418
+ max_links_per_page=int(max_links_per_page),
419
+ request_timeout_seconds=float(request_timeout_seconds),
420
+ max_response_bytes=int(max_response_bytes),
421
+ shard_size_rows=int(shard_size_rows),
422
+ output_dir=Path(__file__).resolve().parent / "shards",
423
+ enable_hf_upload=bool(enable_hf_upload),
424
+ hf_repo_id=hf_repo_id.strip(),
425
+ hf_token=hf_token.strip(),
426
+ hf_private_repo=bool(hf_private_repo),
427
+ hf_path_prefix=hf_path_prefix.strip() or "crawl_shards",
428
+ total_workers=int(total_workers),
429
+ )
430
+
431
+
432
+ @dataclass
433
+ class RunState:
434
+ run_id: int = 0
435
+ running: bool = False
436
+ started_at: str = ""
437
+ finished_at: str = ""
438
+ stop_requested: bool = False
439
+ last_error: str = ""
440
+
441
+
442
+ class CrawlerRunManager:
443
+ def __init__(self) -> None:
444
+ self._lock = threading.Lock()
445
+ self._thread: threading.Thread | None = None
446
+ self._loop: asyncio.AbstractEventLoop | None = None
447
+ self._crawler: AsyncCrawler | None = None
448
+ self._state = RunState()
449
+ self._logs: list[str] = []
450
+ self._last_snapshot: dict[str, Any] | None = None
451
+
452
+ def start(self, config: CrawlerConfig) -> str:
453
+ with self._lock:
454
+ if self._thread is not None and self._thread.is_alive():
455
+ return "A crawl is already running. Stop it before starting another one."
456
+
457
+ self._state.run_id += 1
458
+ self._state.running = True
459
+ self._state.started_at = utc_now_iso()
460
+ self._state.finished_at = ""
461
+ self._state.stop_requested = False
462
+ self._state.last_error = ""
463
+ self._last_snapshot = None
464
+ self._logs.clear()
465
+
466
+ run_id = self._state.run_id
467
+ self._logs.append(
468
+ f"[{utc_now_iso()}] Started run #{run_id} with {config.total_workers} workers "
469
+ f"({config.fetch_workers} fetch / {config.parser_workers} parser)."
470
+ )
471
+
472
+ self._thread = threading.Thread(
473
+ target=self._run_crawler,
474
+ args=(run_id, config),
475
+ daemon=True,
476
+ name=f"crawler-run-{run_id}",
477
+ )
478
+ self._thread.start()
479
+
480
+ return f"Run #{run_id} started."
481
+
482
+ def stop(self) -> str:
483
+ with self._lock:
484
+ if self._thread is None or not self._thread.is_alive():
485
+ return "No active crawl to stop."
486
+
487
+ self._state.stop_requested = True
488
+ crawler = self._crawler
489
+ loop = self._loop
490
+ run_id = self._state.run_id
491
+ self._logs.append(f"[{utc_now_iso()}] Stop requested for run #{run_id}")
492
+
493
+ if crawler is not None and loop is not None and loop.is_running():
494
+ loop.call_soon_threadsafe(crawler.request_stop, "user_requested_stop")
495
+ elif crawler is not None:
496
+ crawler.request_stop("user_requested_stop")
497
+
498
+ return f"Stop signal sent to run #{run_id}."
499
+
500
+ def _run_crawler(self, run_id: int, config: CrawlerConfig) -> None:
501
+ loop: asyncio.AbstractEventLoop | None = None
502
+ try:
503
+ crawler = AsyncCrawler(config)
504
+ if hasattr(asyncio, "Runner"):
505
+ with asyncio.Runner() as runner: # type: ignore[attr-defined]
506
+ loop = runner.get_loop()
507
+ with self._lock:
508
+ if self._state.run_id == run_id:
509
+ self._crawler = crawler
510
+ self._loop = loop
511
+ runner.run(crawler.run())
512
+ else:
513
+ loop = asyncio.new_event_loop()
514
+ asyncio.set_event_loop(loop)
515
+ with self._lock:
516
+ if self._state.run_id == run_id:
517
+ self._crawler = crawler
518
+ self._loop = loop
519
+ loop.run_until_complete(crawler.run())
520
+
521
+ final_snapshot = self._snapshot_from_crawler(crawler)
522
+ with self._lock:
523
+ if self._state.run_id == run_id:
524
+ self._last_snapshot = final_snapshot
525
+ self._logs.append(f"[{utc_now_iso()}] Run #{run_id} completed")
526
+ except Exception:
527
+ error_text = traceback.format_exc(limit=20)
528
+ with self._lock:
529
+ self._state.last_error = error_text
530
+ self._logs.append(f"[{utc_now_iso()}] Run #{run_id} crashed")
531
+ finally:
532
+ with self._lock:
533
+ if self._state.run_id == run_id:
534
+ self._state.running = False
535
+ self._state.finished_at = utc_now_iso()
536
+ self._crawler = None
537
+ self._loop = None
538
+
539
+ if loop is not None and not loop.is_closed():
540
+ loop.close()
541
+ with contextlib.suppress(Exception):
542
+ asyncio.set_event_loop(None)
543
+
544
+ def _snapshot_from_crawler(self, crawler: AsyncCrawler) -> dict[str, Any]:
545
+ stats = crawler.stats
546
+ return {
547
+ "timestamp": utc_now_iso(),
548
+ "workers_total": crawler.config.total_workers,
549
+ "workers_split": f"{crawler.config.fetch_workers}/{crawler.config.parser_workers}",
550
+ "stop_reason": crawler.stop_reason or "-",
551
+ "fetch_succeeded": stats.fetch_succeeded,
552
+ "parsed_pages": stats.parsed_pages,
553
+ "written_shards": stats.written_shards,
554
+ "tokenized_shards": stats.tokenized_shards,
555
+ "tokenized_rows": stats.tokenized_rows,
556
+ "tokenized_tokens": stats.tokenized_tokens,
557
+ "fetch_queue": safe_queue_size(crawler.fetch_queue),
558
+ "parse_queue": safe_queue_size(crawler.parse_queue),
559
+ "record_queue": safe_queue_size(crawler.record_queue),
560
+ "stop_event": crawler.stop_event.is_set(),
561
+ }
562
+
563
+ def poll(self) -> tuple[str, dict[str, Any], str]:
564
+ with self._lock:
565
+ crawler = self._crawler
566
+ state = RunState(
567
+ run_id=self._state.run_id,
568
+ running=self._state.running,
569
+ started_at=self._state.started_at,
570
+ finished_at=self._state.finished_at,
571
+ stop_requested=self._state.stop_requested,
572
+ last_error=self._state.last_error,
573
+ )
574
+
575
+ if crawler is not None:
576
+ snapshot = self._snapshot_from_crawler(crawler)
577
+ with self._lock:
578
+ self._last_snapshot = snapshot
579
+
580
+ with self._lock:
581
+ latest = self._last_snapshot or {
582
+ "timestamp": utc_now_iso(),
583
+ "workers_total": 0,
584
+ "workers_split": "-",
585
+ "stop_reason": "-",
586
+ "fetch_succeeded": 0,
587
+ "parsed_pages": 0,
588
+ "written_shards": 0,
589
+ "tokenized_shards": 0,
590
+ "tokenized_rows": 0,
591
+ "tokenized_tokens": 0,
592
+ "fetch_queue": 0,
593
+ "parse_queue": 0,
594
+ "record_queue": 0,
595
+ "stop_event": False,
596
+ }
597
+ logs_text = "\n".join(self._logs[-500:])
598
+
599
+ status_lines = [
600
+ "### Crawler Status",
601
+ f"- Run ID: `{state.run_id}`",
602
+ f"- Running: `{state.running}`",
603
+ f"- Stop requested: `{state.stop_requested}`",
604
+ f"- Started at (UTC): `{state.started_at or '-'}`",
605
+ f"- Finished at (UTC): `{state.finished_at or '-'}`",
606
+ ]
607
+ if state.last_error:
608
+ status_lines.append("- Last error:")
609
+ status_lines.append("```text")
610
+ status_lines.append(state.last_error.strip())
611
+ status_lines.append("```")
612
+
613
+ return "\n".join(status_lines), latest, logs_text
614
+
615
+
616
+ RUN_MANAGER = CrawlerRunManager()
617
+
618
+
619
+ def _format_dashboard_response(
620
+ status: str,
621
+ snapshot: dict[str, Any],
622
+ logs: str,
623
+ ) -> tuple[str, str, str, str]:
624
+ return (
625
+ status,
626
+ render_qvp_widget_md(snapshot),
627
+ logs,
628
+ render_tokenization_widget_html(snapshot),
629
+ )
630
+
631
+
632
+ def _start_crawl(
633
+ *,
634
+ total_workers: int,
635
+ seed_urls_table: Any,
636
+ max_links_per_page: int,
637
+ request_timeout_seconds: float,
638
+ max_response_bytes: int,
639
+ shard_size_rows: int,
640
+ enable_hf_upload: bool,
641
+ hf_repo_id: str,
642
+ hf_token: str,
643
+ hf_private_repo: bool,
644
+ hf_path_prefix: str,
645
+ ) -> tuple[str, str, str, str]:
646
+ try:
647
+ config = build_crawler_config(
648
+ seed_urls_table=seed_urls_table,
649
+ max_links_per_page=max_links_per_page,
650
+ request_timeout_seconds=request_timeout_seconds,
651
+ max_response_bytes=max_response_bytes,
652
+ shard_size_rows=shard_size_rows,
653
+ enable_hf_upload=enable_hf_upload,
654
+ hf_repo_id=hf_repo_id,
655
+ hf_token=hf_token,
656
+ hf_private_repo=hf_private_repo,
657
+ hf_path_prefix=hf_path_prefix,
658
+ total_workers=total_workers,
659
+ )
660
+ except ValueError as exc:
661
+ raise gr.Error(str(exc)) from exc
662
+
663
+ message = RUN_MANAGER.start(config)
664
+ status, snapshot, logs = RUN_MANAGER.poll()
665
+ return _format_dashboard_response(f"{status}\n\n{message}", snapshot, logs)
666
+
667
+
668
+ def start_crawl_standard(
669
+ seed_urls_table: Any,
670
+ max_links_per_page: int,
671
+ request_timeout_seconds: float,
672
+ max_response_bytes: int,
673
+ shard_size_rows: int,
674
+ enable_hf_upload: bool,
675
+ hf_repo_id: str,
676
+ hf_token: str,
677
+ hf_private_repo: bool,
678
+ hf_path_prefix: str,
679
+ ) -> tuple[str, str, str, str]:
680
+ return _start_crawl(
681
+ total_workers=NORMAL_TOTAL_WORKERS,
682
+ seed_urls_table=seed_urls_table,
683
+ max_links_per_page=max_links_per_page,
684
+ request_timeout_seconds=request_timeout_seconds,
685
+ max_response_bytes=max_response_bytes,
686
+ shard_size_rows=shard_size_rows,
687
+ enable_hf_upload=enable_hf_upload,
688
+ hf_repo_id=hf_repo_id,
689
+ hf_token=hf_token,
690
+ hf_private_repo=hf_private_repo,
691
+ hf_path_prefix=hf_path_prefix,
692
+ )
693
+
694
+
695
+ def start_crawl_super(
696
+ seed_urls_table: Any,
697
+ max_links_per_page: int,
698
+ request_timeout_seconds: float,
699
+ max_response_bytes: int,
700
+ shard_size_rows: int,
701
+ enable_hf_upload: bool,
702
+ hf_repo_id: str,
703
+ hf_token: str,
704
+ hf_private_repo: bool,
705
+ hf_path_prefix: str,
706
+ ) -> tuple[str, str, str, str]:
707
+ return _start_crawl(
708
+ total_workers=SUPER_TOTAL_WORKERS,
709
+ seed_urls_table=seed_urls_table,
710
+ max_links_per_page=max_links_per_page,
711
+ request_timeout_seconds=request_timeout_seconds,
712
+ max_response_bytes=max_response_bytes,
713
+ shard_size_rows=shard_size_rows,
714
+ enable_hf_upload=enable_hf_upload,
715
+ hf_repo_id=hf_repo_id,
716
+ hf_token=hf_token,
717
+ hf_private_repo=hf_private_repo,
718
+ hf_path_prefix=hf_path_prefix,
719
+ )
720
+
721
+
722
+ def stop_crawl() -> tuple[str, str, str, str]:
723
+ message = RUN_MANAGER.stop()
724
+ status, snapshot, logs = RUN_MANAGER.poll()
725
+ return _format_dashboard_response(f"{status}\n\n{message}", snapshot, logs)
726
+
727
+
728
+ def poll_dashboard() -> tuple[str, str, str, str]:
729
+ status, snapshot, logs = RUN_MANAGER.poll()
730
+ return _format_dashboard_response(status, snapshot, logs)
731
+
732
+
733
+ def toggle_hf_fields(enable_hf_upload: bool) -> tuple[Any, Any, Any, Any]:
734
+ update = gr.update(visible=enable_hf_upload)
735
+ return update, update, update, update
736
+
737
+
738
+ def build_ui() -> gr.Blocks:
739
+ defaults = CrawlerConfig(
740
+ seed_urls=[
741
+ "https://en.wikipedia.org/wiki/Main_Page",
742
+ "https://docs.python.org/3/",
743
+ "https://developer.mozilla.org/en-US/",
744
+ "https://www.nasa.gov/",
745
+ ]
746
+ )
747
+ default_seed_rows = [[url] for url in defaults.seed_urls]
748
+
749
+ with gr.Blocks(title="DataMuncherLabs AutoWS") as demo:
750
+ gr.Markdown("# DataMuncherLabs AutoWS")
751
+ gr.Markdown("Async web crawler dashboard with live parquet text tokenization.")
752
+
753
+ with gr.Row():
754
+ theme_name = gr.Dropdown(
755
+ choices=["red", "blue", "light", "dark", "green"],
756
+ value="dark",
757
+ label="Theme",
758
+ interactive=True,
759
+ )
760
+ gr.Markdown(
761
+ "- Standard mode: **12 threads** (`10 fetch`, `2 parse`)\n"
762
+ "- Super mode: **24 threads** (`20 fetch`, `4 parse`)"
763
+ )
764
+
765
+ with gr.Row():
766
+ with gr.Column(scale=2):
767
+ seed_urls_table = gr.Dataframe(
768
+ headers=["seed_url"],
769
+ datatype=["str"],
770
+ type="array",
771
+ row_count=(8, "dynamic"),
772
+ value=default_seed_rows,
773
+ interactive=True,
774
+ label="Seed URL List (editable)",
775
+ )
776
+ seed_widget_html = gr.HTML(
777
+ label="Seed URL Summary",
778
+ value=render_seed_widget_html(default_seed_rows),
779
+ )
780
+ token_widget_html = gr.HTML(
781
+ label="Live Tokenization",
782
+ value=render_tokenization_widget_html({}),
783
+ )
784
+
785
+ with gr.Column(scale=1):
786
+ shard_size_rows = gr.Slider(
787
+ label=f"Shard Size Rows (max {MAX_SHARD_ROWS})",
788
+ minimum=100,
789
+ maximum=MAX_SHARD_ROWS,
790
+ step=100,
791
+ value=min(defaults.shard_size_rows, MAX_SHARD_ROWS),
792
+ )
793
+ max_links_per_page = gr.Slider(
794
+ label="Max Links Per Page",
795
+ minimum=10,
796
+ maximum=1000,
797
+ step=10,
798
+ value=defaults.max_links_per_page,
799
+ )
800
+ request_timeout_seconds = gr.Slider(
801
+ label="Request Timeout (seconds)",
802
+ minimum=3,
803
+ maximum=60,
804
+ step=1,
805
+ value=defaults.request_timeout_seconds,
806
+ )
807
+ max_response_bytes = gr.Slider(
808
+ label="Max Response Bytes",
809
+ minimum=500_000,
810
+ maximum=8_000_000,
811
+ step=100_000,
812
+ value=defaults.max_response_bytes,
813
+ )
814
+
815
+ with gr.Accordion("Hugging Face Upload", open=False):
816
+ enable_hf_upload = gr.Checkbox(
817
+ label="Upload shards to my HF repo",
818
+ value=False,
819
+ )
820
+ hf_repo_id = gr.Textbox(
821
+ label="HF Repo ID",
822
+ placeholder="username/dataset-name",
823
+ visible=False,
824
+ )
825
+ hf_token = gr.Textbox(
826
+ label="HF Token (write permissions)",
827
+ type="password",
828
+ placeholder="hf_xxx",
829
+ visible=False,
830
+ )
831
+ hf_private_repo = gr.Checkbox(
832
+ label="Private HF Repo",
833
+ value=False,
834
+ visible=False,
835
+ )
836
+ hf_path_prefix = gr.Textbox(
837
+ label="HF Path Prefix",
838
+ value="crawl_shards",
839
+ visible=False,
840
+ )
841
+
842
+ with gr.Row():
843
+ start_button = gr.Button("Start Crawl (12 Threads)", variant="primary")
844
+ super_button = gr.Button("Super Mode (24 Threads)", variant="primary")
845
+ stop_button = gr.Button("Stop Crawl", variant="stop")
846
+ refresh_button = gr.Button("Refresh")
847
+
848
+ status_md = gr.Markdown("### Crawler Status\n- Run ID: `0`\n- Running: `False`")
849
+ qvp_md = gr.Markdown("### Live Metrics\n- Queue: `0`\n- Visited: `0`\n- Parsed: `0`")
850
+ logs_box = gr.Textbox(label="Run Log", lines=12, interactive=False)
851
+
852
+ start_inputs = [
853
+ seed_urls_table,
854
+ max_links_per_page,
855
+ request_timeout_seconds,
856
+ max_response_bytes,
857
+ shard_size_rows,
858
+ enable_hf_upload,
859
+ hf_repo_id,
860
+ hf_token,
861
+ hf_private_repo,
862
+ hf_path_prefix,
863
+ ]
864
+ outputs = [status_md, qvp_md, logs_box, token_widget_html]
865
+
866
+ start_button.click(start_crawl_standard, inputs=start_inputs, outputs=outputs)
867
+ super_button.click(start_crawl_super, inputs=start_inputs, outputs=outputs)
868
+ stop_button.click(stop_crawl, inputs=[], outputs=outputs)
869
+ refresh_button.click(poll_dashboard, inputs=[], outputs=outputs)
870
+
871
+ enable_hf_upload.change(
872
+ toggle_hf_fields,
873
+ inputs=enable_hf_upload,
874
+ outputs=[hf_repo_id, hf_token, hf_private_repo, hf_path_prefix],
875
+ )
876
+
877
+ seed_urls_table.change(
878
+ fn=None,
879
+ inputs=[seed_urls_table],
880
+ outputs=[seed_widget_html],
881
+ js=SEED_WIDGET_JS,
882
+ )
883
+
884
+ theme_name.change(fn=None, inputs=theme_name, outputs=[], js=THEME_JS)
885
+ demo.load(
886
+ fn=None,
887
+ inputs=[],
888
+ outputs=[],
889
+ js='() => { document.documentElement.setAttribute("data-crawler-theme", "dark"); }',
890
+ )
891
+ demo.load(
892
+ fn=None,
893
+ inputs=[seed_urls_table],
894
+ outputs=[seed_widget_html],
895
+ js=SEED_WIDGET_JS,
896
+ )
897
+ demo.load(fn=poll_dashboard, inputs=[], outputs=outputs)
898
+
899
+ timer = gr.Timer(value=1.0)
900
+ timer.tick(fn=poll_dashboard, inputs=[], outputs=outputs)
901
+
902
+ return demo
903
+
904
+
905
+ demo = build_ui()
906
+
907
+
908
+ def main() -> None:
909
+ queued = demo.queue(default_concurrency_limit=32)
910
+ launch_sig = inspect.signature(queued.launch)
911
+ launch_kwargs: dict[str, Any] = {}
912
+
913
+ if "css" in launch_sig.parameters:
914
+ launch_kwargs["css"] = APP_CSS
915
+ if "theme" in launch_sig.parameters:
916
+ launch_kwargs["theme"] = gr.themes.Default(primary_hue="green")
917
+ if "ssr_mode" in launch_sig.parameters:
918
+ launch_kwargs["ssr_mode"] = False
919
+
920
+ queued.launch(**launch_kwargs)
921
+
922
+
923
+ if __name__ == "__main__":
924
+ main()
crawler/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .config import (
2
+ MAX_SHARD_ROWS,
3
+ MAX_SHARDS,
4
+ NORMAL_TOTAL_WORKERS,
5
+ SUPER_TOTAL_WORKERS,
6
+ CrawlerConfig,
7
+ compute_worker_split,
8
+ )
9
+ from .engine import AsyncCrawler
10
+
11
+ __all__ = [
12
+ "AsyncCrawler",
13
+ "CrawlerConfig",
14
+ "MAX_SHARD_ROWS",
15
+ "MAX_SHARDS",
16
+ "NORMAL_TOTAL_WORKERS",
17
+ "SUPER_TOTAL_WORKERS",
18
+ "compute_worker_split",
19
+ ]
crawler/config.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+
6
+ NORMAL_TOTAL_WORKERS = 12
7
+ SUPER_TOTAL_WORKERS = 24
8
+ MAX_SHARD_ROWS = 15_000
9
+ MAX_SHARDS = 10
10
+
11
+
12
+ def validate_total_workers(total_workers: int) -> int:
13
+ value = int(total_workers)
14
+ if value not in {NORMAL_TOTAL_WORKERS, SUPER_TOTAL_WORKERS}:
15
+ raise ValueError(
16
+ f"total_workers must be {NORMAL_TOTAL_WORKERS} or {SUPER_TOTAL_WORKERS}, got {value}."
17
+ )
18
+ return value
19
+
20
+
21
+ def compute_worker_split(total_workers: int) -> tuple[int, int]:
22
+ total = validate_total_workers(total_workers)
23
+ fetch_workers = (total * 5) // 6
24
+ parser_workers = total - fetch_workers
25
+ if fetch_workers < 1 or parser_workers < 1:
26
+ raise ValueError(f"Invalid worker split for total_workers={total}.")
27
+ return fetch_workers, parser_workers
28
+
29
+
30
+ @dataclass
31
+ class CrawlerConfig:
32
+ seed_urls: list[str]
33
+
34
+ max_links_per_page: int = 250
35
+ request_timeout_seconds: float = 18.0
36
+ max_response_bytes: int = 3_000_000
37
+ user_agent: str = "HFDBContCrawler/1.0 (+https://huggingface.co/datasets)"
38
+ seen_url_cache_size: int = 2_000_000
39
+
40
+ fetch_queue_size: int = 100_000
41
+ parse_queue_size: int = 25_000
42
+ record_queue_size: int = 50_000
43
+ report_every_seconds: float = 5.0
44
+
45
+ output_dir: Path = field(
46
+ default_factory=lambda: Path(__file__).resolve().parents[1] / "shards"
47
+ )
48
+ shard_size_rows: int = 10_000
49
+ max_shards: int = MAX_SHARDS
50
+ parquet_compression: str = "zstd"
51
+ parquet_compression_level: int = 9
52
+
53
+ enable_hf_upload: bool = False
54
+ hf_repo_id: str = ""
55
+ hf_token: str = ""
56
+ hf_repo_type: str = "dataset"
57
+ hf_private_repo: bool = False
58
+ hf_path_prefix: str = "crawl_shards"
59
+
60
+ total_workers: int = NORMAL_TOTAL_WORKERS
61
+ request_delay_global_seconds: float = 0.02
62
+ request_delay_per_domain_seconds: float = 2.0
63
+
64
+ robots_cache_ttl_seconds: float = 3600.0
65
+ robots_fail_closed: bool = True
66
+ robots_max_bytes: int = 300_000
67
+
68
+ fetch_workers: int = field(init=False)
69
+ parser_workers: int = field(init=False)
70
+
71
+ def __post_init__(self) -> None:
72
+ self.seed_urls = [u.strip() for u in self.seed_urls if u and u.strip()]
73
+ if not self.seed_urls:
74
+ raise ValueError("At least one seed URL is required.")
75
+
76
+ self.total_workers = validate_total_workers(self.total_workers)
77
+ self.fetch_workers, self.parser_workers = compute_worker_split(self.total_workers)
78
+
79
+ self.shard_size_rows = int(self.shard_size_rows)
80
+ if self.shard_size_rows < 1 or self.shard_size_rows > MAX_SHARD_ROWS:
81
+ raise ValueError(f"shard_size_rows must be between 1 and {MAX_SHARD_ROWS}.")
82
+
83
+ self.max_shards = int(self.max_shards)
84
+ if self.max_shards < 1 or self.max_shards > MAX_SHARDS:
85
+ raise ValueError(f"max_shards must be between 1 and {MAX_SHARDS}.")
86
+
87
+ self.output_dir = Path(self.output_dir).expanduser()
88
+
89
+ self.hf_repo_id = self.hf_repo_id.strip()
90
+ self.hf_token = self.hf_token.strip()
91
+ self.hf_path_prefix = self.hf_path_prefix.strip() or "crawl_shards"
92
+
93
+ if self.enable_hf_upload:
94
+ if not self.hf_repo_id:
95
+ raise ValueError("hf_repo_id is required when enable_hf_upload=True.")
96
+ if not self.hf_token:
97
+ raise ValueError("hf_token is required when enable_hf_upload=True.")
crawler/engine.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import contextlib
5
+ from collections import deque
6
+ from typing import Any
7
+
8
+ import aiohttp
9
+
10
+ from .config import CrawlerConfig
11
+ from .fetch import fetch_url
12
+ from .models import CrawlStats, FetchResult
13
+ from .parse import parse_page
14
+ from .rate_limit import RequestRateLimiter
15
+ from .robots import RobotsPolicy
16
+ from .shards import ParquetShardWriter, ShardLimitReached
17
+ from .utils import has_binary_extension, normalize_url
18
+
19
+
20
+ class AsyncCrawler:
21
+ def __init__(self, config: CrawlerConfig):
22
+ self.config = config
23
+ self.stats = CrawlStats()
24
+ self.stop_event = asyncio.Event()
25
+ self.stop_reason = ""
26
+
27
+ self.fetch_queue: asyncio.Queue[str | None] = asyncio.Queue(
28
+ maxsize=config.fetch_queue_size
29
+ )
30
+ self.parse_queue: asyncio.Queue[FetchResult | None] = asyncio.Queue(
31
+ maxsize=config.parse_queue_size
32
+ )
33
+ self.record_queue: asyncio.Queue[dict[str, Any] | None] = asyncio.Queue(
34
+ maxsize=config.record_queue_size
35
+ )
36
+
37
+ self.seen_urls: set[str] = set()
38
+ self.seen_order: deque[str] = deque()
39
+ self.seen_lock = asyncio.Lock()
40
+ self.counter_lock = asyncio.Lock()
41
+
42
+ self.active_fetchers = 0
43
+ self.active_parsers = 0
44
+
45
+ self.writer = ParquetShardWriter(config=config, stats=self.stats)
46
+ self.rate_limiter: RequestRateLimiter | None = None
47
+ self.robots_policy: RobotsPolicy | None = None
48
+
49
+ async def run(self) -> None:
50
+ await self.writer.initialize()
51
+
52
+ for seed in self.config.seed_urls:
53
+ await self.try_enqueue(seed)
54
+
55
+ connector = aiohttp.TCPConnector(
56
+ limit=max(200, self.config.fetch_workers * 4),
57
+ ttl_dns_cache=300,
58
+ )
59
+ timeout = aiohttp.ClientTimeout(total=self.config.request_timeout_seconds)
60
+
61
+ async with aiohttp.ClientSession(
62
+ connector=connector,
63
+ timeout=timeout,
64
+ headers={"User-Agent": self.config.user_agent},
65
+ ) as session:
66
+ self.rate_limiter = RequestRateLimiter(
67
+ global_interval_seconds=self.config.request_delay_global_seconds,
68
+ per_domain_interval_seconds=self.config.request_delay_per_domain_seconds,
69
+ )
70
+ self.robots_policy = RobotsPolicy(
71
+ session=session,
72
+ user_agent=self.config.user_agent,
73
+ cache_ttl_seconds=self.config.robots_cache_ttl_seconds,
74
+ fail_closed=self.config.robots_fail_closed,
75
+ max_bytes=self.config.robots_max_bytes,
76
+ )
77
+
78
+ fetchers = [
79
+ asyncio.create_task(self.fetcher_worker(worker_id=i, session=session))
80
+ for i in range(self.config.fetch_workers)
81
+ ]
82
+ parsers = [
83
+ asyncio.create_task(self.parser_worker(worker_id=i))
84
+ for i in range(self.config.parser_workers)
85
+ ]
86
+ writer_task = asyncio.create_task(self.writer.consume(self.record_queue))
87
+ reporter_task = asyncio.create_task(self.progress_reporter())
88
+
89
+ try:
90
+ await self.wait_until_complete(writer_task)
91
+ await self._graceful_shutdown(fetchers, parsers, writer_task)
92
+ except ShardLimitReached:
93
+ self.stop_reason = "shard_cap_reached"
94
+ self.stop_event.set()
95
+ await self._hard_shutdown(fetchers, parsers, writer_task)
96
+ finally:
97
+ reporter_task.cancel()
98
+ with contextlib.suppress(asyncio.CancelledError):
99
+ await reporter_task
100
+
101
+ def request_stop(self, reason: str = "user_requested_stop") -> None:
102
+ if not self.stop_reason:
103
+ self.stop_reason = reason
104
+ self.stop_event.set()
105
+
106
+ async def wait_until_complete(self, writer_task: asyncio.Task[None]) -> None:
107
+ while True:
108
+ if writer_task.done():
109
+ exc = writer_task.exception()
110
+ if exc is not None:
111
+ raise exc
112
+ return
113
+
114
+ if self.stop_event.is_set():
115
+ if not self.stop_reason:
116
+ self.stop_reason = "stop_event_set"
117
+ if self._is_pipeline_idle():
118
+ return
119
+ await asyncio.sleep(0.2)
120
+ continue
121
+
122
+ if self._is_pipeline_idle():
123
+ self.stop_reason = "frontier_exhausted"
124
+ return
125
+
126
+ await asyncio.sleep(0.5)
127
+
128
+ async def _graceful_shutdown(
129
+ self,
130
+ fetchers: list[asyncio.Task[None]],
131
+ parsers: list[asyncio.Task[None]],
132
+ writer_task: asyncio.Task[None],
133
+ ) -> None:
134
+ for _ in fetchers:
135
+ await self.fetch_queue.put(None)
136
+ await asyncio.gather(*fetchers, return_exceptions=True)
137
+
138
+ for _ in parsers:
139
+ await self.parse_queue.put(None)
140
+ await asyncio.gather(*parsers, return_exceptions=True)
141
+
142
+ await self.record_queue.put(None)
143
+ await writer_task
144
+
145
+ async def _hard_shutdown(
146
+ self,
147
+ fetchers: list[asyncio.Task[None]],
148
+ parsers: list[asyncio.Task[None]],
149
+ writer_task: asyncio.Task[None],
150
+ ) -> None:
151
+ for task in fetchers + parsers:
152
+ task.cancel()
153
+ await asyncio.gather(*fetchers, *parsers, return_exceptions=True)
154
+
155
+ if not writer_task.done():
156
+ writer_task.cancel()
157
+ await asyncio.gather(writer_task, return_exceptions=True)
158
+
159
+ async def progress_reporter(self) -> None:
160
+ while True:
161
+ await asyncio.sleep(self.config.report_every_seconds)
162
+ print(
163
+ "[stats]"
164
+ f" workers={self.config.total_workers}"
165
+ f" split={self.config.fetch_workers}/{self.config.parser_workers}"
166
+ f" queued={self.stats.queued_urls}"
167
+ f" fetched={self.stats.fetch_reserved}"
168
+ f" fetch_ok={self.stats.fetch_succeeded}"
169
+ f" fetch_fail={self.stats.fetch_failed}"
170
+ f" parsed={self.stats.parsed_pages}"
171
+ f" parse_fail={self.stats.parse_failed}"
172
+ f" robots_blocked={self.stats.robots_blocked}"
173
+ f" rows={self.stats.stored_rows}"
174
+ f" shards={self.stats.written_shards}/{self.config.max_shards}"
175
+ f" tok_shards={self.stats.tokenized_shards}"
176
+ f" tok_rows={self.stats.tokenized_rows}"
177
+ f" tok_total={self.stats.tokenized_tokens}"
178
+ f" uploaded={self.stats.uploaded_shards}"
179
+ f" fetch_q={self.fetch_queue.qsize()}"
180
+ f" parse_q={self.parse_queue.qsize()}"
181
+ f" record_q={self.record_queue.qsize()}"
182
+ )
183
+
184
+ async def fetcher_worker(self, worker_id: int, session: aiohttp.ClientSession) -> None:
185
+ del worker_id
186
+ assert self.rate_limiter is not None
187
+ assert self.robots_policy is not None
188
+
189
+ while True:
190
+ url = await self.fetch_queue.get()
191
+ if url is None:
192
+ self.fetch_queue.task_done()
193
+ return
194
+
195
+ slot_reserved = await self.reserve_fetch_slot()
196
+ if not slot_reserved:
197
+ self.fetch_queue.task_done()
198
+ continue
199
+
200
+ self.active_fetchers += 1
201
+ try:
202
+ outcome = await fetch_url(
203
+ session,
204
+ url,
205
+ config=self.config,
206
+ mark_seen=self._mark_seen,
207
+ rate_limiter=self.rate_limiter,
208
+ robots_policy=self.robots_policy,
209
+ )
210
+ if outcome.robots_blocked:
211
+ self.stats.robots_blocked += 1
212
+ if outcome.result is not None:
213
+ self.stats.fetch_succeeded += 1
214
+ if outcome.result.html:
215
+ await self.parse_queue.put(outcome.result)
216
+ else:
217
+ self.stats.fetch_failed += 1
218
+ finally:
219
+ self.active_fetchers -= 1
220
+ self.fetch_queue.task_done()
221
+
222
+ async def parser_worker(self, worker_id: int) -> None:
223
+ del worker_id
224
+
225
+ while True:
226
+ item = await self.parse_queue.get()
227
+ if item is None:
228
+ self.parse_queue.task_done()
229
+ return
230
+
231
+ self.active_parsers += 1
232
+ try:
233
+ record, links = parse_page(item)
234
+ if record is not None:
235
+ await self.record_queue.put(record)
236
+ self.stats.parsed_pages += 1
237
+
238
+ extracted = 0
239
+ for link in links:
240
+ if extracted >= self.config.max_links_per_page:
241
+ break
242
+ if await self.try_enqueue(link):
243
+ extracted += 1
244
+ self.stats.extracted_links += extracted
245
+ except Exception:
246
+ self.stats.parse_failed += 1
247
+ finally:
248
+ self.active_parsers -= 1
249
+ self.parse_queue.task_done()
250
+
251
+ async def reserve_fetch_slot(self) -> bool:
252
+ async with self.counter_lock:
253
+ if self.stop_event.is_set():
254
+ return False
255
+ self.stats.fetch_reserved += 1
256
+ return True
257
+
258
+ async def try_enqueue(self, raw_url: str) -> bool:
259
+ if self.stop_event.is_set():
260
+ return False
261
+
262
+ normalized = normalize_url(raw_url)
263
+ if not normalized:
264
+ self.stats.dropped_urls += 1
265
+ return False
266
+
267
+ if has_binary_extension(normalized):
268
+ self.stats.dropped_urls += 1
269
+ return False
270
+
271
+ async with self.seen_lock:
272
+ if self.config.seen_url_cache_size > 0 and normalized in self.seen_urls:
273
+ return False
274
+ self._remember_seen_locked(normalized)
275
+ self.stats.queued_urls += 1
276
+
277
+ await self.fetch_queue.put(normalized)
278
+ return True
279
+
280
+ async def _mark_seen(self, url: str) -> None:
281
+ async with self.seen_lock:
282
+ self._remember_seen_locked(url)
283
+
284
+ def _remember_seen_locked(self, url: str) -> None:
285
+ if self.config.seen_url_cache_size <= 0:
286
+ return
287
+ if url in self.seen_urls:
288
+ return
289
+
290
+ self.seen_urls.add(url)
291
+ self.seen_order.append(url)
292
+ while len(self.seen_order) > self.config.seen_url_cache_size:
293
+ expired = self.seen_order.popleft()
294
+ self.seen_urls.discard(expired)
295
+
296
+ def _is_pipeline_idle(self) -> bool:
297
+ return (
298
+ self.fetch_queue.empty()
299
+ and self.parse_queue.empty()
300
+ and self.active_fetchers == 0
301
+ and self.active_parsers == 0
302
+ )
crawler/fetch.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from datetime import datetime, timezone
5
+ from typing import Awaitable, Callable
6
+ from urllib.parse import urlsplit
7
+
8
+ import aiohttp
9
+
10
+ from .config import CrawlerConfig
11
+ from .models import FetchResult
12
+ from .rate_limit import RequestRateLimiter
13
+ from .robots import RobotsPolicy
14
+ from .utils import is_html_response, normalize_url
15
+
16
+
17
+ @dataclass
18
+ class FetchOutcome:
19
+ result: FetchResult | None
20
+ robots_blocked: bool = False
21
+
22
+
23
+ async def fetch_url(
24
+ session: aiohttp.ClientSession,
25
+ url: str,
26
+ *,
27
+ config: CrawlerConfig,
28
+ mark_seen: Callable[[str], Awaitable[None]],
29
+ rate_limiter: RequestRateLimiter,
30
+ robots_policy: RobotsPolicy,
31
+ ) -> FetchOutcome:
32
+ fetched_at = datetime.now(timezone.utc).isoformat()
33
+ requested_domain = (urlsplit(url).hostname or "").lower().strip(".")
34
+ if not requested_domain:
35
+ return FetchOutcome(result=None)
36
+
37
+ if not await robots_policy.can_fetch(url):
38
+ return FetchOutcome(result=None, robots_blocked=True)
39
+
40
+ await rate_limiter.acquire(requested_domain)
41
+
42
+ try:
43
+ async with session.get(url, allow_redirects=True) as response:
44
+ content_type = response.headers.get("content-type", "").lower()
45
+ final_url = normalize_url(str(response.url))
46
+ if not final_url:
47
+ return FetchOutcome(result=None)
48
+
49
+ final_domain = (urlsplit(final_url).hostname or "").lower().strip(".")
50
+ if not final_domain:
51
+ return FetchOutcome(result=None)
52
+
53
+ if not await robots_policy.can_fetch(final_url):
54
+ return FetchOutcome(result=None, robots_blocked=True)
55
+
56
+ await mark_seen(final_url)
57
+
58
+ if response.status >= 400:
59
+ return FetchOutcome(result=None)
60
+
61
+ if not is_html_response(content_type, final_url):
62
+ return FetchOutcome(
63
+ result=FetchResult(
64
+ url=final_url,
65
+ status=response.status,
66
+ fetched_at=fetched_at,
67
+ content_type=content_type,
68
+ html="",
69
+ )
70
+ )
71
+
72
+ raw = await response.content.read(config.max_response_bytes + 1)
73
+ if len(raw) > config.max_response_bytes:
74
+ raw = raw[: config.max_response_bytes]
75
+ html = raw.decode(response.charset or "utf-8", errors="ignore")
76
+
77
+ return FetchOutcome(
78
+ result=FetchResult(
79
+ url=final_url,
80
+ status=response.status,
81
+ fetched_at=fetched_at,
82
+ content_type=content_type,
83
+ html=html,
84
+ )
85
+ )
86
+ except Exception:
87
+ return FetchOutcome(result=None)
crawler/models.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ @dataclass
7
+ class FetchResult:
8
+ url: str
9
+ status: int
10
+ fetched_at: str
11
+ content_type: str
12
+ html: str
13
+
14
+
15
+ @dataclass
16
+ class CrawlStats:
17
+ queued_urls: int = 0
18
+ fetch_reserved: int = 0
19
+ fetch_succeeded: int = 0
20
+ fetch_failed: int = 0
21
+ parsed_pages: int = 0
22
+ parse_failed: int = 0
23
+ extracted_links: int = 0
24
+ dropped_urls: int = 0
25
+ robots_blocked: int = 0
26
+ stored_rows: int = 0
27
+ written_shards: int = 0
28
+ uploaded_shards: int = 0
29
+ tokenized_shards: int = 0
30
+ tokenized_rows: int = 0
31
+ tokenized_tokens: int = 0
crawler/parse.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+ from urllib.parse import urljoin, urlsplit
5
+
6
+ from bs4 import BeautifulSoup
7
+
8
+ from .models import FetchResult
9
+
10
+
11
+ def parse_page(item: FetchResult) -> tuple[dict[str, Any] | None, list[str]]:
12
+ if not item.html:
13
+ return None, []
14
+
15
+ soup = BeautifulSoup(item.html, "lxml")
16
+
17
+ for tag in soup(["script", "style", "noscript", "svg", "iframe", "canvas"]):
18
+ tag.decompose()
19
+
20
+ text = soup.get_text(" ", strip=True)
21
+ if not text:
22
+ return None, []
23
+
24
+ links: list[str] = []
25
+ for anchor in soup.find_all("a", href=True):
26
+ href = anchor.get("href", "").strip()
27
+ if not href:
28
+ continue
29
+ links.append(urljoin(item.url, href))
30
+
31
+ domain = (urlsplit(item.url).hostname or "").lower().strip(".")
32
+ record = {
33
+ "text": text,
34
+ "url": item.url,
35
+ "domain": domain,
36
+ "timestamp": item.fetched_at,
37
+ }
38
+ return record, links
crawler/rate_limit.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import time
5
+ from typing import Awaitable, Callable
6
+
7
+
8
+ class RequestRateLimiter:
9
+ def __init__(
10
+ self,
11
+ global_interval_seconds: float,
12
+ per_domain_interval_seconds: float,
13
+ *,
14
+ clock: Callable[[], float] | None = None,
15
+ sleep: Callable[[float], Awaitable[None]] | None = None,
16
+ ) -> None:
17
+ self.global_interval_seconds = max(0.0, float(global_interval_seconds))
18
+ self.per_domain_interval_seconds = max(0.0, float(per_domain_interval_seconds))
19
+ self._clock = clock or time.monotonic
20
+ self._sleep = sleep or asyncio.sleep
21
+
22
+ self._global_lock = asyncio.Lock()
23
+ self._global_last: float | None = None
24
+
25
+ self._domain_guard = asyncio.Lock()
26
+ self._domain_locks: dict[str, asyncio.Lock] = {}
27
+ self._domain_last: dict[str, float] = {}
28
+
29
+ async def acquire(self, domain: str) -> None:
30
+ normalized = domain.lower().strip(".")
31
+ await self._acquire_global()
32
+ await self._acquire_domain(normalized)
33
+
34
+ async def _acquire_global(self) -> None:
35
+ if self.global_interval_seconds <= 0:
36
+ return
37
+
38
+ async with self._global_lock:
39
+ now = self._clock()
40
+ if self._global_last is not None:
41
+ wait = self.global_interval_seconds - (now - self._global_last)
42
+ if wait > 0:
43
+ await self._sleep(wait)
44
+ self._global_last = self._clock()
45
+
46
+ async def _acquire_domain(self, domain: str) -> None:
47
+ if not domain or self.per_domain_interval_seconds <= 0:
48
+ return
49
+
50
+ lock = await self._get_domain_lock(domain)
51
+ async with lock:
52
+ now = self._clock()
53
+ last = self._domain_last.get(domain)
54
+ if last is not None:
55
+ wait = self.per_domain_interval_seconds - (now - last)
56
+ if wait > 0:
57
+ await self._sleep(wait)
58
+ self._domain_last[domain] = self._clock()
59
+
60
+ async def _get_domain_lock(self, domain: str) -> asyncio.Lock:
61
+ async with self._domain_guard:
62
+ lock = self._domain_locks.get(domain)
63
+ if lock is None:
64
+ lock = asyncio.Lock()
65
+ self._domain_locks[domain] = lock
66
+ return lock
crawler/robots.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import time
5
+ from dataclasses import dataclass
6
+ from urllib.parse import urlsplit
7
+ from urllib.robotparser import RobotFileParser
8
+
9
+ import aiohttp
10
+
11
+
12
+ @dataclass
13
+ class _RobotsCacheEntry:
14
+ parser: RobotFileParser | None
15
+ expires_at: float
16
+
17
+
18
+ class RobotsPolicy:
19
+ def __init__(
20
+ self,
21
+ session: aiohttp.ClientSession,
22
+ user_agent: str,
23
+ *,
24
+ cache_ttl_seconds: float = 3600.0,
25
+ fail_closed: bool = True,
26
+ max_bytes: int = 300_000,
27
+ ) -> None:
28
+ self.session = session
29
+ self.user_agent = user_agent
30
+ self.cache_ttl_seconds = max(1.0, float(cache_ttl_seconds))
31
+ self.fail_closed = bool(fail_closed)
32
+ self.max_bytes = int(max_bytes)
33
+
34
+ self._cache: dict[str, _RobotsCacheEntry] = {}
35
+ self._cache_lock = asyncio.Lock()
36
+ self._origin_locks: dict[str, asyncio.Lock] = {}
37
+
38
+ async def can_fetch(self, url: str) -> bool:
39
+ parts = urlsplit(url)
40
+ host = (parts.hostname or "").lower().strip(".")
41
+ scheme = parts.scheme.lower()
42
+ if scheme not in {"http", "https"} or not host:
43
+ return False
44
+
45
+ origin = f"{scheme}://{host}"
46
+ parser = await self._get_parser(origin)
47
+ if parser is None:
48
+ return not self.fail_closed
49
+ return parser.can_fetch(self.user_agent, url)
50
+
51
+ async def _get_parser(self, origin: str) -> RobotFileParser | None:
52
+ now = time.monotonic()
53
+ async with self._cache_lock:
54
+ cached = self._cache.get(origin)
55
+ if cached and cached.expires_at > now:
56
+ return cached.parser
57
+
58
+ lock = self._origin_locks.get(origin)
59
+ if lock is None:
60
+ lock = asyncio.Lock()
61
+ self._origin_locks[origin] = lock
62
+
63
+ async with lock:
64
+ now = time.monotonic()
65
+ async with self._cache_lock:
66
+ cached = self._cache.get(origin)
67
+ if cached and cached.expires_at > now:
68
+ return cached.parser
69
+
70
+ parser = await self._download_and_parse(origin)
71
+ async with self._cache_lock:
72
+ self._cache[origin] = _RobotsCacheEntry(
73
+ parser=parser,
74
+ expires_at=time.monotonic() + self.cache_ttl_seconds,
75
+ )
76
+ return parser
77
+
78
+ async def _download_and_parse(self, origin: str) -> RobotFileParser | None:
79
+ robots_url = f"{origin}/robots.txt"
80
+ try:
81
+ async with self.session.get(robots_url, allow_redirects=True) as response:
82
+ if response.status >= 400:
83
+ return None
84
+
85
+ raw = await response.content.read(self.max_bytes + 1)
86
+ if len(raw) > self.max_bytes:
87
+ raw = raw[: self.max_bytes]
88
+ charset = response.charset or "utf-8"
89
+ text = raw.decode(charset, errors="ignore")
90
+ except Exception:
91
+ return None
92
+
93
+ parser = RobotFileParser()
94
+ parser.set_url(robots_url)
95
+ parser.parse(text.splitlines())
96
+ return parser
crawler/shards.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from datetime import datetime, timezone
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import pyarrow as pa
9
+ import pyarrow.parquet as pq
10
+
11
+ from .config import CrawlerConfig
12
+ from .models import CrawlStats
13
+ from .tokenizer import LiveShardTokenizer
14
+ from .upload import HfShardUploader
15
+
16
+
17
+ class ShardLimitReached(RuntimeError):
18
+ pass
19
+
20
+
21
+ PARQUET_SCHEMA = pa.schema(
22
+ [
23
+ ("text", pa.string()),
24
+ ("url", pa.string()),
25
+ ("domain", pa.string()),
26
+ ("timestamp", pa.string()),
27
+ ]
28
+ )
29
+
30
+
31
+ class ParquetShardWriter:
32
+ def __init__(self, config: CrawlerConfig, stats: CrawlStats):
33
+ self.config = config
34
+ self.stats = stats
35
+ self.buffer: list[dict[str, Any]] = []
36
+ self.shard_index = 0
37
+ self.uploader: HfShardUploader | None = None
38
+ self.live_tokenizer = LiveShardTokenizer()
39
+
40
+ async def initialize(self) -> None:
41
+ self.config.output_dir.mkdir(parents=True, exist_ok=True)
42
+ if not self.config.enable_hf_upload:
43
+ return
44
+
45
+ self.uploader = HfShardUploader(
46
+ repo_id=self.config.hf_repo_id,
47
+ token=self.config.hf_token,
48
+ repo_type=self.config.hf_repo_type,
49
+ private_repo=self.config.hf_private_repo,
50
+ path_prefix=self.config.hf_path_prefix,
51
+ )
52
+ await self.uploader.initialize()
53
+
54
+ async def consume(self, record_queue: asyncio.Queue[dict[str, Any] | None]) -> None:
55
+ while True:
56
+ item = await record_queue.get()
57
+ if item is None:
58
+ record_queue.task_done()
59
+ break
60
+
61
+ try:
62
+ self.buffer.append(item)
63
+ if len(self.buffer) >= self.config.shard_size_rows:
64
+ await self.flush()
65
+ finally:
66
+ record_queue.task_done()
67
+
68
+ if self.buffer:
69
+ await self.flush()
70
+
71
+ async def flush(self) -> None:
72
+ if not self.buffer:
73
+ return
74
+
75
+ if self.shard_index >= self.config.max_shards:
76
+ raise ShardLimitReached(f"Reached shard cap of {self.config.max_shards}.")
77
+
78
+ rows = self.buffer
79
+ self.buffer = []
80
+ normalized_rows = [
81
+ {
82
+ "text": str(row.get("text", "")),
83
+ "url": str(row.get("url", "")),
84
+ "domain": str(row.get("domain", "")),
85
+ "timestamp": str(row.get("timestamp", "")),
86
+ }
87
+ for row in rows
88
+ if row.get("text")
89
+ ]
90
+ if not normalized_rows:
91
+ return
92
+
93
+ timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
94
+ shard_name = f"shard-{timestamp}-{self.shard_index:04d}.parquet"
95
+ shard_path = self.config.output_dir / shard_name
96
+
97
+ table = pa.Table.from_pylist(normalized_rows, schema=PARQUET_SCHEMA)
98
+ await asyncio.to_thread(
99
+ pq.write_table,
100
+ table,
101
+ shard_path,
102
+ compression=self.config.parquet_compression,
103
+ compression_level=self.config.parquet_compression_level,
104
+ use_dictionary=True,
105
+ )
106
+
107
+ self.shard_index += 1
108
+ self.stats.written_shards = self.shard_index
109
+ self.stats.stored_rows += len(normalized_rows)
110
+ token_rows, token_count = await asyncio.to_thread(
111
+ self.live_tokenizer.tokenize_shard_text, shard_path
112
+ )
113
+ self.stats.tokenized_shards += 1
114
+ self.stats.tokenized_rows += token_rows
115
+ self.stats.tokenized_tokens += token_count
116
+
117
+ if self.config.enable_hf_upload:
118
+ ok = await self._upload_and_delete(shard_path, rows=len(normalized_rows))
119
+ if ok:
120
+ self.stats.uploaded_shards += 1
121
+
122
+ if self.shard_index >= self.config.max_shards:
123
+ raise ShardLimitReached(f"Reached shard cap of {self.config.max_shards}.")
124
+
125
+ async def _upload_and_delete(self, shard_path: Path, rows: int) -> bool:
126
+ if self.uploader is None:
127
+ raise RuntimeError("Uploader not initialized.")
128
+ return await self.uploader.upload_and_delete(shard_path, rows)
crawler/tokenizer.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ import pyarrow.parquet as pq
6
+ import tiktoken
7
+
8
+
9
+ class LiveShardTokenizer:
10
+ def __init__(self, encoding_name: str = "cl100k_base") -> None:
11
+ self.encoding = tiktoken.get_encoding(encoding_name)
12
+
13
+ def tokenize_shard_text(self, shard_path: Path) -> tuple[int, int]:
14
+ table = pq.read_table(shard_path, columns=["text"])
15
+ if "text" not in table.column_names:
16
+ return 0, 0
17
+
18
+ rows = 0
19
+ token_count = 0
20
+ for value in table.column("text").to_pylist():
21
+ if value is None:
22
+ continue
23
+ text = str(value)
24
+ rows += 1
25
+ token_count += len(self.encoding.encode(text, disallowed_special=()))
26
+ return rows, token_count
crawler/upload.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import contextlib
5
+ from pathlib import Path
6
+
7
+ from huggingface_hub import HfApi
8
+
9
+
10
+ class HfShardUploader:
11
+ def __init__(
12
+ self,
13
+ *,
14
+ repo_id: str,
15
+ token: str,
16
+ repo_type: str = "dataset",
17
+ private_repo: bool = False,
18
+ path_prefix: str = "crawl_shards",
19
+ ) -> None:
20
+ self.repo_id = repo_id.strip()
21
+ self.token = token.strip()
22
+ self.repo_type = repo_type
23
+ self.private_repo = bool(private_repo)
24
+ self.path_prefix = path_prefix.strip("/")
25
+ self.api: HfApi | None = None
26
+
27
+ async def initialize(self) -> None:
28
+ self.api = HfApi(token=self.token or None)
29
+ await asyncio.to_thread(
30
+ self.api.create_repo,
31
+ repo_id=self.repo_id,
32
+ repo_type=self.repo_type,
33
+ private=self.private_repo,
34
+ exist_ok=True,
35
+ )
36
+
37
+ async def upload_and_delete(self, shard_path: Path, rows: int) -> bool:
38
+ if self.api is None:
39
+ raise RuntimeError("Uploader was not initialized.")
40
+
41
+ if self.path_prefix:
42
+ path_in_repo = f"{self.path_prefix}/{shard_path.name}"
43
+ else:
44
+ path_in_repo = shard_path.name
45
+
46
+ try:
47
+ await asyncio.to_thread(
48
+ self.api.upload_file,
49
+ path_or_fileobj=str(shard_path),
50
+ path_in_repo=path_in_repo,
51
+ repo_id=self.repo_id,
52
+ repo_type=self.repo_type,
53
+ commit_message=f"Add crawl shard {shard_path.name} ({rows} rows)",
54
+ )
55
+ except Exception:
56
+ return False
57
+
58
+ with contextlib.suppress(FileNotFoundError):
59
+ shard_path.unlink()
60
+ return True
crawler/utils.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
5
+
6
+ BINARY_EXTENSIONS = {
7
+ ".7z",
8
+ ".avi",
9
+ ".bin",
10
+ ".bz2",
11
+ ".csv",
12
+ ".doc",
13
+ ".docx",
14
+ ".epub",
15
+ ".gif",
16
+ ".gz",
17
+ ".ico",
18
+ ".jpeg",
19
+ ".jpg",
20
+ ".json",
21
+ ".m4a",
22
+ ".m4v",
23
+ ".mov",
24
+ ".mp3",
25
+ ".mp4",
26
+ ".mpeg",
27
+ ".ogg",
28
+ ".pdf",
29
+ ".png",
30
+ ".ppt",
31
+ ".pptx",
32
+ ".rar",
33
+ ".svg",
34
+ ".tar",
35
+ ".tgz",
36
+ ".tif",
37
+ ".tiff",
38
+ ".wav",
39
+ ".webm",
40
+ ".webp",
41
+ ".xls",
42
+ ".xlsx",
43
+ ".xml",
44
+ ".xz",
45
+ ".zip",
46
+ }
47
+
48
+ TRACKING_QUERY_KEYS = {
49
+ "fbclid",
50
+ "gclid",
51
+ "mc_cid",
52
+ "mc_eid",
53
+ "ref",
54
+ "source",
55
+ "spm",
56
+ "yclid",
57
+ }
58
+
59
+
60
+ def normalize_url(raw_url: str) -> str | None:
61
+ try:
62
+ parts = urlsplit(raw_url.strip())
63
+ except ValueError:
64
+ return None
65
+
66
+ scheme = parts.scheme.lower()
67
+ if scheme not in {"http", "https"}:
68
+ return None
69
+
70
+ host = (parts.hostname or "").lower().strip(".")
71
+ if not host:
72
+ return None
73
+
74
+ try:
75
+ port = parts.port
76
+ except ValueError:
77
+ return None
78
+
79
+ if (scheme == "http" and port == 80) or (scheme == "https" and port == 443):
80
+ netloc = host
81
+ elif port:
82
+ netloc = f"{host}:{port}"
83
+ else:
84
+ netloc = host
85
+
86
+ path = parts.path or "/"
87
+ path = re.sub(r"/{2,}", "/", path)
88
+
89
+ query_pairs: list[tuple[str, str]] = []
90
+ for key, value in parse_qsl(parts.query, keep_blank_values=True):
91
+ lowered = key.lower()
92
+ if lowered.startswith("utm_") or lowered in TRACKING_QUERY_KEYS:
93
+ continue
94
+ query_pairs.append((key, value))
95
+ query = urlencode(query_pairs, doseq=True)
96
+
97
+ return urlunsplit((scheme, netloc, path, query, ""))
98
+
99
+
100
+ def has_binary_extension(url: str) -> bool:
101
+ path = urlsplit(url).path.lower()
102
+ if not path:
103
+ return False
104
+
105
+ dot_index = path.rfind(".")
106
+ if dot_index == -1:
107
+ return False
108
+
109
+ return path[dot_index:] in BINARY_EXTENSIONS
110
+
111
+
112
+ def is_html_response(content_type: str, final_url: str) -> bool:
113
+ if has_binary_extension(final_url):
114
+ return False
115
+
116
+ if not content_type:
117
+ return True
118
+
119
+ lowered = content_type.lower()
120
+ return (
121
+ "text/html" in lowered
122
+ or "application/xhtml+xml" in lowered
123
+ or "text/plain" in lowered
124
+ )
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ aiohttp>=3.9.0
2
+ beautifulsoup4>=4.12.0
3
+ gradio>=4.44.0
4
+ huggingface_hub>=0.24.0
5
+ lxml>=4.9.0
6
+ pyarrow>=16.1.0
7
+ tiktoken>=0.7.0