greeta commited on
Commit
90649c5
·
verified ·
1 Parent(s): f89163d

Upload scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +974 -0
scraper.py ADDED
@@ -0,0 +1,974 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FIPI scraper focused on extracting real tasks instead of generic page text.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import asyncio
8
+ from datetime import datetime
9
+ import io
10
+ import logging
11
+ import math
12
+ import os
13
+ import re
14
+ import ssl
15
+ from typing import Dict, Iterable, List, Optional
16
+ from urllib.parse import urljoin
17
+ import zipfile
18
+
19
+ from bs4 import BeautifulSoup, Tag
20
+ import httpx
21
+ import requests
22
+
23
+ try:
24
+ from pypdf import PdfReader
25
+ except ImportError: # pragma: no cover - optional dependency for HF deploy
26
+ PdfReader = None
27
+
28
+ logging.basicConfig(level=logging.INFO)
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class FIPIScraper:
33
+ """Collects task candidates from the FIPI bank and official demo archives."""
34
+
35
+ SUBJECT_CONFIG = {
36
+ "russian": {
37
+ "label": "Русский язык",
38
+ "dynamic_sources": [
39
+ {
40
+ "kind": "ege_bank",
41
+ "base_url": "https://ege.fipi.ru/bank",
42
+ "project_guid": "AF0ED3F2557F8FFC4C06F80B6803FD26",
43
+ "project_name": "ЕГЭ. Русский язык",
44
+ },
45
+ {
46
+ "kind": "oge_bank",
47
+ "base_url": "https://oge.fipi.ru/bank",
48
+ "project_guid": "2F5EE3B12FE2A0EA40B06BF61A015416",
49
+ "project_name": "ОГЭ. Русский язык",
50
+ },
51
+ ],
52
+ "official_demo_page": "https://fipi.ru/ege/demoversii-specifikacii-kodifikatory",
53
+ "official_variant_page": "https://fipi.ru/ege/otkrytyy-bank-zadaniy-ege/otkrytyye-varianty-kim-ege",
54
+ "archive_prefixes": ("ru_11_",),
55
+ "variant_prefixes": ("rus_",),
56
+ "title_keywords": ("русский язык",),
57
+ }
58
+ }
59
+
60
+ TASK_TYPE_KEYWORDS = {
61
+ "writing": ("сочинение", "эссе", "напишите", "сформулируйте", "прокомментируйте"),
62
+ "test": ("выберите", "укажите", "ответ", "вариант", "расставьте", "определите"),
63
+ "listening": ("аудио", "прослуш", "запись"),
64
+ "reading": ("прочитайте", "текст", "абзац", "предложение"),
65
+ }
66
+
67
+ GENERIC_TITLE_PATTERNS = (
68
+ "открытый банк",
69
+ "демоверсии",
70
+ "спецификации",
71
+ "кодификаторы",
72
+ "федеральный институт",
73
+ "фипи",
74
+ "нормативно",
75
+ "документы",
76
+ "варианты ким",
77
+ )
78
+
79
+ PDF_TASK_START_PATTERNS = (
80
+ "Прочитайте текст",
81
+ "Самостоятельно подберите",
82
+ "В тексте выделено",
83
+ "Укажите",
84
+ "В одном из",
85
+ "Отредактируйте предложение",
86
+ "Установите соответствие",
87
+ "Расставьте",
88
+ "Определите",
89
+ "Найдите",
90
+ "Подберите",
91
+ )
92
+
93
+ PDF_NOISE_PATTERNS = (
94
+ "Инструкция по выполнению работы",
95
+ "Пояснения к демонстрационному варианту",
96
+ "Желаем успеха",
97
+ "Все бланки ЕГЭ заполняются",
98
+ "Баллы, полученные",
99
+ "После завершения работы",
100
+ "В демонстрационном варианте представлены",
101
+ "Часть 1 содержит 26 заданий",
102
+ "На выполнение экзаменационной работы",
103
+ "Ответами к заданиям 1–26 являются",
104
+ "Бланк",
105
+ )
106
+
107
+ NOISE_PATTERNS = (
108
+ "федеральный институт педагогических измерений",
109
+ "открытый банк тестовых заданий",
110
+ "открытый банк заданий егэ",
111
+ "открытый банк заданий огэ",
112
+ "подбор заданий",
113
+ "демоверсии, спецификации, кодификаторы",
114
+ "для предметных комиссий",
115
+ "аналитические и методические материалы",
116
+ "видеоконсультации разработчиков ким",
117
+ "скачать",
118
+ "изменения в ким",
119
+ )
120
+
121
+ def __init__(self, base_url: str = "https://fipi.ru"):
122
+ self.base_url = base_url.rstrip("/")
123
+ self.headers = {
124
+ "User-Agent": (
125
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
126
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
127
+ ),
128
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
129
+ "Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
130
+ }
131
+ self.page_size = max(1, int(os.getenv("SCRAPER_BANK_PAGE_SIZE", "10")))
132
+ self.max_bank_pages = max(1, int(os.getenv("SCRAPER_MAX_BANK_PAGES", "5")))
133
+ self.max_demo_archives = max(1, int(os.getenv("SCRAPER_MAX_DEMO_ARCHIVES", "2")))
134
+ self.max_demo_tasks = max(1, int(os.getenv("SCRAPER_MAX_DEMO_TASKS", "20")))
135
+ self.min_quality_score = max(1, int(os.getenv("SCRAPER_MIN_QUALITY_SCORE", "45")))
136
+
137
+ async def fetch_page(self, url: str) -> Optional[str]:
138
+ response = await self._request("GET", url)
139
+ return response.text if response else None
140
+
141
+ async def fetch_bytes(self, url: str) -> Optional[bytes]:
142
+ response = await self._request("GET", url)
143
+ return response.content if response else None
144
+
145
+ async def _request(
146
+ self,
147
+ method: str,
148
+ url: str,
149
+ *,
150
+ data: Optional[Dict[str, str]] = None,
151
+ ) -> Optional[httpx.Response]:
152
+ ssl_context = ssl.create_default_context()
153
+ ssl_context.check_hostname = False
154
+ ssl_context.verify_mode = ssl.CERT_NONE
155
+
156
+ async with httpx.AsyncClient(
157
+ headers=self.headers,
158
+ timeout=45.0,
159
+ verify=ssl_context,
160
+ follow_redirects=True,
161
+ trust_env=False,
162
+ ) as client:
163
+ try:
164
+ response = await client.request(method, url, data=data)
165
+ response.raise_for_status()
166
+ return response
167
+ except httpx.HTTPError as e:
168
+ logger.error("Async request failed for %s: %r", url, e)
169
+
170
+ return await self._request_with_requests_fallback(
171
+ method=method,
172
+ url=url,
173
+ data=data,
174
+ )
175
+
176
+ async def _request_with_requests_fallback(
177
+ self,
178
+ *,
179
+ method: str,
180
+ url: str,
181
+ data: Optional[Dict[str, str]] = None,
182
+ ) -> Optional[httpx.Response]:
183
+ def do_request() -> Optional[httpx.Response]:
184
+ session = requests.Session()
185
+ session.trust_env = False
186
+
187
+ try:
188
+ response = session.request(
189
+ method=method,
190
+ url=url,
191
+ data=data,
192
+ headers=self.headers,
193
+ timeout=45,
194
+ verify=False,
195
+ allow_redirects=True,
196
+ )
197
+ response.raise_for_status()
198
+
199
+ request = httpx.Request(method, url, headers=self.headers)
200
+ return httpx.Response(
201
+ status_code=response.status_code,
202
+ headers=response.headers,
203
+ content=response.content,
204
+ request=request,
205
+ )
206
+ except requests.RequestException as exc:
207
+ logger.error("Requests fallback failed for %s: %r", url, exc)
208
+ return None
209
+ finally:
210
+ session.close()
211
+
212
+ return await asyncio.to_thread(do_request)
213
+
214
+ async def scrape_tasks(
215
+ self,
216
+ subject: str = "russian",
217
+ *,
218
+ include_official_archives: bool = True,
219
+ ) -> List[Dict]:
220
+ config = self.SUBJECT_CONFIG.get(subject)
221
+ if not config:
222
+ logger.warning("Unknown subject %s, falling back to russian", subject)
223
+ config = self.SUBJECT_CONFIG["russian"]
224
+
225
+ candidates: List[Dict] = []
226
+ candidates.extend(await self.scrape_dynamic_bank(subject))
227
+ if include_official_archives:
228
+ candidates.extend(await self.scrape_official_archives(subject))
229
+ validated = self._dedupe_candidates(self._filter_candidates(candidates))
230
+ logger.info("Accepted %s task candidates after filtering", len(validated))
231
+ return validated
232
+
233
+ async def scrape_dynamic_bank(self, subject: str = "russian") -> List[Dict]:
234
+ config = self.SUBJECT_CONFIG.get(subject, self.SUBJECT_CONFIG["russian"])
235
+ tasks: List[Dict] = []
236
+
237
+ for source in config["dynamic_sources"]:
238
+ project_guid = source["project_guid"]
239
+ questions_url = f"{source['base_url']}/questions.php"
240
+ total_tasks = None
241
+
242
+ for page_index in range(self.max_bank_pages):
243
+ html = await self._fetch_bank_page(
244
+ questions_url=questions_url,
245
+ project_guid=project_guid,
246
+ page_index=page_index,
247
+ )
248
+ if not html:
249
+ break
250
+
251
+ if total_tasks is None:
252
+ total_tasks = self._extract_total_count(html)
253
+ if total_tasks:
254
+ max_pages = math.ceil(total_tasks / self.page_size)
255
+ logger.info(
256
+ "Bank %s reports %s tasks, scraping up to %s pages",
257
+ source["project_name"],
258
+ total_tasks,
259
+ min(max_pages, self.max_bank_pages),
260
+ )
261
+
262
+ soup = BeautifulSoup(html, "lxml")
263
+ blocks = soup.select("div.qblock")
264
+ if not blocks:
265
+ logger.warning(
266
+ "No qblock nodes found for %s page=%s via primary fetch, retrying POST search",
267
+ source["project_name"],
268
+ page_index,
269
+ )
270
+ html = await self._fetch_bank_page(
271
+ questions_url=questions_url,
272
+ project_guid=project_guid,
273
+ page_index=page_index,
274
+ force_post=True,
275
+ )
276
+ if not html:
277
+ break
278
+
279
+ soup = BeautifulSoup(html, "lxml")
280
+ blocks = soup.select("div.qblock")
281
+ if not blocks:
282
+ logger.warning(
283
+ "No qblock nodes found for %s page=%s after retry",
284
+ source["project_name"],
285
+ page_index,
286
+ )
287
+ break
288
+
289
+ for block in blocks:
290
+ task = self._parse_bank_question_block(
291
+ block,
292
+ project_guid=project_guid,
293
+ source_name=source["project_name"],
294
+ questions_url=questions_url,
295
+ )
296
+ if task:
297
+ tasks.append(task)
298
+
299
+ if total_tasks is not None and (page_index + 1) * self.page_size >= total_tasks:
300
+ break
301
+
302
+ logger.info("Collected %s candidates from the dynamic bank", len(tasks))
303
+ return tasks
304
+
305
+ async def _fetch_bank_page(
306
+ self,
307
+ *,
308
+ questions_url: str,
309
+ project_guid: str,
310
+ page_index: int,
311
+ force_post: bool = False,
312
+ ) -> Optional[str]:
313
+ page_url = (
314
+ f"{questions_url}?proj={project_guid}"
315
+ f"&page={page_index}&pagesize={self.page_size}"
316
+ )
317
+
318
+ if not force_post:
319
+ html = await self.fetch_page(page_url)
320
+ if html:
321
+ return html
322
+
323
+ return await self._post_bank_page(
324
+ questions_url=questions_url,
325
+ project_guid=project_guid,
326
+ page_index=page_index,
327
+ )
328
+
329
+ async def _post_bank_page(
330
+ self,
331
+ *,
332
+ questions_url: str,
333
+ project_guid: str,
334
+ page_index: int,
335
+ ) -> Optional[str]:
336
+ response = await self._request(
337
+ "POST",
338
+ questions_url,
339
+ data={
340
+ "search": "1",
341
+ "pagesize": str(self.page_size),
342
+ "proj": project_guid,
343
+ "page": str(page_index),
344
+ },
345
+ )
346
+ return response.text if response else None
347
+
348
+ def _extract_total_count(self, html: str) -> Optional[int]:
349
+ match = re.search(r"setQCount\((\d+)", html)
350
+ return int(match.group(1)) if match else None
351
+
352
+ def _parse_bank_question_block(
353
+ self,
354
+ block: Tag,
355
+ *,
356
+ project_guid: str,
357
+ source_name: str,
358
+ questions_url: str,
359
+ ) -> Optional[Dict]:
360
+ prompt_cell = block.select_one("td.cell_0")
361
+ if not prompt_cell:
362
+ return None
363
+
364
+ content = self._clean_text(prompt_cell.get_text("\n", strip=True))
365
+ if not content:
366
+ return None
367
+
368
+ title = self._build_title_from_content(content, fallback=source_name)
369
+ question_guid = self._extract_block_guid(block)
370
+ variants = self._extract_variants_from_block(block)
371
+ images = self._extract_images(prompt_cell, base_url=questions_url)
372
+
373
+ return {
374
+ "title": title,
375
+ "content": content,
376
+ "source_url": f"{questions_url}?proj={project_guid}&qid={question_guid}",
377
+ "task_type": self._detect_task_type(title, content),
378
+ "images": images,
379
+ "variants": variants,
380
+ "scraped_at": datetime.utcnow().isoformat(),
381
+ "source_kind": "dynamic_bank",
382
+ "task_guid": question_guid,
383
+ }
384
+
385
+ def _extract_block_guid(self, block: Tag) -> str:
386
+ guid_input = block.select_one("form input[name='guid']")
387
+ if guid_input and guid_input.get("value"):
388
+ return guid_input["value"]
389
+ return block.get("id", "").lstrip("q")
390
+
391
+ def _extract_variants_from_block(self, block: Tag) -> List[str]:
392
+ variants: List[str] = []
393
+
394
+ for label in block.find_all("label"):
395
+ text = self._clean_text(label.get_text(" ", strip=True))
396
+ if text:
397
+ variants.append(text)
398
+
399
+ if not variants:
400
+ for option in block.find_all("option"):
401
+ text = self._clean_text(option.get_text(" ", strip=True))
402
+ if text and text.lower() != "выбор":
403
+ variants.append(text)
404
+
405
+ return variants[:10]
406
+
407
+ async def scrape_official_archives(self, subject: str = "russian") -> List[Dict]:
408
+ config = self.SUBJECT_CONFIG.get(subject, self.SUBJECT_CONFIG["russian"])
409
+ archive_links = await self._discover_official_archive_links(config)
410
+ variant_links = await self._discover_official_variant_links(config)
411
+ document_links = self._sort_document_links(archive_links + variant_links)
412
+ tasks: List[Dict] = []
413
+
414
+ if not document_links:
415
+ logger.warning("No official archive links found for %s", subject)
416
+ return tasks
417
+
418
+ if PdfReader is None:
419
+ logger.warning("pypdf is not installed, skipping official PDF extraction")
420
+ return tasks
421
+
422
+ for document_url in document_links[: self.max_demo_archives]:
423
+ document_bytes = await self.fetch_bytes(document_url)
424
+ if not document_bytes:
425
+ continue
426
+ tasks.extend(self._extract_tasks_from_document_bytes(document_bytes, document_url))
427
+
428
+ logger.info("Collected %s candidates from official archives", len(tasks))
429
+ return tasks
430
+
431
+ async def _discover_official_archive_links(self, config: Dict) -> List[str]:
432
+ html = await self.fetch_page(config["official_demo_page"])
433
+ if not html:
434
+ return []
435
+
436
+ soup = BeautifulSoup(html, "lxml")
437
+ prefixes = config["archive_prefixes"]
438
+ archive_links: List[str] = []
439
+
440
+ for link in soup.find_all("a", href=True):
441
+ href = link["href"]
442
+ absolute = href if href.startswith("http") else urljoin(config["official_demo_page"], href)
443
+ href_lower = absolute.lower()
444
+ if not href_lower.endswith(".zip"):
445
+ continue
446
+ if any(prefix in href_lower for prefix in prefixes):
447
+ archive_links.append(absolute)
448
+
449
+ def sort_key(url: str) -> int:
450
+ match = re.search(r"/(20\d{2})/", url)
451
+ return int(match.group(1)) if match else 0
452
+
453
+ archive_links.sort(key=sort_key, reverse=True)
454
+ return archive_links
455
+
456
+ async def _discover_official_variant_links(self, config: Dict) -> List[str]:
457
+ variant_page = config.get("official_variant_page")
458
+ if not variant_page:
459
+ return []
460
+
461
+ html = await self.fetch_page(variant_page)
462
+ if not html:
463
+ return []
464
+
465
+ soup = BeautifulSoup(html, "lxml")
466
+ prefixes = config.get("variant_prefixes", ())
467
+ links: List[str] = []
468
+
469
+ for link in soup.find_all("a", href=True):
470
+ href = link["href"]
471
+ absolute = href if href.startswith("http") else urljoin(variant_page, href)
472
+ href_lower = absolute.lower()
473
+ if not href_lower.endswith((".zip", ".pdf")):
474
+ continue
475
+ if "braille" in href_lower:
476
+ continue
477
+ filename = absolute.rsplit("/", 1)[-1].lower()
478
+ if prefixes and not any(filename.startswith(prefix) for prefix in prefixes):
479
+ continue
480
+ links.append(absolute)
481
+
482
+ return self._sort_document_links(links)
483
+
484
+ def _sort_document_links(self, links: Iterable[str]) -> List[str]:
485
+ def sort_key(url: str) -> tuple[int, str]:
486
+ match = re.search(r"(20\d{2})", url)
487
+ return (int(match.group(1)) if match else 0, url)
488
+
489
+ return sorted(set(links), key=sort_key, reverse=True)
490
+
491
+ def _extract_tasks_from_document_bytes(self, document_bytes: bytes, document_url: str) -> List[Dict]:
492
+ if document_url.lower().endswith(".zip"):
493
+ return self._extract_tasks_from_archive(document_bytes, document_url)
494
+ if document_url.lower().endswith(".pdf"):
495
+ return self._extract_tasks_from_pdf_document(
496
+ document_bytes,
497
+ document_url=document_url,
498
+ document_name=document_url.rsplit("/", 1)[-1],
499
+ )
500
+ return []
501
+
502
+ def _extract_tasks_from_archive(self, archive_bytes: bytes, archive_url: str) -> List[Dict]:
503
+ tasks: List[Dict] = []
504
+
505
+ try:
506
+ with zipfile.ZipFile(io.BytesIO(archive_bytes)) as archive:
507
+ for member_name in archive.namelist():
508
+ if not member_name.lower().endswith(".pdf"):
509
+ continue
510
+ if "демо" not in member_name.lower() and "demo" not in member_name.lower():
511
+ continue
512
+
513
+ text = self._extract_text_from_pdf_bytes(archive.read(member_name))
514
+ if not text:
515
+ continue
516
+
517
+ year_match = re.search(r"(20\d{2})", archive_url)
518
+ year = year_match.group(1) if year_match else "unknown"
519
+ tasks.extend(
520
+ self._extract_tasks_from_demo_text(
521
+ text,
522
+ archive_url=archive_url,
523
+ document_name=member_name,
524
+ year=year,
525
+ )
526
+ )
527
+ except zipfile.BadZipFile:
528
+ logger.error("Invalid archive %s", archive_url)
529
+
530
+ return tasks
531
+
532
+ def _extract_text_from_pdf_bytes(self, pdf_bytes: bytes) -> str:
533
+ if PdfReader is None:
534
+ return ""
535
+
536
+ try:
537
+ reader = PdfReader(io.BytesIO(pdf_bytes))
538
+ except Exception as e: # pragma: no cover - parser-dependent
539
+ logger.error("Failed to open PDF: %s", e)
540
+ return ""
541
+
542
+ pages: List[str] = []
543
+ for page in reader.pages:
544
+ try:
545
+ page_text = page.extract_text() or ""
546
+ except Exception: # pragma: no cover - parser-dependent
547
+ page_text = ""
548
+ if page_text:
549
+ pages.append(page_text)
550
+
551
+ return self._clean_text("\n".join(pages))
552
+
553
+ def _extract_tasks_from_demo_text(
554
+ self,
555
+ text: str,
556
+ *,
557
+ archive_url: str,
558
+ document_name: str,
559
+ year: str,
560
+ ) -> List[Dict]:
561
+ tasks: List[Dict] = []
562
+ if not text:
563
+ return tasks
564
+
565
+ bounded_text = text
566
+ if not bounded_text:
567
+ return tasks
568
+
569
+ pattern = re.compile(
570
+ r"(?ms)(?:^|\n)(\d{1,2})[\.\)]\s*(.+?)(?=(?:\n\d{1,2}[\.\)])|(?:\nЧасть\s+\d)|\Z)"
571
+ )
572
+
573
+ for match in pattern.finditer(bounded_text):
574
+ task_number = int(match.group(1))
575
+ content = self._clean_text(match.group(2))
576
+ if len(content) < 80:
577
+ continue
578
+
579
+ title = f"Демоверсия ЕГЭ {year}. Задание {task_number}"
580
+ tasks.append(
581
+ {
582
+ "title": title,
583
+ "content": content,
584
+ "source_url": f"{archive_url}#task-{task_number}",
585
+ "task_type": self._detect_task_type(title, content),
586
+ "images": [],
587
+ "variants": self._extract_variants(content),
588
+ "scraped_at": datetime.utcnow().isoformat(),
589
+ "source_kind": "official_demo_pdf",
590
+ "document_name": document_name,
591
+ "task_number": task_number,
592
+ }
593
+ )
594
+
595
+ if len(tasks) >= self.max_demo_tasks:
596
+ break
597
+
598
+ return tasks
599
+
600
+ def _slice_demo_section(self, text: str) -> str:
601
+ start = re.search(r"(Часть\s*1|Ответами к заданиям)", text, re.IGNORECASE)
602
+ if not start:
603
+ return text
604
+
605
+ end = re.search(r"(Система оценивания|Ключи|Ответы)", text[start.start() :], re.IGNORECASE)
606
+ if not end:
607
+ return text[start.start() :]
608
+
609
+ return text[start.start() : start.start() + end.start()]
610
+
611
+ def _extract_tasks_from_archive(self, archive_bytes: bytes, archive_url: str) -> List[Dict]:
612
+ tasks: List[Dict] = []
613
+
614
+ try:
615
+ with zipfile.ZipFile(io.BytesIO(archive_bytes)) as archive:
616
+ for member_name in archive.namelist():
617
+ if not member_name.lower().endswith(".pdf"):
618
+ continue
619
+ if not self._should_parse_pdf_member(member_name, archive_url):
620
+ continue
621
+ tasks.extend(
622
+ self._extract_tasks_from_pdf_document(
623
+ archive.read(member_name),
624
+ document_url=archive_url,
625
+ document_name=member_name,
626
+ )
627
+ )
628
+ except zipfile.BadZipFile:
629
+ logger.error("Invalid archive %s", archive_url)
630
+
631
+ return tasks
632
+
633
+ def _should_parse_pdf_member(self, member_name: str, document_url: str) -> bool:
634
+ member_lower = member_name.lower()
635
+ if any(token in member_lower for token in ("спец", "кодиф", "критер", "ответ", "аудио")):
636
+ return False
637
+ if "otkrytyye-varianty-kim-ege" in document_url.lower():
638
+ return True
639
+ return "демо" in member_lower or "demo" in member_lower
640
+
641
+ def _extract_tasks_from_pdf_document(
642
+ self,
643
+ pdf_bytes: bytes,
644
+ *,
645
+ document_url: str,
646
+ document_name: str,
647
+ ) -> List[Dict]:
648
+ text = self._extract_text_from_pdf_bytes(pdf_bytes)
649
+ if not text:
650
+ return []
651
+
652
+ year_match = re.search(r"(20\d{2})", document_url)
653
+ year = year_match.group(1) if year_match else "unknown"
654
+ return self._extract_tasks_from_demo_text(
655
+ text,
656
+ archive_url=document_url,
657
+ document_name=document_name,
658
+ year=year,
659
+ source_kind=self._detect_document_source_kind(document_url),
660
+ )
661
+
662
+ def _detect_document_source_kind(self, document_url: str) -> str:
663
+ if "otkrytyye-varianty-kim-ege" in document_url.lower():
664
+ return "official_open_variant_pdf"
665
+ return "official_demo_pdf"
666
+
667
+ def _extract_tasks_from_demo_text(
668
+ self,
669
+ text: str,
670
+ *,
671
+ archive_url: str,
672
+ document_name: str,
673
+ year: str,
674
+ source_kind: str = "official_demo_pdf",
675
+ ) -> List[Dict]:
676
+ tasks: List[Dict] = []
677
+ if not text:
678
+ return tasks
679
+
680
+ bounded_text = text
681
+ if not bounded_text:
682
+ return tasks
683
+
684
+ for raw_block in self._split_pdf_into_task_blocks(bounded_text):
685
+ content = self._cleanup_pdf_task_block(raw_block)
686
+ content = self._trim_to_task_start(content)
687
+ if not self._looks_like_official_task_block(content):
688
+ continue
689
+
690
+ task_number = len(tasks) + 1
691
+ document_label = "Открытый вариант ЕГЭ" if source_kind == "official_open_variant_pdf" else "Демоверсия ЕГЭ"
692
+ title = f"{document_label} {year}. Задание {task_number}"
693
+ tasks.append(
694
+ {
695
+ "title": title,
696
+ "content": content,
697
+ "source_url": f"{archive_url}#task-{task_number}",
698
+ "task_type": self._detect_task_type(title, content),
699
+ "images": [],
700
+ "variants": self._extract_variants(content),
701
+ "scraped_at": datetime.utcnow().isoformat(),
702
+ "source_kind": source_kind,
703
+ "document_name": document_name,
704
+ "task_number": task_number,
705
+ }
706
+ )
707
+
708
+ if len(tasks) >= self.max_demo_tasks:
709
+ break
710
+
711
+ return tasks
712
+
713
+ def _split_pdf_into_task_blocks(self, text: str) -> List[str]:
714
+ answer_pattern = re.compile(r"(?:^|\n)\s*Ответ\s*:\s*[_\.\s]*", re.IGNORECASE)
715
+ blocks: List[str] = []
716
+ last_pos = 0
717
+
718
+ for match in answer_pattern.finditer(text):
719
+ block = text[last_pos:match.start()]
720
+ if block.strip():
721
+ blocks.append(block)
722
+ last_pos = match.end()
723
+
724
+ return blocks
725
+
726
+ def _cleanup_pdf_task_block(self, block: str) -> str:
727
+ lines: List[str] = []
728
+ for raw_line in block.splitlines():
729
+ line = self._clean_text(raw_line)
730
+ if not line:
731
+ continue
732
+ lower = line.lower()
733
+ if line == "&%end_page&%":
734
+ continue
735
+ if re.fullmatch(r"\d{1,2}", line):
736
+ continue
737
+ if re.search(r"\d+\s*/\s*\d+$", line):
738
+ continue
739
+ if lower.startswith(("демонстрационный вариант егэ", "открытый вариант ким егэ", "единый государственный экзамен")):
740
+ continue
741
+ if lower.startswith("© "):
742
+ continue
743
+ lines.append(line)
744
+
745
+ return self._clean_text("\n".join(lines))
746
+
747
+ def _trim_to_task_start(self, text: str) -> str:
748
+ if not text:
749
+ return text
750
+
751
+ starts = [text.find(pattern) for pattern in self.PDF_TASK_START_PATTERNS if text.find(pattern) >= 0]
752
+ if starts:
753
+ return text[min(starts):].strip()
754
+ return text.strip()
755
+
756
+ def _looks_like_official_task_block(self, text: str) -> bool:
757
+ if len(text) < 70 or len(text) > 6000:
758
+ return False
759
+
760
+ lower = text.lower()
761
+ if any(pattern.lower() in lower for pattern in self.PDF_NOISE_PATTERNS):
762
+ return False
763
+
764
+ return any(pattern.lower() in lower for pattern in self.PDF_TASK_START_PATTERNS)
765
+
766
+ def _slice_demo_section(self, text: str) -> str:
767
+ start_matches = list(re.finditer(r"(?m)^\s*Часть\s*1\s*$", text, re.IGNORECASE))
768
+ if start_matches:
769
+ start_pos = start_matches[-1].start()
770
+ else:
771
+ fallback = list(re.finditer(r"Ответами к заданиям", text, re.IGNORECASE))
772
+ if not fallback:
773
+ return text
774
+ start_pos = fallback[-1].start()
775
+
776
+ end = re.search(
777
+ r"(Часть\s*2|Задание\s*27|Система оценивания|Критерии оценивания|Ключи)",
778
+ text[start_pos:],
779
+ re.IGNORECASE,
780
+ )
781
+ if not end:
782
+ return text[start_pos:]
783
+
784
+ return text[start_pos : start_pos + end.start()]
785
+
786
+ def parse_task_page(self, html: str, url: str) -> Optional[Dict]:
787
+ if not html:
788
+ return None
789
+
790
+ soup = BeautifulSoup(html, "lxml")
791
+ for selector in (
792
+ "div.qblock",
793
+ "article",
794
+ "main article",
795
+ ".field--name-body",
796
+ ".content",
797
+ "main",
798
+ "body",
799
+ ):
800
+ container = soup.select_one(selector)
801
+ if not container:
802
+ continue
803
+
804
+ candidate = self._build_candidate_from_container(container, url)
805
+ if candidate:
806
+ return candidate
807
+
808
+ return None
809
+
810
+ def _build_candidate_from_container(self, container: Tag, url: str) -> Optional[Dict]:
811
+ cloned = BeautifulSoup(str(container), "lxml")
812
+ root = cloned.find()
813
+ if root is None:
814
+ return None
815
+
816
+ for element in root.find_all(["script", "style", "nav", "header", "footer", "form", "button", "aside"]):
817
+ element.decompose()
818
+
819
+ title_tag = root.find(["h1", "h2", "h3", "strong", "b"])
820
+ title = self._clean_text(title_tag.get_text(" ", strip=True)) if title_tag else ""
821
+ content = self._clean_text(root.get_text("\n", strip=True))
822
+ if not title:
823
+ title = self._build_title_from_content(content, fallback=url)
824
+
825
+ images = self._extract_images(root, base_url=url)
826
+ candidate = {
827
+ "title": title,
828
+ "content": content,
829
+ "source_url": url,
830
+ "task_type": self._detect_task_type(title, content),
831
+ "images": images,
832
+ "variants": self._extract_variants(content),
833
+ "scraped_at": datetime.utcnow().isoformat(),
834
+ "source_kind": "generic_html",
835
+ }
836
+ return candidate if self._passes_quality_gate(candidate) else None
837
+
838
+ async def scrape_task_by_id(self, task_id: str) -> Optional[Dict]:
839
+ config = self.SUBJECT_CONFIG["russian"]["dynamic_sources"][0]
840
+ html = await self.fetch_page(
841
+ f"{config['base_url']}/questions.php?proj={config['project_guid']}&qid={task_id}"
842
+ )
843
+ if not html:
844
+ return None
845
+
846
+ soup = BeautifulSoup(html, "lxml")
847
+ block = soup.select_one("div.qblock")
848
+ if not block:
849
+ return None
850
+
851
+ return self._parse_bank_question_block(
852
+ block,
853
+ project_guid=config["project_guid"],
854
+ source_name=config["project_name"],
855
+ questions_url=f"{config['base_url']}/questions.php",
856
+ )
857
+
858
+ async def search_tasks(self, query: str) -> List[Dict]:
859
+ query_lower = query.lower().strip()
860
+ tasks = await self.scrape_tasks(subject="russian")
861
+ return [
862
+ task
863
+ for task in tasks
864
+ if query_lower in task.get("title", "").lower()
865
+ or query_lower in task.get("content", "").lower()
866
+ ]
867
+
868
+ def _filter_candidates(self, candidates: Iterable[Dict]) -> List[Dict]:
869
+ accepted: List[Dict] = []
870
+ for candidate in candidates:
871
+ if self._passes_quality_gate(candidate):
872
+ accepted.append(candidate)
873
+ return accepted
874
+
875
+ def _dedupe_candidates(self, candidates: Iterable[Dict]) -> List[Dict]:
876
+ deduped: List[Dict] = []
877
+ seen_keys = set()
878
+
879
+ for candidate in candidates:
880
+ normalized = self._clean_text(candidate.get("content", ""))[:400]
881
+ key = (candidate.get("source_url", ""), normalized)
882
+ if key in seen_keys:
883
+ continue
884
+ seen_keys.add(key)
885
+ deduped.append(candidate)
886
+
887
+ return deduped
888
+
889
+ def _passes_quality_gate(self, candidate: Dict) -> bool:
890
+ score = self._score_candidate(candidate)
891
+ candidate["quality_score"] = score
892
+ return score >= self.min_quality_score
893
+
894
+ def _score_candidate(self, candidate: Dict) -> int:
895
+ title = candidate.get("title", "").lower()
896
+ content = candidate.get("content", "").lower()
897
+ source_kind = candidate.get("source_kind", "")
898
+ length = len(content)
899
+
900
+ score = 0
901
+
902
+ if source_kind == "dynamic_bank":
903
+ score += 60
904
+ elif source_kind in {"official_demo_pdf", "official_open_variant_pdf"}:
905
+ score += 50
906
+ else:
907
+ score += 10
908
+
909
+ if 80 <= length <= 3500:
910
+ score += 15
911
+ elif length > 5000:
912
+ score -= 20
913
+ else:
914
+ score -= 10
915
+
916
+ if any(keyword in content for keywords in self.TASK_TYPE_KEYWORDS.values() for keyword in keywords):
917
+ score += 10
918
+
919
+ if any(pattern.lower() in content for pattern in self.PDF_TASK_START_PATTERNS):
920
+ score += 10
921
+
922
+ if re.search(r"\b\d+\b", content):
923
+ score += 5
924
+
925
+ if any(pattern in title for pattern in self.GENERIC_TITLE_PATTERNS):
926
+ score -= 45
927
+
928
+ noise_hits = sum(1 for pattern in self.NOISE_PATTERNS if pattern in content[:1200])
929
+ score -= min(noise_hits * 8, 32)
930
+
931
+ if content.count("\n") > 80:
932
+ score -= 10
933
+
934
+ return score
935
+
936
+ def _detect_task_type(self, title: str, content: str) -> str:
937
+ text = f"{title} {content}".lower()
938
+
939
+ for task_type, keywords in self.TASK_TYPE_KEYWORDS.items():
940
+ if any(keyword in text for keyword in keywords):
941
+ return task_type
942
+
943
+ return "other"
944
+
945
+ def _extract_variants(self, content: str) -> List[str]:
946
+ matches = re.findall(r"(?:^|\n)(?:[1-6]|[A-DА-Г])[.)]\s*([^\n]{2,200})", content)
947
+ return [self._clean_text(match) for match in matches[:10]]
948
+
949
+ def _extract_images(self, container: Tag, *, base_url: str) -> List[str]:
950
+ images: List[str] = []
951
+ for img in container.find_all("img"):
952
+ src = img.get("src") or img.get("data-src")
953
+ if not src:
954
+ continue
955
+ images.append(src if src.startswith("http") else urljoin(base_url, src))
956
+ return images[:10]
957
+
958
+ def _build_title_from_content(self, content: str, fallback: str) -> str:
959
+ first_line = next((line.strip() for line in content.splitlines() if line.strip()), "")
960
+ title = self._clean_text(first_line)
961
+ if not title:
962
+ title = fallback
963
+ return title[:160]
964
+
965
+ def _clean_text(self, text: str) -> str:
966
+ text = text.replace("\xa0", " ")
967
+ text = re.sub(
968
+ r"\b(?:[A-Za-zА-Яа-яЁё]\s+){2,}[A-Za-zА-Яа-яЁё]\b",
969
+ lambda match: match.group(0).replace(" ", ""),
970
+ text,
971
+ )
972
+ text = re.sub(r"[ \t]+", " ", text)
973
+ text = re.sub(r"\n{3,}", "\n\n", text)
974
+ return text.strip()