greeta commited on
Commit
412e059
·
verified ·
1 Parent(s): e6bac30

Upload 2 files

Browse files
Files changed (2) hide show
  1. requirements.txt +21 -18
  2. scraper.py +871 -0
requirements.txt CHANGED
@@ -1,23 +1,26 @@
1
- # FastAPI
2
- fastapi==0.109.0
3
- uvicorn[standard]==0.27.0
4
 
5
- # HTTP clients
6
- httpx==0.25.2
7
- requests==2.31.0
8
- aiohttp==3.9.1
9
 
10
- # Parsing
11
- beautifulsoup4==4.12.3
12
- lxml==5.1.0
13
 
14
- # Pydantic
15
- pydantic==2.5.3
16
- pydantic-settings==2.1.0
 
 
 
 
 
17
 
18
- # Utilities
19
- python-dotenv==1.0.0
20
- playwright==1.40.0
21
 
22
- # Supabase (закомментировано - вызывает конфликт версий)
23
- # supabase==2.3.4
 
1
+ # ============================================
2
+ # AI Scraper ФИПИ - Минимальные зависимости
3
+ # ============================================
4
 
5
+ # FastAPI и сервер
6
+ fastapi>=0.109.0
7
+ uvicorn[standard]>=0.27.0
 
8
 
9
+ # Supabase (база данных) - базовая версия
10
+ supabase>=2.0.0
 
11
 
12
+ # HTTP клиенты
13
+ httpx>=0.27.0
14
+ requests>=2.32.0
15
+
16
+ # Парсинг
17
+ beautifulsoup4>=4.12.3
18
+ lxml>=5.1.0
19
+ pypdf>=5.0.0
20
 
21
+ # Pydantic
22
+ pydantic>=2.5.3
23
+ pydantic-settings>=2.1.0
24
 
25
+ # Утилиты
26
+ python-dotenv>=1.0.0
scraper.py ADDED
@@ -0,0 +1,871 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FIPI scraper focused on extracting real tasks instead of generic page text.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from datetime import datetime
8
+ import io
9
+ import logging
10
+ import math
11
+ import os
12
+ import re
13
+ import ssl
14
+ from typing import Dict, Iterable, List, Optional
15
+ from urllib.parse import urljoin
16
+ import zipfile
17
+
18
+ from bs4 import BeautifulSoup, Tag
19
+ import httpx
20
+
21
+ try:
22
+ from pypdf import PdfReader
23
+ except ImportError: # pragma: no cover - optional dependency for HF deploy
24
+ PdfReader = None
25
+
26
+ logging.basicConfig(level=logging.INFO)
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class FIPIScraper:
31
+ """Collects task candidates from the FIPI bank and official demo archives."""
32
+
33
+ SUBJECT_CONFIG = {
34
+ "russian": {
35
+ "label": "Русский язык",
36
+ "dynamic_sources": [
37
+ {
38
+ "kind": "ege_bank",
39
+ "base_url": "https://ege.fipi.ru/bank",
40
+ "project_guid": "AF0ED3F2557F8FFC4C06F80B6803FD26",
41
+ "project_name": "ЕГЭ. Русский язык",
42
+ },
43
+ {
44
+ "kind": "oge_bank",
45
+ "base_url": "https://oge.fipi.ru/bank",
46
+ "project_guid": "2F5EE3B12FE2A0EA40B06BF61A015416",
47
+ "project_name": "ОГЭ. Русский язык",
48
+ },
49
+ ],
50
+ "official_demo_page": "https://fipi.ru/ege/demoversii-specifikacii-kodifikatory",
51
+ "official_variant_page": "https://fipi.ru/ege/otkrytyy-bank-zadaniy-ege/otkrytyye-varianty-kim-ege",
52
+ "archive_prefixes": ("ru_11_",),
53
+ "variant_prefixes": ("rus_",),
54
+ "title_keywords": ("русский язык",),
55
+ }
56
+ }
57
+
58
+ TASK_TYPE_KEYWORDS = {
59
+ "writing": ("сочинение", "эссе", "напишите", "сформулируйте", "прокомментируйте"),
60
+ "test": ("выберите", "укажите", "ответ", "вариант", "расставьте", "определите"),
61
+ "listening": ("аудио", "прослуш", "запись"),
62
+ "reading": ("прочитайте", "текст", "абзац", "предложение"),
63
+ }
64
+
65
+ GENERIC_TITLE_PATTERNS = (
66
+ "открытый банк",
67
+ "демоверсии",
68
+ "спецификации",
69
+ "кодификаторы",
70
+ "федеральный институт",
71
+ "фипи",
72
+ "нормативно",
73
+ "документы",
74
+ "варианты ким",
75
+ )
76
+
77
+ PDF_TASK_START_PATTERNS = (
78
+ "Прочитайте текст",
79
+ "Самостоятельно подберите",
80
+ "В тексте выделено",
81
+ "Укажите",
82
+ "В одном из",
83
+ "Отредактируйте предложение",
84
+ "Установите соответствие",
85
+ "Расставьте",
86
+ "Определите",
87
+ "Найдите",
88
+ "Подберите",
89
+ )
90
+
91
+ PDF_NOISE_PATTERNS = (
92
+ "Инструкция по выполнению работы",
93
+ "Пояснения к демонстрационному варианту",
94
+ "Желаем успеха",
95
+ "Все бланки ЕГЭ заполняются",
96
+ "Баллы, полученные",
97
+ "После завершения работы",
98
+ "В демонстрационном варианте представлены",
99
+ "Часть 1 содержит 26 заданий",
100
+ "На выполнение экзаменационной работы",
101
+ "Ответами к заданиям 1–26 являются",
102
+ "Бланк",
103
+ )
104
+
105
+ NOISE_PATTERNS = (
106
+ "федеральный институт педагогических измерений",
107
+ "открытый банк тестовых заданий",
108
+ "открытый банк заданий егэ",
109
+ "открытый банк заданий огэ",
110
+ "подбор заданий",
111
+ "демоверсии, спецификации, кодификаторы",
112
+ "для предметных комиссий",
113
+ "аналитические и методические материалы",
114
+ "видеоконсультации разработчиков ким",
115
+ "скачать",
116
+ "изменения в ким",
117
+ )
118
+
119
+ def __init__(self, base_url: str = "https://fipi.ru"):
120
+ self.base_url = base_url.rstrip("/")
121
+ self.headers = {
122
+ "User-Agent": (
123
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
124
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
125
+ ),
126
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
127
+ "Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
128
+ }
129
+ self.page_size = max(1, int(os.getenv("SCRAPER_BANK_PAGE_SIZE", "10")))
130
+ self.max_bank_pages = max(1, int(os.getenv("SCRAPER_MAX_BANK_PAGES", "5")))
131
+ self.max_demo_archives = max(1, int(os.getenv("SCRAPER_MAX_DEMO_ARCHIVES", "2")))
132
+ self.max_demo_tasks = max(1, int(os.getenv("SCRAPER_MAX_DEMO_TASKS", "20")))
133
+ self.min_quality_score = max(1, int(os.getenv("SCRAPER_MIN_QUALITY_SCORE", "45")))
134
+
135
+ async def fetch_page(self, url: str) -> Optional[str]:
136
+ response = await self._request("GET", url)
137
+ return response.text if response else None
138
+
139
+ async def fetch_bytes(self, url: str) -> Optional[bytes]:
140
+ response = await self._request("GET", url)
141
+ return response.content if response else None
142
+
143
+ async def _request(
144
+ self,
145
+ method: str,
146
+ url: str,
147
+ *,
148
+ data: Optional[Dict[str, str]] = None,
149
+ ) -> Optional[httpx.Response]:
150
+ ssl_context = ssl.create_default_context()
151
+ ssl_context.check_hostname = False
152
+ ssl_context.verify_mode = ssl.CERT_NONE
153
+
154
+ async with httpx.AsyncClient(
155
+ headers=self.headers,
156
+ timeout=45.0,
157
+ verify=ssl_context,
158
+ follow_redirects=True,
159
+ ) as client:
160
+ try:
161
+ response = await client.request(method, url, data=data)
162
+ response.raise_for_status()
163
+ return response
164
+ except httpx.HTTPError as e:
165
+ logger.error("Request failed for %s: %s", url, e)
166
+ return None
167
+
168
+ async def scrape_tasks(self, subject: str = "russian") -> List[Dict]:
169
+ config = self.SUBJECT_CONFIG.get(subject)
170
+ if not config:
171
+ logger.warning("Unknown subject %s, falling back to russian", subject)
172
+ config = self.SUBJECT_CONFIG["russian"]
173
+
174
+ candidates: List[Dict] = []
175
+ candidates.extend(await self.scrape_dynamic_bank(subject))
176
+ candidates.extend(await self.scrape_official_archives(subject))
177
+ validated = self._dedupe_candidates(self._filter_candidates(candidates))
178
+ logger.info("Accepted %s task candidates after filtering", len(validated))
179
+ return validated
180
+
181
+ async def scrape_dynamic_bank(self, subject: str = "russian") -> List[Dict]:
182
+ config = self.SUBJECT_CONFIG.get(subject, self.SUBJECT_CONFIG["russian"])
183
+ tasks: List[Dict] = []
184
+
185
+ for source in config["dynamic_sources"]:
186
+ project_guid = source["project_guid"]
187
+ questions_url = f"{source['base_url']}/questions.php"
188
+ total_tasks = None
189
+
190
+ for page_index in range(self.max_bank_pages):
191
+ page_url = (
192
+ f"{questions_url}?proj={project_guid}"
193
+ f"&page={page_index}&pagesize={self.page_size}"
194
+ )
195
+ html = await self.fetch_page(page_url)
196
+ if not html and page_index == 0:
197
+ html = await self._fetch_bank_first_page(questions_url, project_guid)
198
+ if not html:
199
+ break
200
+
201
+ if total_tasks is None:
202
+ total_tasks = self._extract_total_count(html)
203
+ if total_tasks:
204
+ max_pages = math.ceil(total_tasks / self.page_size)
205
+ logger.info(
206
+ "Bank %s reports %s tasks, scraping up to %s pages",
207
+ source["project_name"],
208
+ total_tasks,
209
+ min(max_pages, self.max_bank_pages),
210
+ )
211
+
212
+ soup = BeautifulSoup(html, "lxml")
213
+ blocks = soup.select("div.qblock")
214
+ if not blocks:
215
+ break
216
+
217
+ for block in blocks:
218
+ task = self._parse_bank_question_block(
219
+ block,
220
+ project_guid=project_guid,
221
+ source_name=source["project_name"],
222
+ questions_url=questions_url,
223
+ )
224
+ if task:
225
+ tasks.append(task)
226
+
227
+ if total_tasks is not None and (page_index + 1) * self.page_size >= total_tasks:
228
+ break
229
+
230
+ logger.info("Collected %s candidates from the dynamic bank", len(tasks))
231
+ return tasks
232
+
233
+ async def _fetch_bank_first_page(self, questions_url: str, project_guid: str) -> Optional[str]:
234
+ response = await self._request(
235
+ "POST",
236
+ questions_url,
237
+ data={
238
+ "search": "1",
239
+ "pagesize": str(self.page_size),
240
+ "proj": project_guid,
241
+ },
242
+ )
243
+ return response.text if response else None
244
+
245
+ def _extract_total_count(self, html: str) -> Optional[int]:
246
+ match = re.search(r"setQCount\((\d+)", html)
247
+ return int(match.group(1)) if match else None
248
+
249
+ def _parse_bank_question_block(
250
+ self,
251
+ block: Tag,
252
+ *,
253
+ project_guid: str,
254
+ source_name: str,
255
+ questions_url: str,
256
+ ) -> Optional[Dict]:
257
+ prompt_cell = block.select_one("td.cell_0")
258
+ if not prompt_cell:
259
+ return None
260
+
261
+ content = self._clean_text(prompt_cell.get_text("\n", strip=True))
262
+ if not content:
263
+ return None
264
+
265
+ title = self._build_title_from_content(content, fallback=source_name)
266
+ question_guid = self._extract_block_guid(block)
267
+ variants = self._extract_variants_from_block(block)
268
+ images = self._extract_images(prompt_cell, base_url=questions_url)
269
+
270
+ return {
271
+ "title": title,
272
+ "content": content,
273
+ "source_url": f"{questions_url}?proj={project_guid}&qid={question_guid}",
274
+ "task_type": self._detect_task_type(title, content),
275
+ "images": images,
276
+ "variants": variants,
277
+ "scraped_at": datetime.utcnow().isoformat(),
278
+ "source_kind": "dynamic_bank",
279
+ "task_guid": question_guid,
280
+ }
281
+
282
+ def _extract_block_guid(self, block: Tag) -> str:
283
+ guid_input = block.select_one("form input[name='guid']")
284
+ if guid_input and guid_input.get("value"):
285
+ return guid_input["value"]
286
+ return block.get("id", "").lstrip("q")
287
+
288
+ def _extract_variants_from_block(self, block: Tag) -> List[str]:
289
+ variants: List[str] = []
290
+
291
+ for label in block.find_all("label"):
292
+ text = self._clean_text(label.get_text(" ", strip=True))
293
+ if text:
294
+ variants.append(text)
295
+
296
+ if not variants:
297
+ for option in block.find_all("option"):
298
+ text = self._clean_text(option.get_text(" ", strip=True))
299
+ if text and text.lower() != "выбор":
300
+ variants.append(text)
301
+
302
+ return variants[:10]
303
+
304
+ async def scrape_official_archives(self, subject: str = "russian") -> List[Dict]:
305
+ config = self.SUBJECT_CONFIG.get(subject, self.SUBJECT_CONFIG["russian"])
306
+ archive_links = await self._discover_official_archive_links(config)
307
+ variant_links = await self._discover_official_variant_links(config)
308
+ document_links = self._sort_document_links(archive_links + variant_links)
309
+ tasks: List[Dict] = []
310
+
311
+ if not document_links:
312
+ logger.warning("No official archive links found for %s", subject)
313
+ return tasks
314
+
315
+ if PdfReader is None:
316
+ logger.warning("pypdf is not installed, skipping official PDF extraction")
317
+ return tasks
318
+
319
+ for document_url in document_links[: self.max_demo_archives]:
320
+ document_bytes = await self.fetch_bytes(document_url)
321
+ if not document_bytes:
322
+ continue
323
+ tasks.extend(self._extract_tasks_from_document_bytes(document_bytes, document_url))
324
+
325
+ logger.info("Collected %s candidates from official archives", len(tasks))
326
+ return tasks
327
+
328
+ async def _discover_official_archive_links(self, config: Dict) -> List[str]:
329
+ html = await self.fetch_page(config["official_demo_page"])
330
+ if not html:
331
+ return []
332
+
333
+ soup = BeautifulSoup(html, "lxml")
334
+ prefixes = config["archive_prefixes"]
335
+ archive_links: List[str] = []
336
+
337
+ for link in soup.find_all("a", href=True):
338
+ href = link["href"]
339
+ absolute = href if href.startswith("http") else urljoin(config["official_demo_page"], href)
340
+ href_lower = absolute.lower()
341
+ if not href_lower.endswith(".zip"):
342
+ continue
343
+ if any(prefix in href_lower for prefix in prefixes):
344
+ archive_links.append(absolute)
345
+
346
+ def sort_key(url: str) -> int:
347
+ match = re.search(r"/(20\d{2})/", url)
348
+ return int(match.group(1)) if match else 0
349
+
350
+ archive_links.sort(key=sort_key, reverse=True)
351
+ return archive_links
352
+
353
+ async def _discover_official_variant_links(self, config: Dict) -> List[str]:
354
+ variant_page = config.get("official_variant_page")
355
+ if not variant_page:
356
+ return []
357
+
358
+ html = await self.fetch_page(variant_page)
359
+ if not html:
360
+ return []
361
+
362
+ soup = BeautifulSoup(html, "lxml")
363
+ prefixes = config.get("variant_prefixes", ())
364
+ links: List[str] = []
365
+
366
+ for link in soup.find_all("a", href=True):
367
+ href = link["href"]
368
+ absolute = href if href.startswith("http") else urljoin(variant_page, href)
369
+ href_lower = absolute.lower()
370
+ if not href_lower.endswith((".zip", ".pdf")):
371
+ continue
372
+ if "braille" in href_lower:
373
+ continue
374
+ filename = absolute.rsplit("/", 1)[-1].lower()
375
+ if prefixes and not any(filename.startswith(prefix) for prefix in prefixes):
376
+ continue
377
+ links.append(absolute)
378
+
379
+ return self._sort_document_links(links)
380
+
381
+ def _sort_document_links(self, links: Iterable[str]) -> List[str]:
382
+ def sort_key(url: str) -> tuple[int, str]:
383
+ match = re.search(r"(20\d{2})", url)
384
+ return (int(match.group(1)) if match else 0, url)
385
+
386
+ return sorted(set(links), key=sort_key, reverse=True)
387
+
388
+ def _extract_tasks_from_document_bytes(self, document_bytes: bytes, document_url: str) -> List[Dict]:
389
+ if document_url.lower().endswith(".zip"):
390
+ return self._extract_tasks_from_archive(document_bytes, document_url)
391
+ if document_url.lower().endswith(".pdf"):
392
+ return self._extract_tasks_from_pdf_document(
393
+ document_bytes,
394
+ document_url=document_url,
395
+ document_name=document_url.rsplit("/", 1)[-1],
396
+ )
397
+ return []
398
+
399
+ def _extract_tasks_from_archive(self, archive_bytes: bytes, archive_url: str) -> List[Dict]:
400
+ tasks: List[Dict] = []
401
+
402
+ try:
403
+ with zipfile.ZipFile(io.BytesIO(archive_bytes)) as archive:
404
+ for member_name in archive.namelist():
405
+ if not member_name.lower().endswith(".pdf"):
406
+ continue
407
+ if "демо" not in member_name.lower() and "demo" not in member_name.lower():
408
+ continue
409
+
410
+ text = self._extract_text_from_pdf_bytes(archive.read(member_name))
411
+ if not text:
412
+ continue
413
+
414
+ year_match = re.search(r"(20\d{2})", archive_url)
415
+ year = year_match.group(1) if year_match else "unknown"
416
+ tasks.extend(
417
+ self._extract_tasks_from_demo_text(
418
+ text,
419
+ archive_url=archive_url,
420
+ document_name=member_name,
421
+ year=year,
422
+ )
423
+ )
424
+ except zipfile.BadZipFile:
425
+ logger.error("Invalid archive %s", archive_url)
426
+
427
+ return tasks
428
+
429
+ def _extract_text_from_pdf_bytes(self, pdf_bytes: bytes) -> str:
430
+ if PdfReader is None:
431
+ return ""
432
+
433
+ try:
434
+ reader = PdfReader(io.BytesIO(pdf_bytes))
435
+ except Exception as e: # pragma: no cover - parser-dependent
436
+ logger.error("Failed to open PDF: %s", e)
437
+ return ""
438
+
439
+ pages: List[str] = []
440
+ for page in reader.pages:
441
+ try:
442
+ page_text = page.extract_text() or ""
443
+ except Exception: # pragma: no cover - parser-dependent
444
+ page_text = ""
445
+ if page_text:
446
+ pages.append(page_text)
447
+
448
+ return self._clean_text("\n".join(pages))
449
+
450
+ def _extract_tasks_from_demo_text(
451
+ self,
452
+ text: str,
453
+ *,
454
+ archive_url: str,
455
+ document_name: str,
456
+ year: str,
457
+ ) -> List[Dict]:
458
+ tasks: List[Dict] = []
459
+ if not text:
460
+ return tasks
461
+
462
+ bounded_text = text
463
+ if not bounded_text:
464
+ return tasks
465
+
466
+ pattern = re.compile(
467
+ r"(?ms)(?:^|\n)(\d{1,2})[\.\)]\s*(.+?)(?=(?:\n\d{1,2}[\.\)])|(?:\nЧасть\s+\d)|\Z)"
468
+ )
469
+
470
+ for match in pattern.finditer(bounded_text):
471
+ task_number = int(match.group(1))
472
+ content = self._clean_text(match.group(2))
473
+ if len(content) < 80:
474
+ continue
475
+
476
+ title = f"Демоверсия ЕГЭ {year}. Задание {task_number}"
477
+ tasks.append(
478
+ {
479
+ "title": title,
480
+ "content": content,
481
+ "source_url": f"{archive_url}#task-{task_number}",
482
+ "task_type": self._detect_task_type(title, content),
483
+ "images": [],
484
+ "variants": self._extract_variants(content),
485
+ "scraped_at": datetime.utcnow().isoformat(),
486
+ "source_kind": "official_demo_pdf",
487
+ "document_name": document_name,
488
+ "task_number": task_number,
489
+ }
490
+ )
491
+
492
+ if len(tasks) >= self.max_demo_tasks:
493
+ break
494
+
495
+ return tasks
496
+
497
+ def _slice_demo_section(self, text: str) -> str:
498
+ start = re.search(r"(Часть\s*1|Ответами к заданиям)", text, re.IGNORECASE)
499
+ if not start:
500
+ return text
501
+
502
+ end = re.search(r"(Система оценивания|Ключи|Ответы)", text[start.start() :], re.IGNORECASE)
503
+ if not end:
504
+ return text[start.start() :]
505
+
506
+ return text[start.start() : start.start() + end.start()]
507
+
508
+ def _extract_tasks_from_archive(self, archive_bytes: bytes, archive_url: str) -> List[Dict]:
509
+ tasks: List[Dict] = []
510
+
511
+ try:
512
+ with zipfile.ZipFile(io.BytesIO(archive_bytes)) as archive:
513
+ for member_name in archive.namelist():
514
+ if not member_name.lower().endswith(".pdf"):
515
+ continue
516
+ if not self._should_parse_pdf_member(member_name, archive_url):
517
+ continue
518
+ tasks.extend(
519
+ self._extract_tasks_from_pdf_document(
520
+ archive.read(member_name),
521
+ document_url=archive_url,
522
+ document_name=member_name,
523
+ )
524
+ )
525
+ except zipfile.BadZipFile:
526
+ logger.error("Invalid archive %s", archive_url)
527
+
528
+ return tasks
529
+
530
+ def _should_parse_pdf_member(self, member_name: str, document_url: str) -> bool:
531
+ member_lower = member_name.lower()
532
+ if any(token in member_lower for token in ("спец", "кодиф", "критер", "ответ", "аудио")):
533
+ return False
534
+ if "otkrytyye-varianty-kim-ege" in document_url.lower():
535
+ return True
536
+ return "демо" in member_lower or "demo" in member_lower
537
+
538
+ def _extract_tasks_from_pdf_document(
539
+ self,
540
+ pdf_bytes: bytes,
541
+ *,
542
+ document_url: str,
543
+ document_name: str,
544
+ ) -> List[Dict]:
545
+ text = self._extract_text_from_pdf_bytes(pdf_bytes)
546
+ if not text:
547
+ return []
548
+
549
+ year_match = re.search(r"(20\d{2})", document_url)
550
+ year = year_match.group(1) if year_match else "unknown"
551
+ return self._extract_tasks_from_demo_text(
552
+ text,
553
+ archive_url=document_url,
554
+ document_name=document_name,
555
+ year=year,
556
+ source_kind=self._detect_document_source_kind(document_url),
557
+ )
558
+
559
+ def _detect_document_source_kind(self, document_url: str) -> str:
560
+ if "otkrytyye-varianty-kim-ege" in document_url.lower():
561
+ return "official_open_variant_pdf"
562
+ return "official_demo_pdf"
563
+
564
+ def _extract_tasks_from_demo_text(
565
+ self,
566
+ text: str,
567
+ *,
568
+ archive_url: str,
569
+ document_name: str,
570
+ year: str,
571
+ source_kind: str = "official_demo_pdf",
572
+ ) -> List[Dict]:
573
+ tasks: List[Dict] = []
574
+ if not text:
575
+ return tasks
576
+
577
+ bounded_text = text
578
+ if not bounded_text:
579
+ return tasks
580
+
581
+ for raw_block in self._split_pdf_into_task_blocks(bounded_text):
582
+ content = self._cleanup_pdf_task_block(raw_block)
583
+ content = self._trim_to_task_start(content)
584
+ if not self._looks_like_official_task_block(content):
585
+ continue
586
+
587
+ task_number = len(tasks) + 1
588
+ document_label = "Открытый вариант ЕГЭ" if source_kind == "official_open_variant_pdf" else "Демоверсия ЕГЭ"
589
+ title = f"{document_label} {year}. Задание {task_number}"
590
+ tasks.append(
591
+ {
592
+ "title": title,
593
+ "content": content,
594
+ "source_url": f"{archive_url}#task-{task_number}",
595
+ "task_type": self._detect_task_type(title, content),
596
+ "images": [],
597
+ "variants": self._extract_variants(content),
598
+ "scraped_at": datetime.utcnow().isoformat(),
599
+ "source_kind": source_kind,
600
+ "document_name": document_name,
601
+ "task_number": task_number,
602
+ }
603
+ )
604
+
605
+ if len(tasks) >= self.max_demo_tasks:
606
+ break
607
+
608
+ return tasks
609
+
610
+ def _split_pdf_into_task_blocks(self, text: str) -> List[str]:
611
+ answer_pattern = re.compile(r"(?:^|\n)\s*Ответ\s*:\s*[_\.\s]*", re.IGNORECASE)
612
+ blocks: List[str] = []
613
+ last_pos = 0
614
+
615
+ for match in answer_pattern.finditer(text):
616
+ block = text[last_pos:match.start()]
617
+ if block.strip():
618
+ blocks.append(block)
619
+ last_pos = match.end()
620
+
621
+ return blocks
622
+
623
+ def _cleanup_pdf_task_block(self, block: str) -> str:
624
+ lines: List[str] = []
625
+ for raw_line in block.splitlines():
626
+ line = self._clean_text(raw_line)
627
+ if not line:
628
+ continue
629
+ lower = line.lower()
630
+ if line == "&%end_page&%":
631
+ continue
632
+ if re.fullmatch(r"\d{1,2}", line):
633
+ continue
634
+ if re.search(r"\d+\s*/\s*\d+$", line):
635
+ continue
636
+ if lower.startswith(("демонстрационный вариант егэ", "открытый вариант ким егэ", "единый государственный экзамен")):
637
+ continue
638
+ if lower.startswith("© "):
639
+ continue
640
+ lines.append(line)
641
+
642
+ return self._clean_text("\n".join(lines))
643
+
644
+ def _trim_to_task_start(self, text: str) -> str:
645
+ if not text:
646
+ return text
647
+
648
+ starts = [text.find(pattern) for pattern in self.PDF_TASK_START_PATTERNS if text.find(pattern) >= 0]
649
+ if starts:
650
+ return text[min(starts):].strip()
651
+ return text.strip()
652
+
653
+ def _looks_like_official_task_block(self, text: str) -> bool:
654
+ if len(text) < 70 or len(text) > 6000:
655
+ return False
656
+
657
+ lower = text.lower()
658
+ if any(pattern.lower() in lower for pattern in self.PDF_NOISE_PATTERNS):
659
+ return False
660
+
661
+ return any(pattern.lower() in lower for pattern in self.PDF_TASK_START_PATTERNS)
662
+
663
+ def _slice_demo_section(self, text: str) -> str:
664
+ start_matches = list(re.finditer(r"(?m)^\s*Часть\s*1\s*$", text, re.IGNORECASE))
665
+ if start_matches:
666
+ start_pos = start_matches[-1].start()
667
+ else:
668
+ fallback = list(re.finditer(r"Ответами к заданиям", text, re.IGNORECASE))
669
+ if not fallback:
670
+ return text
671
+ start_pos = fallback[-1].start()
672
+
673
+ end = re.search(
674
+ r"(Часть\s*2|Задание\s*27|Система оценивания|Критерии оценивания|Ключи)",
675
+ text[start_pos:],
676
+ re.IGNORECASE,
677
+ )
678
+ if not end:
679
+ return text[start_pos:]
680
+
681
+ return text[start_pos : start_pos + end.start()]
682
+
683
+ def parse_task_page(self, html: str, url: str) -> Optional[Dict]:
684
+ if not html:
685
+ return None
686
+
687
+ soup = BeautifulSoup(html, "lxml")
688
+ for selector in (
689
+ "div.qblock",
690
+ "article",
691
+ "main article",
692
+ ".field--name-body",
693
+ ".content",
694
+ "main",
695
+ "body",
696
+ ):
697
+ container = soup.select_one(selector)
698
+ if not container:
699
+ continue
700
+
701
+ candidate = self._build_candidate_from_container(container, url)
702
+ if candidate:
703
+ return candidate
704
+
705
+ return None
706
+
707
+ def _build_candidate_from_container(self, container: Tag, url: str) -> Optional[Dict]:
708
+ cloned = BeautifulSoup(str(container), "lxml")
709
+ root = cloned.find()
710
+ if root is None:
711
+ return None
712
+
713
+ for element in root.find_all(["script", "style", "nav", "header", "footer", "form", "button", "aside"]):
714
+ element.decompose()
715
+
716
+ title_tag = root.find(["h1", "h2", "h3", "strong", "b"])
717
+ title = self._clean_text(title_tag.get_text(" ", strip=True)) if title_tag else ""
718
+ content = self._clean_text(root.get_text("\n", strip=True))
719
+ if not title:
720
+ title = self._build_title_from_content(content, fallback=url)
721
+
722
+ images = self._extract_images(root, base_url=url)
723
+ candidate = {
724
+ "title": title,
725
+ "content": content,
726
+ "source_url": url,
727
+ "task_type": self._detect_task_type(title, content),
728
+ "images": images,
729
+ "variants": self._extract_variants(content),
730
+ "scraped_at": datetime.utcnow().isoformat(),
731
+ "source_kind": "generic_html",
732
+ }
733
+ return candidate if self._passes_quality_gate(candidate) else None
734
+
735
+ async def scrape_task_by_id(self, task_id: str) -> Optional[Dict]:
736
+ config = self.SUBJECT_CONFIG["russian"]["dynamic_sources"][0]
737
+ html = await self.fetch_page(
738
+ f"{config['base_url']}/questions.php?proj={config['project_guid']}&qid={task_id}"
739
+ )
740
+ if not html:
741
+ return None
742
+
743
+ soup = BeautifulSoup(html, "lxml")
744
+ block = soup.select_one("div.qblock")
745
+ if not block:
746
+ return None
747
+
748
+ return self._parse_bank_question_block(
749
+ block,
750
+ project_guid=config["project_guid"],
751
+ source_name=config["project_name"],
752
+ questions_url=f"{config['base_url']}/questions.php",
753
+ )
754
+
755
+ async def search_tasks(self, query: str) -> List[Dict]:
756
+ query_lower = query.lower().strip()
757
+ tasks = await self.scrape_tasks(subject="russian")
758
+ return [
759
+ task
760
+ for task in tasks
761
+ if query_lower in task.get("title", "").lower()
762
+ or query_lower in task.get("content", "").lower()
763
+ ]
764
+
765
+ def _filter_candidates(self, candidates: Iterable[Dict]) -> List[Dict]:
766
+ accepted: List[Dict] = []
767
+ for candidate in candidates:
768
+ if self._passes_quality_gate(candidate):
769
+ accepted.append(candidate)
770
+ return accepted
771
+
772
+ def _dedupe_candidates(self, candidates: Iterable[Dict]) -> List[Dict]:
773
+ deduped: List[Dict] = []
774
+ seen_keys = set()
775
+
776
+ for candidate in candidates:
777
+ normalized = self._clean_text(candidate.get("content", ""))[:400]
778
+ key = (candidate.get("source_url", ""), normalized)
779
+ if key in seen_keys:
780
+ continue
781
+ seen_keys.add(key)
782
+ deduped.append(candidate)
783
+
784
+ return deduped
785
+
786
+ def _passes_quality_gate(self, candidate: Dict) -> bool:
787
+ score = self._score_candidate(candidate)
788
+ candidate["quality_score"] = score
789
+ return score >= self.min_quality_score
790
+
791
+ def _score_candidate(self, candidate: Dict) -> int:
792
+ title = candidate.get("title", "").lower()
793
+ content = candidate.get("content", "").lower()
794
+ source_kind = candidate.get("source_kind", "")
795
+ length = len(content)
796
+
797
+ score = 0
798
+
799
+ if source_kind == "dynamic_bank":
800
+ score += 60
801
+ elif source_kind in {"official_demo_pdf", "official_open_variant_pdf"}:
802
+ score += 50
803
+ else:
804
+ score += 10
805
+
806
+ if 80 <= length <= 3500:
807
+ score += 15
808
+ elif length > 5000:
809
+ score -= 20
810
+ else:
811
+ score -= 10
812
+
813
+ if any(keyword in content for keywords in self.TASK_TYPE_KEYWORDS.values() for keyword in keywords):
814
+ score += 10
815
+
816
+ if any(pattern.lower() in content for pattern in self.PDF_TASK_START_PATTERNS):
817
+ score += 10
818
+
819
+ if re.search(r"\b\d+\b", content):
820
+ score += 5
821
+
822
+ if any(pattern in title for pattern in self.GENERIC_TITLE_PATTERNS):
823
+ score -= 45
824
+
825
+ noise_hits = sum(1 for pattern in self.NOISE_PATTERNS if pattern in content[:1200])
826
+ score -= min(noise_hits * 8, 32)
827
+
828
+ if content.count("\n") > 80:
829
+ score -= 10
830
+
831
+ return score
832
+
833
+ def _detect_task_type(self, title: str, content: str) -> str:
834
+ text = f"{title} {content}".lower()
835
+
836
+ for task_type, keywords in self.TASK_TYPE_KEYWORDS.items():
837
+ if any(keyword in text for keyword in keywords):
838
+ return task_type
839
+
840
+ return "other"
841
+
842
+ def _extract_variants(self, content: str) -> List[str]:
843
+ matches = re.findall(r"(?:^|\n)(?:[1-6]|[A-DА-Г])[.)]\s*([^\n]{2,200})", content)
844
+ return [self._clean_text(match) for match in matches[:10]]
845
+
846
+ def _extract_images(self, container: Tag, *, base_url: str) -> List[str]:
847
+ images: List[str] = []
848
+ for img in container.find_all("img"):
849
+ src = img.get("src") or img.get("data-src")
850
+ if not src:
851
+ continue
852
+ images.append(src if src.startswith("http") else urljoin(base_url, src))
853
+ return images[:10]
854
+
855
+ def _build_title_from_content(self, content: str, fallback: str) -> str:
856
+ first_line = next((line.strip() for line in content.splitlines() if line.strip()), "")
857
+ title = self._clean_text(first_line)
858
+ if not title:
859
+ title = fallback
860
+ return title[:160]
861
+
862
+ def _clean_text(self, text: str) -> str:
863
+ text = text.replace("\xa0", " ")
864
+ text = re.sub(
865
+ r"\b(?:[A-Za-zА-Яа-яЁё]\s+){2,}[A-Za-zА-Яа-яЁё]\b",
866
+ lambda match: match.group(0).replace(" ", ""),
867
+ text,
868
+ )
869
+ text = re.sub(r"[ \t]+", " ", text)
870
+ text = re.sub(r"\n{3,}", "\n\n", text)
871
+ return text.strip()