greeta commited on
Commit
bcb901c
·
verified ·
1 Parent(s): 8b8d4b0

Delete scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +0 -877
scraper.py DELETED
@@ -1,877 +0,0 @@
1
- """
2
- FIPI scraper focused on extracting real tasks instead of generic page text.
3
- """
4
-
5
- from __future__ import annotations
6
-
7
- from datetime import datetime
8
- import io
9
- import logging
10
- import math
11
- import os
12
- import re
13
- import ssl
14
- from typing import Dict, Iterable, List, Optional
15
- from urllib.parse import urljoin
16
- import zipfile
17
-
18
- from bs4 import BeautifulSoup, Tag
19
- import httpx
20
-
21
- try:
22
- from pypdf import PdfReader
23
- except ImportError: # pragma: no cover - optional dependency for HF deploy
24
- PdfReader = None
25
-
26
- logging.basicConfig(level=logging.INFO)
27
- logger = logging.getLogger(__name__)
28
-
29
-
30
- class FIPIScraper:
31
- """Collects task candidates from the FIPI bank and official demo archives."""
32
-
33
- SUBJECT_CONFIG = {
34
- "russian": {
35
- "label": "Русский язык",
36
- "dynamic_sources": [
37
- {
38
- "kind": "ege_bank",
39
- "base_url": "https://ege.fipi.ru/bank",
40
- "project_guid": "AF0ED3F2557F8FFC4C06F80B6803FD26",
41
- "project_name": "ЕГЭ. Русский язык",
42
- },
43
- {
44
- "kind": "oge_bank",
45
- "base_url": "https://oge.fipi.ru/bank",
46
- "project_guid": "2F5EE3B12FE2A0EA40B06BF61A015416",
47
- "project_name": "ОГЭ. Русский язык",
48
- },
49
- ],
50
- "official_demo_page": "https://fipi.ru/ege/demoversii-specifikacii-kodifikatory",
51
- "official_variant_page": "https://fipi.ru/ege/otkrytyy-bank-zadaniy-ege/otkrytyye-varianty-kim-ege",
52
- "archive_prefixes": ("ru_11_",),
53
- "variant_prefixes": ("rus_",),
54
- "title_keywords": ("русский язык",),
55
- }
56
- }
57
-
58
- TASK_TYPE_KEYWORDS = {
59
- "writing": ("сочинение", "эссе", "напишите", "сформулируйте", "прокомментируйте"),
60
- "test": ("выберите", "укажите", "ответ", "вариант", "расставьте", "определите"),
61
- "listening": ("аудио", "прослуш", "запись"),
62
- "reading": ("прочитайте", "текст", "абзац", "предложение"),
63
- }
64
-
65
- GENERIC_TITLE_PATTERNS = (
66
- "открытый банк",
67
- "демоверсии",
68
- "спецификации",
69
- "кодификаторы",
70
- "федеральный институт",
71
- "фипи",
72
- "нормативно",
73
- "документы",
74
- "варианты ким",
75
- )
76
-
77
- PDF_TASK_START_PATTERNS = (
78
- "Прочитайте текст",
79
- "Самостоятельно подберите",
80
- "В тексте выделено",
81
- "Укажите",
82
- "В одном из",
83
- "Отредактируйте предложение",
84
- "Установите соответствие",
85
- "Расставьте",
86
- "Определите",
87
- "Найдите",
88
- "Подберите",
89
- )
90
-
91
- PDF_NOISE_PATTERNS = (
92
- "Инструкция по выполнению работы",
93
- "Пояснения к демонстрационному варианту",
94
- "Желаем успеха",
95
- "Все бланки ЕГЭ заполняются",
96
- "Баллы, полученные",
97
- "После завершения работы",
98
- "В демонстрационном варианте представлены",
99
- "Часть 1 содержит 26 заданий",
100
- "На выполнение экзаменационной работы",
101
- "Ответами к заданиям 1–26 являются",
102
- "Бланк",
103
- )
104
-
105
- NOISE_PATTERNS = (
106
- "федеральный институт педагогических измерений",
107
- "открытый банк тестовых заданий",
108
- "открытый банк заданий егэ",
109
- "открытый банк заданий огэ",
110
- "подбор заданий",
111
- "демоверсии, спецификации, кодификаторы",
112
- "для предметных комиссий",
113
- "аналитические и методические материалы",
114
- "видеоконсультации разработчиков ким",
115
- "скачать",
116
- "изменения в ким",
117
- )
118
-
119
- def __init__(self, base_url: str = "https://fipi.ru"):
120
- self.base_url = base_url.rstrip("/")
121
- self.headers = {
122
- "User-Agent": (
123
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
124
- "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
125
- ),
126
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
127
- "Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
128
- }
129
- self.page_size = max(1, int(os.getenv("SCRAPER_BANK_PAGE_SIZE", "10")))
130
- self.max_bank_pages = max(1, int(os.getenv("SCRAPER_MAX_BANK_PAGES", "5")))
131
- self.max_demo_archives = max(1, int(os.getenv("SCRAPER_MAX_DEMO_ARCHIVES", "2")))
132
- self.max_demo_tasks = max(1, int(os.getenv("SCRAPER_MAX_DEMO_TASKS", "20")))
133
- self.min_quality_score = max(1, int(os.getenv("SCRAPER_MIN_QUALITY_SCORE", "45")))
134
-
135
- async def fetch_page(self, url: str) -> Optional[str]:
136
- response = await self._request("GET", url)
137
- return response.text if response else None
138
-
139
- async def fetch_bytes(self, url: str) -> Optional[bytes]:
140
- response = await self._request("GET", url)
141
- return response.content if response else None
142
-
143
- async def _request(
144
- self,
145
- method: str,
146
- url: str,
147
- *,
148
- data: Optional[Dict[str, str]] = None,
149
- ) -> Optional[httpx.Response]:
150
- ssl_context = ssl.create_default_context()
151
- ssl_context.check_hostname = False
152
- ssl_context.verify_mode = ssl.CERT_NONE
153
-
154
- async with httpx.AsyncClient(
155
- headers=self.headers,
156
- timeout=45.0,
157
- verify=ssl_context,
158
- follow_redirects=True,
159
- ) as client:
160
- try:
161
- response = await client.request(method, url, data=data)
162
- response.raise_for_status()
163
- return response
164
- except httpx.HTTPError as e:
165
- logger.error("Request failed for %s: %s", url, e)
166
- return None
167
-
168
- async def scrape_tasks(
169
- self,
170
- subject: str = "russian",
171
- *,
172
- include_official_archives: bool = True,
173
- ) -> List[Dict]:
174
- config = self.SUBJECT_CONFIG.get(subject)
175
- if not config:
176
- logger.warning("Unknown subject %s, falling back to russian", subject)
177
- config = self.SUBJECT_CONFIG["russian"]
178
-
179
- candidates: List[Dict] = []
180
- candidates.extend(await self.scrape_dynamic_bank(subject))
181
- if include_official_archives:
182
- candidates.extend(await self.scrape_official_archives(subject))
183
- validated = self._dedupe_candidates(self._filter_candidates(candidates))
184
- logger.info("Accepted %s task candidates after filtering", len(validated))
185
- return validated
186
-
187
- async def scrape_dynamic_bank(self, subject: str = "russian") -> List[Dict]:
188
- config = self.SUBJECT_CONFIG.get(subject, self.SUBJECT_CONFIG["russian"])
189
- tasks: List[Dict] = []
190
-
191
- for source in config["dynamic_sources"]:
192
- project_guid = source["project_guid"]
193
- questions_url = f"{source['base_url']}/questions.php"
194
- total_tasks = None
195
-
196
- for page_index in range(self.max_bank_pages):
197
- page_url = (
198
- f"{questions_url}?proj={project_guid}"
199
- f"&page={page_index}&pagesize={self.page_size}"
200
- )
201
- html = await self.fetch_page(page_url)
202
- if not html and page_index == 0:
203
- html = await self._fetch_bank_first_page(questions_url, project_guid)
204
- if not html:
205
- break
206
-
207
- if total_tasks is None:
208
- total_tasks = self._extract_total_count(html)
209
- if total_tasks:
210
- max_pages = math.ceil(total_tasks / self.page_size)
211
- logger.info(
212
- "Bank %s reports %s tasks, scraping up to %s pages",
213
- source["project_name"],
214
- total_tasks,
215
- min(max_pages, self.max_bank_pages),
216
- )
217
-
218
- soup = BeautifulSoup(html, "lxml")
219
- blocks = soup.select("div.qblock")
220
- if not blocks:
221
- break
222
-
223
- for block in blocks:
224
- task = self._parse_bank_question_block(
225
- block,
226
- project_guid=project_guid,
227
- source_name=source["project_name"],
228
- questions_url=questions_url,
229
- )
230
- if task:
231
- tasks.append(task)
232
-
233
- if total_tasks is not None and (page_index + 1) * self.page_size >= total_tasks:
234
- break
235
-
236
- logger.info("Collected %s candidates from the dynamic bank", len(tasks))
237
- return tasks
238
-
239
- async def _fetch_bank_first_page(self, questions_url: str, project_guid: str) -> Optional[str]:
240
- response = await self._request(
241
- "POST",
242
- questions_url,
243
- data={
244
- "search": "1",
245
- "pagesize": str(self.page_size),
246
- "proj": project_guid,
247
- },
248
- )
249
- return response.text if response else None
250
-
251
- def _extract_total_count(self, html: str) -> Optional[int]:
252
- match = re.search(r"setQCount\((\d+)", html)
253
- return int(match.group(1)) if match else None
254
-
255
- def _parse_bank_question_block(
256
- self,
257
- block: Tag,
258
- *,
259
- project_guid: str,
260
- source_name: str,
261
- questions_url: str,
262
- ) -> Optional[Dict]:
263
- prompt_cell = block.select_one("td.cell_0")
264
- if not prompt_cell:
265
- return None
266
-
267
- content = self._clean_text(prompt_cell.get_text("\n", strip=True))
268
- if not content:
269
- return None
270
-
271
- title = self._build_title_from_content(content, fallback=source_name)
272
- question_guid = self._extract_block_guid(block)
273
- variants = self._extract_variants_from_block(block)
274
- images = self._extract_images(prompt_cell, base_url=questions_url)
275
-
276
- return {
277
- "title": title,
278
- "content": content,
279
- "source_url": f"{questions_url}?proj={project_guid}&qid={question_guid}",
280
- "task_type": self._detect_task_type(title, content),
281
- "images": images,
282
- "variants": variants,
283
- "scraped_at": datetime.utcnow().isoformat(),
284
- "source_kind": "dynamic_bank",
285
- "task_guid": question_guid,
286
- }
287
-
288
- def _extract_block_guid(self, block: Tag) -> str:
289
- guid_input = block.select_one("form input[name='guid']")
290
- if guid_input and guid_input.get("value"):
291
- return guid_input["value"]
292
- return block.get("id", "").lstrip("q")
293
-
294
- def _extract_variants_from_block(self, block: Tag) -> List[str]:
295
- variants: List[str] = []
296
-
297
- for label in block.find_all("label"):
298
- text = self._clean_text(label.get_text(" ", strip=True))
299
- if text:
300
- variants.append(text)
301
-
302
- if not variants:
303
- for option in block.find_all("option"):
304
- text = self._clean_text(option.get_text(" ", strip=True))
305
- if text and text.lower() != "выбор":
306
- variants.append(text)
307
-
308
- return variants[:10]
309
-
310
- async def scrape_official_archives(self, subject: str = "russian") -> List[Dict]:
311
- config = self.SUBJECT_CONFIG.get(subject, self.SUBJECT_CONFIG["russian"])
312
- archive_links = await self._discover_official_archive_links(config)
313
- variant_links = await self._discover_official_variant_links(config)
314
- document_links = self._sort_document_links(archive_links + variant_links)
315
- tasks: List[Dict] = []
316
-
317
- if not document_links:
318
- logger.warning("No official archive links found for %s", subject)
319
- return tasks
320
-
321
- if PdfReader is None:
322
- logger.warning("pypdf is not installed, skipping official PDF extraction")
323
- return tasks
324
-
325
- for document_url in document_links[: self.max_demo_archives]:
326
- document_bytes = await self.fetch_bytes(document_url)
327
- if not document_bytes:
328
- continue
329
- tasks.extend(self._extract_tasks_from_document_bytes(document_bytes, document_url))
330
-
331
- logger.info("Collected %s candidates from official archives", len(tasks))
332
- return tasks
333
-
334
- async def _discover_official_archive_links(self, config: Dict) -> List[str]:
335
- html = await self.fetch_page(config["official_demo_page"])
336
- if not html:
337
- return []
338
-
339
- soup = BeautifulSoup(html, "lxml")
340
- prefixes = config["archive_prefixes"]
341
- archive_links: List[str] = []
342
-
343
- for link in soup.find_all("a", href=True):
344
- href = link["href"]
345
- absolute = href if href.startswith("http") else urljoin(config["official_demo_page"], href)
346
- href_lower = absolute.lower()
347
- if not href_lower.endswith(".zip"):
348
- continue
349
- if any(prefix in href_lower for prefix in prefixes):
350
- archive_links.append(absolute)
351
-
352
- def sort_key(url: str) -> int:
353
- match = re.search(r"/(20\d{2})/", url)
354
- return int(match.group(1)) if match else 0
355
-
356
- archive_links.sort(key=sort_key, reverse=True)
357
- return archive_links
358
-
359
- async def _discover_official_variant_links(self, config: Dict) -> List[str]:
360
- variant_page = config.get("official_variant_page")
361
- if not variant_page:
362
- return []
363
-
364
- html = await self.fetch_page(variant_page)
365
- if not html:
366
- return []
367
-
368
- soup = BeautifulSoup(html, "lxml")
369
- prefixes = config.get("variant_prefixes", ())
370
- links: List[str] = []
371
-
372
- for link in soup.find_all("a", href=True):
373
- href = link["href"]
374
- absolute = href if href.startswith("http") else urljoin(variant_page, href)
375
- href_lower = absolute.lower()
376
- if not href_lower.endswith((".zip", ".pdf")):
377
- continue
378
- if "braille" in href_lower:
379
- continue
380
- filename = absolute.rsplit("/", 1)[-1].lower()
381
- if prefixes and not any(filename.startswith(prefix) for prefix in prefixes):
382
- continue
383
- links.append(absolute)
384
-
385
- return self._sort_document_links(links)
386
-
387
- def _sort_document_links(self, links: Iterable[str]) -> List[str]:
388
- def sort_key(url: str) -> tuple[int, str]:
389
- match = re.search(r"(20\d{2})", url)
390
- return (int(match.group(1)) if match else 0, url)
391
-
392
- return sorted(set(links), key=sort_key, reverse=True)
393
-
394
- def _extract_tasks_from_document_bytes(self, document_bytes: bytes, document_url: str) -> List[Dict]:
395
- if document_url.lower().endswith(".zip"):
396
- return self._extract_tasks_from_archive(document_bytes, document_url)
397
- if document_url.lower().endswith(".pdf"):
398
- return self._extract_tasks_from_pdf_document(
399
- document_bytes,
400
- document_url=document_url,
401
- document_name=document_url.rsplit("/", 1)[-1],
402
- )
403
- return []
404
-
405
- def _extract_tasks_from_archive(self, archive_bytes: bytes, archive_url: str) -> List[Dict]:
406
- tasks: List[Dict] = []
407
-
408
- try:
409
- with zipfile.ZipFile(io.BytesIO(archive_bytes)) as archive:
410
- for member_name in archive.namelist():
411
- if not member_name.lower().endswith(".pdf"):
412
- continue
413
- if "демо" not in member_name.lower() and "demo" not in member_name.lower():
414
- continue
415
-
416
- text = self._extract_text_from_pdf_bytes(archive.read(member_name))
417
- if not text:
418
- continue
419
-
420
- year_match = re.search(r"(20\d{2})", archive_url)
421
- year = year_match.group(1) if year_match else "unknown"
422
- tasks.extend(
423
- self._extract_tasks_from_demo_text(
424
- text,
425
- archive_url=archive_url,
426
- document_name=member_name,
427
- year=year,
428
- )
429
- )
430
- except zipfile.BadZipFile:
431
- logger.error("Invalid archive %s", archive_url)
432
-
433
- return tasks
434
-
435
- def _extract_text_from_pdf_bytes(self, pdf_bytes: bytes) -> str:
436
- if PdfReader is None:
437
- return ""
438
-
439
- try:
440
- reader = PdfReader(io.BytesIO(pdf_bytes))
441
- except Exception as e: # pragma: no cover - parser-dependent
442
- logger.error("Failed to open PDF: %s", e)
443
- return ""
444
-
445
- pages: List[str] = []
446
- for page in reader.pages:
447
- try:
448
- page_text = page.extract_text() or ""
449
- except Exception: # pragma: no cover - parser-dependent
450
- page_text = ""
451
- if page_text:
452
- pages.append(page_text)
453
-
454
- return self._clean_text("\n".join(pages))
455
-
456
- def _extract_tasks_from_demo_text(
457
- self,
458
- text: str,
459
- *,
460
- archive_url: str,
461
- document_name: str,
462
- year: str,
463
- ) -> List[Dict]:
464
- tasks: List[Dict] = []
465
- if not text:
466
- return tasks
467
-
468
- bounded_text = text
469
- if not bounded_text:
470
- return tasks
471
-
472
- pattern = re.compile(
473
- r"(?ms)(?:^|\n)(\d{1,2})[\.\)]\s*(.+?)(?=(?:\n\d{1,2}[\.\)])|(?:\nЧасть\s+\d)|\Z)"
474
- )
475
-
476
- for match in pattern.finditer(bounded_text):
477
- task_number = int(match.group(1))
478
- content = self._clean_text(match.group(2))
479
- if len(content) < 80:
480
- continue
481
-
482
- title = f"Демоверсия ЕГЭ {year}. Задание {task_number}"
483
- tasks.append(
484
- {
485
- "title": title,
486
- "content": content,
487
- "source_url": f"{archive_url}#task-{task_number}",
488
- "task_type": self._detect_task_type(title, content),
489
- "images": [],
490
- "variants": self._extract_variants(content),
491
- "scraped_at": datetime.utcnow().isoformat(),
492
- "source_kind": "official_demo_pdf",
493
- "document_name": document_name,
494
- "task_number": task_number,
495
- }
496
- )
497
-
498
- if len(tasks) >= self.max_demo_tasks:
499
- break
500
-
501
- return tasks
502
-
503
- def _slice_demo_section(self, text: str) -> str:
504
- start = re.search(r"(Часть\s*1|Ответами к заданиям)", text, re.IGNORECASE)
505
- if not start:
506
- return text
507
-
508
- end = re.search(r"(Система оценивания|Ключи|Ответы)", text[start.start() :], re.IGNORECASE)
509
- if not end:
510
- return text[start.start() :]
511
-
512
- return text[start.start() : start.start() + end.start()]
513
-
514
- def _extract_tasks_from_archive(self, archive_bytes: bytes, archive_url: str) -> List[Dict]:
515
- tasks: List[Dict] = []
516
-
517
- try:
518
- with zipfile.ZipFile(io.BytesIO(archive_bytes)) as archive:
519
- for member_name in archive.namelist():
520
- if not member_name.lower().endswith(".pdf"):
521
- continue
522
- if not self._should_parse_pdf_member(member_name, archive_url):
523
- continue
524
- tasks.extend(
525
- self._extract_tasks_from_pdf_document(
526
- archive.read(member_name),
527
- document_url=archive_url,
528
- document_name=member_name,
529
- )
530
- )
531
- except zipfile.BadZipFile:
532
- logger.error("Invalid archive %s", archive_url)
533
-
534
- return tasks
535
-
536
- def _should_parse_pdf_member(self, member_name: str, document_url: str) -> bool:
537
- member_lower = member_name.lower()
538
- if any(token in member_lower for token in ("спец", "кодиф", "критер", "ответ", "аудио")):
539
- return False
540
- if "otkrytyye-varianty-kim-ege" in document_url.lower():
541
- return True
542
- return "демо" in member_lower or "demo" in member_lower
543
-
544
- def _extract_tasks_from_pdf_document(
545
- self,
546
- pdf_bytes: bytes,
547
- *,
548
- document_url: str,
549
- document_name: str,
550
- ) -> List[Dict]:
551
- text = self._extract_text_from_pdf_bytes(pdf_bytes)
552
- if not text:
553
- return []
554
-
555
- year_match = re.search(r"(20\d{2})", document_url)
556
- year = year_match.group(1) if year_match else "unknown"
557
- return self._extract_tasks_from_demo_text(
558
- text,
559
- archive_url=document_url,
560
- document_name=document_name,
561
- year=year,
562
- source_kind=self._detect_document_source_kind(document_url),
563
- )
564
-
565
- def _detect_document_source_kind(self, document_url: str) -> str:
566
- if "otkrytyye-varianty-kim-ege" in document_url.lower():
567
- return "official_open_variant_pdf"
568
- return "official_demo_pdf"
569
-
570
- def _extract_tasks_from_demo_text(
571
- self,
572
- text: str,
573
- *,
574
- archive_url: str,
575
- document_name: str,
576
- year: str,
577
- source_kind: str = "official_demo_pdf",
578
- ) -> List[Dict]:
579
- tasks: List[Dict] = []
580
- if not text:
581
- return tasks
582
-
583
- bounded_text = text
584
- if not bounded_text:
585
- return tasks
586
-
587
- for raw_block in self._split_pdf_into_task_blocks(bounded_text):
588
- content = self._cleanup_pdf_task_block(raw_block)
589
- content = self._trim_to_task_start(content)
590
- if not self._looks_like_official_task_block(content):
591
- continue
592
-
593
- task_number = len(tasks) + 1
594
- document_label = "Открытый вариант ЕГЭ" if source_kind == "official_open_variant_pdf" else "Демоверсия ЕГЭ"
595
- title = f"{document_label} {year}. Задание {task_number}"
596
- tasks.append(
597
- {
598
- "title": title,
599
- "content": content,
600
- "source_url": f"{archive_url}#task-{task_number}",
601
- "task_type": self._detect_task_type(title, content),
602
- "images": [],
603
- "variants": self._extract_variants(content),
604
- "scraped_at": datetime.utcnow().isoformat(),
605
- "source_kind": source_kind,
606
- "document_name": document_name,
607
- "task_number": task_number,
608
- }
609
- )
610
-
611
- if len(tasks) >= self.max_demo_tasks:
612
- break
613
-
614
- return tasks
615
-
616
- def _split_pdf_into_task_blocks(self, text: str) -> List[str]:
617
- answer_pattern = re.compile(r"(?:^|\n)\s*Ответ\s*:\s*[_\.\s]*", re.IGNORECASE)
618
- blocks: List[str] = []
619
- last_pos = 0
620
-
621
- for match in answer_pattern.finditer(text):
622
- block = text[last_pos:match.start()]
623
- if block.strip():
624
- blocks.append(block)
625
- last_pos = match.end()
626
-
627
- return blocks
628
-
629
- def _cleanup_pdf_task_block(self, block: str) -> str:
630
- lines: List[str] = []
631
- for raw_line in block.splitlines():
632
- line = self._clean_text(raw_line)
633
- if not line:
634
- continue
635
- lower = line.lower()
636
- if line == "&%end_page&%":
637
- continue
638
- if re.fullmatch(r"\d{1,2}", line):
639
- continue
640
- if re.search(r"\d+\s*/\s*\d+$", line):
641
- continue
642
- if lower.startswith(("демонстрационный вариант егэ", "открытый вариант ким егэ", "единый государственный экзамен")):
643
- continue
644
- if lower.startswith("© "):
645
- continue
646
- lines.append(line)
647
-
648
- return self._clean_text("\n".join(lines))
649
-
650
- def _trim_to_task_start(self, text: str) -> str:
651
- if not text:
652
- return text
653
-
654
- starts = [text.find(pattern) for pattern in self.PDF_TASK_START_PATTERNS if text.find(pattern) >= 0]
655
- if starts:
656
- return text[min(starts):].strip()
657
- return text.strip()
658
-
659
- def _looks_like_official_task_block(self, text: str) -> bool:
660
- if len(text) < 70 or len(text) > 6000:
661
- return False
662
-
663
- lower = text.lower()
664
- if any(pattern.lower() in lower for pattern in self.PDF_NOISE_PATTERNS):
665
- return False
666
-
667
- return any(pattern.lower() in lower for pattern in self.PDF_TASK_START_PATTERNS)
668
-
669
- def _slice_demo_section(self, text: str) -> str:
670
- start_matches = list(re.finditer(r"(?m)^\s*Часть\s*1\s*$", text, re.IGNORECASE))
671
- if start_matches:
672
- start_pos = start_matches[-1].start()
673
- else:
674
- fallback = list(re.finditer(r"Ответами к заданиям", text, re.IGNORECASE))
675
- if not fallback:
676
- return text
677
- start_pos = fallback[-1].start()
678
-
679
- end = re.search(
680
- r"(Часть\s*2|Задание\s*27|Система оценивания|Критерии оценивания|Ключи)",
681
- text[start_pos:],
682
- re.IGNORECASE,
683
- )
684
- if not end:
685
- return text[start_pos:]
686
-
687
- return text[start_pos : start_pos + end.start()]
688
-
689
- def parse_task_page(self, html: str, url: str) -> Optional[Dict]:
690
- if not html:
691
- return None
692
-
693
- soup = BeautifulSoup(html, "lxml")
694
- for selector in (
695
- "div.qblock",
696
- "article",
697
- "main article",
698
- ".field--name-body",
699
- ".content",
700
- "main",
701
- "body",
702
- ):
703
- container = soup.select_one(selector)
704
- if not container:
705
- continue
706
-
707
- candidate = self._build_candidate_from_container(container, url)
708
- if candidate:
709
- return candidate
710
-
711
- return None
712
-
713
- def _build_candidate_from_container(self, container: Tag, url: str) -> Optional[Dict]:
714
- cloned = BeautifulSoup(str(container), "lxml")
715
- root = cloned.find()
716
- if root is None:
717
- return None
718
-
719
- for element in root.find_all(["script", "style", "nav", "header", "footer", "form", "button", "aside"]):
720
- element.decompose()
721
-
722
- title_tag = root.find(["h1", "h2", "h3", "strong", "b"])
723
- title = self._clean_text(title_tag.get_text(" ", strip=True)) if title_tag else ""
724
- content = self._clean_text(root.get_text("\n", strip=True))
725
- if not title:
726
- title = self._build_title_from_content(content, fallback=url)
727
-
728
- images = self._extract_images(root, base_url=url)
729
- candidate = {
730
- "title": title,
731
- "content": content,
732
- "source_url": url,
733
- "task_type": self._detect_task_type(title, content),
734
- "images": images,
735
- "variants": self._extract_variants(content),
736
- "scraped_at": datetime.utcnow().isoformat(),
737
- "source_kind": "generic_html",
738
- }
739
- return candidate if self._passes_quality_gate(candidate) else None
740
-
741
- async def scrape_task_by_id(self, task_id: str) -> Optional[Dict]:
742
- config = self.SUBJECT_CONFIG["russian"]["dynamic_sources"][0]
743
- html = await self.fetch_page(
744
- f"{config['base_url']}/questions.php?proj={config['project_guid']}&qid={task_id}"
745
- )
746
- if not html:
747
- return None
748
-
749
- soup = BeautifulSoup(html, "lxml")
750
- block = soup.select_one("div.qblock")
751
- if not block:
752
- return None
753
-
754
- return self._parse_bank_question_block(
755
- block,
756
- project_guid=config["project_guid"],
757
- source_name=config["project_name"],
758
- questions_url=f"{config['base_url']}/questions.php",
759
- )
760
-
761
- async def search_tasks(self, query: str) -> List[Dict]:
762
- query_lower = query.lower().strip()
763
- tasks = await self.scrape_tasks(subject="russian")
764
- return [
765
- task
766
- for task in tasks
767
- if query_lower in task.get("title", "").lower()
768
- or query_lower in task.get("content", "").lower()
769
- ]
770
-
771
- def _filter_candidates(self, candidates: Iterable[Dict]) -> List[Dict]:
772
- accepted: List[Dict] = []
773
- for candidate in candidates:
774
- if self._passes_quality_gate(candidate):
775
- accepted.append(candidate)
776
- return accepted
777
-
778
- def _dedupe_candidates(self, candidates: Iterable[Dict]) -> List[Dict]:
779
- deduped: List[Dict] = []
780
- seen_keys = set()
781
-
782
- for candidate in candidates:
783
- normalized = self._clean_text(candidate.get("content", ""))[:400]
784
- key = (candidate.get("source_url", ""), normalized)
785
- if key in seen_keys:
786
- continue
787
- seen_keys.add(key)
788
- deduped.append(candidate)
789
-
790
- return deduped
791
-
792
- def _passes_quality_gate(self, candidate: Dict) -> bool:
793
- score = self._score_candidate(candidate)
794
- candidate["quality_score"] = score
795
- return score >= self.min_quality_score
796
-
797
- def _score_candidate(self, candidate: Dict) -> int:
798
- title = candidate.get("title", "").lower()
799
- content = candidate.get("content", "").lower()
800
- source_kind = candidate.get("source_kind", "")
801
- length = len(content)
802
-
803
- score = 0
804
-
805
- if source_kind == "dynamic_bank":
806
- score += 60
807
- elif source_kind in {"official_demo_pdf", "official_open_variant_pdf"}:
808
- score += 50
809
- else:
810
- score += 10
811
-
812
- if 80 <= length <= 3500:
813
- score += 15
814
- elif length > 5000:
815
- score -= 20
816
- else:
817
- score -= 10
818
-
819
- if any(keyword in content for keywords in self.TASK_TYPE_KEYWORDS.values() for keyword in keywords):
820
- score += 10
821
-
822
- if any(pattern.lower() in content for pattern in self.PDF_TASK_START_PATTERNS):
823
- score += 10
824
-
825
- if re.search(r"\b\d+\b", content):
826
- score += 5
827
-
828
- if any(pattern in title for pattern in self.GENERIC_TITLE_PATTERNS):
829
- score -= 45
830
-
831
- noise_hits = sum(1 for pattern in self.NOISE_PATTERNS if pattern in content[:1200])
832
- score -= min(noise_hits * 8, 32)
833
-
834
- if content.count("\n") > 80:
835
- score -= 10
836
-
837
- return score
838
-
839
- def _detect_task_type(self, title: str, content: str) -> str:
840
- text = f"{title} {content}".lower()
841
-
842
- for task_type, keywords in self.TASK_TYPE_KEYWORDS.items():
843
- if any(keyword in text for keyword in keywords):
844
- return task_type
845
-
846
- return "other"
847
-
848
- def _extract_variants(self, content: str) -> List[str]:
849
- matches = re.findall(r"(?:^|\n)(?:[1-6]|[A-DА-Г])[.)]\s*([^\n]{2,200})", content)
850
- return [self._clean_text(match) for match in matches[:10]]
851
-
852
- def _extract_images(self, container: Tag, *, base_url: str) -> List[str]:
853
- images: List[str] = []
854
- for img in container.find_all("img"):
855
- src = img.get("src") or img.get("data-src")
856
- if not src:
857
- continue
858
- images.append(src if src.startswith("http") else urljoin(base_url, src))
859
- return images[:10]
860
-
861
- def _build_title_from_content(self, content: str, fallback: str) -> str:
862
- first_line = next((line.strip() for line in content.splitlines() if line.strip()), "")
863
- title = self._clean_text(first_line)
864
- if not title:
865
- title = fallback
866
- return title[:160]
867
-
868
- def _clean_text(self, text: str) -> str:
869
- text = text.replace("\xa0", " ")
870
- text = re.sub(
871
- r"\b(?:[A-Za-zА-Яа-яЁё]\s+){2,}[A-Za-zА-Яа-яЁё]\b",
872
- lambda match: match.group(0).replace(" ", ""),
873
- text,
874
- )
875
- text = re.sub(r"[ \t]+", " ", text)
876
- text = re.sub(r"\n{3,}", "\n\n", text)
877
- return text.strip()