greeta commited on
Commit
b651663
·
verified ·
1 Parent(s): bcb901c

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +518 -0
  2. models.py +94 -0
  3. scraper.py +928 -0
app.py ADDED
@@ -0,0 +1,518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI app for the FIPI scraper service.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from datetime import datetime
8
+ import logging
9
+ import os
10
+ from pathlib import Path
11
+ import re
12
+ import ssl
13
+ from typing import Dict, List, Optional
14
+ from urllib.parse import parse_qs, urlparse
15
+
16
+ from bs4 import BeautifulSoup
17
+ from dotenv import load_dotenv
18
+ from fastapi import BackgroundTasks, FastAPI, HTTPException
19
+ from fastapi.middleware.cors import CORSMiddleware
20
+ from fastapi.responses import JSONResponse
21
+ import httpx
22
+
23
+ from models import (
24
+ CheckAnswerRequest,
25
+ CheckAnswerResponse,
26
+ ErrorResponse,
27
+ HealthResponse,
28
+ ScrapeRequest,
29
+ ScrapeResponse,
30
+ StatsResponse,
31
+ TaskResponse,
32
+ )
33
+ from scraper import FIPIScraper
34
+
35
+ BASE_DIR = Path(__file__).resolve().parent
36
+ load_dotenv(BASE_DIR / ".env")
37
+
38
+ logging.basicConfig(level=logging.INFO)
39
+ logger = logging.getLogger(__name__)
40
+
41
+ SUPABASE_AVAILABLE = False
42
+ SupabaseClient = None
43
+
44
+ try:
45
+ from supabase_client import SupabaseClient
46
+
47
+ SUPABASE_AVAILABLE = True
48
+ except ImportError as exc:
49
+ logger.warning("Supabase client import failed: %s", exc)
50
+ except Exception as exc: # pragma: no cover - startup guard
51
+ logger.warning("Supabase client init failed: %s", exc)
52
+
53
+
54
+ app = FastAPI(
55
+ title="AI Scraper FIPI",
56
+ description="Collects, stores and validates FIPI tasks.",
57
+ version="1.1.1-proof-20260317",
58
+ )
59
+
60
+ app.add_middleware(
61
+ CORSMiddleware,
62
+ allow_origins=["*"],
63
+ allow_credentials=True,
64
+ allow_methods=["*"],
65
+ allow_headers=["*"],
66
+ )
67
+
68
+ scraper: Optional[FIPIScraper] = None
69
+ supabase_client: Optional[SupabaseClient] = None
70
+ last_auto_refresh_at: Optional[datetime] = None
71
+ refresh_in_progress = False
72
+
73
+
74
+ @app.on_event("startup")
75
+ async def startup_event() -> None:
76
+ global scraper, supabase_client
77
+
78
+ scraper = FIPIScraper(base_url=os.getenv("FIPI_BASE_URL", "https://fipi.ru"))
79
+ logger.info("FIPIScraper initialized")
80
+
81
+ if not SUPABASE_AVAILABLE:
82
+ logger.info("Supabase disabled")
83
+ return
84
+
85
+ supabase_url = os.getenv("SUPABASE_URL")
86
+ supabase_key = os.getenv("SUPABASE_SERVICE_KEY")
87
+ if not supabase_url or not supabase_key:
88
+ logger.warning("SUPABASE_URL or SUPABASE_SERVICE_KEY missing")
89
+ return
90
+
91
+ try:
92
+ client = SupabaseClient(url=supabase_url, key=supabase_key)
93
+ if await client.is_available():
94
+ supabase_client = client
95
+ logger.info("Supabase client connected")
96
+ else:
97
+ logger.error("Supabase is unavailable")
98
+ except Exception as exc: # pragma: no cover - startup guard
99
+ logger.error("Supabase startup error: %s", exc)
100
+
101
+
102
+ def _require_supabase() -> SupabaseClient:
103
+ if not supabase_client:
104
+ raise HTTPException(status_code=503, detail="Supabase is not configured")
105
+ return supabase_client
106
+
107
+
108
+ def _require_scraper() -> FIPIScraper:
109
+ if not scraper:
110
+ raise HTTPException(status_code=503, detail="Scraper is not configured")
111
+ return scraper
112
+
113
+
114
+ def _can_check_answer(task: Dict) -> bool:
115
+ if task.get("source_kind") == "dynamic_bank" and task.get("task_guid"):
116
+ return True
117
+
118
+ source_url = task.get("source_url", "")
119
+ parsed = urlparse(source_url)
120
+ query = parse_qs(parsed.query)
121
+ project_guid = (query.get("proj") or [None])[0]
122
+ question_id = (query.get("qid") or [None])[0]
123
+
124
+ return parsed.path.endswith("/questions.php") and bool(project_guid and (question_id or task.get("task_guid")))
125
+
126
+
127
+ def _serialize_task(task: Dict) -> Dict:
128
+ payload = dict(task)
129
+ payload["can_check_answer"] = _can_check_answer(task)
130
+ return payload
131
+
132
+
133
+ async def _persist_tasks(tasks: List[Dict]) -> Dict[str, int]:
134
+ client = _require_supabase()
135
+ saved = 0
136
+ duplicates = 0
137
+ for task in tasks:
138
+ result = await client.insert_task(task)
139
+ if result:
140
+ saved += 1
141
+ else:
142
+ duplicates += 1
143
+ return {"saved": saved, "duplicates": duplicates}
144
+
145
+
146
+ async def _collect_tasks(subject: str = "russian", *, include_official_archives: bool = True) -> List[Dict]:
147
+ service = _require_scraper()
148
+ if include_official_archives:
149
+ return await service.scrape_tasks(subject=subject, include_official_archives=True)
150
+ return await service.scrape_dynamic_bank(subject=subject)
151
+
152
+
153
+ async def _refresh_tasks(subject: str = "russian", *, include_official_archives: bool = True) -> Dict[str, int]:
154
+ scraped_tasks = await _collect_tasks(
155
+ subject=subject,
156
+ include_official_archives=include_official_archives,
157
+ )
158
+ persisted = await _persist_tasks(scraped_tasks)
159
+ return {
160
+ "scraped": len(scraped_tasks),
161
+ "saved": persisted["saved"],
162
+ "duplicates": persisted["duplicates"],
163
+ }
164
+
165
+
166
+ def _needs_task_refresh(tasks: List[Dict]) -> bool:
167
+ if not tasks:
168
+ return True
169
+
170
+ dynamic_count = sum(1 for task in tasks if task.get("source_kind") == "dynamic_bank")
171
+ checkable_count = sum(1 for task in tasks if _can_check_answer(task))
172
+ minimum_total = max(10, int(os.getenv("SCRAPER_MIN_READY_TASKS", "40")))
173
+
174
+ if dynamic_count == 0 or checkable_count == 0:
175
+ return True
176
+
177
+ return len(tasks) < minimum_total
178
+
179
+
180
+ def _needs_dynamic_bank_refresh(tasks: List[Dict]) -> bool:
181
+ if not tasks:
182
+ return True
183
+
184
+ dynamic_count = sum(1 for task in tasks if task.get("source_kind") == "dynamic_bank")
185
+ checkable_count = sum(1 for task in tasks if _can_check_answer(task))
186
+ return dynamic_count == 0 or checkable_count == 0
187
+
188
+
189
+ def _is_refresh_running() -> bool:
190
+ return refresh_in_progress
191
+
192
+
193
+ async def _run_refresh(subject: str = "russian", include_official_archives: bool = True) -> None:
194
+ global refresh_in_progress
195
+
196
+ try:
197
+ refresh_in_progress = True
198
+ refreshed = await _refresh_tasks(
199
+ subject=subject,
200
+ include_official_archives=include_official_archives,
201
+ )
202
+ logger.info(
203
+ "Background refresh finished: scraped=%s saved=%s duplicates=%s include_archives=%s",
204
+ refreshed["scraped"],
205
+ refreshed["saved"],
206
+ refreshed["duplicates"],
207
+ include_official_archives,
208
+ )
209
+ except Exception as exc: # pragma: no cover - background guard
210
+ logger.error("Background refresh failed: %s", exc)
211
+ finally:
212
+ refresh_in_progress = False
213
+
214
+
215
+ def _schedule_refresh(
216
+ background_tasks: BackgroundTasks,
217
+ subject: str = "russian",
218
+ *,
219
+ include_official_archives: bool = True,
220
+ ) -> bool:
221
+ global last_auto_refresh_at
222
+
223
+ if _is_refresh_running():
224
+ return False
225
+
226
+ last_auto_refresh_at = datetime.utcnow()
227
+ background_tasks.add_task(_run_refresh, subject, include_official_archives)
228
+ return True
229
+
230
+
231
+ async def _ensure_tasks_available(background_tasks: BackgroundTasks, subject: str = "russian") -> List[Dict]:
232
+ global last_auto_refresh_at
233
+
234
+ client = _require_supabase()
235
+ existing = await client.get_all_tasks()
236
+ if existing and not _needs_task_refresh(existing):
237
+ return existing
238
+
239
+ if not existing:
240
+ logger.info("Tasks table is empty, running initial dynamic scrape")
241
+ last_auto_refresh_at = datetime.utcnow()
242
+ await _refresh_tasks(subject=subject, include_official_archives=False)
243
+ refreshed = await client.get_all_tasks()
244
+ if refreshed and _schedule_refresh(background_tasks, subject, include_official_archives=True):
245
+ logger.info("Scheduled full refresh after initial dynamic scrape")
246
+ return refreshed or existing
247
+
248
+ if _needs_dynamic_bank_refresh(existing):
249
+ if _is_refresh_running():
250
+ logger.info("Dynamic bank refresh is already running, returning existing tasks")
251
+ return existing
252
+
253
+ if _schedule_refresh(background_tasks, subject, include_official_archives=False):
254
+ logger.info("Tasks are missing dynamic/checkable entries, scheduled targeted dynamic refresh")
255
+ else:
256
+ logger.info("Unable to schedule targeted dynamic refresh, returning existing tasks")
257
+ return existing
258
+
259
+ cooldown_minutes = max(1, int(os.getenv("SCRAPER_AUTO_REFRESH_COOLDOWN_MINUTES", "30")))
260
+ if last_auto_refresh_at and (datetime.utcnow() - last_auto_refresh_at).total_seconds() < cooldown_minutes * 60:
261
+ logger.info("Skipping auto refresh because cooldown is active")
262
+ return existing
263
+
264
+ if _schedule_refresh(background_tasks, subject, include_official_archives=True):
265
+ logger.info("Existing tasks are stale or incomplete, scheduled background refresh")
266
+ else:
267
+ logger.info("Refresh is already running, returning existing tasks")
268
+
269
+ return existing
270
+
271
+
272
+ def _normalize_answer(answer: str) -> str:
273
+ return re.sub(r"\s+", "", answer.strip()).upper()
274
+
275
+
276
+ async def _resolve_task_guid(task: Dict) -> Optional[str]:
277
+ if task.get("task_guid"):
278
+ return task["task_guid"]
279
+
280
+ source_url = task.get("source_url", "")
281
+ if not _can_check_answer(task):
282
+ return None
283
+
284
+ html = await _require_scraper().fetch_page(source_url)
285
+ if not html:
286
+ return None
287
+
288
+ soup = BeautifulSoup(html, "lxml")
289
+ guid_input = soup.select_one("form[id^='checkform'] input[name='guid']")
290
+ return guid_input.get("value") if guid_input and guid_input.get("value") else None
291
+
292
+
293
+ async def _check_task_answer(task: Dict, answer: str) -> CheckAnswerResponse:
294
+ if not _can_check_answer(task):
295
+ raise HTTPException(status_code=400, detail="This task does not support answer checking")
296
+
297
+ normalized = _normalize_answer(answer)
298
+ if not normalized:
299
+ raise HTTPException(status_code=400, detail="Answer is empty")
300
+
301
+ parsed = urlparse(task["source_url"])
302
+ query = parse_qs(parsed.query)
303
+ project_guid = (query.get("proj") or [None])[0]
304
+ task_guid = await _resolve_task_guid(task)
305
+
306
+ if not project_guid or not task_guid:
307
+ raise HTTPException(status_code=500, detail="Unable to resolve FIPI task metadata")
308
+
309
+ solve_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path.rsplit('/', 1)[0]}/solve.php"
310
+ ssl_context = ssl.create_default_context()
311
+ ssl_context.check_hostname = False
312
+ ssl_context.verify_mode = ssl.CERT_NONE
313
+
314
+ async with httpx.AsyncClient(
315
+ headers=_require_scraper().headers,
316
+ timeout=45.0,
317
+ verify=ssl_context,
318
+ follow_redirects=True,
319
+ ) as client:
320
+ page_response = await client.get(task["source_url"])
321
+ page_response.raise_for_status()
322
+ response = await client.post(
323
+ solve_url,
324
+ data={
325
+ "guid": task_guid,
326
+ "answer": normalized,
327
+ "ajax": "1",
328
+ "proj": project_guid,
329
+ },
330
+ headers={"Referer": task["source_url"]},
331
+ )
332
+ response.raise_for_status()
333
+
334
+ if not response:
335
+ raise HTTPException(status_code=502, detail="FIPI answer check failed")
336
+
337
+ status_code = response.text.strip()
338
+ status_map = {
339
+ "0": ("not_solved", False, "Не решено"),
340
+ "1": ("solved", True, "Решено"),
341
+ "2": ("incorrect", False, "Неверно"),
342
+ "3": ("correct", True, "Верно"),
343
+ }
344
+ if status_code not in status_map:
345
+ raise HTTPException(status_code=502, detail=f"Unexpected FIPI response: {status_code}")
346
+
347
+ status_label, is_correct, message = status_map[status_code]
348
+ return CheckAnswerResponse(
349
+ success=True,
350
+ is_correct=is_correct,
351
+ status_code=status_label,
352
+ status_label=message,
353
+ submitted_answer=answer,
354
+ normalized_answer=normalized,
355
+ message=message,
356
+ )
357
+
358
+
359
+ @app.get("/api/health", response_model=HealthResponse)
360
+ async def health_check() -> HealthResponse:
361
+ services = {
362
+ "api": True,
363
+ "scraper": scraper is not None,
364
+ "supabase": False,
365
+ }
366
+
367
+ if supabase_client:
368
+ try:
369
+ services["supabase"] = await supabase_client.is_available()
370
+ except Exception:
371
+ services["supabase"] = False
372
+
373
+ all_critical_ok = services["api"] and services["scraper"]
374
+ if all_critical_ok and all(services.values()):
375
+ status = "healthy"
376
+ elif all_critical_ok:
377
+ status = "degraded"
378
+ else:
379
+ status = "unhealthy"
380
+
381
+ return HealthResponse(status=status, timestamp=datetime.utcnow(), services=services)
382
+
383
+
384
+ @app.get("/api/tasks", response_model=List[TaskResponse])
385
+ async def get_all_tasks(background_tasks: BackgroundTasks) -> List[TaskResponse]:
386
+ tasks = await _ensure_tasks_available(background_tasks)
387
+ return [TaskResponse(**_serialize_task(task)) for task in tasks]
388
+
389
+
390
+ @app.get("/api/tasks/latest", response_model=List[TaskResponse])
391
+ async def get_latest_tasks(limit: int = 10) -> List[TaskResponse]:
392
+ tasks = await _require_supabase().get_latest_tasks(limit=limit)
393
+ return [TaskResponse(**_serialize_task(task)) for task in tasks]
394
+
395
+
396
+ @app.get("/api/tasks/{task_id}", response_model=TaskResponse)
397
+ async def get_task(task_id: int) -> TaskResponse:
398
+ task = await _require_supabase().get_task_by_id(task_id)
399
+ if not task:
400
+ raise HTTPException(status_code=404, detail="Task not found")
401
+ return TaskResponse(**_serialize_task(task))
402
+
403
+
404
+ @app.post("/api/tasks/{task_id}/check-answer", response_model=CheckAnswerResponse)
405
+ async def check_task_answer(task_id: int, request: CheckAnswerRequest) -> CheckAnswerResponse:
406
+ task = await _require_supabase().get_task_by_id(task_id)
407
+ if not task:
408
+ raise HTTPException(status_code=404, detail="Task not found")
409
+ return await _check_task_answer(task, request.answer)
410
+
411
+
412
+ @app.get("/api/tasks/type/{task_type}", response_model=List[TaskResponse])
413
+ async def get_tasks_by_type(task_type: str) -> List[TaskResponse]:
414
+ tasks = await _require_supabase().get_tasks_by_type(task_type)
415
+ return [TaskResponse(**_serialize_task(task)) for task in tasks]
416
+
417
+
418
+ @app.get("/api/tasks/search", response_model=List[TaskResponse])
419
+ async def search_tasks(q: str) -> List[TaskResponse]:
420
+ tasks = await _require_supabase().search_tasks(q)
421
+ return [TaskResponse(**_serialize_task(task)) for task in tasks]
422
+
423
+
424
+ @app.post("/api/scrape", response_model=ScrapeResponse)
425
+ async def scrape_tasks(request: ScrapeRequest) -> ScrapeResponse:
426
+ client = _require_supabase()
427
+ service = _require_scraper()
428
+
429
+ try:
430
+ tasks_scraped = 0
431
+ tasks_saved = 0
432
+ duplicates_skipped = 0
433
+
434
+ if request.urls:
435
+ for url in request.urls:
436
+ html = await service.fetch_page(url)
437
+ task = service.parse_task_page(html, url) if html else None
438
+ if not task:
439
+ continue
440
+ tasks_scraped += 1
441
+ result = await client.insert_task(task)
442
+ if result:
443
+ tasks_saved += 1
444
+ else:
445
+ duplicates_skipped += 1
446
+ elif request.query:
447
+ tasks = await service.search_tasks(request.query)
448
+ tasks_scraped = len(tasks)
449
+ persisted = await _persist_tasks(tasks)
450
+ tasks_saved = persisted["saved"]
451
+ duplicates_skipped = persisted["duplicates"]
452
+ else:
453
+ if request.full_refresh:
454
+ tasks = await service.scrape_tasks(
455
+ subject=request.subject or "russian",
456
+ include_official_archives=True,
457
+ )
458
+ else:
459
+ tasks = await service.scrape_dynamic_bank(subject=request.subject or "russian")
460
+ tasks_scraped = len(tasks)
461
+ persisted = await _persist_tasks(tasks)
462
+ tasks_saved = persisted["saved"]
463
+ duplicates_skipped = persisted["duplicates"]
464
+
465
+ return ScrapeResponse(
466
+ success=True,
467
+ tasks_scraped=tasks_scraped,
468
+ tasks_saved=tasks_saved,
469
+ duplicates_skipped=duplicates_skipped,
470
+ message=(
471
+ f"Processed {tasks_scraped} tasks. "
472
+ f"Saved: {tasks_saved}, duplicates: {duplicates_skipped}"
473
+ ),
474
+ )
475
+ except HTTPException:
476
+ raise
477
+ except Exception as exc: # pragma: no cover - endpoint guard
478
+ logger.error("Scrape error: %s", exc)
479
+ raise HTTPException(status_code=500, detail=f"Scrape error: {exc}")
480
+
481
+
482
+ @app.get("/api/stats", response_model=StatsResponse)
483
+ async def get_stats() -> StatsResponse:
484
+ client = _require_supabase()
485
+ stats = await client.get_stats()
486
+ latest = await client.get_latest_tasks(limit=1)
487
+ last_scrape = latest[0].get("scraped_at") if latest else None
488
+ return StatsResponse(
489
+ total_tasks=stats.get("total", 0),
490
+ by_type=stats.get("by_type", {}),
491
+ last_scrape=last_scrape,
492
+ )
493
+
494
+
495
+ @app.get("/", tags=["root"])
496
+ async def root() -> Dict[str, str]:
497
+ return {
498
+ "message": "AI Scraper FIPI API proof-20260317",
499
+ "version": "1.1.1-proof-20260317",
500
+ "docs": "/docs",
501
+ }
502
+
503
+
504
+ @app.exception_handler(Exception)
505
+ async def global_exception_handler(request, exc) -> JSONResponse:
506
+ logger.error("Unhandled exception: %s", exc)
507
+ payload = ErrorResponse(
508
+ error="Internal Server Error",
509
+ detail=str(exc),
510
+ timestamp=datetime.utcnow(),
511
+ )
512
+ return JSONResponse(status_code=500, content=payload.model_dump(mode="json"))
513
+
514
+
515
+ if __name__ == "__main__":
516
+ import uvicorn
517
+
518
+ uvicorn.run(app, host="0.0.0.0", port=8000)
models.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pydantic models for the scraper API.
3
+ """
4
+
5
+ from datetime import datetime
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ class TaskInput(BaseModel):
12
+ title: str = Field(..., description="Task title")
13
+ content: str = Field(..., description="Task body")
14
+ source_url: str = Field(..., description="Source URL")
15
+ task_type: Optional[str] = Field(None, description="Task type")
16
+ images: Optional[List[str]] = Field(default_factory=list, description="Task images")
17
+ variants: Optional[List[str]] = Field(default_factory=list, description="Answer variants")
18
+
19
+
20
+ class TaskResponse(BaseModel):
21
+ id: Optional[int] = None
22
+ title: str
23
+ content: str
24
+ source_url: str
25
+ task_type: Optional[str] = None
26
+ images: Optional[List[str]] = None
27
+ variants: Optional[List[str]] = None
28
+ task_number: Optional[int] = None
29
+ source_kind: Optional[str] = None
30
+ task_guid: Optional[str] = None
31
+ can_check_answer: bool = False
32
+ scraped_at: Optional[datetime] = None
33
+ rubert_analysis: Optional[Dict[str, Any]] = None
34
+
35
+
36
+ class ScrapeRequest(BaseModel):
37
+ subject: Optional[str] = Field("russian", description="Subject code")
38
+ urls: Optional[List[str]] = Field(default_factory=list, description="Explicit URLs to scrape")
39
+ query: Optional[str] = Field(None, description="Search query")
40
+ full_refresh: bool = Field(
41
+ False,
42
+ description="When true, also scrape official archives and open-variant PDFs",
43
+ )
44
+
45
+
46
+ class ScrapeResponse(BaseModel):
47
+ success: bool
48
+ tasks_scraped: int
49
+ tasks_saved: int
50
+ duplicates_skipped: int
51
+ message: str
52
+
53
+
54
+ class CheckAnswerRequest(BaseModel):
55
+ answer: str = Field(..., min_length=1, description="Submitted answer")
56
+
57
+
58
+ class CheckAnswerResponse(BaseModel):
59
+ success: bool
60
+ is_correct: bool
61
+ status_code: str
62
+ status_label: str
63
+ submitted_answer: str
64
+ normalized_answer: str
65
+ message: str
66
+
67
+
68
+ class AnalysisRequest(BaseModel):
69
+ text: str = Field(..., description="Text to analyze")
70
+
71
+
72
+ class AnalysisResponse(BaseModel):
73
+ category: str
74
+ keywords: List[str]
75
+ confidence: float
76
+ embedding: Optional[List[float]] = None
77
+
78
+
79
+ class HealthResponse(BaseModel):
80
+ status: str
81
+ timestamp: datetime
82
+ services: Dict[str, bool]
83
+
84
+
85
+ class StatsResponse(BaseModel):
86
+ total_tasks: int
87
+ by_type: Dict[str, int]
88
+ last_scrape: Optional[datetime] = None
89
+
90
+
91
+ class ErrorResponse(BaseModel):
92
+ error: str
93
+ detail: Optional[str] = None
94
+ timestamp: datetime = Field(default_factory=datetime.utcnow)
scraper.py ADDED
@@ -0,0 +1,928 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FIPI scraper focused on extracting real tasks instead of generic page text.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from datetime import datetime
8
+ import io
9
+ import logging
10
+ import math
11
+ import os
12
+ import re
13
+ import ssl
14
+ from typing import Dict, Iterable, List, Optional
15
+ from urllib.parse import urljoin
16
+ import zipfile
17
+
18
+ from bs4 import BeautifulSoup, Tag
19
+ import httpx
20
+
21
+ try:
22
+ from pypdf import PdfReader
23
+ except ImportError: # pragma: no cover - optional dependency for HF deploy
24
+ PdfReader = None
25
+
26
+ logging.basicConfig(level=logging.INFO)
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class FIPIScraper:
31
+ """Collects task candidates from the FIPI bank and official demo archives."""
32
+
33
+ SUBJECT_CONFIG = {
34
+ "russian": {
35
+ "label": "Русский язык",
36
+ "dynamic_sources": [
37
+ {
38
+ "kind": "ege_bank",
39
+ "base_url": "https://ege.fipi.ru/bank",
40
+ "project_guid": "AF0ED3F2557F8FFC4C06F80B6803FD26",
41
+ "project_name": "ЕГЭ. Русский язык",
42
+ },
43
+ {
44
+ "kind": "oge_bank",
45
+ "base_url": "https://oge.fipi.ru/bank",
46
+ "project_guid": "2F5EE3B12FE2A0EA40B06BF61A015416",
47
+ "project_name": "ОГЭ. Русский язык",
48
+ },
49
+ ],
50
+ "official_demo_page": "https://fipi.ru/ege/demoversii-specifikacii-kodifikatory",
51
+ "official_variant_page": "https://fipi.ru/ege/otkrytyy-bank-zadaniy-ege/otkrytyye-varianty-kim-ege",
52
+ "archive_prefixes": ("ru_11_",),
53
+ "variant_prefixes": ("rus_",),
54
+ "title_keywords": ("русский язык",),
55
+ }
56
+ }
57
+
58
+ TASK_TYPE_KEYWORDS = {
59
+ "writing": ("сочинение", "эссе", "напишите", "сформулируйте", "прокомментируйте"),
60
+ "test": ("выберите", "укажите", "ответ", "вариант", "расставьте", "определите"),
61
+ "listening": ("аудио", "прослуш", "запись"),
62
+ "reading": ("прочитайте", "текст", "абзац", "предложение"),
63
+ }
64
+
65
+ GENERIC_TITLE_PATTERNS = (
66
+ "открытый банк",
67
+ "демоверсии",
68
+ "спецификации",
69
+ "кодификаторы",
70
+ "федеральный институт",
71
+ "фипи",
72
+ "нормативно",
73
+ "документы",
74
+ "варианты ким",
75
+ )
76
+
77
+ PDF_TASK_START_PATTERNS = (
78
+ "Прочитайте текст",
79
+ "Самостоятельно подберите",
80
+ "В тексте выделено",
81
+ "Укажите",
82
+ "В одном из",
83
+ "Отредактируйте предложение",
84
+ "Установите соответствие",
85
+ "Расставьте",
86
+ "Определите",
87
+ "Найдите",
88
+ "Подберите",
89
+ )
90
+
91
+ PDF_NOISE_PATTERNS = (
92
+ "Инструкция по выполнению работы",
93
+ "Пояснения к демонстрационному варианту",
94
+ "Желаем успеха",
95
+ "Все бланки ЕГЭ заполняются",
96
+ "Баллы, полученные",
97
+ "После завершения работы",
98
+ "В демонстрационном варианте представлены",
99
+ "Часть 1 содержит 26 заданий",
100
+ "На выполнение экзаменационной работы",
101
+ "Ответами к заданиям 1–26 являются",
102
+ "Бланк",
103
+ )
104
+
105
+ NOISE_PATTERNS = (
106
+ "федеральный институт педагогических измерений",
107
+ "открытый банк тестовых заданий",
108
+ "открытый банк заданий егэ",
109
+ "открытый банк заданий огэ",
110
+ "подбор заданий",
111
+ "демоверсии, спецификации, кодификаторы",
112
+ "для предметных комиссий",
113
+ "аналитические и методические материалы",
114
+ "видеоконсультации разработчиков ким",
115
+ "скачать",
116
+ "изменения в ким",
117
+ )
118
+
119
+ def __init__(self, base_url: str = "https://fipi.ru"):
120
+ self.base_url = base_url.rstrip("/")
121
+ self.headers = {
122
+ "User-Agent": (
123
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
124
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
125
+ ),
126
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
127
+ "Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
128
+ }
129
+ self.page_size = max(1, int(os.getenv("SCRAPER_BANK_PAGE_SIZE", "10")))
130
+ self.max_bank_pages = max(1, int(os.getenv("SCRAPER_MAX_BANK_PAGES", "5")))
131
+ self.max_demo_archives = max(1, int(os.getenv("SCRAPER_MAX_DEMO_ARCHIVES", "2")))
132
+ self.max_demo_tasks = max(1, int(os.getenv("SCRAPER_MAX_DEMO_TASKS", "20")))
133
+ self.min_quality_score = max(1, int(os.getenv("SCRAPER_MIN_QUALITY_SCORE", "45")))
134
+
135
+ async def fetch_page(self, url: str) -> Optional[str]:
136
+ response = await self._request("GET", url)
137
+ return response.text if response else None
138
+
139
+ async def fetch_bytes(self, url: str) -> Optional[bytes]:
140
+ response = await self._request("GET", url)
141
+ return response.content if response else None
142
+
143
+ async def _request(
144
+ self,
145
+ method: str,
146
+ url: str,
147
+ *,
148
+ data: Optional[Dict[str, str]] = None,
149
+ ) -> Optional[httpx.Response]:
150
+ ssl_context = ssl.create_default_context()
151
+ ssl_context.check_hostname = False
152
+ ssl_context.verify_mode = ssl.CERT_NONE
153
+
154
+ async with httpx.AsyncClient(
155
+ headers=self.headers,
156
+ timeout=45.0,
157
+ verify=ssl_context,
158
+ follow_redirects=True,
159
+ ) as client:
160
+ try:
161
+ response = await client.request(method, url, data=data)
162
+ response.raise_for_status()
163
+ return response
164
+ except httpx.HTTPError as e:
165
+ logger.error("Request failed for %s: %s", url, e)
166
+ return None
167
+
168
+ async def scrape_tasks(
169
+ self,
170
+ subject: str = "russian",
171
+ *,
172
+ include_official_archives: bool = True,
173
+ ) -> List[Dict]:
174
+ config = self.SUBJECT_CONFIG.get(subject)
175
+ if not config:
176
+ logger.warning("Unknown subject %s, falling back to russian", subject)
177
+ config = self.SUBJECT_CONFIG["russian"]
178
+
179
+ candidates: List[Dict] = []
180
+ candidates.extend(await self.scrape_dynamic_bank(subject))
181
+ if include_official_archives:
182
+ candidates.extend(await self.scrape_official_archives(subject))
183
+ validated = self._dedupe_candidates(self._filter_candidates(candidates))
184
+ logger.info("Accepted %s task candidates after filtering", len(validated))
185
+ return validated
186
+
187
+ async def scrape_dynamic_bank(self, subject: str = "russian") -> List[Dict]:
188
+ config = self.SUBJECT_CONFIG.get(subject, self.SUBJECT_CONFIG["russian"])
189
+ tasks: List[Dict] = []
190
+
191
+ for source in config["dynamic_sources"]:
192
+ project_guid = source["project_guid"]
193
+ questions_url = f"{source['base_url']}/questions.php"
194
+ total_tasks = None
195
+
196
+ for page_index in range(self.max_bank_pages):
197
+ html = await self._fetch_bank_page(
198
+ questions_url=questions_url,
199
+ project_guid=project_guid,
200
+ page_index=page_index,
201
+ )
202
+ if not html:
203
+ break
204
+
205
+ if total_tasks is None:
206
+ total_tasks = self._extract_total_count(html)
207
+ if total_tasks:
208
+ max_pages = math.ceil(total_tasks / self.page_size)
209
+ logger.info(
210
+ "Bank %s reports %s tasks, scraping up to %s pages",
211
+ source["project_name"],
212
+ total_tasks,
213
+ min(max_pages, self.max_bank_pages),
214
+ )
215
+
216
+ soup = BeautifulSoup(html, "lxml")
217
+ blocks = soup.select("div.qblock")
218
+ if not blocks:
219
+ logger.warning(
220
+ "No qblock nodes found for %s page=%s via primary fetch, retrying POST search",
221
+ source["project_name"],
222
+ page_index,
223
+ )
224
+ html = await self._fetch_bank_page(
225
+ questions_url=questions_url,
226
+ project_guid=project_guid,
227
+ page_index=page_index,
228
+ force_post=True,
229
+ )
230
+ if not html:
231
+ break
232
+
233
+ soup = BeautifulSoup(html, "lxml")
234
+ blocks = soup.select("div.qblock")
235
+ if not blocks:
236
+ logger.warning(
237
+ "No qblock nodes found for %s page=%s after retry",
238
+ source["project_name"],
239
+ page_index,
240
+ )
241
+ break
242
+
243
+ for block in blocks:
244
+ task = self._parse_bank_question_block(
245
+ block,
246
+ project_guid=project_guid,
247
+ source_name=source["project_name"],
248
+ questions_url=questions_url,
249
+ )
250
+ if task:
251
+ tasks.append(task)
252
+
253
+ if total_tasks is not None and (page_index + 1) * self.page_size >= total_tasks:
254
+ break
255
+
256
+ logger.info("Collected %s candidates from the dynamic bank", len(tasks))
257
+ return tasks
258
+
259
+ async def _fetch_bank_page(
260
+ self,
261
+ *,
262
+ questions_url: str,
263
+ project_guid: str,
264
+ page_index: int,
265
+ force_post: bool = False,
266
+ ) -> Optional[str]:
267
+ page_url = (
268
+ f"{questions_url}?proj={project_guid}"
269
+ f"&page={page_index}&pagesize={self.page_size}"
270
+ )
271
+
272
+ if not force_post:
273
+ html = await self.fetch_page(page_url)
274
+ if html:
275
+ return html
276
+
277
+ return await self._post_bank_page(
278
+ questions_url=questions_url,
279
+ project_guid=project_guid,
280
+ page_index=page_index,
281
+ )
282
+
283
+ async def _post_bank_page(
284
+ self,
285
+ *,
286
+ questions_url: str,
287
+ project_guid: str,
288
+ page_index: int,
289
+ ) -> Optional[str]:
290
+ response = await self._request(
291
+ "POST",
292
+ questions_url,
293
+ data={
294
+ "search": "1",
295
+ "pagesize": str(self.page_size),
296
+ "proj": project_guid,
297
+ "page": str(page_index),
298
+ },
299
+ )
300
+ return response.text if response else None
301
+
302
+ def _extract_total_count(self, html: str) -> Optional[int]:
303
+ match = re.search(r"setQCount\((\d+)", html)
304
+ return int(match.group(1)) if match else None
305
+
306
+ def _parse_bank_question_block(
307
+ self,
308
+ block: Tag,
309
+ *,
310
+ project_guid: str,
311
+ source_name: str,
312
+ questions_url: str,
313
+ ) -> Optional[Dict]:
314
+ prompt_cell = block.select_one("td.cell_0")
315
+ if not prompt_cell:
316
+ return None
317
+
318
+ content = self._clean_text(prompt_cell.get_text("\n", strip=True))
319
+ if not content:
320
+ return None
321
+
322
+ title = self._build_title_from_content(content, fallback=source_name)
323
+ question_guid = self._extract_block_guid(block)
324
+ variants = self._extract_variants_from_block(block)
325
+ images = self._extract_images(prompt_cell, base_url=questions_url)
326
+
327
+ return {
328
+ "title": title,
329
+ "content": content,
330
+ "source_url": f"{questions_url}?proj={project_guid}&qid={question_guid}",
331
+ "task_type": self._detect_task_type(title, content),
332
+ "images": images,
333
+ "variants": variants,
334
+ "scraped_at": datetime.utcnow().isoformat(),
335
+ "source_kind": "dynamic_bank",
336
+ "task_guid": question_guid,
337
+ }
338
+
339
+ def _extract_block_guid(self, block: Tag) -> str:
340
+ guid_input = block.select_one("form input[name='guid']")
341
+ if guid_input and guid_input.get("value"):
342
+ return guid_input["value"]
343
+ return block.get("id", "").lstrip("q")
344
+
345
+ def _extract_variants_from_block(self, block: Tag) -> List[str]:
346
+ variants: List[str] = []
347
+
348
+ for label in block.find_all("label"):
349
+ text = self._clean_text(label.get_text(" ", strip=True))
350
+ if text:
351
+ variants.append(text)
352
+
353
+ if not variants:
354
+ for option in block.find_all("option"):
355
+ text = self._clean_text(option.get_text(" ", strip=True))
356
+ if text and text.lower() != "выбор":
357
+ variants.append(text)
358
+
359
+ return variants[:10]
360
+
361
+ async def scrape_official_archives(self, subject: str = "russian") -> List[Dict]:
362
+ config = self.SUBJECT_CONFIG.get(subject, self.SUBJECT_CONFIG["russian"])
363
+ archive_links = await self._discover_official_archive_links(config)
364
+ variant_links = await self._discover_official_variant_links(config)
365
+ document_links = self._sort_document_links(archive_links + variant_links)
366
+ tasks: List[Dict] = []
367
+
368
+ if not document_links:
369
+ logger.warning("No official archive links found for %s", subject)
370
+ return tasks
371
+
372
+ if PdfReader is None:
373
+ logger.warning("pypdf is not installed, skipping official PDF extraction")
374
+ return tasks
375
+
376
+ for document_url in document_links[: self.max_demo_archives]:
377
+ document_bytes = await self.fetch_bytes(document_url)
378
+ if not document_bytes:
379
+ continue
380
+ tasks.extend(self._extract_tasks_from_document_bytes(document_bytes, document_url))
381
+
382
+ logger.info("Collected %s candidates from official archives", len(tasks))
383
+ return tasks
384
+
385
+ async def _discover_official_archive_links(self, config: Dict) -> List[str]:
386
+ html = await self.fetch_page(config["official_demo_page"])
387
+ if not html:
388
+ return []
389
+
390
+ soup = BeautifulSoup(html, "lxml")
391
+ prefixes = config["archive_prefixes"]
392
+ archive_links: List[str] = []
393
+
394
+ for link in soup.find_all("a", href=True):
395
+ href = link["href"]
396
+ absolute = href if href.startswith("http") else urljoin(config["official_demo_page"], href)
397
+ href_lower = absolute.lower()
398
+ if not href_lower.endswith(".zip"):
399
+ continue
400
+ if any(prefix in href_lower for prefix in prefixes):
401
+ archive_links.append(absolute)
402
+
403
+ def sort_key(url: str) -> int:
404
+ match = re.search(r"/(20\d{2})/", url)
405
+ return int(match.group(1)) if match else 0
406
+
407
+ archive_links.sort(key=sort_key, reverse=True)
408
+ return archive_links
409
+
410
+ async def _discover_official_variant_links(self, config: Dict) -> List[str]:
411
+ variant_page = config.get("official_variant_page")
412
+ if not variant_page:
413
+ return []
414
+
415
+ html = await self.fetch_page(variant_page)
416
+ if not html:
417
+ return []
418
+
419
+ soup = BeautifulSoup(html, "lxml")
420
+ prefixes = config.get("variant_prefixes", ())
421
+ links: List[str] = []
422
+
423
+ for link in soup.find_all("a", href=True):
424
+ href = link["href"]
425
+ absolute = href if href.startswith("http") else urljoin(variant_page, href)
426
+ href_lower = absolute.lower()
427
+ if not href_lower.endswith((".zip", ".pdf")):
428
+ continue
429
+ if "braille" in href_lower:
430
+ continue
431
+ filename = absolute.rsplit("/", 1)[-1].lower()
432
+ if prefixes and not any(filename.startswith(prefix) for prefix in prefixes):
433
+ continue
434
+ links.append(absolute)
435
+
436
+ return self._sort_document_links(links)
437
+
438
+ def _sort_document_links(self, links: Iterable[str]) -> List[str]:
439
+ def sort_key(url: str) -> tuple[int, str]:
440
+ match = re.search(r"(20\d{2})", url)
441
+ return (int(match.group(1)) if match else 0, url)
442
+
443
+ return sorted(set(links), key=sort_key, reverse=True)
444
+
445
+ def _extract_tasks_from_document_bytes(self, document_bytes: bytes, document_url: str) -> List[Dict]:
446
+ if document_url.lower().endswith(".zip"):
447
+ return self._extract_tasks_from_archive(document_bytes, document_url)
448
+ if document_url.lower().endswith(".pdf"):
449
+ return self._extract_tasks_from_pdf_document(
450
+ document_bytes,
451
+ document_url=document_url,
452
+ document_name=document_url.rsplit("/", 1)[-1],
453
+ )
454
+ return []
455
+
456
+ def _extract_tasks_from_archive(self, archive_bytes: bytes, archive_url: str) -> List[Dict]:
457
+ tasks: List[Dict] = []
458
+
459
+ try:
460
+ with zipfile.ZipFile(io.BytesIO(archive_bytes)) as archive:
461
+ for member_name in archive.namelist():
462
+ if not member_name.lower().endswith(".pdf"):
463
+ continue
464
+ if "демо" not in member_name.lower() and "demo" not in member_name.lower():
465
+ continue
466
+
467
+ text = self._extract_text_from_pdf_bytes(archive.read(member_name))
468
+ if not text:
469
+ continue
470
+
471
+ year_match = re.search(r"(20\d{2})", archive_url)
472
+ year = year_match.group(1) if year_match else "unknown"
473
+ tasks.extend(
474
+ self._extract_tasks_from_demo_text(
475
+ text,
476
+ archive_url=archive_url,
477
+ document_name=member_name,
478
+ year=year,
479
+ )
480
+ )
481
+ except zipfile.BadZipFile:
482
+ logger.error("Invalid archive %s", archive_url)
483
+
484
+ return tasks
485
+
486
+ def _extract_text_from_pdf_bytes(self, pdf_bytes: bytes) -> str:
487
+ if PdfReader is None:
488
+ return ""
489
+
490
+ try:
491
+ reader = PdfReader(io.BytesIO(pdf_bytes))
492
+ except Exception as e: # pragma: no cover - parser-dependent
493
+ logger.error("Failed to open PDF: %s", e)
494
+ return ""
495
+
496
+ pages: List[str] = []
497
+ for page in reader.pages:
498
+ try:
499
+ page_text = page.extract_text() or ""
500
+ except Exception: # pragma: no cover - parser-dependent
501
+ page_text = ""
502
+ if page_text:
503
+ pages.append(page_text)
504
+
505
+ return self._clean_text("\n".join(pages))
506
+
507
+ def _extract_tasks_from_demo_text(
508
+ self,
509
+ text: str,
510
+ *,
511
+ archive_url: str,
512
+ document_name: str,
513
+ year: str,
514
+ ) -> List[Dict]:
515
+ tasks: List[Dict] = []
516
+ if not text:
517
+ return tasks
518
+
519
+ bounded_text = text
520
+ if not bounded_text:
521
+ return tasks
522
+
523
+ pattern = re.compile(
524
+ r"(?ms)(?:^|\n)(\d{1,2})[\.\)]\s*(.+?)(?=(?:\n\d{1,2}[\.\)])|(?:\nЧасть\s+\d)|\Z)"
525
+ )
526
+
527
+ for match in pattern.finditer(bounded_text):
528
+ task_number = int(match.group(1))
529
+ content = self._clean_text(match.group(2))
530
+ if len(content) < 80:
531
+ continue
532
+
533
+ title = f"Демоверсия ЕГЭ {year}. Задание {task_number}"
534
+ tasks.append(
535
+ {
536
+ "title": title,
537
+ "content": content,
538
+ "source_url": f"{archive_url}#task-{task_number}",
539
+ "task_type": self._detect_task_type(title, content),
540
+ "images": [],
541
+ "variants": self._extract_variants(content),
542
+ "scraped_at": datetime.utcnow().isoformat(),
543
+ "source_kind": "official_demo_pdf",
544
+ "document_name": document_name,
545
+ "task_number": task_number,
546
+ }
547
+ )
548
+
549
+ if len(tasks) >= self.max_demo_tasks:
550
+ break
551
+
552
+ return tasks
553
+
554
+ def _slice_demo_section(self, text: str) -> str:
555
+ start = re.search(r"(Часть\s*1|Ответами к заданиям)", text, re.IGNORECASE)
556
+ if not start:
557
+ return text
558
+
559
+ end = re.search(r"(Система оценивания|Ключи|Ответы)", text[start.start() :], re.IGNORECASE)
560
+ if not end:
561
+ return text[start.start() :]
562
+
563
+ return text[start.start() : start.start() + end.start()]
564
+
565
+ def _extract_tasks_from_archive(self, archive_bytes: bytes, archive_url: str) -> List[Dict]:
566
+ tasks: List[Dict] = []
567
+
568
+ try:
569
+ with zipfile.ZipFile(io.BytesIO(archive_bytes)) as archive:
570
+ for member_name in archive.namelist():
571
+ if not member_name.lower().endswith(".pdf"):
572
+ continue
573
+ if not self._should_parse_pdf_member(member_name, archive_url):
574
+ continue
575
+ tasks.extend(
576
+ self._extract_tasks_from_pdf_document(
577
+ archive.read(member_name),
578
+ document_url=archive_url,
579
+ document_name=member_name,
580
+ )
581
+ )
582
+ except zipfile.BadZipFile:
583
+ logger.error("Invalid archive %s", archive_url)
584
+
585
+ return tasks
586
+
587
+ def _should_parse_pdf_member(self, member_name: str, document_url: str) -> bool:
588
+ member_lower = member_name.lower()
589
+ if any(token in member_lower for token in ("спец", "кодиф", "критер", "ответ", "аудио")):
590
+ return False
591
+ if "otkrytyye-varianty-kim-ege" in document_url.lower():
592
+ return True
593
+ return "демо" in member_lower or "demo" in member_lower
594
+
595
+ def _extract_tasks_from_pdf_document(
596
+ self,
597
+ pdf_bytes: bytes,
598
+ *,
599
+ document_url: str,
600
+ document_name: str,
601
+ ) -> List[Dict]:
602
+ text = self._extract_text_from_pdf_bytes(pdf_bytes)
603
+ if not text:
604
+ return []
605
+
606
+ year_match = re.search(r"(20\d{2})", document_url)
607
+ year = year_match.group(1) if year_match else "unknown"
608
+ return self._extract_tasks_from_demo_text(
609
+ text,
610
+ archive_url=document_url,
611
+ document_name=document_name,
612
+ year=year,
613
+ source_kind=self._detect_document_source_kind(document_url),
614
+ )
615
+
616
+ def _detect_document_source_kind(self, document_url: str) -> str:
617
+ if "otkrytyye-varianty-kim-ege" in document_url.lower():
618
+ return "official_open_variant_pdf"
619
+ return "official_demo_pdf"
620
+
621
+ def _extract_tasks_from_demo_text(
622
+ self,
623
+ text: str,
624
+ *,
625
+ archive_url: str,
626
+ document_name: str,
627
+ year: str,
628
+ source_kind: str = "official_demo_pdf",
629
+ ) -> List[Dict]:
630
+ tasks: List[Dict] = []
631
+ if not text:
632
+ return tasks
633
+
634
+ bounded_text = text
635
+ if not bounded_text:
636
+ return tasks
637
+
638
+ for raw_block in self._split_pdf_into_task_blocks(bounded_text):
639
+ content = self._cleanup_pdf_task_block(raw_block)
640
+ content = self._trim_to_task_start(content)
641
+ if not self._looks_like_official_task_block(content):
642
+ continue
643
+
644
+ task_number = len(tasks) + 1
645
+ document_label = "Открытый вариант ЕГЭ" if source_kind == "official_open_variant_pdf" else "Демоверсия ЕГЭ"
646
+ title = f"{document_label} {year}. Задание {task_number}"
647
+ tasks.append(
648
+ {
649
+ "title": title,
650
+ "content": content,
651
+ "source_url": f"{archive_url}#task-{task_number}",
652
+ "task_type": self._detect_task_type(title, content),
653
+ "images": [],
654
+ "variants": self._extract_variants(content),
655
+ "scraped_at": datetime.utcnow().isoformat(),
656
+ "source_kind": source_kind,
657
+ "document_name": document_name,
658
+ "task_number": task_number,
659
+ }
660
+ )
661
+
662
+ if len(tasks) >= self.max_demo_tasks:
663
+ break
664
+
665
+ return tasks
666
+
667
+ def _split_pdf_into_task_blocks(self, text: str) -> List[str]:
668
+ answer_pattern = re.compile(r"(?:^|\n)\s*Ответ\s*:\s*[_\.\s]*", re.IGNORECASE)
669
+ blocks: List[str] = []
670
+ last_pos = 0
671
+
672
+ for match in answer_pattern.finditer(text):
673
+ block = text[last_pos:match.start()]
674
+ if block.strip():
675
+ blocks.append(block)
676
+ last_pos = match.end()
677
+
678
+ return blocks
679
+
680
+ def _cleanup_pdf_task_block(self, block: str) -> str:
681
+ lines: List[str] = []
682
+ for raw_line in block.splitlines():
683
+ line = self._clean_text(raw_line)
684
+ if not line:
685
+ continue
686
+ lower = line.lower()
687
+ if line == "&%end_page&%":
688
+ continue
689
+ if re.fullmatch(r"\d{1,2}", line):
690
+ continue
691
+ if re.search(r"\d+\s*/\s*\d+$", line):
692
+ continue
693
+ if lower.startswith(("демонстрационный вариант егэ", "открытый вариант ким егэ", "единый государственный экзамен")):
694
+ continue
695
+ if lower.startswith("© "):
696
+ continue
697
+ lines.append(line)
698
+
699
+ return self._clean_text("\n".join(lines))
700
+
701
+ def _trim_to_task_start(self, text: str) -> str:
702
+ if not text:
703
+ return text
704
+
705
+ starts = [text.find(pattern) for pattern in self.PDF_TASK_START_PATTERNS if text.find(pattern) >= 0]
706
+ if starts:
707
+ return text[min(starts):].strip()
708
+ return text.strip()
709
+
710
+ def _looks_like_official_task_block(self, text: str) -> bool:
711
+ if len(text) < 70 or len(text) > 6000:
712
+ return False
713
+
714
+ lower = text.lower()
715
+ if any(pattern.lower() in lower for pattern in self.PDF_NOISE_PATTERNS):
716
+ return False
717
+
718
+ return any(pattern.lower() in lower for pattern in self.PDF_TASK_START_PATTERNS)
719
+
720
+ def _slice_demo_section(self, text: str) -> str:
721
+ start_matches = list(re.finditer(r"(?m)^\s*Часть\s*1\s*$", text, re.IGNORECASE))
722
+ if start_matches:
723
+ start_pos = start_matches[-1].start()
724
+ else:
725
+ fallback = list(re.finditer(r"Ответами к заданиям", text, re.IGNORECASE))
726
+ if not fallback:
727
+ return text
728
+ start_pos = fallback[-1].start()
729
+
730
+ end = re.search(
731
+ r"(Часть\s*2|Задание\s*27|Система оценивания|Критерии оценивания|Ключи)",
732
+ text[start_pos:],
733
+ re.IGNORECASE,
734
+ )
735
+ if not end:
736
+ return text[start_pos:]
737
+
738
+ return text[start_pos : start_pos + end.start()]
739
+
740
+ def parse_task_page(self, html: str, url: str) -> Optional[Dict]:
741
+ if not html:
742
+ return None
743
+
744
+ soup = BeautifulSoup(html, "lxml")
745
+ for selector in (
746
+ "div.qblock",
747
+ "article",
748
+ "main article",
749
+ ".field--name-body",
750
+ ".content",
751
+ "main",
752
+ "body",
753
+ ):
754
+ container = soup.select_one(selector)
755
+ if not container:
756
+ continue
757
+
758
+ candidate = self._build_candidate_from_container(container, url)
759
+ if candidate:
760
+ return candidate
761
+
762
+ return None
763
+
764
+ def _build_candidate_from_container(self, container: Tag, url: str) -> Optional[Dict]:
765
+ cloned = BeautifulSoup(str(container), "lxml")
766
+ root = cloned.find()
767
+ if root is None:
768
+ return None
769
+
770
+ for element in root.find_all(["script", "style", "nav", "header", "footer", "form", "button", "aside"]):
771
+ element.decompose()
772
+
773
+ title_tag = root.find(["h1", "h2", "h3", "strong", "b"])
774
+ title = self._clean_text(title_tag.get_text(" ", strip=True)) if title_tag else ""
775
+ content = self._clean_text(root.get_text("\n", strip=True))
776
+ if not title:
777
+ title = self._build_title_from_content(content, fallback=url)
778
+
779
+ images = self._extract_images(root, base_url=url)
780
+ candidate = {
781
+ "title": title,
782
+ "content": content,
783
+ "source_url": url,
784
+ "task_type": self._detect_task_type(title, content),
785
+ "images": images,
786
+ "variants": self._extract_variants(content),
787
+ "scraped_at": datetime.utcnow().isoformat(),
788
+ "source_kind": "generic_html",
789
+ }
790
+ return candidate if self._passes_quality_gate(candidate) else None
791
+
792
+ async def scrape_task_by_id(self, task_id: str) -> Optional[Dict]:
793
+ config = self.SUBJECT_CONFIG["russian"]["dynamic_sources"][0]
794
+ html = await self.fetch_page(
795
+ f"{config['base_url']}/questions.php?proj={config['project_guid']}&qid={task_id}"
796
+ )
797
+ if not html:
798
+ return None
799
+
800
+ soup = BeautifulSoup(html, "lxml")
801
+ block = soup.select_one("div.qblock")
802
+ if not block:
803
+ return None
804
+
805
+ return self._parse_bank_question_block(
806
+ block,
807
+ project_guid=config["project_guid"],
808
+ source_name=config["project_name"],
809
+ questions_url=f"{config['base_url']}/questions.php",
810
+ )
811
+
812
+ async def search_tasks(self, query: str) -> List[Dict]:
813
+ query_lower = query.lower().strip()
814
+ tasks = await self.scrape_tasks(subject="russian")
815
+ return [
816
+ task
817
+ for task in tasks
818
+ if query_lower in task.get("title", "").lower()
819
+ or query_lower in task.get("content", "").lower()
820
+ ]
821
+
822
+ def _filter_candidates(self, candidates: Iterable[Dict]) -> List[Dict]:
823
+ accepted: List[Dict] = []
824
+ for candidate in candidates:
825
+ if self._passes_quality_gate(candidate):
826
+ accepted.append(candidate)
827
+ return accepted
828
+
829
+ def _dedupe_candidates(self, candidates: Iterable[Dict]) -> List[Dict]:
830
+ deduped: List[Dict] = []
831
+ seen_keys = set()
832
+
833
+ for candidate in candidates:
834
+ normalized = self._clean_text(candidate.get("content", ""))[:400]
835
+ key = (candidate.get("source_url", ""), normalized)
836
+ if key in seen_keys:
837
+ continue
838
+ seen_keys.add(key)
839
+ deduped.append(candidate)
840
+
841
+ return deduped
842
+
843
+ def _passes_quality_gate(self, candidate: Dict) -> bool:
844
+ score = self._score_candidate(candidate)
845
+ candidate["quality_score"] = score
846
+ return score >= self.min_quality_score
847
+
848
+ def _score_candidate(self, candidate: Dict) -> int:
849
+ title = candidate.get("title", "").lower()
850
+ content = candidate.get("content", "").lower()
851
+ source_kind = candidate.get("source_kind", "")
852
+ length = len(content)
853
+
854
+ score = 0
855
+
856
+ if source_kind == "dynamic_bank":
857
+ score += 60
858
+ elif source_kind in {"official_demo_pdf", "official_open_variant_pdf"}:
859
+ score += 50
860
+ else:
861
+ score += 10
862
+
863
+ if 80 <= length <= 3500:
864
+ score += 15
865
+ elif length > 5000:
866
+ score -= 20
867
+ else:
868
+ score -= 10
869
+
870
+ if any(keyword in content for keywords in self.TASK_TYPE_KEYWORDS.values() for keyword in keywords):
871
+ score += 10
872
+
873
+ if any(pattern.lower() in content for pattern in self.PDF_TASK_START_PATTERNS):
874
+ score += 10
875
+
876
+ if re.search(r"\b\d+\b", content):
877
+ score += 5
878
+
879
+ if any(pattern in title for pattern in self.GENERIC_TITLE_PATTERNS):
880
+ score -= 45
881
+
882
+ noise_hits = sum(1 for pattern in self.NOISE_PATTERNS if pattern in content[:1200])
883
+ score -= min(noise_hits * 8, 32)
884
+
885
+ if content.count("\n") > 80:
886
+ score -= 10
887
+
888
+ return score
889
+
890
+ def _detect_task_type(self, title: str, content: str) -> str:
891
+ text = f"{title} {content}".lower()
892
+
893
+ for task_type, keywords in self.TASK_TYPE_KEYWORDS.items():
894
+ if any(keyword in text for keyword in keywords):
895
+ return task_type
896
+
897
+ return "other"
898
+
899
+ def _extract_variants(self, content: str) -> List[str]:
900
+ matches = re.findall(r"(?:^|\n)(?:[1-6]|[A-DА-Г])[.)]\s*([^\n]{2,200})", content)
901
+ return [self._clean_text(match) for match in matches[:10]]
902
+
903
+ def _extract_images(self, container: Tag, *, base_url: str) -> List[str]:
904
+ images: List[str] = []
905
+ for img in container.find_all("img"):
906
+ src = img.get("src") or img.get("data-src")
907
+ if not src:
908
+ continue
909
+ images.append(src if src.startswith("http") else urljoin(base_url, src))
910
+ return images[:10]
911
+
912
+ def _build_title_from_content(self, content: str, fallback: str) -> str:
913
+ first_line = next((line.strip() for line in content.splitlines() if line.strip()), "")
914
+ title = self._clean_text(first_line)
915
+ if not title:
916
+ title = fallback
917
+ return title[:160]
918
+
919
+ def _clean_text(self, text: str) -> str:
920
+ text = text.replace("\xa0", " ")
921
+ text = re.sub(
922
+ r"\b(?:[A-Za-zА-Яа-яЁё]\s+){2,}[A-Za-zА-Яа-яЁё]\b",
923
+ lambda match: match.group(0).replace(" ", ""),
924
+ text,
925
+ )
926
+ text = re.sub(r"[ \t]+", " ", text)
927
+ text = re.sub(r"\n{3,}", "\n\n", text)
928
+ return text.strip()