greeta commited on
Commit
bc0e1a3
·
verified ·
1 Parent(s): 8de6346

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +519 -0
app.py ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI app for the FIPI scraper service.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from datetime import datetime
8
+ import logging
9
+ import os
10
+ from pathlib import Path
11
+ import re
12
+ import ssl
13
+ from typing import Dict, List, Optional
14
+ from urllib.parse import parse_qs, urlparse
15
+
16
+ from bs4 import BeautifulSoup
17
+ from dotenv import load_dotenv
18
+ from fastapi import BackgroundTasks, FastAPI, HTTPException
19
+ from fastapi.middleware.cors import CORSMiddleware
20
+ from fastapi.responses import JSONResponse
21
+ import httpx
22
+
23
+ from models import (
24
+ CheckAnswerRequest,
25
+ CheckAnswerResponse,
26
+ ErrorResponse,
27
+ HealthResponse,
28
+ ScrapeRequest,
29
+ ScrapeResponse,
30
+ StatsResponse,
31
+ TaskResponse,
32
+ )
33
+ from scraper import FIPIScraper
34
+
35
+ BASE_DIR = Path(__file__).resolve().parent
36
+ load_dotenv(BASE_DIR / ".env")
37
+
38
+ logging.basicConfig(level=logging.INFO)
39
+ logger = logging.getLogger(__name__)
40
+
41
+ SUPABASE_AVAILABLE = False
42
+ SupabaseClient = None
43
+
44
+ try:
45
+ from supabase_client import SupabaseClient
46
+
47
+ SUPABASE_AVAILABLE = True
48
+ except ImportError as exc:
49
+ logger.warning("Supabase client import failed: %s", exc)
50
+ except Exception as exc: # pragma: no cover - startup guard
51
+ logger.warning("Supabase client init failed: %s", exc)
52
+
53
+
54
+ app = FastAPI(
55
+ title="AI Scraper FIPI",
56
+ description="Collects, stores and validates FIPI tasks.",
57
+ version="1.1.1-proof-20260317",
58
+ )
59
+
60
+ app.add_middleware(
61
+ CORSMiddleware,
62
+ allow_origins=["*"],
63
+ allow_credentials=True,
64
+ allow_methods=["*"],
65
+ allow_headers=["*"],
66
+ )
67
+
68
+ scraper: Optional[FIPIScraper] = None
69
+ supabase_client: Optional[SupabaseClient] = None
70
+ last_auto_refresh_at: Optional[datetime] = None
71
+ refresh_in_progress = False
72
+
73
+
74
+ @app.on_event("startup")
75
+ async def startup_event() -> None:
76
+ global scraper, supabase_client
77
+
78
+ scraper = FIPIScraper(base_url=os.getenv("FIPI_BASE_URL", "https://fipi.ru"))
79
+ logger.info("FIPIScraper initialized")
80
+
81
+ if not SUPABASE_AVAILABLE:
82
+ logger.info("Supabase disabled")
83
+ return
84
+
85
+ supabase_url = os.getenv("SUPABASE_URL")
86
+ supabase_key = os.getenv("SUPABASE_SERVICE_KEY")
87
+ if not supabase_url or not supabase_key:
88
+ logger.warning("SUPABASE_URL or SUPABASE_SERVICE_KEY missing")
89
+ return
90
+
91
+ try:
92
+ client = SupabaseClient(url=supabase_url, key=supabase_key)
93
+ if await client.is_available():
94
+ supabase_client = client
95
+ logger.info("Supabase client connected")
96
+ else:
97
+ logger.error("Supabase is unavailable")
98
+ except Exception as exc: # pragma: no cover - startup guard
99
+ logger.error("Supabase startup error: %s", exc)
100
+
101
+
102
+ def _require_supabase() -> SupabaseClient:
103
+ if not supabase_client:
104
+ raise HTTPException(status_code=503, detail="Supabase is not configured")
105
+ return supabase_client
106
+
107
+
108
+ def _require_scraper() -> FIPIScraper:
109
+ if not scraper:
110
+ raise HTTPException(status_code=503, detail="Scraper is not configured")
111
+ return scraper
112
+
113
+
114
+ def _can_check_answer(task: Dict) -> bool:
115
+ if task.get("source_kind") == "dynamic_bank" and task.get("task_guid"):
116
+ return True
117
+
118
+ source_url = task.get("source_url", "")
119
+ parsed = urlparse(source_url)
120
+ query = parse_qs(parsed.query)
121
+ project_guid = (query.get("proj") or [None])[0]
122
+ question_id = (query.get("qid") or [None])[0]
123
+
124
+ return parsed.path.endswith("/questions.php") and bool(project_guid and (question_id or task.get("task_guid")))
125
+
126
+
127
+ def _serialize_task(task: Dict) -> Dict:
128
+ payload = dict(task)
129
+ payload["can_check_answer"] = _can_check_answer(task)
130
+ return payload
131
+
132
+
133
+ async def _persist_tasks(tasks: List[Dict]) -> Dict[str, int]:
134
+ client = _require_supabase()
135
+ saved = 0
136
+ duplicates = 0
137
+ for task in tasks:
138
+ result = await client.insert_task(task)
139
+ if result:
140
+ saved += 1
141
+ else:
142
+ duplicates += 1
143
+ return {"saved": saved, "duplicates": duplicates}
144
+
145
+
146
+ async def _collect_tasks(subject: str = "russian", *, include_official_archives: bool = True) -> List[Dict]:
147
+ service = _require_scraper()
148
+ if include_official_archives:
149
+ return await service.scrape_tasks(subject=subject, include_official_archives=True)
150
+ return await service.scrape_dynamic_bank(subject=subject)
151
+
152
+
153
+ async def _refresh_tasks(subject: str = "russian", *, include_official_archives: bool = True) -> Dict[str, int]:
154
+ scraped_tasks = await _collect_tasks(
155
+ subject=subject,
156
+ include_official_archives=include_official_archives,
157
+ )
158
+ persisted = await _persist_tasks(scraped_tasks)
159
+ return {
160
+ "scraped": len(scraped_tasks),
161
+ "saved": persisted["saved"],
162
+ "duplicates": persisted["duplicates"],
163
+ }
164
+
165
+
166
+ def _needs_task_refresh(tasks: List[Dict]) -> bool:
167
+ if not tasks:
168
+ return True
169
+
170
+ dynamic_count = sum(1 for task in tasks if task.get("source_kind") == "dynamic_bank")
171
+ checkable_count = sum(1 for task in tasks if _can_check_answer(task))
172
+ minimum_total = max(10, int(os.getenv("SCRAPER_MIN_READY_TASKS", "40")))
173
+
174
+ if dynamic_count == 0 or checkable_count == 0:
175
+ return True
176
+
177
+ return len(tasks) < minimum_total
178
+
179
+
180
+ def _needs_dynamic_bank_refresh(tasks: List[Dict]) -> bool:
181
+ if not tasks:
182
+ return True
183
+
184
+ dynamic_count = sum(1 for task in tasks if task.get("source_kind") == "dynamic_bank")
185
+ checkable_count = sum(1 for task in tasks if _can_check_answer(task))
186
+ return dynamic_count == 0 or checkable_count == 0
187
+
188
+
189
+ def _is_refresh_running() -> bool:
190
+ return refresh_in_progress
191
+
192
+
193
+ async def _run_refresh(subject: str = "russian", include_official_archives: bool = True) -> None:
194
+ global refresh_in_progress
195
+
196
+ try:
197
+ refresh_in_progress = True
198
+ refreshed = await _refresh_tasks(
199
+ subject=subject,
200
+ include_official_archives=include_official_archives,
201
+ )
202
+ logger.info(
203
+ "Background refresh finished: scraped=%s saved=%s duplicates=%s include_archives=%s",
204
+ refreshed["scraped"],
205
+ refreshed["saved"],
206
+ refreshed["duplicates"],
207
+ include_official_archives,
208
+ )
209
+ except Exception as exc: # pragma: no cover - background guard
210
+ logger.error("Background refresh failed: %s", exc)
211
+ finally:
212
+ refresh_in_progress = False
213
+
214
+
215
+ def _schedule_refresh(
216
+ background_tasks: BackgroundTasks,
217
+ subject: str = "russian",
218
+ *,
219
+ include_official_archives: bool = True,
220
+ ) -> bool:
221
+ global last_auto_refresh_at
222
+
223
+ if _is_refresh_running():
224
+ return False
225
+
226
+ last_auto_refresh_at = datetime.utcnow()
227
+ background_tasks.add_task(_run_refresh, subject, include_official_archives)
228
+ return True
229
+
230
+
231
+ async def _ensure_tasks_available(background_tasks: BackgroundTasks, subject: str = "russian") -> List[Dict]:
232
+ global last_auto_refresh_at
233
+
234
+ client = _require_supabase()
235
+ existing = await client.get_all_tasks()
236
+ if existing and not _needs_task_refresh(existing):
237
+ return existing
238
+
239
+ if not existing:
240
+ logger.info("Tasks table is empty, running initial dynamic scrape")
241
+ last_auto_refresh_at = datetime.utcnow()
242
+ await _refresh_tasks(subject=subject, include_official_archives=False)
243
+ refreshed = await client.get_all_tasks()
244
+ if refreshed and _schedule_refresh(background_tasks, subject, include_official_archives=True):
245
+ logger.info("Scheduled full refresh after initial dynamic scrape")
246
+ return refreshed or existing
247
+
248
+ if _needs_dynamic_bank_refresh(existing):
249
+ if _is_refresh_running():
250
+ logger.info("Dynamic bank refresh is already running, returning existing tasks")
251
+ return existing
252
+
253
+ if _schedule_refresh(background_tasks, subject, include_official_archives=False):
254
+ logger.info("Tasks are missing dynamic/checkable entries, scheduled targeted dynamic refresh")
255
+ else:
256
+ logger.info("Unable to schedule targeted dynamic refresh, returning existing tasks")
257
+ return existing
258
+
259
+ cooldown_minutes = max(1, int(os.getenv("SCRAPER_AUTO_REFRESH_COOLDOWN_MINUTES", "30")))
260
+ if last_auto_refresh_at and (datetime.utcnow() - last_auto_refresh_at).total_seconds() < cooldown_minutes * 60:
261
+ logger.info("Skipping auto refresh because cooldown is active")
262
+ return existing
263
+
264
+ if _schedule_refresh(background_tasks, subject, include_official_archives=True):
265
+ logger.info("Existing tasks are stale or incomplete, scheduled background refresh")
266
+ else:
267
+ logger.info("Refresh is already running, returning existing tasks")
268
+
269
+ return existing
270
+
271
+
272
+ def _normalize_answer(answer: str) -> str:
273
+ return re.sub(r"\s+", "", answer.strip()).upper()
274
+
275
+
276
+ async def _resolve_task_guid(task: Dict) -> Optional[str]:
277
+ if task.get("task_guid"):
278
+ return task["task_guid"]
279
+
280
+ source_url = task.get("source_url", "")
281
+ if not _can_check_answer(task):
282
+ return None
283
+
284
+ html = await _require_scraper().fetch_page(source_url)
285
+ if not html:
286
+ return None
287
+
288
+ soup = BeautifulSoup(html, "lxml")
289
+ guid_input = soup.select_one("form[id^='checkform'] input[name='guid']")
290
+ return guid_input.get("value") if guid_input and guid_input.get("value") else None
291
+
292
+
293
+ async def _check_task_answer(task: Dict, answer: str) -> CheckAnswerResponse:
294
+ if not _can_check_answer(task):
295
+ raise HTTPException(status_code=400, detail="This task does not support answer checking")
296
+
297
+ normalized = _normalize_answer(answer)
298
+ if not normalized:
299
+ raise HTTPException(status_code=400, detail="Answer is empty")
300
+
301
+ parsed = urlparse(task["source_url"])
302
+ query = parse_qs(parsed.query)
303
+ project_guid = (query.get("proj") or [None])[0]
304
+ task_guid = await _resolve_task_guid(task)
305
+
306
+ if not project_guid or not task_guid:
307
+ raise HTTPException(status_code=500, detail="Unable to resolve FIPI task metadata")
308
+
309
+ solve_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path.rsplit('/', 1)[0]}/solve.php"
310
+ ssl_context = ssl.create_default_context()
311
+ ssl_context.check_hostname = False
312
+ ssl_context.verify_mode = ssl.CERT_NONE
313
+
314
+ async with httpx.AsyncClient(
315
+ headers=_require_scraper().headers,
316
+ timeout=45.0,
317
+ verify=ssl_context,
318
+ follow_redirects=True,
319
+ trust_env=False,
320
+ ) as client:
321
+ page_response = await client.get(task["source_url"])
322
+ page_response.raise_for_status()
323
+ response = await client.post(
324
+ solve_url,
325
+ data={
326
+ "guid": task_guid,
327
+ "answer": normalized,
328
+ "ajax": "1",
329
+ "proj": project_guid,
330
+ },
331
+ headers={"Referer": task["source_url"]},
332
+ )
333
+ response.raise_for_status()
334
+
335
+ if not response:
336
+ raise HTTPException(status_code=502, detail="FIPI answer check failed")
337
+
338
+ status_code = response.text.strip()
339
+ status_map = {
340
+ "0": ("not_solved", False, "Не решено"),
341
+ "1": ("solved", True, "Решено"),
342
+ "2": ("incorrect", False, "Неверно"),
343
+ "3": ("correct", True, "Верно"),
344
+ }
345
+ if status_code not in status_map:
346
+ raise HTTPException(status_code=502, detail=f"Unexpected FIPI response: {status_code}")
347
+
348
+ status_label, is_correct, message = status_map[status_code]
349
+ return CheckAnswerResponse(
350
+ success=True,
351
+ is_correct=is_correct,
352
+ status_code=status_label,
353
+ status_label=message,
354
+ submitted_answer=answer,
355
+ normalized_answer=normalized,
356
+ message=message,
357
+ )
358
+
359
+
360
+ @app.get("/api/health", response_model=HealthResponse)
361
+ async def health_check() -> HealthResponse:
362
+ services = {
363
+ "api": True,
364
+ "scraper": scraper is not None,
365
+ "supabase": False,
366
+ }
367
+
368
+ if supabase_client:
369
+ try:
370
+ services["supabase"] = await supabase_client.is_available()
371
+ except Exception:
372
+ services["supabase"] = False
373
+
374
+ all_critical_ok = services["api"] and services["scraper"]
375
+ if all_critical_ok and all(services.values()):
376
+ status = "healthy"
377
+ elif all_critical_ok:
378
+ status = "degraded"
379
+ else:
380
+ status = "unhealthy"
381
+
382
+ return HealthResponse(status=status, timestamp=datetime.utcnow(), services=services)
383
+
384
+
385
+ @app.get("/api/tasks", response_model=List[TaskResponse])
386
+ async def get_all_tasks(background_tasks: BackgroundTasks) -> List[TaskResponse]:
387
+ tasks = await _ensure_tasks_available(background_tasks)
388
+ return [TaskResponse(**_serialize_task(task)) for task in tasks]
389
+
390
+
391
+ @app.get("/api/tasks/latest", response_model=List[TaskResponse])
392
+ async def get_latest_tasks(limit: int = 10) -> List[TaskResponse]:
393
+ tasks = await _require_supabase().get_latest_tasks(limit=limit)
394
+ return [TaskResponse(**_serialize_task(task)) for task in tasks]
395
+
396
+
397
+ @app.get("/api/tasks/{task_id}", response_model=TaskResponse)
398
+ async def get_task(task_id: int) -> TaskResponse:
399
+ task = await _require_supabase().get_task_by_id(task_id)
400
+ if not task:
401
+ raise HTTPException(status_code=404, detail="Task not found")
402
+ return TaskResponse(**_serialize_task(task))
403
+
404
+
405
+ @app.post("/api/tasks/{task_id}/check-answer", response_model=CheckAnswerResponse)
406
+ async def check_task_answer(task_id: int, request: CheckAnswerRequest) -> CheckAnswerResponse:
407
+ task = await _require_supabase().get_task_by_id(task_id)
408
+ if not task:
409
+ raise HTTPException(status_code=404, detail="Task not found")
410
+ return await _check_task_answer(task, request.answer)
411
+
412
+
413
+ @app.get("/api/tasks/type/{task_type}", response_model=List[TaskResponse])
414
+ async def get_tasks_by_type(task_type: str) -> List[TaskResponse]:
415
+ tasks = await _require_supabase().get_tasks_by_type(task_type)
416
+ return [TaskResponse(**_serialize_task(task)) for task in tasks]
417
+
418
+
419
+ @app.get("/api/tasks/search", response_model=List[TaskResponse])
420
+ async def search_tasks(q: str) -> List[TaskResponse]:
421
+ tasks = await _require_supabase().search_tasks(q)
422
+ return [TaskResponse(**_serialize_task(task)) for task in tasks]
423
+
424
+
425
+ @app.post("/api/scrape", response_model=ScrapeResponse)
426
+ async def scrape_tasks(request: ScrapeRequest) -> ScrapeResponse:
427
+ client = _require_supabase()
428
+ service = _require_scraper()
429
+
430
+ try:
431
+ tasks_scraped = 0
432
+ tasks_saved = 0
433
+ duplicates_skipped = 0
434
+
435
+ if request.urls:
436
+ for url in request.urls:
437
+ html = await service.fetch_page(url)
438
+ task = service.parse_task_page(html, url) if html else None
439
+ if not task:
440
+ continue
441
+ tasks_scraped += 1
442
+ result = await client.insert_task(task)
443
+ if result:
444
+ tasks_saved += 1
445
+ else:
446
+ duplicates_skipped += 1
447
+ elif request.query:
448
+ tasks = await service.search_tasks(request.query)
449
+ tasks_scraped = len(tasks)
450
+ persisted = await _persist_tasks(tasks)
451
+ tasks_saved = persisted["saved"]
452
+ duplicates_skipped = persisted["duplicates"]
453
+ else:
454
+ if request.full_refresh:
455
+ tasks = await service.scrape_tasks(
456
+ subject=request.subject or "russian",
457
+ include_official_archives=True,
458
+ )
459
+ else:
460
+ tasks = await service.scrape_dynamic_bank(subject=request.subject or "russian")
461
+ tasks_scraped = len(tasks)
462
+ persisted = await _persist_tasks(tasks)
463
+ tasks_saved = persisted["saved"]
464
+ duplicates_skipped = persisted["duplicates"]
465
+
466
+ return ScrapeResponse(
467
+ success=True,
468
+ tasks_scraped=tasks_scraped,
469
+ tasks_saved=tasks_saved,
470
+ duplicates_skipped=duplicates_skipped,
471
+ message=(
472
+ f"Processed {tasks_scraped} tasks. "
473
+ f"Saved: {tasks_saved}, duplicates: {duplicates_skipped}"
474
+ ),
475
+ )
476
+ except HTTPException:
477
+ raise
478
+ except Exception as exc: # pragma: no cover - endpoint guard
479
+ logger.error("Scrape error: %s", exc)
480
+ raise HTTPException(status_code=500, detail=f"Scrape error: {exc}")
481
+
482
+
483
+ @app.get("/api/stats", response_model=StatsResponse)
484
+ async def get_stats() -> StatsResponse:
485
+ client = _require_supabase()
486
+ stats = await client.get_stats()
487
+ latest = await client.get_latest_tasks(limit=1)
488
+ last_scrape = latest[0].get("scraped_at") if latest else None
489
+ return StatsResponse(
490
+ total_tasks=stats.get("total", 0),
491
+ by_type=stats.get("by_type", {}),
492
+ last_scrape=last_scrape,
493
+ )
494
+
495
+
496
+ @app.get("/", tags=["root"])
497
+ async def root() -> Dict[str, str]:
498
+ return {
499
+ "message": "AI Scraper FIPI API proof-20260317",
500
+ "version": "1.1.1-proof-20260317",
501
+ "docs": "/docs",
502
+ }
503
+
504
+
505
+ @app.exception_handler(Exception)
506
+ async def global_exception_handler(request, exc) -> JSONResponse:
507
+ logger.error("Unhandled exception: %s", exc)
508
+ payload = ErrorResponse(
509
+ error="Internal Server Error",
510
+ detail=str(exc),
511
+ timestamp=datetime.utcnow(),
512
+ )
513
+ return JSONResponse(status_code=500, content=payload.model_dump(mode="json"))
514
+
515
+
516
+ if __name__ == "__main__":
517
+ import uvicorn
518
+
519
+ uvicorn.run(app, host="0.0.0.0", port=8000)