greeta commited on
Commit
8de6346
·
verified ·
1 Parent(s): b651663

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -518
app.py DELETED
@@ -1,518 +0,0 @@
1
- """
2
- FastAPI app for the FIPI scraper service.
3
- """
4
-
5
- from __future__ import annotations
6
-
7
- from datetime import datetime
8
- import logging
9
- import os
10
- from pathlib import Path
11
- import re
12
- import ssl
13
- from typing import Dict, List, Optional
14
- from urllib.parse import parse_qs, urlparse
15
-
16
- from bs4 import BeautifulSoup
17
- from dotenv import load_dotenv
18
- from fastapi import BackgroundTasks, FastAPI, HTTPException
19
- from fastapi.middleware.cors import CORSMiddleware
20
- from fastapi.responses import JSONResponse
21
- import httpx
22
-
23
- from models import (
24
- CheckAnswerRequest,
25
- CheckAnswerResponse,
26
- ErrorResponse,
27
- HealthResponse,
28
- ScrapeRequest,
29
- ScrapeResponse,
30
- StatsResponse,
31
- TaskResponse,
32
- )
33
- from scraper import FIPIScraper
34
-
35
- BASE_DIR = Path(__file__).resolve().parent
36
- load_dotenv(BASE_DIR / ".env")
37
-
38
- logging.basicConfig(level=logging.INFO)
39
- logger = logging.getLogger(__name__)
40
-
41
- SUPABASE_AVAILABLE = False
42
- SupabaseClient = None
43
-
44
- try:
45
- from supabase_client import SupabaseClient
46
-
47
- SUPABASE_AVAILABLE = True
48
- except ImportError as exc:
49
- logger.warning("Supabase client import failed: %s", exc)
50
- except Exception as exc: # pragma: no cover - startup guard
51
- logger.warning("Supabase client init failed: %s", exc)
52
-
53
-
54
- app = FastAPI(
55
- title="AI Scraper FIPI",
56
- description="Collects, stores and validates FIPI tasks.",
57
- version="1.1.1-proof-20260317",
58
- )
59
-
60
- app.add_middleware(
61
- CORSMiddleware,
62
- allow_origins=["*"],
63
- allow_credentials=True,
64
- allow_methods=["*"],
65
- allow_headers=["*"],
66
- )
67
-
68
- scraper: Optional[FIPIScraper] = None
69
- supabase_client: Optional[SupabaseClient] = None
70
- last_auto_refresh_at: Optional[datetime] = None
71
- refresh_in_progress = False
72
-
73
-
74
- @app.on_event("startup")
75
- async def startup_event() -> None:
76
- global scraper, supabase_client
77
-
78
- scraper = FIPIScraper(base_url=os.getenv("FIPI_BASE_URL", "https://fipi.ru"))
79
- logger.info("FIPIScraper initialized")
80
-
81
- if not SUPABASE_AVAILABLE:
82
- logger.info("Supabase disabled")
83
- return
84
-
85
- supabase_url = os.getenv("SUPABASE_URL")
86
- supabase_key = os.getenv("SUPABASE_SERVICE_KEY")
87
- if not supabase_url or not supabase_key:
88
- logger.warning("SUPABASE_URL or SUPABASE_SERVICE_KEY missing")
89
- return
90
-
91
- try:
92
- client = SupabaseClient(url=supabase_url, key=supabase_key)
93
- if await client.is_available():
94
- supabase_client = client
95
- logger.info("Supabase client connected")
96
- else:
97
- logger.error("Supabase is unavailable")
98
- except Exception as exc: # pragma: no cover - startup guard
99
- logger.error("Supabase startup error: %s", exc)
100
-
101
-
102
- def _require_supabase() -> SupabaseClient:
103
- if not supabase_client:
104
- raise HTTPException(status_code=503, detail="Supabase is not configured")
105
- return supabase_client
106
-
107
-
108
- def _require_scraper() -> FIPIScraper:
109
- if not scraper:
110
- raise HTTPException(status_code=503, detail="Scraper is not configured")
111
- return scraper
112
-
113
-
114
- def _can_check_answer(task: Dict) -> bool:
115
- if task.get("source_kind") == "dynamic_bank" and task.get("task_guid"):
116
- return True
117
-
118
- source_url = task.get("source_url", "")
119
- parsed = urlparse(source_url)
120
- query = parse_qs(parsed.query)
121
- project_guid = (query.get("proj") or [None])[0]
122
- question_id = (query.get("qid") or [None])[0]
123
-
124
- return parsed.path.endswith("/questions.php") and bool(project_guid and (question_id or task.get("task_guid")))
125
-
126
-
127
- def _serialize_task(task: Dict) -> Dict:
128
- payload = dict(task)
129
- payload["can_check_answer"] = _can_check_answer(task)
130
- return payload
131
-
132
-
133
- async def _persist_tasks(tasks: List[Dict]) -> Dict[str, int]:
134
- client = _require_supabase()
135
- saved = 0
136
- duplicates = 0
137
- for task in tasks:
138
- result = await client.insert_task(task)
139
- if result:
140
- saved += 1
141
- else:
142
- duplicates += 1
143
- return {"saved": saved, "duplicates": duplicates}
144
-
145
-
146
- async def _collect_tasks(subject: str = "russian", *, include_official_archives: bool = True) -> List[Dict]:
147
- service = _require_scraper()
148
- if include_official_archives:
149
- return await service.scrape_tasks(subject=subject, include_official_archives=True)
150
- return await service.scrape_dynamic_bank(subject=subject)
151
-
152
-
153
- async def _refresh_tasks(subject: str = "russian", *, include_official_archives: bool = True) -> Dict[str, int]:
154
- scraped_tasks = await _collect_tasks(
155
- subject=subject,
156
- include_official_archives=include_official_archives,
157
- )
158
- persisted = await _persist_tasks(scraped_tasks)
159
- return {
160
- "scraped": len(scraped_tasks),
161
- "saved": persisted["saved"],
162
- "duplicates": persisted["duplicates"],
163
- }
164
-
165
-
166
- def _needs_task_refresh(tasks: List[Dict]) -> bool:
167
- if not tasks:
168
- return True
169
-
170
- dynamic_count = sum(1 for task in tasks if task.get("source_kind") == "dynamic_bank")
171
- checkable_count = sum(1 for task in tasks if _can_check_answer(task))
172
- minimum_total = max(10, int(os.getenv("SCRAPER_MIN_READY_TASKS", "40")))
173
-
174
- if dynamic_count == 0 or checkable_count == 0:
175
- return True
176
-
177
- return len(tasks) < minimum_total
178
-
179
-
180
- def _needs_dynamic_bank_refresh(tasks: List[Dict]) -> bool:
181
- if not tasks:
182
- return True
183
-
184
- dynamic_count = sum(1 for task in tasks if task.get("source_kind") == "dynamic_bank")
185
- checkable_count = sum(1 for task in tasks if _can_check_answer(task))
186
- return dynamic_count == 0 or checkable_count == 0
187
-
188
-
189
- def _is_refresh_running() -> bool:
190
- return refresh_in_progress
191
-
192
-
193
- async def _run_refresh(subject: str = "russian", include_official_archives: bool = True) -> None:
194
- global refresh_in_progress
195
-
196
- try:
197
- refresh_in_progress = True
198
- refreshed = await _refresh_tasks(
199
- subject=subject,
200
- include_official_archives=include_official_archives,
201
- )
202
- logger.info(
203
- "Background refresh finished: scraped=%s saved=%s duplicates=%s include_archives=%s",
204
- refreshed["scraped"],
205
- refreshed["saved"],
206
- refreshed["duplicates"],
207
- include_official_archives,
208
- )
209
- except Exception as exc: # pragma: no cover - background guard
210
- logger.error("Background refresh failed: %s", exc)
211
- finally:
212
- refresh_in_progress = False
213
-
214
-
215
- def _schedule_refresh(
216
- background_tasks: BackgroundTasks,
217
- subject: str = "russian",
218
- *,
219
- include_official_archives: bool = True,
220
- ) -> bool:
221
- global last_auto_refresh_at
222
-
223
- if _is_refresh_running():
224
- return False
225
-
226
- last_auto_refresh_at = datetime.utcnow()
227
- background_tasks.add_task(_run_refresh, subject, include_official_archives)
228
- return True
229
-
230
-
231
- async def _ensure_tasks_available(background_tasks: BackgroundTasks, subject: str = "russian") -> List[Dict]:
232
- global last_auto_refresh_at
233
-
234
- client = _require_supabase()
235
- existing = await client.get_all_tasks()
236
- if existing and not _needs_task_refresh(existing):
237
- return existing
238
-
239
- if not existing:
240
- logger.info("Tasks table is empty, running initial dynamic scrape")
241
- last_auto_refresh_at = datetime.utcnow()
242
- await _refresh_tasks(subject=subject, include_official_archives=False)
243
- refreshed = await client.get_all_tasks()
244
- if refreshed and _schedule_refresh(background_tasks, subject, include_official_archives=True):
245
- logger.info("Scheduled full refresh after initial dynamic scrape")
246
- return refreshed or existing
247
-
248
- if _needs_dynamic_bank_refresh(existing):
249
- if _is_refresh_running():
250
- logger.info("Dynamic bank refresh is already running, returning existing tasks")
251
- return existing
252
-
253
- if _schedule_refresh(background_tasks, subject, include_official_archives=False):
254
- logger.info("Tasks are missing dynamic/checkable entries, scheduled targeted dynamic refresh")
255
- else:
256
- logger.info("Unable to schedule targeted dynamic refresh, returning existing tasks")
257
- return existing
258
-
259
- cooldown_minutes = max(1, int(os.getenv("SCRAPER_AUTO_REFRESH_COOLDOWN_MINUTES", "30")))
260
- if last_auto_refresh_at and (datetime.utcnow() - last_auto_refresh_at).total_seconds() < cooldown_minutes * 60:
261
- logger.info("Skipping auto refresh because cooldown is active")
262
- return existing
263
-
264
- if _schedule_refresh(background_tasks, subject, include_official_archives=True):
265
- logger.info("Existing tasks are stale or incomplete, scheduled background refresh")
266
- else:
267
- logger.info("Refresh is already running, returning existing tasks")
268
-
269
- return existing
270
-
271
-
272
- def _normalize_answer(answer: str) -> str:
273
- return re.sub(r"\s+", "", answer.strip()).upper()
274
-
275
-
276
- async def _resolve_task_guid(task: Dict) -> Optional[str]:
277
- if task.get("task_guid"):
278
- return task["task_guid"]
279
-
280
- source_url = task.get("source_url", "")
281
- if not _can_check_answer(task):
282
- return None
283
-
284
- html = await _require_scraper().fetch_page(source_url)
285
- if not html:
286
- return None
287
-
288
- soup = BeautifulSoup(html, "lxml")
289
- guid_input = soup.select_one("form[id^='checkform'] input[name='guid']")
290
- return guid_input.get("value") if guid_input and guid_input.get("value") else None
291
-
292
-
293
- async def _check_task_answer(task: Dict, answer: str) -> CheckAnswerResponse:
294
- if not _can_check_answer(task):
295
- raise HTTPException(status_code=400, detail="This task does not support answer checking")
296
-
297
- normalized = _normalize_answer(answer)
298
- if not normalized:
299
- raise HTTPException(status_code=400, detail="Answer is empty")
300
-
301
- parsed = urlparse(task["source_url"])
302
- query = parse_qs(parsed.query)
303
- project_guid = (query.get("proj") or [None])[0]
304
- task_guid = await _resolve_task_guid(task)
305
-
306
- if not project_guid or not task_guid:
307
- raise HTTPException(status_code=500, detail="Unable to resolve FIPI task metadata")
308
-
309
- solve_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path.rsplit('/', 1)[0]}/solve.php"
310
- ssl_context = ssl.create_default_context()
311
- ssl_context.check_hostname = False
312
- ssl_context.verify_mode = ssl.CERT_NONE
313
-
314
- async with httpx.AsyncClient(
315
- headers=_require_scraper().headers,
316
- timeout=45.0,
317
- verify=ssl_context,
318
- follow_redirects=True,
319
- ) as client:
320
- page_response = await client.get(task["source_url"])
321
- page_response.raise_for_status()
322
- response = await client.post(
323
- solve_url,
324
- data={
325
- "guid": task_guid,
326
- "answer": normalized,
327
- "ajax": "1",
328
- "proj": project_guid,
329
- },
330
- headers={"Referer": task["source_url"]},
331
- )
332
- response.raise_for_status()
333
-
334
- if not response:
335
- raise HTTPException(status_code=502, detail="FIPI answer check failed")
336
-
337
- status_code = response.text.strip()
338
- status_map = {
339
- "0": ("not_solved", False, "Не решено"),
340
- "1": ("solved", True, "Решено"),
341
- "2": ("incorrect", False, "Неверно"),
342
- "3": ("correct", True, "Верно"),
343
- }
344
- if status_code not in status_map:
345
- raise HTTPException(status_code=502, detail=f"Unexpected FIPI response: {status_code}")
346
-
347
- status_label, is_correct, message = status_map[status_code]
348
- return CheckAnswerResponse(
349
- success=True,
350
- is_correct=is_correct,
351
- status_code=status_label,
352
- status_label=message,
353
- submitted_answer=answer,
354
- normalized_answer=normalized,
355
- message=message,
356
- )
357
-
358
-
359
- @app.get("/api/health", response_model=HealthResponse)
360
- async def health_check() -> HealthResponse:
361
- services = {
362
- "api": True,
363
- "scraper": scraper is not None,
364
- "supabase": False,
365
- }
366
-
367
- if supabase_client:
368
- try:
369
- services["supabase"] = await supabase_client.is_available()
370
- except Exception:
371
- services["supabase"] = False
372
-
373
- all_critical_ok = services["api"] and services["scraper"]
374
- if all_critical_ok and all(services.values()):
375
- status = "healthy"
376
- elif all_critical_ok:
377
- status = "degraded"
378
- else:
379
- status = "unhealthy"
380
-
381
- return HealthResponse(status=status, timestamp=datetime.utcnow(), services=services)
382
-
383
-
384
- @app.get("/api/tasks", response_model=List[TaskResponse])
385
- async def get_all_tasks(background_tasks: BackgroundTasks) -> List[TaskResponse]:
386
- tasks = await _ensure_tasks_available(background_tasks)
387
- return [TaskResponse(**_serialize_task(task)) for task in tasks]
388
-
389
-
390
- @app.get("/api/tasks/latest", response_model=List[TaskResponse])
391
- async def get_latest_tasks(limit: int = 10) -> List[TaskResponse]:
392
- tasks = await _require_supabase().get_latest_tasks(limit=limit)
393
- return [TaskResponse(**_serialize_task(task)) for task in tasks]
394
-
395
-
396
- @app.get("/api/tasks/{task_id}", response_model=TaskResponse)
397
- async def get_task(task_id: int) -> TaskResponse:
398
- task = await _require_supabase().get_task_by_id(task_id)
399
- if not task:
400
- raise HTTPException(status_code=404, detail="Task not found")
401
- return TaskResponse(**_serialize_task(task))
402
-
403
-
404
- @app.post("/api/tasks/{task_id}/check-answer", response_model=CheckAnswerResponse)
405
- async def check_task_answer(task_id: int, request: CheckAnswerRequest) -> CheckAnswerResponse:
406
- task = await _require_supabase().get_task_by_id(task_id)
407
- if not task:
408
- raise HTTPException(status_code=404, detail="Task not found")
409
- return await _check_task_answer(task, request.answer)
410
-
411
-
412
- @app.get("/api/tasks/type/{task_type}", response_model=List[TaskResponse])
413
- async def get_tasks_by_type(task_type: str) -> List[TaskResponse]:
414
- tasks = await _require_supabase().get_tasks_by_type(task_type)
415
- return [TaskResponse(**_serialize_task(task)) for task in tasks]
416
-
417
-
418
- @app.get("/api/tasks/search", response_model=List[TaskResponse])
419
- async def search_tasks(q: str) -> List[TaskResponse]:
420
- tasks = await _require_supabase().search_tasks(q)
421
- return [TaskResponse(**_serialize_task(task)) for task in tasks]
422
-
423
-
424
- @app.post("/api/scrape", response_model=ScrapeResponse)
425
- async def scrape_tasks(request: ScrapeRequest) -> ScrapeResponse:
426
- client = _require_supabase()
427
- service = _require_scraper()
428
-
429
- try:
430
- tasks_scraped = 0
431
- tasks_saved = 0
432
- duplicates_skipped = 0
433
-
434
- if request.urls:
435
- for url in request.urls:
436
- html = await service.fetch_page(url)
437
- task = service.parse_task_page(html, url) if html else None
438
- if not task:
439
- continue
440
- tasks_scraped += 1
441
- result = await client.insert_task(task)
442
- if result:
443
- tasks_saved += 1
444
- else:
445
- duplicates_skipped += 1
446
- elif request.query:
447
- tasks = await service.search_tasks(request.query)
448
- tasks_scraped = len(tasks)
449
- persisted = await _persist_tasks(tasks)
450
- tasks_saved = persisted["saved"]
451
- duplicates_skipped = persisted["duplicates"]
452
- else:
453
- if request.full_refresh:
454
- tasks = await service.scrape_tasks(
455
- subject=request.subject or "russian",
456
- include_official_archives=True,
457
- )
458
- else:
459
- tasks = await service.scrape_dynamic_bank(subject=request.subject or "russian")
460
- tasks_scraped = len(tasks)
461
- persisted = await _persist_tasks(tasks)
462
- tasks_saved = persisted["saved"]
463
- duplicates_skipped = persisted["duplicates"]
464
-
465
- return ScrapeResponse(
466
- success=True,
467
- tasks_scraped=tasks_scraped,
468
- tasks_saved=tasks_saved,
469
- duplicates_skipped=duplicates_skipped,
470
- message=(
471
- f"Processed {tasks_scraped} tasks. "
472
- f"Saved: {tasks_saved}, duplicates: {duplicates_skipped}"
473
- ),
474
- )
475
- except HTTPException:
476
- raise
477
- except Exception as exc: # pragma: no cover - endpoint guard
478
- logger.error("Scrape error: %s", exc)
479
- raise HTTPException(status_code=500, detail=f"Scrape error: {exc}")
480
-
481
-
482
- @app.get("/api/stats", response_model=StatsResponse)
483
- async def get_stats() -> StatsResponse:
484
- client = _require_supabase()
485
- stats = await client.get_stats()
486
- latest = await client.get_latest_tasks(limit=1)
487
- last_scrape = latest[0].get("scraped_at") if latest else None
488
- return StatsResponse(
489
- total_tasks=stats.get("total", 0),
490
- by_type=stats.get("by_type", {}),
491
- last_scrape=last_scrape,
492
- )
493
-
494
-
495
- @app.get("/", tags=["root"])
496
- async def root() -> Dict[str, str]:
497
- return {
498
- "message": "AI Scraper FIPI API proof-20260317",
499
- "version": "1.1.1-proof-20260317",
500
- "docs": "/docs",
501
- }
502
-
503
-
504
- @app.exception_handler(Exception)
505
- async def global_exception_handler(request, exc) -> JSONResponse:
506
- logger.error("Unhandled exception: %s", exc)
507
- payload = ErrorResponse(
508
- error="Internal Server Error",
509
- detail=str(exc),
510
- timestamp=datetime.utcnow(),
511
- )
512
- return JSONResponse(status_code=500, content=payload.model_dump(mode="json"))
513
-
514
-
515
- if __name__ == "__main__":
516
- import uvicorn
517
-
518
- uvicorn.run(app, host="0.0.0.0", port=8000)