llm-ready-data / app /api /v1 /scraper.py
light-infer-chat's picture
ok
f02b0c0
Raw
History Blame Contribute Delete
2.09 kB
from __future__ import annotations
import time
from typing import Optional
from fastapi import APIRouter, Depends
from app.api.deps import require_auth
from app.core.logger import get_logger
from app.models.schemas import (
ScrapeHealthResponse,
ScrapeRequest,
ScrapeResponse,
)
from app.services.scraper_service import ScraperService
router = APIRouter()
_logger = get_logger(__name__)
def get_scraper_service() -> ScraperService:
return ScraperService()
@router.get(
"/scrape/health",
response_model=ScrapeHealthResponse,
summary="Check Scrapling framework health and available fetchers",
)
async def scrape_health(
token: str = Depends(require_auth),
svc: ScraperService = Depends(get_scraper_service),
) -> ScrapeHealthResponse:
_logger.info("Scrape health check")
result = await svc.health()
return ScrapeHealthResponse(
success=True,
framework="Scrapling",
version=result.get("version", "unknown"),
fetchers_available=result.get("available", []),
)
@router.post(
"/scrape/extract",
response_model=ScrapeResponse,
summary="Extract structured data from any webpage using CSS/XPath selectors",
)
async def scrape_extract(
body: ScrapeRequest,
token: str = Depends(require_auth),
svc: ScraperService = Depends(get_scraper_service),
) -> ScrapeResponse:
_logger.info(
"Scrape extract: url=%s fetcher=%s rules=%d",
body.url, body.fetcher_type, len(body.rules),
)
start = time.perf_counter()
rules_dict = [r.model_dump() for r in body.rules]
data = await svc.extract(
url=body.url,
fetcher_type=body.fetcher_type.value,
rules=rules_dict,
proxy=body.proxy,
network_idle=body.network_idle,
)
elapsed_ms = round((time.perf_counter() - start) * 1000, 2)
error = data.pop("error", None)
return ScrapeResponse(
success=error is None,
time_ms=elapsed_ms,
url=body.url,
data=data,
fetcher=body.fetcher_type.value,
error=error,
)