| from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError |
| from typing import Dict |
| from fastapi import FastAPI, HTTPException |
| from pydantic import BaseModel |
|
|
|
|
| async def scraper(link: str) -> Dict: |
| async with async_playwright() as p: |
| browser = await p.chromium.launch(headless=True) |
| context = await browser.new_context() |
| page = await context.new_page() |
|
|
| try: |
| await page.goto(link, timeout=15000) |
| except PlaywrightTimeoutError: |
| await browser.close() |
| return {"error": "Timeout while loading the page."} |
|
|
|
|
| |
| page_text = await page.locator("body").inner_text() |
|
|
| |
| script_sources = await page.eval_on_selector_all( |
| "script[src]", "elements => elements.map(e => e.src)" |
| ) |
|
|
| |
| link_sources = await page.eval_on_selector_all( |
| "link[href]", "elements => elements.map(e => e.href)" |
| ) |
|
|
| await browser.close() |
|
|
| return { |
| "page_text": page_text, |
| "script_sources": script_sources, |
| "link_sources": link_sources |
| } |
|
|
|
|
| app = FastAPI() |
|
|
| class ScrapeRequest(BaseModel): |
| url: str |
|
|
| @app.post("/scrape") |
| async def scrape_endpoint(request: ScrapeRequest): |
| try: |
| data = await scraper(request.url) |
| return data |
| except Exception as e: |
| raise HTTPException(status_code=500, detail=str(e)) |
|
|
|
|