from fastapi import FastAPI, Request, HTTPException, Body from crawl4ai import AsyncWebCrawler, CrawlerRunConfig import re from collections.abc import Mapping, Sequence app = FastAPI(title="Crawl4AI API") URL_RE = re.compile(r"https?://[^\s\"'>)]+", re.IGNORECASE) def find_url_anywhere(obj): if isinstance(obj, str): m = URL_RE.search(obj) return m.group(0) if m else None if isinstance(obj, Mapping): for k in ("url", "link", "q", "input"): v = obj.get(k) if isinstance(v, str) and v.startswith("http"): return v for v in obj.values(): u = find_url_anywhere(v) if u: return u if isinstance(obj, Sequence) and not isinstance(obj, (str, bytes)): for it in obj: u = find_url_anywhere(it) if u: return u return None @app.get("/healthz") def health(): return {"status": "ok"} @app.get("/crawl") async def crawl_get(url: str | None = None): if not url: raise HTTPException(status_code=400, detail="Provide ?url=https://...") return await do_crawl(url) @app.post("/crawl") async def crawl_post( request: Request, payload: dict | None = Body(None, example={"url": "https://example.com"}) ): url = request.query_params.get("url") if not url and isinstance(payload, dict): url = find_url_anywhere(payload) if not url: try: form = await request.form() url = find_url_anywhere(dict(form)) except Exception: pass if not url: raw = await request.body() url = find_url_anywhere(raw.decode("utf-8", errors="ignore")) if not url: raise HTTPException(status_code=400, detail="No URL found. Send {'url':'https://...'}") return await do_crawl(url) async def do_crawl(url: str): try: cfg = CrawlerRunConfig() # kompatybilne z Twoją wersją crawl4ai async with AsyncWebCrawler() as crawler: result = await crawler.arun(url=url, config=cfg) text_val = getattr(result, "cleaned_text", None) md_val = getattr(result, "markdown", None) content = md_val or text_val or "" # Odpowiedź zgodna i z Twoim API, i z wtyczką Dify: return { "url": url, "status": "ok", "text": text_val, "markdown": md_val, # 👇 WTYCZKA MARKETPLACE patrzy na results[0].content "results": [ { "url": url, "content": content } ], "success": True } except Exception as e: raise HTTPException(status_code=500, detail=f"Crawl error: {e}" )