DarmacSEO commited on
Commit
40488af
·
verified ·
1 Parent(s): 11b634d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -29
app.py CHANGED
@@ -7,7 +7,6 @@ app = FastAPI(title="Crawl4AI API")
7
  URL_RE = re.compile(r"https?://[^\s\"'>)]+", re.IGNORECASE)
8
 
9
  def find_url_anywhere(obj):
10
- """Znajdź pierwszy URL w obiekcie JSON, form-data albo tekście."""
11
  if isinstance(obj, str):
12
  m = URL_RE.search(obj)
13
  return m.group(0) if m else None
@@ -18,23 +17,19 @@ def find_url_anywhere(obj):
18
  return v
19
  for v in obj.values():
20
  u = find_url_anywhere(v)
21
- if u:
22
- return u
23
  if isinstance(obj, Sequence) and not isinstance(obj, (str, bytes)):
24
  for it in obj:
25
  u = find_url_anywhere(it)
26
- if u:
27
- return u
28
  return None
29
 
30
  @app.get("/healthz")
31
  def health():
32
- """Prosty test czy apka działa."""
33
  return {"status": "ok"}
34
 
35
  @app.get("/crawl")
36
  async def crawl_get(url: str | None = None):
37
- """GET /crawl?url=https://..."""
38
  if not url:
39
  raise HTTPException(status_code=400, detail="Provide ?url=https://...")
40
  return await do_crawl(url)
@@ -44,56 +39,48 @@ async def crawl_post(
44
  request: Request,
45
  payload: dict | None = Body(None, example={"url": "https://example.com"})
46
  ):
47
- """POST /crawl z dowolnym formatem wejściowym (JSON, form-data, raw)."""
48
  url = request.query_params.get("url")
49
-
50
- # JSON body
51
  if not url and isinstance(payload, dict):
52
  url = find_url_anywhere(payload)
53
-
54
- # form-data / x-www-form-urlencoded
55
  if not url:
56
  try:
57
  form = await request.form()
58
  url = find_url_anywhere(dict(form))
59
  except Exception:
60
  pass
61
-
62
- # raw body (np. text/plain)
63
  if not url:
64
  raw = await request.body()
65
  url = find_url_anywhere(raw.decode("utf-8", errors="ignore"))
66
-
67
  if not url:
68
- raise HTTPException(
69
- status_code=400,
70
- detail="No URL found. Send JSON {'url':'https://...'} or use GET /crawl?url=..."
71
- )
72
 
73
  return await do_crawl(url)
74
 
75
  async def do_crawl(url: str):
76
- """Właściwe pobranie treści przez Crawl4AI."""
77
  try:
78
- cfg = CrawlerRunConfig() # bez 'js' zgodne z 0.7.x
79
  async with AsyncWebCrawler() as crawler:
80
  result = await crawler.arun(url=url, config=cfg)
81
 
82
  text_val = getattr(result, "cleaned_text", None)
83
- md_val = getattr(result, "markdown", None)
 
84
 
85
- # 🔥 ZWRACAMY WSZYSTKO + KLUCZ 'results' dla Dify
86
  return {
87
  "url": url,
88
  "status": "ok",
89
  "text": text_val,
90
  "markdown": md_val,
91
- "results": { # 👈 kompatybilność z Dify Marketplace
92
- "url": url,
93
- "text": text_val,
94
- "markdown": md_val
95
- }
 
 
 
96
  }
97
 
98
  except Exception as e:
99
- raise HTTPException(status_code=500, detail=f"Crawl error: {e}")
 
7
  URL_RE = re.compile(r"https?://[^\s\"'>)]+", re.IGNORECASE)
8
 
9
  def find_url_anywhere(obj):
 
10
  if isinstance(obj, str):
11
  m = URL_RE.search(obj)
12
  return m.group(0) if m else None
 
17
  return v
18
  for v in obj.values():
19
  u = find_url_anywhere(v)
20
+ if u: return u
 
21
  if isinstance(obj, Sequence) and not isinstance(obj, (str, bytes)):
22
  for it in obj:
23
  u = find_url_anywhere(it)
24
+ if u: return u
 
25
  return None
26
 
27
  @app.get("/healthz")
28
  def health():
 
29
  return {"status": "ok"}
30
 
31
  @app.get("/crawl")
32
  async def crawl_get(url: str | None = None):
 
33
  if not url:
34
  raise HTTPException(status_code=400, detail="Provide ?url=https://...")
35
  return await do_crawl(url)
 
39
  request: Request,
40
  payload: dict | None = Body(None, example={"url": "https://example.com"})
41
  ):
 
42
  url = request.query_params.get("url")
 
 
43
  if not url and isinstance(payload, dict):
44
  url = find_url_anywhere(payload)
 
 
45
  if not url:
46
  try:
47
  form = await request.form()
48
  url = find_url_anywhere(dict(form))
49
  except Exception:
50
  pass
 
 
51
  if not url:
52
  raw = await request.body()
53
  url = find_url_anywhere(raw.decode("utf-8", errors="ignore"))
 
54
  if not url:
55
+ raise HTTPException(status_code=400, detail="No URL found. Send {'url':'https://...'}")
 
 
 
56
 
57
  return await do_crawl(url)
58
 
59
  async def do_crawl(url: str):
 
60
  try:
61
+ cfg = CrawlerRunConfig() # kompatybilne z Twoją wersją crawl4ai
62
  async with AsyncWebCrawler() as crawler:
63
  result = await crawler.arun(url=url, config=cfg)
64
 
65
  text_val = getattr(result, "cleaned_text", None)
66
+ md_val = getattr(result, "markdown", None)
67
+ content = md_val or text_val or ""
68
 
69
+ # Odpowiedź zgodna i z Twoim API, i z wtyczką Dify:
70
  return {
71
  "url": url,
72
  "status": "ok",
73
  "text": text_val,
74
  "markdown": md_val,
75
+ # 👇 WTYCZKA MARKETPLACE patrzy na results[0].content
76
+ "results": [
77
+ {
78
+ "url": url,
79
+ "content": content
80
+ }
81
+ ],
82
+ "success": True
83
  }
84
 
85
  except Exception as e:
86
+ raise HTTPException(status_code=500, detail=f"Crawl error: {e}" )