phhttps commited on
Commit
88a6677
·
1 Parent(s): e31151b

fix: use direct Firecrawl API and add health diagnostics

Browse files
Files changed (2) hide show
  1. api.py +12 -0
  2. patchright_airbnb_scraper.py +28 -55
api.py CHANGED
@@ -21,6 +21,18 @@ app.add_middleware(
21
 
22
  agent = HollandVacationAgent()
23
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  @app.get("/")
25
  async def serve_frontend():
26
  return FileResponse("frontend_dashboard.html")
 
21
 
22
  agent = HollandVacationAgent()
23
 
24
+ @app.get("/health")
25
+ async def health_check():
26
+ return {
27
+ "status": "online",
28
+ "keys_found": {
29
+ "OPENWEATHER_API_KEY": bool(os.getenv("OPENWEATHER_API_KEY")),
30
+ "FIRECRAWL_API_KEY": bool(os.getenv("FIRECRAWL_API_KEY")),
31
+ "AGENT_BROWSER_SESSION": bool(os.getenv("AGENT_BROWSER_SESSION"))
32
+ },
33
+ "python_version": os.sys.version
34
+ }
35
+
36
  @app.get("/")
37
  async def serve_frontend():
38
  return FileResponse("frontend_dashboard.html")
patchright_airbnb_scraper.py CHANGED
@@ -1,6 +1,7 @@
1
  import asyncio
2
  import re
3
  import os
 
4
  from typing import List, Dict
5
  from bs4 import BeautifulSoup
6
  from urllib.parse import quote
@@ -12,12 +13,6 @@ try:
12
  except ImportError:
13
  PATCHRIGHT_AVAILABLE = False
14
 
15
- try:
16
- from firecrawl import FirecrawlApp
17
- FIRECRAWL_AVAILABLE = True
18
- except ImportError:
19
- FIRECRAWL_AVAILABLE = False
20
-
21
  class PatchrightAirbnbScraper:
22
  def __init__(self):
23
  self.playwright = None
@@ -37,21 +32,28 @@ class PatchrightAirbnbScraper:
37
  if self.playwright: await self.playwright.stop()
38
 
39
  async def _search_via_firecrawl(self, url: str, region: str, nights: int) -> List[Dict]:
40
- if not (FIRECRAWL_AVAILABLE and self.firecrawl_key):
41
- print(" [Firecrawl] Key oder Library fehlt.")
 
42
  return []
43
 
44
- print(f" [Firecrawl] API-Scraping für: {region}")
45
  try:
46
- app = FirecrawlApp(api_key=self.firecrawl_key)
47
- # Die korrekte Methode ist app.scrape_url
48
- response = app.scrape_url(url, params={'formats': ['html']})
49
- # Falls Firecrawl ein Dict zurückgibt
50
- html_content = response.get('html') if isinstance(response, dict) else getattr(response, 'html', None)
51
- if html_content:
52
- return self._parse_content(html_content, region, nights)
 
 
 
 
 
 
53
  except Exception as e:
54
- print(f" [Firecrawl] API Fehler: {e}")
55
  return []
56
 
57
  async def search_airbnb(self, region: str, checkin: str, checkout: str, adults: int = 4) -> List[Dict]:
@@ -60,34 +62,24 @@ class PatchrightAirbnbScraper:
60
  nights = max(1, (d2 - d1).days)
61
  url = f"https://www.airbnb.com/s/{quote(region)}/homes?checkin={checkin}&checkout={checkout}&adults={adults}"
62
 
63
- # 1. VERSUCH: Patchright
64
  deals = await self._run_patchright(url, region, nights)
65
-
66
- # 2. VERSUCH: Firecrawl Fallback
67
  if not deals:
68
- print(f" [Hybrid] Patchright leer. Nutze Firecrawl für {region}...")
69
  deals = await self._search_via_firecrawl(url, region, nights)
70
-
71
  return deals
72
 
73
  async def _run_patchright(self, url: str, region: str, nights: int) -> List[Dict]:
74
  if not PATCHRIGHT_AVAILABLE: return []
75
  try:
76
  if not self.browser: await self.launch()
77
- context = await self.browser.new_context(
78
- viewport={'width': 1280, 'height': 800},
79
- user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
80
- )
81
  page = await context.new_page()
82
  await page.goto(url, wait_until='domcontentloaded', timeout=40000)
83
- try:
84
- await page.wait_for_selector('[data-testid="card-container"]', timeout=15000)
85
  except: pass
86
-
87
  for _ in range(2):
88
  await page.mouse.wheel(0, 800)
89
  await asyncio.sleep(1)
90
-
91
  content = await page.content()
92
  deals = self._parse_content(content, region, nights)
93
  await context.close()
@@ -102,28 +94,13 @@ class PatchrightAirbnbScraper:
102
  cards = soup.find_all('div', {'data-testid': 'card-container'})
103
  for card in cards:
104
  try:
105
- name_elem = card.find('div', {'data-testid': 'listing-card-title'})
106
- name = name_elem.get_text(strip=True) if name_elem else "Airbnb"
107
-
108
  card_text = card.get_text(separator=' ')
109
- price_matches = re.findall(r'(?:€\s*([\d\.,]+)|([\d\.,]+)\s*€)', card_text)
110
- extracted_vals = []
111
- for m in price_matches:
112
- val_str = m[0] or m[1]
113
- val = int(val_str.replace('.', '').replace(',', ''))
114
- extracted_vals.append(val)
115
-
116
- price_per_night = 0
117
- if extracted_vals:
118
- min_val = min(extracted_vals)
119
- max_val = max(extracted_vals)
120
- if len(extracted_vals) > 1 and max_val > min_val * 2:
121
- price_per_night = min_val
122
- else:
123
- price_per_night = round(max_val / nights) if max_val > 250 else max_val
124
 
125
- if price_per_night <= 0: continue
126
-
127
  image_url = ""
128
  for img in card.find_all('img'):
129
  src = img.get('src', '') or img.get('data-src', '')
@@ -132,11 +109,7 @@ class PatchrightAirbnbScraper:
132
  break
133
 
134
  link = card.find('a', href=True)
135
- url = ""
136
- if link and '/rooms/' in link['href']:
137
- room_id = re.search(r'/rooms/(\d+)', link['href'])
138
- if room_id: url = f"https://www.airbnb.com/rooms/{room_id.group(1)}"
139
-
140
  if not url: continue
141
 
142
  deals.append({
 
1
  import asyncio
2
  import re
3
  import os
4
+ import httpx
5
  from typing import List, Dict
6
  from bs4 import BeautifulSoup
7
  from urllib.parse import quote
 
13
  except ImportError:
14
  PATCHRIGHT_AVAILABLE = False
15
 
 
 
 
 
 
 
16
  class PatchrightAirbnbScraper:
17
  def __init__(self):
18
  self.playwright = None
 
32
  if self.playwright: await self.playwright.stop()
33
 
34
  async def _search_via_firecrawl(self, url: str, region: str, nights: int) -> List[Dict]:
35
+ """Direct REST API call to Firecrawl (bypass library issues)"""
36
+ if not self.firecrawl_key:
37
+ print(" [Firecrawl] Kein API Key vorhanden.")
38
  return []
39
 
40
+ print(f" [Firecrawl] Suche für {region}...")
41
  try:
42
+ async with httpx.AsyncClient(timeout=60.0) as client:
43
+ response = await client.post(
44
+ "https://api.firecrawl.dev/v1/scrape",
45
+ headers={"Authorization": f"Bearer {self.firecrawl_key}", "Content-Type": "application/json"},
46
+ json={"url": url, "formats": ["html"], "waitFor": 5000}
47
+ )
48
+ if response.status_code == 200:
49
+ data = response.json()
50
+ html_content = data.get('data', {}).get('html')
51
+ if html_content:
52
+ return self._parse_content(html_content, region, nights)
53
+ else:
54
+ print(f" [Firecrawl] API Fehler: {response.status_code} - {response.text}")
55
  except Exception as e:
56
+ print(f" [Firecrawl] Fehler: {e}")
57
  return []
58
 
59
  async def search_airbnb(self, region: str, checkin: str, checkout: str, adults: int = 4) -> List[Dict]:
 
62
  nights = max(1, (d2 - d1).days)
63
  url = f"https://www.airbnb.com/s/{quote(region)}/homes?checkin={checkin}&checkout={checkout}&adults={adults}"
64
 
 
65
  deals = await self._run_patchright(url, region, nights)
 
 
66
  if not deals:
67
+ print(f" [Hybrid] Airbnb Patchright leer. Nutze Firecrawl...")
68
  deals = await self._search_via_firecrawl(url, region, nights)
 
69
  return deals
70
 
71
  async def _run_patchright(self, url: str, region: str, nights: int) -> List[Dict]:
72
  if not PATCHRIGHT_AVAILABLE: return []
73
  try:
74
  if not self.browser: await self.launch()
75
+ context = await self.browser.new_context(viewport={'width': 1280, 'height': 800}, user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
 
 
 
76
  page = await context.new_page()
77
  await page.goto(url, wait_until='domcontentloaded', timeout=40000)
78
+ try: await page.wait_for_selector('[data-testid="card-container"]', timeout=15000)
 
79
  except: pass
 
80
  for _ in range(2):
81
  await page.mouse.wheel(0, 800)
82
  await asyncio.sleep(1)
 
83
  content = await page.content()
84
  deals = self._parse_content(content, region, nights)
85
  await context.close()
 
94
  cards = soup.find_all('div', {'data-testid': 'card-container'})
95
  for card in cards:
96
  try:
97
+ name = (card.find('div', {'data-testid': 'listing-card-title'}) or soup.new_tag('div')).get_text(strip=True)
 
 
98
  card_text = card.get_text(separator=' ')
99
+ prices = [int(p.replace('.', '').replace(',', '')) for p in re.findall(r'€\s*([\d\.,]+)', card_text)]
100
+ if not prices: continue
101
+ max_val = max(prices)
102
+ price_per_night = round(max_val / nights) if max_val > 250 else max_val
 
 
 
 
 
 
 
 
 
 
 
103
 
 
 
104
  image_url = ""
105
  for img in card.find_all('img'):
106
  src = img.get('src', '') or img.get('data-src', '')
 
109
  break
110
 
111
  link = card.find('a', href=True)
112
+ url = f"https://www.airbnb.com{link['href']}".split('?')[0] if link and '/rooms/' in link['href'] else ""
 
 
 
 
113
  if not url: continue
114
 
115
  deals.append({