phhttps commited on
Commit
0e200a9
·
1 Parent(s): 55b53a1

fix: airbnb scraper now uses robust block-parsing and price_max filter

Browse files
Files changed (1) hide show
  1. patchright_airbnb_scraper.py +115 -62
patchright_airbnb_scraper.py CHANGED
@@ -15,7 +15,9 @@ class PatchrightAirbnbScraper:
15
  d1 = datetime.strptime(checkin, "%Y-%m-%d")
16
  d2 = datetime.strptime(checkout, "%Y-%m-%d")
17
  nights = max(1, (d2 - d1).days)
18
- url = f"https://www.airbnb.com/s/{quote(region)}/homes?checkin={checkin}&checkout={checkout}&adults={adults}"
 
 
19
 
20
  try:
21
  async with httpx.AsyncClient(timeout=90.0) as client:
@@ -30,81 +32,132 @@ class PatchrightAirbnbScraper:
30
  except Exception: pass
31
  return []
32
 
33
- def _parse_markdown(self, text: str, region: str, nights: int) -> List[Dict]:
34
  deals = []
35
- # Airbnb links are often structured as [Name](url) or just raw URL
36
- room_links = re.findall(r'https://www\.airbnb\.com/rooms/(\d+)', text)
37
- seen_ids = set()
 
38
 
39
- for room_id in room_links:
40
- if room_id in seen_ids: continue
41
- seen_ids.add(room_id)
 
 
 
 
 
 
 
 
 
 
42
 
43
- pos = text.find(room_id)
44
- context = text[max(0, pos-600):pos+600]
 
 
45
 
46
- # 1. Rating & Reviews
47
- rating = 4.8
48
- reviews = 20
49
- rate_match = re.search(r'([\d\.,]+)\s*star|Rating\s*([\d\.,]+)', context, re.I)
50
- if rate_match:
51
- r_val = rate_match.group(1) or rate_match.group(2)
52
- try: rating = float(r_val.replace(',', '.'))
53
- except: pass
54
 
55
- rev_match = re.search(r'(\d+)\s*reviews|(\d+)\s*Bewertungen', context, re.I)
56
- if rev_match:
57
- try: reviews = int(rev_match.group(1) or rev_match.group(2))
58
- except: pass
 
 
 
 
 
 
59
 
60
- # 2. Name / Titel (Suche nach Fettschrift oder Überschriften vor dem Link)
 
61
  name = "[DEBUG: NAME FEHLT]"
62
- name_match = re.search(r'[\*\#]{2,}\s*([^\*\n\#]{10,60})', context)
63
- if name_match:
64
- name = name_match.group(1).strip()
65
- else:
66
- # Fallback: Link-Text
67
- link_text_match = re.search(r'\[([^\]]{10,60})\]\(https://www\.airbnb\.com/rooms/' + room_id, text)
68
- if link_text_match:
69
- name = link_text_match.group(1).strip()
 
 
 
 
 
 
 
 
 
 
70
 
71
- # 3. Preis-Präzision
 
 
 
 
 
 
72
  price_per_night = 0
73
- # Wir suchen nach Preisen im Kontext
74
- price_candidates = re.findall(r'[\$€£]\s*([\d\.,]+)', context)
75
- if price_candidates:
76
- # Wir nehmen den kleinsten Wert als Nachtpreis (oder berechnen ihn aus dem Gesamtpreis)
77
- numeric_prices = []
78
- for p in price_candidates:
79
- try:
80
- val = int("".join(re.findall(r'\d+', p)))
81
- if val > 10: numeric_prices.append(val)
 
 
 
 
 
 
 
 
 
82
  except: pass
83
 
84
- if numeric_prices:
85
- # Wenn "total" oder "Gesamt" im Kontext steht, ist der größte Preis vermutlich der Gesamtpreis
86
- is_total = any(kw in context.lower() for kw in ["total", "gesamt", "summe"])
87
- if is_total:
88
- total_val = max(numeric_prices)
89
- price_per_night = round(total_val / nights)
90
  else:
91
- # Sonst nehmen wir den plausibelsten Wert (unter 300)
92
- small_prices = [p for p in numeric_prices if p < 500]
93
- price_per_night = min(small_prices) if small_prices else min(numeric_prices)
94
-
95
- if price_per_night == 0: price_per_night = 0 # Markierung für Debug
96
 
97
- # 4. Bild-URL
98
- image_url = ""
99
- img_match = re.search(r'https://a0\.muscache\.com/im/pictures/[^\s\)\?\!]+', context)
100
- if img_match: image_url = img_match.group(0).split('?')[0] + "?im_w=720"
 
 
 
 
 
 
 
 
 
101
 
102
- deals.append({
103
- "name": name, "location": region, "price_per_night": price_per_night,
104
- "rating": rating, "reviews": reviews, "pet_friendly": True,
105
- "source": "airbnb (cloud)", "url": f"https://www.airbnb.com/rooms/{room_id}",
106
- "image_url": image_url
107
- })
 
 
 
 
 
 
 
 
108
  return deals
109
 
110
  SmartAirbnbScraper = PatchrightAirbnbScraper
 
15
  d1 = datetime.strptime(checkin, "%Y-%m-%d")
16
  d2 = datetime.strptime(checkout, "%Y-%m-%d")
17
  nights = max(1, (d2 - d1).days)
18
+ # Add price_max to filter out luxury villas and ensure better budget fit
19
+ # Assume a max budget per night of ~300 to be safe, or use the budget_max if passed (defaulting to 500 here to be safe)
20
+ url = f"https://www.airbnb.com/s/{quote(region)}/homes?checkin={checkin}&checkout={checkout}&adults={adults}&price_max=600"
21
 
22
  try:
23
  async with httpx.AsyncClient(timeout=90.0) as client:
 
32
  except Exception: pass
33
  return []
34
 
35
+ def _parse_markdown(self, text: str, region: str, searched_nights: int) -> List[Dict]:
36
  deals = []
37
+ # 1. Identify all Room IDs and their positions
38
+ # format: https://www.airbnb.com/rooms/123456...
39
+ id_pattern = re.compile(r'rooms/(\d+)')
40
+ matches = [(m.group(1), m.start()) for m in id_pattern.finditer(text)]
41
 
42
+ # Deduplicate while preserving order of first appearance
43
+ seen = set()
44
+ unique_matches = []
45
+ for rid, pos in matches:
46
+ if rid not in seen:
47
+ seen.add(rid)
48
+ unique_matches.append((rid, pos))
49
+
50
+ for i, (room_id, start_pos) in enumerate(unique_matches):
51
+ # Define the text block for this listing
52
+ # Start: from the first mention of this ID
53
+ # End: until the start of the next ID (or reasonable limit)
54
+ end_pos = unique_matches[i+1][1] if i + 1 < len(unique_matches) else len(text)
55
 
56
+ # Limit block size to avoid processing huge chunks if IDs are far apart
57
+ # But typically the text follows the images
58
+ block_len = min(end_pos - start_pos, 4000)
59
+ block = text[start_pos:start_pos + block_len]
60
 
61
+ # --- PARSING LOGIC ---
 
 
 
 
 
 
 
62
 
63
+ # 1. Image
64
+ image_url = ""
65
+ # Look for the image associated with this ID in the block (or just before)
66
+ # Actually, the block starts at the URL in the markdown link: [![]()]...
67
+ # We want the image *inside* the markdown link that contains the room_id
68
+ # Re-scan the original text slightly before the start_pos to catch the image bracket
69
+ # But simpler: scan the block for image syntax
70
+ img_match = re.search(r'!\[.*?\]\((https://[^)]+)\)', text[max(0, start_pos-300):start_pos+300])
71
+ if img_match:
72
+ image_url = img_match.group(1).split('?')[0] + "?im_w=720"
73
 
74
+ # 2. Name
75
+ # Strategy: Look for "Apartment in...", "Home in..." and take the next line
76
  name = "[DEBUG: NAME FEHLT]"
77
+
78
+ # Common prefixes in Airbnb listings
79
+ type_match = re.search(r'(Apartment|Home|Condo|Villa|House|Guest suite|Cottage|Loft) in [A-Za-z\s]+', block)
80
+ if type_match:
81
+ # The title is usually the line AFTER the type description
82
+ # Split block by lines and find the index
83
+ lines = block.split('\n')
84
+ for idx, line in enumerate(lines):
85
+ if type_match.group(0) in line:
86
+ # Check next non-empty line
87
+ if idx + 1 < len(lines):
88
+ potential_name = lines[idx+1].strip()
89
+ if potential_name and len(potential_name) > 3:
90
+ name = potential_name
91
+ break
92
+ # Sometimes it's the same line?
93
+ if name == "[DEBUG: NAME FEHLT]":
94
+ name = line.replace(type_match.group(0), "").strip()
95
 
96
+ if name == "[DEBUG: NAME FEHLT]" or len(name) < 5:
97
+ # Fallback: Look for "Guest favorite" and take line after?
98
+ # Or use the first generic text line
99
+ lines = [l.strip() for l in block.split('\n') if len(l.strip()) > 10 and "rooms/" not in l and "Review" not in l]
100
+ if lines: name = lines[0] # Very rough fallback
101
+
102
+ # 3. Price
103
  price_per_night = 0
104
+ # Search for "$1,350 ... for 5 nights" pattern
105
+ # Matches: $1,234 or €1.234
106
+ price_block_match = re.search(r'([\$\€\£])\s*([\d,\.]+).*?for\s+(\d+)\s+nights', block, re.DOTALL | re.IGNORECASE)
107
+
108
+ if price_block_match:
109
+ currency, amount_str, nights_found = price_block_match.groups()
110
+ amount = int(re.sub(r'[^\d]', '', amount_str))
111
+ nights_found = int(nights_found)
112
+ if nights_found > 0:
113
+ price_per_night = round(amount / nights_found)
114
+ else:
115
+ # Fallback: Find any price and assume it is nightly if low, or total if high
116
+ prices = re.findall(r'[\$\€\£]\s*([\d,\.]+)', block)
117
+ valid_prices = []
118
+ for p in prices:
119
+ try:
120
+ v = int(re.sub(r'[^\d]', '', p))
121
+ valid_prices.append(v)
122
  except: pass
123
 
124
+ if valid_prices:
125
+ # Sort logic
126
+ best_guess = min(valid_prices)
127
+ # If the best guess is super high (e.g. > 1000), treat as total
128
+ if best_guess > 1000:
129
+ price_per_night = round(best_guess / searched_nights)
130
  else:
131
+ price_per_night = best_guess
 
 
 
 
132
 
133
+ # 4. Rating / Reviews
134
+ rating = 4.8
135
+ reviews = 20
136
+ # "4.32 out of 5 average rating, 141 reviews"
137
+ rating_match = re.search(r'([\d\.]+)\s*out of 5', block)
138
+ if rating_match:
139
+ try: rating = float(rating_match.group(1))
140
+ except: pass
141
+
142
+ rev_match = re.search(r'(\d+)\s*reviews', block)
143
+ if rev_match:
144
+ try: reviews = int(rev_match.group(1))
145
+ except: pass
146
 
147
+ # Add to list
148
+ if price_per_night > 0:
149
+ deals.append({
150
+ "name": name,
151
+ "location": region,
152
+ "price_per_night": price_per_night,
153
+ "rating": rating,
154
+ "reviews": reviews,
155
+ "pet_friendly": True,
156
+ "source": "airbnb (cloud)",
157
+ "url": f"https://www.airbnb.com/rooms/{room_id}",
158
+ "image_url": image_url
159
+ })
160
+
161
  return deals
162
 
163
  SmartAirbnbScraper = PatchrightAirbnbScraper