Spaces:
Sleeping
Sleeping
phhttps commited on
Commit ·
0e200a9
1
Parent(s): 55b53a1
fix: airbnb scraper now uses robust block-parsing and price_max filter
Browse files- patchright_airbnb_scraper.py +115 -62
patchright_airbnb_scraper.py
CHANGED
|
@@ -15,7 +15,9 @@ class PatchrightAirbnbScraper:
|
|
| 15 |
d1 = datetime.strptime(checkin, "%Y-%m-%d")
|
| 16 |
d2 = datetime.strptime(checkout, "%Y-%m-%d")
|
| 17 |
nights = max(1, (d2 - d1).days)
|
| 18 |
-
|
|
|
|
|
|
|
| 19 |
|
| 20 |
try:
|
| 21 |
async with httpx.AsyncClient(timeout=90.0) as client:
|
|
@@ -30,81 +32,132 @@ class PatchrightAirbnbScraper:
|
|
| 30 |
except Exception: pass
|
| 31 |
return []
|
| 32 |
|
| 33 |
-
def _parse_markdown(self, text: str, region: str,
|
| 34 |
deals = []
|
| 35 |
-
#
|
| 36 |
-
|
| 37 |
-
|
|
|
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
#
|
| 47 |
-
rating = 4.8
|
| 48 |
-
reviews = 20
|
| 49 |
-
rate_match = re.search(r'([\d\.,]+)\s*star|Rating\s*([\d\.,]+)', context, re.I)
|
| 50 |
-
if rate_match:
|
| 51 |
-
r_val = rate_match.group(1) or rate_match.group(2)
|
| 52 |
-
try: rating = float(r_val.replace(',', '.'))
|
| 53 |
-
except: pass
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
-
# 2. Name
|
|
|
|
| 61 |
name = "[DEBUG: NAME FEHLT]"
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
#
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
price_per_night = 0
|
| 73 |
-
#
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
except: pass
|
| 83 |
|
| 84 |
-
if
|
| 85 |
-
#
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
price_per_night = round(
|
| 90 |
else:
|
| 91 |
-
|
| 92 |
-
small_prices = [p for p in numeric_prices if p < 500]
|
| 93 |
-
price_per_night = min(small_prices) if small_prices else min(numeric_prices)
|
| 94 |
-
|
| 95 |
-
if price_per_night == 0: price_per_night = 0 # Markierung für Debug
|
| 96 |
|
| 97 |
-
# 4.
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
return deals
|
| 109 |
|
| 110 |
SmartAirbnbScraper = PatchrightAirbnbScraper
|
|
|
|
| 15 |
d1 = datetime.strptime(checkin, "%Y-%m-%d")
|
| 16 |
d2 = datetime.strptime(checkout, "%Y-%m-%d")
|
| 17 |
nights = max(1, (d2 - d1).days)
|
| 18 |
+
# Add price_max to filter out luxury villas and ensure better budget fit
|
| 19 |
+
# Assume a max budget per night of ~300 to be safe, or use the budget_max if passed (defaulting to 500 here to be safe)
|
| 20 |
+
url = f"https://www.airbnb.com/s/{quote(region)}/homes?checkin={checkin}&checkout={checkout}&adults={adults}&price_max=600"
|
| 21 |
|
| 22 |
try:
|
| 23 |
async with httpx.AsyncClient(timeout=90.0) as client:
|
|
|
|
| 32 |
except Exception: pass
|
| 33 |
return []
|
| 34 |
|
| 35 |
+
def _parse_markdown(self, text: str, region: str, searched_nights: int) -> List[Dict]:
|
| 36 |
deals = []
|
| 37 |
+
# 1. Identify all Room IDs and their positions
|
| 38 |
+
# format: https://www.airbnb.com/rooms/123456...
|
| 39 |
+
id_pattern = re.compile(r'rooms/(\d+)')
|
| 40 |
+
matches = [(m.group(1), m.start()) for m in id_pattern.finditer(text)]
|
| 41 |
|
| 42 |
+
# Deduplicate while preserving order of first appearance
|
| 43 |
+
seen = set()
|
| 44 |
+
unique_matches = []
|
| 45 |
+
for rid, pos in matches:
|
| 46 |
+
if rid not in seen:
|
| 47 |
+
seen.add(rid)
|
| 48 |
+
unique_matches.append((rid, pos))
|
| 49 |
+
|
| 50 |
+
for i, (room_id, start_pos) in enumerate(unique_matches):
|
| 51 |
+
# Define the text block for this listing
|
| 52 |
+
# Start: from the first mention of this ID
|
| 53 |
+
# End: until the start of the next ID (or reasonable limit)
|
| 54 |
+
end_pos = unique_matches[i+1][1] if i + 1 < len(unique_matches) else len(text)
|
| 55 |
|
| 56 |
+
# Limit block size to avoid processing huge chunks if IDs are far apart
|
| 57 |
+
# But typically the text follows the images
|
| 58 |
+
block_len = min(end_pos - start_pos, 4000)
|
| 59 |
+
block = text[start_pos:start_pos + block_len]
|
| 60 |
|
| 61 |
+
# --- PARSING LOGIC ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
+
# 1. Image
|
| 64 |
+
image_url = ""
|
| 65 |
+
# Look for the image associated with this ID in the block (or just before)
|
| 66 |
+
# Actually, the block starts at the URL in the markdown link: [![]()]...
|
| 67 |
+
# We want the image *inside* the markdown link that contains the room_id
|
| 68 |
+
# Re-scan the original text slightly before the start_pos to catch the image bracket
|
| 69 |
+
# But simpler: scan the block for image syntax
|
| 70 |
+
img_match = re.search(r'!\[.*?\]\((https://[^)]+)\)', text[max(0, start_pos-300):start_pos+300])
|
| 71 |
+
if img_match:
|
| 72 |
+
image_url = img_match.group(1).split('?')[0] + "?im_w=720"
|
| 73 |
|
| 74 |
+
# 2. Name
|
| 75 |
+
# Strategy: Look for "Apartment in...", "Home in..." and take the next line
|
| 76 |
name = "[DEBUG: NAME FEHLT]"
|
| 77 |
+
|
| 78 |
+
# Common prefixes in Airbnb listings
|
| 79 |
+
type_match = re.search(r'(Apartment|Home|Condo|Villa|House|Guest suite|Cottage|Loft) in [A-Za-z\s]+', block)
|
| 80 |
+
if type_match:
|
| 81 |
+
# The title is usually the line AFTER the type description
|
| 82 |
+
# Split block by lines and find the index
|
| 83 |
+
lines = block.split('\n')
|
| 84 |
+
for idx, line in enumerate(lines):
|
| 85 |
+
if type_match.group(0) in line:
|
| 86 |
+
# Check next non-empty line
|
| 87 |
+
if idx + 1 < len(lines):
|
| 88 |
+
potential_name = lines[idx+1].strip()
|
| 89 |
+
if potential_name and len(potential_name) > 3:
|
| 90 |
+
name = potential_name
|
| 91 |
+
break
|
| 92 |
+
# Sometimes it's the same line?
|
| 93 |
+
if name == "[DEBUG: NAME FEHLT]":
|
| 94 |
+
name = line.replace(type_match.group(0), "").strip()
|
| 95 |
|
| 96 |
+
if name == "[DEBUG: NAME FEHLT]" or len(name) < 5:
|
| 97 |
+
# Fallback: Look for "Guest favorite" and take line after?
|
| 98 |
+
# Or use the first generic text line
|
| 99 |
+
lines = [l.strip() for l in block.split('\n') if len(l.strip()) > 10 and "rooms/" not in l and "Review" not in l]
|
| 100 |
+
if lines: name = lines[0] # Very rough fallback
|
| 101 |
+
|
| 102 |
+
# 3. Price
|
| 103 |
price_per_night = 0
|
| 104 |
+
# Search for "$1,350 ... for 5 nights" pattern
|
| 105 |
+
# Matches: $1,234 or €1.234
|
| 106 |
+
price_block_match = re.search(r'([\$\€\£])\s*([\d,\.]+).*?for\s+(\d+)\s+nights', block, re.DOTALL | re.IGNORECASE)
|
| 107 |
+
|
| 108 |
+
if price_block_match:
|
| 109 |
+
currency, amount_str, nights_found = price_block_match.groups()
|
| 110 |
+
amount = int(re.sub(r'[^\d]', '', amount_str))
|
| 111 |
+
nights_found = int(nights_found)
|
| 112 |
+
if nights_found > 0:
|
| 113 |
+
price_per_night = round(amount / nights_found)
|
| 114 |
+
else:
|
| 115 |
+
# Fallback: Find any price and assume it is nightly if low, or total if high
|
| 116 |
+
prices = re.findall(r'[\$\€\£]\s*([\d,\.]+)', block)
|
| 117 |
+
valid_prices = []
|
| 118 |
+
for p in prices:
|
| 119 |
+
try:
|
| 120 |
+
v = int(re.sub(r'[^\d]', '', p))
|
| 121 |
+
valid_prices.append(v)
|
| 122 |
except: pass
|
| 123 |
|
| 124 |
+
if valid_prices:
|
| 125 |
+
# Sort logic
|
| 126 |
+
best_guess = min(valid_prices)
|
| 127 |
+
# If the best guess is super high (e.g. > 1000), treat as total
|
| 128 |
+
if best_guess > 1000:
|
| 129 |
+
price_per_night = round(best_guess / searched_nights)
|
| 130 |
else:
|
| 131 |
+
price_per_night = best_guess
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
+
# 4. Rating / Reviews
|
| 134 |
+
rating = 4.8
|
| 135 |
+
reviews = 20
|
| 136 |
+
# "4.32 out of 5 average rating, 141 reviews"
|
| 137 |
+
rating_match = re.search(r'([\d\.]+)\s*out of 5', block)
|
| 138 |
+
if rating_match:
|
| 139 |
+
try: rating = float(rating_match.group(1))
|
| 140 |
+
except: pass
|
| 141 |
+
|
| 142 |
+
rev_match = re.search(r'(\d+)\s*reviews', block)
|
| 143 |
+
if rev_match:
|
| 144 |
+
try: reviews = int(rev_match.group(1))
|
| 145 |
+
except: pass
|
| 146 |
|
| 147 |
+
# Add to list
|
| 148 |
+
if price_per_night > 0:
|
| 149 |
+
deals.append({
|
| 150 |
+
"name": name,
|
| 151 |
+
"location": region,
|
| 152 |
+
"price_per_night": price_per_night,
|
| 153 |
+
"rating": rating,
|
| 154 |
+
"reviews": reviews,
|
| 155 |
+
"pet_friendly": True,
|
| 156 |
+
"source": "airbnb (cloud)",
|
| 157 |
+
"url": f"https://www.airbnb.com/rooms/{room_id}",
|
| 158 |
+
"image_url": image_url
|
| 159 |
+
})
|
| 160 |
+
|
| 161 |
return deals
|
| 162 |
|
| 163 |
SmartAirbnbScraper = PatchrightAirbnbScraper
|