NZProperty / fetch_property_links.py
NZLouislu's picture
Add code for fetch property data of Auckland NZ.
3624bf2
import requests
from bs4 import BeautifulSoup
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
def fetch_property_links(main_url, page=1, max_retries=3):
property_links = []
titles = []
url = f"{main_url}?page={page}" if page > 1 else main_url
print(f"Fetching page {page}...")
# 创建一个带有重试机制的会话
session = requests.Session()
retries = Retry(total=max_retries,
backoff_factor=0.1,
status_forcelist=[500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))
try:
response = session.get(url, timeout=30)
response.raise_for_status() # 这将抛出一个异常,如果状态码不是200
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# 查找所有属性链接
for link in soup.find_all('a', class_='PropertyCard_PropertyCardLink__icVIl'):
full_link = "https://propertyvalue.co.nz" + link['href']
property_links.append(full_link)
titles.append(link['title']) # 获取标题属性
print(f"\nFound {len(property_links)} properties on page {page}:")
# 如果需要打印标题,取消下面的注释
# for title in titles:
# print(title)
else:
print(f"Unexpected status code {response.status_code} for URL: {url}")
except requests.exceptions.RequestException as e:
print(f"Error fetching page {page}: {e}")
finally:
time.sleep(2) # 增加延迟到2秒,以避免过度加载服务器
return property_links, titles
# 使用示例
if __name__ == "__main__":
main_url = "https://propertyvalue.co.nz/wellington/wellington-city/khandallah-6035/200020"
links, titles = fetch_property_links(main_url)
print(f"Total properties found: {len(links)}")