Spaces:

NZLouislu
/

NZProperty

Paused

App Files Files Community

NZLouislu commited on Jun 11, 2025

Commit

3624bf2

1 Parent(s): 0ea8d23

Add code for fetch property data of Auckland NZ.

Browse files

Files changed (12) hide show

.gitignore +7 -0
config/config_test.py +141 -0
config/redis_config.py +35 -0
config/supabase_config.py +163 -0
fetch_property_details.py +147 -0
fetch_property_links.py +54 -0
main.py +79 -0
properties.py +131 -0
property_history.py +66 -0
real_estate.py +163 -0
requirements.txt +9 -0
utils/data_processing.py +8 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+/config/__pycache__
+/__pycache__
+/.venv
+.vscode
+/.idx/
+.env
+shell.nix

config/config_test.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import sys
+from datetime import datetime
+from supabase_config import create_supabase_client
+from redis_config import create_redis_client
+from upstash_redis import Redis
+# from redis import Redis
+def test_redis_connection():
+    try:
+        # Connect to Redis
+        redis_client = Redis.from_env()
+        # redis_client = Redis(host='localhost', port=6379, db=0)
+         # Flush all data in Redis
+        # redis_client.flushdb()
+        # Test inserting a value
+        redis_client.set('test_key', 'test_value')
+        value = redis_client.get('test_key')
+        if value:
+            print("Redis connection successful, test key inserted.")
+        else:
+            print("Redis connection failed.")
+        return True
+    except Exception as e:
+        print(f"Error connecting to Redis: {e}")
+        print("Redis test failed.")
+        return False
+def test_supabase_connection():
+    try:
+        # Create a Supabase client
+        supabase_client = create_supabase_client()
+        print("Successfully connected to Supabase.")
+        # Test querying the 'properties' table
+        response = supabase_client.from_('properties').select('*').limit(1).execute()
+        # Check if there is an error or data is None
+        if not response.data:
+            print("Failed to fetch data from 'properties' table or table is empty.")
+        else:
+            print("Successfully fetched data from 'properties' table:", response.data)
+        return True
+    except Exception as e:
+        print(f"Error connecting to Supabase: {e}")
+        print("Supabase test failed.")
+        return False
+def insert_property(supabase_client, property_data):
+    try:
+        # Remove the 'id' from property_data if it exists, so it can be auto-generated
+        property_data.pop('id', None)
+        # Clean up the price fields
+        for field in ['last_sold_price', 'capital_value', 'land_value', 'improvement_value']:
+            if field in property_data and property_data[field]:
+                property_data[field] = clean_price(property_data[field])
+        # Insert the property data
+        response = supabase_client.table('properties').insert(property_data).execute()
+        # Check for errors in the response
+        if hasattr(response, 'error') and response.error:
+            print(f"Failed to insert property: {response.error}")
+            return None
+        elif not response.data:
+            print("Failed to insert property: No data returned")
+            return None
+        else:
+            print(f"Property inserted: {property_data['address']}")
+            return response.data[0]['id']  # Return the property ID
+    except Exception as e:
+        print(f"Error inserting property: {str(e)}")
+        return None
+# Helper function to clean price values
+def clean_price(price_str):
+    try:
+        # Remove dollar sign and commas, then convert to float
+        clean_price_str = price_str.replace('$', '').replace(',', '').strip()
+        return float(clean_price_str)
+    except ValueError:
+        return None  # If there's an issue parsing the price, return None
+# Test inserting a property into the database
+def test_insert_property():
+    # Create a test property with a sample price that contains special characters
+    test_property_data = {
+        'address': '15 Agra Crescent, Khandallah, Wellington, 6035',
+        'suburb': 'Khandallah',
+        'city': 'Wellington',
+        'postcode': '6035',
+        'year_built': 1985,
+        'bedrooms': 3,
+        'bathrooms': 2,
+        'car_spaces': 2,
+        'floor_size': '150 sqm',
+        'land_area': '500 sqm',
+        'last_sold_price': '$1,280,000',  # This needs cleaning
+        'last_sold_date': '2023-08-01',
+        'capital_value': '$1,000,000',    # This needs cleaning
+        'land_value': '$800,000',         # This needs cleaning
+        'improvement_value': '$200,000',  # This needs cleaning
+        'has_rental_history': False,
+        'is_currently_rented': False,
+        'status': 'For Sale'  # Adding status as per SQL definition
+    }
+    try:
+        # Create a Supabase client
+        supabase_client = create_supabase_client()
+        # Attempt to insert the property
+        property_id = insert_property(supabase_client, test_property_data)
+        if property_id:
+            print(f"Inserted property with generated ID: {property_id}")
+    except Exception as e:
+        print(f"Error during test insertion: {e}")
+def main():
+    # print("Testing Redis connection first...")
+    redis_test_result = test_redis_connection()
+    print("\nTesting Supabase connection now...")
+    supabase_test_result = test_supabase_connection()
+    print("\nTesting Supabase property insertion...")
+    test_insert_property()
+    # if redis_test_result and supabase_test_result:
+    if supabase_test_result:
+        print("\nAll tests passed!")
+    else:
+        print("\nOne or more tests failed. Please check the above messages.")
+if __name__ == '__main__':
+    main()

config/redis_config.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# redis_config.py
+import os
+# from redis import Redis
+from upstash_redis import Redis # Import Redis instead of UpstashRedis
+# Redis connection configuration
+def create_redis_client():
+    # Use Redis.from_env() which is the correct way to initialize from environment variables
+    return Redis.from_env()
+    # return Redis(host='localhost', port=6379, db=0)
+# Check if a property address exists in Redis
+def check_property_in_redis(redis_client, address):
+    # Upstash Redis client uses get() and checks for None
+    return redis_client.get(address) is not None
+# Add a property address to Redis after insertion
+def add_property_to_redis(redis_client, address):
+    redis_client.set(address, 1)
+# Check if a real estate property address exists in Redis
+def check_real_estate_in_redis(redis_client, address):
+    return redis_client.get("real" + address) is not None
+# Add a real estate property address to Redis after insertion
+def add_real_estate_to_redis(redis_client, address):
+    redis_client.set("real"+address, 1)
+# Check if a real estate rent property address exists in Redis
+def check_real_estate_rent_in_redis(redis_client, address):
+    return redis_client.get("real" + address) is not None
+# Add a real estate property rent address to Redis after insertion
+def add_real_estate_rent_to_redis(redis_client, address):
+    redis_client.set("real"+address, 1)

config/supabase_config.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# supabase_config.py
+from datetime import datetime
+import os
+from dotenv import load_dotenv
+from supabase import create_client, Client
+# 连接到 Supabase
+load_dotenv()  # 默认加载根目录的 .env 文件
+SUPABASE_URL = os.getenv("SUPABASE_URL")
+SUPABASE_KEY = os.getenv("SUPABASE_KEY")
+# Create a Supabase client
+def create_supabase_client() -> Client:
+    if not SUPABASE_URL or not SUPABASE_KEY:
+        raise ValueError("Supabase URL and API key must be provided")
+    return create_client(SUPABASE_URL, SUPABASE_KEY)
+# Insert property details into the properties table
+def insert_property(supabase_client, property_data):
+    try:
+        # Remove the 'id' from property_data if it exists, so it can be auto-generated
+        property_data.pop('id', None)
+        # Clean up the last_sold_price before inserting
+        if 'last_sold_price' in property_data and property_data['last_sold_price']:
+            property_data['last_sold_price'] = clean_price(property_data['last_sold_price'])
+        # Clean up the capital_value, land_value, and improvement_value as well
+        if 'capital_value' in property_data and property_data['capital_value']:
+            property_data['capital_value'] = clean_price(property_data['capital_value'])
+        if 'land_value' in property_data and property_data['land_value']:
+            property_data['land_value'] = clean_price(property_data['land_value'])
+        if 'improvement_value' in property_data and property_data['improvement_value']:
+            property_data['improvement_value'] = clean_price(property_data['improvement_value'])
+        # Insert the property data (without 'id')
+        response = supabase_client.table('properties').insert(property_data).execute()
+        # Check for errors in the response
+        if response.error:
+            print(f"Failed to insert property: {response.error}")  # Print the error message
+            return None
+        # If successful, return the ID of the inserted property
+        print(f"Property inserted: {property_data['address']}")
+        return response.data[0]['id']  # Return the property ID
+    except Exception as e:
+        print(f"Error inserting property: {str(e)}")
+        return None
+def clean_price(price_str):
+    if price_str is None:
+        return None
+    if isinstance(price_str, (int, float)):
+        return price_str
+    try:
+        return float(price_str.replace('$', '').replace(',', '').strip())
+    except ValueError:
+        return None
+def clean_property_data(property_data):
+    price_fields = ['last_sold_price', 'capital_value', 'land_value', 'improvement_value']
+    for field in price_fields:
+        if field in property_data:
+            property_data[field] = clean_price(property_data[field])
+    return property_data
+def parse_date(date_str):
+    if not date_str:
+        return None
+    date_formats = ['%d %b %Y', '%Y', '%b %Y', '%d/%m/%Y', '%Y-%m-%d']
+    for fmt in date_formats:
+        try:
+            return datetime.strptime(date_str, fmt).date()
+        except ValueError:
+            continue
+    print(f"Warning: Unable to parse date '{date_str}'")
+    return None
+def format_date_for_json(date_obj):
+    if date_obj is None:
+        return None
+    return date_obj.isoformat()  # Converts date to ISO 8601 string format
+def insert_property_and_history(property_data, history_data):
+    supabase = create_supabase_client()
+    try:
+        # Clean property data
+        cleaned_property_data = clean_property_data(property_data)
+        # 插入 property 数据
+        response = supabase.table('properties').insert(cleaned_property_data).execute()
+        if response.data:
+            property_id = response.data[0]['id']
+            print(f"✅ Property inserted successfully. ID: {property_id}")
+        else:
+            print(f"⚠️ Failed to insert property. Maybe already exists. URL: {cleaned_property_data.get('property_url')}")
+            return  # 不继续插入历史
+    except Exception as e:
+        error_str = str(e).lower()
+        if "duplicate key" in error_str or "unique constraint" in error_str:
+            print(f"🔁 Duplicate property skipped (URL: {cleaned_property_data.get('property_url')})")
+        else:
+            print(f"❌ Unexpected error during property insert: {e}")
+        return
+    # 继续插入历史记录
+    if history_data and isinstance(history_data, list):
+        for event in history_data:
+            history_entry = {
+                'property_id': property_id,
+                'event_description': event.get('event_description', ''),
+                'event_date': format_date_for_json(parse_date(event.get('event_date'))),
+                'interval_since_last_event': event.get('event_interval', '')
+            }
+            if history_entry['event_date'] is not None:
+                try:
+                    history_response = supabase.table('property_history').insert(history_entry).execute()
+                    if not history_response.data:
+                        print(f"⚠️ Failed to insert history: {event}")
+                except Exception as e:
+                    print(f"❌ Error inserting history: {str(e)}")
+                    print(f"⏩ Skipped history entry: {event}")
+            else:
+                print(f"🕒 Skipped invalid date entry: {event}")
+        print("📜 Property history insertion completed.")
+    else:
+        print("ℹ️ No history data to insert.")
+def insert_real_estate(address, status):
+    try:
+        supabase = create_supabase_client()
+        data = {
+            "address": address,
+            "status": status
+        }
+        response = supabase.table('real_estate').insert(data).execute()
+        if response.data:
+            print(f"Inserted {address} into Supabase successfully.")
+        else:
+            print(f"Failed to insert {address} into Supabase.")
+    except Exception as e:
+        print(f"Error inserting {address} into Supabase: {str(e)}")
+def insert_real_estate_rent(address, status):
+    try:
+        supabase = create_supabase_client()
+        data = {
+            "address": address,
+            "status": status
+        }
+        response = supabase.table('real_estate_rent').insert(data).execute()
+        if response.data:
+            print(f"Inserted {address} into Supabase successfully.")
+        else:
+            print(f"Failed to insert {address} into Supabase.")
+    except Exception as e:
+        print(f"Error inserting {address} into Supabase: {str(e)}")

fetch_property_details.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import requests
+from bs4 import BeautifulSoup
+# 固定的城市和 suburb
+CITY = "Porirua City"
+SUBURB = "Aotea"
+# Step 2: Fetch details for each property
+def fetch_property_details(property_url, title):
+    print(f"\nFetching details for {property_url}...")
+    response = requests.get(property_url)
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Extracting property details
+        address_line1 = title.split(',')[0].strip()  # 从 title 中提取 address_line1
+        address_line2 = soup.find('span', {'testid': 'addressLine2'}).get_text(strip=True) if soup.find('span', {'testid': 'addressLine2'}) else 'N/A'
+        postcode = title.split(',')[-1].strip()  # 获取最后一个逗号后的邮政编码
+        # Combine address_line1 and address_line2 into a single address field
+        address = f"{address_line1}, {address_line2}"
+        suburb = SUBURB
+        city = CITY
+        try:
+            year_built = int(soup.find('div', {'testid': 'yearBuiltValue'}).get_text(strip=True))
+        except (AttributeError, ValueError):
+            year_built = None
+        try:
+            bedrooms = int(soup.find('span', {'testid': 'bed'}).get_text(strip=True))
+        except (AttributeError, ValueError):
+            bedrooms = None
+        try:
+            bathrooms = int(soup.find('span', {'testid': 'bath'}).get_text(strip=True))
+        except (AttributeError, ValueError):
+            bathrooms = None
+        try:
+            car_spaces = int(soup.find('span', {'testid': 'car'}).get_text(strip=True))
+        except (AttributeError, ValueError):
+            car_spaces = None
+        try:
+            floor_size = soup.find('span', class_='floor PropertyAttributes_attribute__3bkWm').get_text(strip=True)
+        except AttributeError:
+            floor_size = 'N/A'
+        try:
+            land_area = soup.find('span', class_='land PropertyAttributes_attribute__3bkWm').get_text(strip=True)
+        except AttributeError:
+            land_area = 'N/A'
+        last_sold_price, last_sold_date = parse_sold_details(soup)
+        capital_value = extract_value(soup, 'Capital Value')
+        land_value = extract_value(soup, 'Land Value')
+        improvement_value = extract_value(soup, 'Improvement Value')
+        # Fetch rental history (this is where rental status is determined)
+        rental_history = fetch_rental_history(soup)
+        # Storing and printing the property data
+        property_data = {
+            'property_url': property_url,
+            'address': address,
+            'suburb': suburb,
+            'city': city,
+            'postcode': postcode,
+            'year_built': year_built,
+            'bedrooms': bedrooms,
+            'bathrooms': bathrooms,
+            'car_spaces': car_spaces,
+            'floor_size': floor_size,
+            'land_area': land_area,
+            'last_sold_price': last_sold_price,
+            'last_sold_date': last_sold_date,
+            'capital_value': capital_value,
+            'land_value': land_value,
+            'improvement_value': improvement_value,
+            'rental_history': rental_history['history'],
+            'has_rental_history': rental_history['has_rental_history'],
+            'is_currently_rented': rental_history['is_currently_rented']
+        }
+        # 打印房产详情
+        for key, value in property_data.items():
+            print(f"{key}: {value}")
+    else:
+        print(f"Failed to fetch details for: {property_url}")
+# Step 3: Parse sold details (remove 'Last Sold on' and keep the correct date and price)
+def parse_sold_details(soup):
+    last_sold = soup.find('strong', {'testid': 'lastSoldAttribute'})
+    if last_sold:
+        last_sold_text = last_sold.get_text(strip=True)
+        # 提取日期并去掉 'Last Sold on'
+        if 'Last Sold on' in last_sold_text and 'for' in last_sold_text:
+            last_sold_date = last_sold_text.replace('Last Sold on', '').split('for')[0].strip()
+            # 提取价格
+            last_sold_price = last_sold_text.split('for')[-1].strip()
+            return last_sold_price, last_sold_date
+    # 如果没有找到相关信息，返回 None
+    return None, None
+# Helper to extract values like Capital Value, Land Value, Improvement Value
+def extract_value(soup, value_type):
+    try:
+        value = soup.find('div', string=value_type).find_next_sibling('div').get_text(strip=True)
+        return value
+    except AttributeError:
+        return 'N/A'
+# Fetch rental history function
+def fetch_rental_history(soup):
+    rental_history = []
+    has_rental_history = False
+    is_currently_rented = False
+    # 解析租赁历史记录
+    events = soup.find_all('div', class_='d-flex flex-row w-100 align-items-center pr-3 mb-2')
+    for event in events:
+        description = event.find('strong', {'testid': lambda x: x and x.startswith('pt-description')})
+        if description:
+            description_text = description.get_text(strip=True)
+            # 检查是否有 "Listed for Rent at" 的字样
+            if "Listed for Rent at" in description_text:
+                has_rental_history = True
+                is_currently_rented = True  # 根据逻辑判断当前是否出租
+                rental_price = description_text.split('Listed for Rent at')[-1].strip()  # 提取租金
+                rental_history.append(f"Rented for {rental_price}")
+    return {
+        'history': rental_history if rental_history else "No rental history available",
+        'has_rental_history': has_rental_history,
+        'is_currently_rented': is_currently_rented
+    }

fetch_property_links.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import requests
+from bs4 import BeautifulSoup
+import time
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+def fetch_property_links(main_url, page=1, max_retries=3):
+    property_links = []
+    titles = []
+    url = f"{main_url}?page={page}" if page > 1 else main_url
+    print(f"Fetching page {page}...")
+    # 创建一个带有重试机制的会话
+    session = requests.Session()
+    retries = Retry(total=max_retries,
+                    backoff_factor=0.1,
+                    status_forcelist=[500, 502, 503, 504])
+    session.mount('https://', HTTPAdapter(max_retries=retries))
+    try:
+        response = session.get(url, timeout=30)
+        response.raise_for_status()  # 这将抛出一个异常，如果状态码不是200
+        if response.status_code == 200:
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # 查找所有属性链接
+            for link in soup.find_all('a', class_='PropertyCard_PropertyCardLink__icVIl'):
+                full_link = "https://propertyvalue.co.nz" + link['href']
+                property_links.append(full_link)
+                titles.append(link['title'])  # 获取标题属性
+            print(f"\nFound {len(property_links)} properties on page {page}:")
+            # 如果需要打印标题，取消下面的注释
+            # for title in titles:
+            #     print(title)
+        else:
+            print(f"Unexpected status code {response.status_code} for URL: {url}")
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching page {page}: {e}")
+    finally:
+        time.sleep(2)  # 增加延迟到2秒，以避免过度加载服务器
+    return property_links, titles
+# 使用示例
+if __name__ == "__main__":
+    main_url = "https://propertyvalue.co.nz/wellington/wellington-city/khandallah-6035/200020"
+    links, titles = fetch_property_links(main_url)
+    print(f"Total properties found: {len(links)}")

main.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import time
+import requests
+from bs4 import BeautifulSoup
+from fetch_property_links import fetch_property_links
+from properties import fetch_property_details
+# from config.redis_config import create_redis_client, check_property_in_redis, add_property_to_redis
+from config.supabase_config import insert_property_and_history
+# Main function to scrape properties
+def fetch_suburbs(url, city):
+    """
+    Fetches the list of suburbs and their links from a given URL.
+    """
+    response = requests.get(url)
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.content, 'html.parser')
+        suburb_links_container = soup.find('div', {'testid': 'suburbLinksContainer'})
+        if suburb_links_container:
+            suburb_links = suburb_links_container.find_all('a')
+            # Reverse the order of links
+            for link in suburb_links:
+                suburb_name = link.get_text(strip=True)
+                suburb_link = "https://propertyvalue.co.nz" + link.get('href')
+                print(f"Suburb: {suburb_name}, Link: {suburb_link}")
+                # Fetch the page content for the suburb link
+                suburb_response = requests.get(suburb_link)
+                print(f"  Status code for {suburb_name}: {suburb_response.status_code}")
+                if suburb_response.status_code == 200:
+                    suburb_soup = BeautifulSoup(suburb_response.content, 'html.parser')
+                    # Find the pagination element using role='group' and class_='btn-group'
+                    pagination = suburb_soup.find('div', {'role': 'group', 'class': 'btn-group'})
+                    if pagination:
+                        # Find the label with "of" and the next label for the max page number
+                        of_label = pagination.find('label', string='of')
+                        if of_label and of_label.find_next_sibling('label'):
+                            max_page = int(of_label.find_next_sibling('label').get_text(strip=True))
+                            print(f"Suburb: {suburb_name}, Max Pages: {max_page}")
+                        else:
+                            print(f"  No page numbers found for {suburb_name}")
+                    else:
+                        print(f"  No pagination element found for {suburb_name}")
+                        max_page = 1 # Default to 1 page if no pagination
+                    scrape_properties(suburb_link, max_page, city, suburb_name)
+def scrape_properties(main_url, max_pages, city, suburb):
+    # redis_client = create_redis_client()  # Instantiate the Redis client
+    for page in range(1, max_pages + 1):
+        # Fetch property links and titles for the current page
+        property_links, titles = fetch_property_links(main_url, page)
+        # Print and fetch details for each property on the current page
+        for property_url, title in zip(property_links, titles):
+            print(f"Fetching details for: {title}")
+            # Check if the property address already exists in Redis
+            # if check_property_in_redis(redis_client, title):
+            #     print(f"Property {title} already exists in Redis. Skipping...")
+            #     continue
+            # Fetch property details and history
+            property_data, history_data = fetch_property_details(property_url, title, city, suburb)
+            # Insert into Supabase
+            insert_property_and_history(property_data, history_data)
+            # Add the property to Redis to avoid duplicates
+            # add_property_to_redis(redis_client, title)
+            # time.sleep(0.5)  # Adding a delay to avoid overloading the server
+# Run the scraper
+if __name__ == "__main__":
+    city = "Auckland - City"
+    fetch_suburbs("https://www.propertyvalue.co.nz/auckland/auckland/7", city)

properties.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import requests
+from bs4 import BeautifulSoup
+from property_history import fetch_property_history
+from config.supabase_config import insert_property_and_history  # Assuming this function exists
+# 固定的城市和 suburb
+# CITY = "Porirua City"
+# SUBURB = "Aotea"
+# Fetch property details
+def fetch_property_details(property_url, title, city, suburb):
+    print(f"\nFetching details for {property_url}...")
+    response = requests.get(property_url)
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Extracting property details
+        address_line1 = title.split(',')[0].strip()  # 从 title 中提取 address_line1
+        address_line2 = soup.find('span', {'testid': 'addressLine2'}).get_text(strip=True) if soup.find('span', {'testid': 'addressLine2'}) else 'N/A'
+        postcode = title.split(',')[-1].strip()  # 获取最后一个逗号后的邮政编码
+        # Combine address_line1 and address_line2 into a single address field
+        address = f"{address_line1}, {address_line2}"
+        # suburb = SUBURB
+        # city = CITY
+        try:
+            year_built = int(soup.find('div', {'testid': 'yearBuiltValue'}).get_text(strip=True))
+        except (AttributeError, ValueError):
+            year_built = None
+        try:
+            bedrooms = int(soup.find('span', {'testid': 'bed'}).get_text(strip=True))
+        except (AttributeError, ValueError):
+            bedrooms = None
+        try:
+            bathrooms = int(soup.find('span', {'testid': 'bath'}).get_text(strip=True))
+        except (AttributeError, ValueError):
+            bathrooms = None
+        try:
+            car_spaces = int(soup.find('span', {'testid': 'car'}).get_text(strip=True))
+        except (AttributeError, ValueError):
+            car_spaces = None
+        try:
+            floor_size = soup.find('span', class_='floor PropertyAttributes_attribute__3bkWm').get_text(strip=True)
+        except AttributeError:
+            floor_size = 'N/A'
+        try:
+            land_area = soup.find('span', class_='land PropertyAttributes_attribute__3bkWm').get_text(strip=True)
+        except AttributeError:
+            land_area = 'N/A'
+        last_sold_price, last_sold_date = parse_sold_details(soup)
+        capital_value = extract_value(soup, 'Capital Value')
+        land_value = extract_value(soup, 'Land Value')
+        improvement_value = extract_value(soup, 'Improvement Value')
+        # Fetch rental history from property_history.py
+        rental_history = fetch_property_history(soup)
+        # Prepare property data for insertion into Supabase
+        property_data = {
+            'property_url': property_url,
+            'address': address,
+            'suburb': suburb,
+            'city': city,
+            'postcode': postcode,
+            'year_built': year_built,
+            'bedrooms': bedrooms,
+            'bathrooms': bathrooms,
+            'car_spaces': car_spaces,
+            'floor_size': floor_size,
+            'land_area': land_area,
+            'last_sold_price': last_sold_price,
+            'last_sold_date': last_sold_date,
+            'capital_value': capital_value,
+            'land_value': land_value,
+            'improvement_value': improvement_value,
+            'has_rental_history': rental_history['has_rental_history'],
+            'is_currently_rented': rental_history['is_currently_rented']
+        }
+        # Prepare history data for insertion into Supabase
+        history_data = rental_history['history']
+        return property_data, history_data  # Return the data for insertion
+    else:
+        print(f"Failed to fetch details for: {property_url}")
+        return None, None
+# Step 3: Parse sold details
+def parse_sold_details(soup):
+    last_sold = soup.find('strong', {'testid': 'lastSoldAttribute'})
+    if last_sold:
+        last_sold_text = last_sold.get_text(strip=True)
+        if 'for' in last_sold_text and 'on' in last_sold_text:
+            last_sold_price = last_sold_text.split('for')[-1].strip()
+            # Clean up the last_sold_price: remove '$' and ',' and convert to a float
+            last_sold_price = last_sold_price.replace('$', '').replace(',', '')
+            try:
+                last_sold_price = float(last_sold_price)
+            except ValueError:
+                last_sold_price = None  # If price cannot be parsed, set it to None
+            if 'Last Sold on' in last_sold_text:
+                last_sold_date = last_sold_text.replace('Last Sold on', '').split('for')[0].strip()
+            else:
+                last_sold_date = last_sold_text.split('on')[-1].strip()
+            return last_sold_price, last_sold_date
+    return None, None
+# Helper to extract values like Capital Value, Land Value, Improvement Value
+def extract_value(soup, value_type):
+    try:
+        value = soup.find('div', string=value_type).find_next_sibling('div').get_text(strip=True)
+        return value
+    except AttributeError:
+        return 'N/A'

property_history.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from bs4 import BeautifulSoup
+# Fetch property history (like sale history, rental history)
+def fetch_property_history(soup):
+    property_history = []
+    has_rental_history = False
+    is_currently_rented = False
+    # 解析历史记录
+    events = soup.find_all('div', class_='d-flex flex-row w-100 align-items-center pr-3 mb-2')
+    for event in events:
+        # 提取日期部分
+        date_day = event.find('div', {'testid': lambda x: x and x.startswith('pt-monthDay')})
+        date_year = event.find('div', {'testid': lambda x: x and x.startswith('pt-year')})
+        if date_day and date_year:
+            event_date = f"{date_day.get_text(strip=True)} {date_year.get_text(strip=True)}"
+        elif date_year:
+            event_date = date_year.get_text(strip=True)
+        else:
+            event_date = "Unknown date"
+        # 提取描述
+        description = event.find('strong', {'testid': lambda x: x and x.startswith('pt-description')})
+        event_description = description.get_text(strip=True) if description else "No description"
+        # 提取间隔信息
+        interval = event.find('div', {'testid': lambda x: x and x.startswith('pt-interval')})
+        event_interval = interval.get_text(strip=True) if interval else "No interval info"
+        # 创建事件字典
+        event_dict = {
+            'event_date': event_date,
+            'event_description': event_description,
+            'event_interval': event_interval
+        }
+        property_history.append(event_dict)
+        # 检查是否有租赁历史
+        if "Listed for Rent at" in event_description or "Rented for" in event_description:
+            has_rental_history = True
+            is_currently_rented = True  # 根据逻辑判断当前是否出租
+    # 打印 property_history 详情
+    print("----------------")
+    print("Property History")
+    print("----------------")
+    for event in property_history:
+        print(f"Date: {event['event_date']}")
+        print(f"Description: {event['event_description']}")
+        print(f"Interval: {event['event_interval']}")
+        print("----------------")
+    return {
+        'history': property_history if property_history else "No rental history available",
+        'has_rental_history': has_rental_history,
+        'is_currently_rented': is_currently_rented
+    }
+# Example usage (for testing purposes):
+# if __name__ == "__main__":
+#     sample_html = """<your HTML content here>"""
+#     soup = BeautifulSoup(sample_html, 'html.parser')
+#     fetch_property_history(soup)

real_estate.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from playwright.sync_api import sync_playwright, TimeoutError
+import time
+import random
+from config.redis_config import add_real_estate_to_redis, check_real_estate_in_redis, create_redis_client
+from config.supabase_config import insert_real_estate
+def handle_dialog(dialog):
+    print(f"Dialog message: {dialog.message}")
+    dialog.accept()
+def scroll_to_bottom(page):
+    print("开始模拟鼠标下滑操作...")
+    last_height = page.evaluate("document.body.scrollHeight")
+    while True:
+        print(f"  - 当前页面高度: {last_height}，继续下滑...")
+        page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+        time.sleep(random.uniform(1, 2))  # 等待页面加载
+        new_height = page.evaluate("document.body.scrollHeight")
+        if new_height == last_height:
+            print("  - 已到达页面底部")
+            break
+        last_height = new_height
+        # 检查是否出现了页码导航
+        if page.query_selector('nav[aria-label="Pagination"]') or page.query_selector('div[class*="pagination"]'):
+            print("  - 检测到页码导航，停止滚动")
+            break
+def simulate_user_behavior(page):
+    scroll_to_bottom(page)
+    # 随机点击几个房产卡片
+    print("模拟查看房产卡片...")
+    card_selectors = [
+        'div[class*="listing-tile"]',
+        'div[class*="property-card"]',
+        'div[class*="search-result"]'
+    ]
+    for selector in card_selectors:
+        cards = page.query_selector_all(selector)
+        if cards:
+            for _ in range(min(3, len(cards))):
+                card = random.choice(cards)
+                try:
+                    card.scroll_into_view_if_needed()
+                    card.hover()
+                    print(f"  - 悬停在一个房产卡片上")
+                    time.sleep(random.uniform(0.5, 1.5))
+                except:
+                    pass
+            break
+    # 额外的滚动操作
+    print("模拟额外的滚动操作")
+    for i in range(10):
+        scroll_distance = random.randint(500, 1500)
+        page.evaluate(f"window.scrollBy(0, {scroll_distance})")
+        print(f"  - 向下滚动 {scroll_distance} 像素")
+        time.sleep(random.uniform(1, 2))
+    # 再次滚动到底部
+    print("再次滚动到页面底部")
+    scroll_to_bottom(page)
+def fetch_addresses(page, url):
+    try:
+        page.goto(url, wait_until="networkidle", timeout=60000)
+    except TimeoutError:
+        print(f"Timeout while loading {url}. Continuing with partial page load.")
+    try:
+        page.wait_for_selector('button:has-text("Accept")', timeout=5000)
+        page.click('button:has-text("Accept")')
+        print("Clicked cookie consent button.")
+    except:
+        print("No cookie consent button found or unable to click it.")
+    # 模拟用户行为
+    simulate_user_behavior(page)
+    addresses = []
+    try:
+        selectors = [
+            'h3[data-test="standard-tile__search-result__address"]',
+            '.standard-tile__search-result__address',
+            'h3[class*="address"]',
+            'div[class*="address"]',
+            'div[class*="listing-tile"] h3',
+            'div[class*="listing-tile"] div[class*="address"]'
+        ]
+        for selector in selectors:
+            address_elements = page.query_selector_all(selector)
+            if address_elements:
+                addresses = [element.inner_text().strip() for element in address_elements if element.inner_text().strip()]
+                print(f"Found {len(addresses)} addresses using selector: {selector}")
+                break
+        if not addresses:
+            print(f"No address elements found on {url} using any of the selectors.")
+            print("Page Title:", page.title())
+            print("Current URL:", page.url)
+            print("HTML content:", page.content()[:1000])
+    except Exception as e:
+        print(f"An error occurred while scraping {url}: {str(e)}")
+    return addresses
+def scrape_properties(main_url, max_pages):
+    redis_client = create_redis_client()  # Instantiate the Redis client
+    all_addresses = []
+    with sync_playwright() as p:
+        browser = p.chromium.launch(
+            headless=True,
+            args=[
+                "--no-sandbox",
+                "--disable-dev-shm-usage",
+            ],
+        )
+        context = browser.new_context(
+            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        )
+        page = context.new_page()
+        page.on("dialog", handle_dialog)
+        for page_num in range(1, max_pages + 1):
+            url = f"{main_url}?page={page_num}"
+            print(f"\nScraping page {page_num}: {url}")
+            addresses = fetch_addresses(page, url)
+            if addresses:
+                all_addresses.extend(addresses)
+                print(f"Found {len(addresses)} addresses on page {page_num}")
+                print("Addresses found on this page:")
+                for addr in addresses:
+                    print(f"  - {addr}")
+                    if not check_real_estate_in_redis(redis_client, addr):
+                        # 插入到Supabase中
+                        insert_real_estate(addr, "for Sale")  # 假设状态为 "For Sale"
+                        # 将地址添加到Redis，避免重复
+                        add_real_estate_to_redis(redis_client, addr)
+                    else:
+                        print(f"Address {addr} already exists in Redis. Skipping...")
+            else:
+                print(f"No addresses found on page {page_num}. Continuing to next page.")
+            if page_num < max_pages:
+                delay = random.uniform(5, 10)
+                print(f"Waiting for {delay:.2f} seconds before next request...")
+                time.sleep(delay)
+        browser.close()
+def main():
+    main_url = "https://www.realestate.co.nz/residential/sale/auckland"
+    max_pages = 500
+    scrape_properties(main_url, max_pages)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+supabase==2.15.1          # Python client library for Supabase
+python-dotenv==1.0.0      # For loading environment variables from a .env file
+requests==2.31.0          # HTTP library for making requests
+upstash-redis==1.3.0      # Redis client for Upstash
+python-dateutil==2.8.2    # For flexible date parsing
+playwright>=1.33.0
+# greenlet>=3.1.0
+beautifulsoup4>=4.12.2    # BeautifulSoup HTML parser

utils/data_processing.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import datetime
+def parse_event_date(date_str):
+    # 将字符串日期解析为标准日期格式
+    try:
+        return datetime.datetime.strptime(date_str, "%d %B %Y")
+    except ValueError:
+        return None  # 返回 None 表示日期格式不正确