Spaces:
Paused
Paused
Add code for fetch property data of Auckland NZ.
Browse files- .gitignore +7 -0
- config/config_test.py +141 -0
- config/redis_config.py +35 -0
- config/supabase_config.py +163 -0
- fetch_property_details.py +147 -0
- fetch_property_links.py +54 -0
- main.py +79 -0
- properties.py +131 -0
- property_history.py +66 -0
- real_estate.py +163 -0
- requirements.txt +9 -0
- utils/data_processing.py +8 -0
.gitignore
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/config/__pycache__
|
| 2 |
+
/__pycache__
|
| 3 |
+
/.venv
|
| 4 |
+
.vscode
|
| 5 |
+
/.idx/
|
| 6 |
+
.env
|
| 7 |
+
shell.nix
|
config/config_test.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
from supabase_config import create_supabase_client
|
| 4 |
+
from redis_config import create_redis_client
|
| 5 |
+
from upstash_redis import Redis
|
| 6 |
+
# from redis import Redis
|
| 7 |
+
|
| 8 |
+
def test_redis_connection():
|
| 9 |
+
try:
|
| 10 |
+
# Connect to Redis
|
| 11 |
+
redis_client = Redis.from_env()
|
| 12 |
+
# redis_client = Redis(host='localhost', port=6379, db=0)
|
| 13 |
+
|
| 14 |
+
# Flush all data in Redis
|
| 15 |
+
# redis_client.flushdb()
|
| 16 |
+
|
| 17 |
+
# Test inserting a value
|
| 18 |
+
redis_client.set('test_key', 'test_value')
|
| 19 |
+
value = redis_client.get('test_key')
|
| 20 |
+
if value:
|
| 21 |
+
print("Redis connection successful, test key inserted.")
|
| 22 |
+
else:
|
| 23 |
+
print("Redis connection failed.")
|
| 24 |
+
return True
|
| 25 |
+
except Exception as e:
|
| 26 |
+
print(f"Error connecting to Redis: {e}")
|
| 27 |
+
print("Redis test failed.")
|
| 28 |
+
return False
|
| 29 |
+
|
| 30 |
+
def test_supabase_connection():
|
| 31 |
+
try:
|
| 32 |
+
# Create a Supabase client
|
| 33 |
+
supabase_client = create_supabase_client()
|
| 34 |
+
print("Successfully connected to Supabase.")
|
| 35 |
+
|
| 36 |
+
# Test querying the 'properties' table
|
| 37 |
+
response = supabase_client.from_('properties').select('*').limit(1).execute()
|
| 38 |
+
|
| 39 |
+
# Check if there is an error or data is None
|
| 40 |
+
if not response.data:
|
| 41 |
+
print("Failed to fetch data from 'properties' table or table is empty.")
|
| 42 |
+
else:
|
| 43 |
+
print("Successfully fetched data from 'properties' table:", response.data)
|
| 44 |
+
return True
|
| 45 |
+
except Exception as e:
|
| 46 |
+
print(f"Error connecting to Supabase: {e}")
|
| 47 |
+
print("Supabase test failed.")
|
| 48 |
+
return False
|
| 49 |
+
|
| 50 |
+
def insert_property(supabase_client, property_data):
|
| 51 |
+
try:
|
| 52 |
+
# Remove the 'id' from property_data if it exists, so it can be auto-generated
|
| 53 |
+
property_data.pop('id', None)
|
| 54 |
+
|
| 55 |
+
# Clean up the price fields
|
| 56 |
+
for field in ['last_sold_price', 'capital_value', 'land_value', 'improvement_value']:
|
| 57 |
+
if field in property_data and property_data[field]:
|
| 58 |
+
property_data[field] = clean_price(property_data[field])
|
| 59 |
+
|
| 60 |
+
# Insert the property data
|
| 61 |
+
response = supabase_client.table('properties').insert(property_data).execute()
|
| 62 |
+
|
| 63 |
+
# Check for errors in the response
|
| 64 |
+
if hasattr(response, 'error') and response.error:
|
| 65 |
+
print(f"Failed to insert property: {response.error}")
|
| 66 |
+
return None
|
| 67 |
+
elif not response.data:
|
| 68 |
+
print("Failed to insert property: No data returned")
|
| 69 |
+
return None
|
| 70 |
+
else:
|
| 71 |
+
print(f"Property inserted: {property_data['address']}")
|
| 72 |
+
return response.data[0]['id'] # Return the property ID
|
| 73 |
+
|
| 74 |
+
except Exception as e:
|
| 75 |
+
print(f"Error inserting property: {str(e)}")
|
| 76 |
+
return None
|
| 77 |
+
|
| 78 |
+
# Helper function to clean price values
|
| 79 |
+
def clean_price(price_str):
|
| 80 |
+
try:
|
| 81 |
+
# Remove dollar sign and commas, then convert to float
|
| 82 |
+
clean_price_str = price_str.replace('$', '').replace(',', '').strip()
|
| 83 |
+
return float(clean_price_str)
|
| 84 |
+
except ValueError:
|
| 85 |
+
return None # If there's an issue parsing the price, return None
|
| 86 |
+
|
| 87 |
+
# Test inserting a property into the database
|
| 88 |
+
def test_insert_property():
|
| 89 |
+
# Create a test property with a sample price that contains special characters
|
| 90 |
+
test_property_data = {
|
| 91 |
+
'address': '15 Agra Crescent, Khandallah, Wellington, 6035',
|
| 92 |
+
'suburb': 'Khandallah',
|
| 93 |
+
'city': 'Wellington',
|
| 94 |
+
'postcode': '6035',
|
| 95 |
+
'year_built': 1985,
|
| 96 |
+
'bedrooms': 3,
|
| 97 |
+
'bathrooms': 2,
|
| 98 |
+
'car_spaces': 2,
|
| 99 |
+
'floor_size': '150 sqm',
|
| 100 |
+
'land_area': '500 sqm',
|
| 101 |
+
'last_sold_price': '$1,280,000', # This needs cleaning
|
| 102 |
+
'last_sold_date': '2023-08-01',
|
| 103 |
+
'capital_value': '$1,000,000', # This needs cleaning
|
| 104 |
+
'land_value': '$800,000', # This needs cleaning
|
| 105 |
+
'improvement_value': '$200,000', # This needs cleaning
|
| 106 |
+
'has_rental_history': False,
|
| 107 |
+
'is_currently_rented': False,
|
| 108 |
+
'status': 'For Sale' # Adding status as per SQL definition
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
try:
|
| 112 |
+
# Create a Supabase client
|
| 113 |
+
supabase_client = create_supabase_client()
|
| 114 |
+
|
| 115 |
+
# Attempt to insert the property
|
| 116 |
+
property_id = insert_property(supabase_client, test_property_data)
|
| 117 |
+
if property_id:
|
| 118 |
+
print(f"Inserted property with generated ID: {property_id}")
|
| 119 |
+
|
| 120 |
+
except Exception as e:
|
| 121 |
+
print(f"Error during test insertion: {e}")
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def main():
|
| 125 |
+
# print("Testing Redis connection first...")
|
| 126 |
+
redis_test_result = test_redis_connection()
|
| 127 |
+
|
| 128 |
+
print("\nTesting Supabase connection now...")
|
| 129 |
+
supabase_test_result = test_supabase_connection()
|
| 130 |
+
|
| 131 |
+
print("\nTesting Supabase property insertion...")
|
| 132 |
+
test_insert_property()
|
| 133 |
+
|
| 134 |
+
# if redis_test_result and supabase_test_result:
|
| 135 |
+
if supabase_test_result:
|
| 136 |
+
print("\nAll tests passed!")
|
| 137 |
+
else:
|
| 138 |
+
print("\nOne or more tests failed. Please check the above messages.")
|
| 139 |
+
|
| 140 |
+
if __name__ == '__main__':
|
| 141 |
+
main()
|
config/redis_config.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# redis_config.py
|
| 2 |
+
import os
|
| 3 |
+
# from redis import Redis
|
| 4 |
+
from upstash_redis import Redis # Import Redis instead of UpstashRedis
|
| 5 |
+
|
| 6 |
+
# Redis connection configuration
|
| 7 |
+
def create_redis_client():
|
| 8 |
+
# Use Redis.from_env() which is the correct way to initialize from environment variables
|
| 9 |
+
return Redis.from_env()
|
| 10 |
+
# return Redis(host='localhost', port=6379, db=0)
|
| 11 |
+
|
| 12 |
+
# Check if a property address exists in Redis
|
| 13 |
+
def check_property_in_redis(redis_client, address):
|
| 14 |
+
# Upstash Redis client uses get() and checks for None
|
| 15 |
+
return redis_client.get(address) is not None
|
| 16 |
+
|
| 17 |
+
# Add a property address to Redis after insertion
|
| 18 |
+
def add_property_to_redis(redis_client, address):
|
| 19 |
+
redis_client.set(address, 1)
|
| 20 |
+
|
| 21 |
+
# Check if a real estate property address exists in Redis
|
| 22 |
+
def check_real_estate_in_redis(redis_client, address):
|
| 23 |
+
return redis_client.get("real" + address) is not None
|
| 24 |
+
|
| 25 |
+
# Add a real estate property address to Redis after insertion
|
| 26 |
+
def add_real_estate_to_redis(redis_client, address):
|
| 27 |
+
redis_client.set("real"+address, 1)
|
| 28 |
+
|
| 29 |
+
# Check if a real estate rent property address exists in Redis
|
| 30 |
+
def check_real_estate_rent_in_redis(redis_client, address):
|
| 31 |
+
return redis_client.get("real" + address) is not None
|
| 32 |
+
|
| 33 |
+
# Add a real estate property rent address to Redis after insertion
|
| 34 |
+
def add_real_estate_rent_to_redis(redis_client, address):
|
| 35 |
+
redis_client.set("real"+address, 1)
|
config/supabase_config.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# supabase_config.py
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
import os
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
from supabase import create_client, Client
|
| 6 |
+
|
| 7 |
+
# 连接到 Supabase
|
| 8 |
+
load_dotenv() # 默认加载根目录的 .env 文件
|
| 9 |
+
|
| 10 |
+
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
| 11 |
+
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
|
| 12 |
+
|
| 13 |
+
# Create a Supabase client
|
| 14 |
+
def create_supabase_client() -> Client:
|
| 15 |
+
if not SUPABASE_URL or not SUPABASE_KEY:
|
| 16 |
+
raise ValueError("Supabase URL and API key must be provided")
|
| 17 |
+
return create_client(SUPABASE_URL, SUPABASE_KEY)
|
| 18 |
+
|
| 19 |
+
# Insert property details into the properties table
|
| 20 |
+
def insert_property(supabase_client, property_data):
|
| 21 |
+
try:
|
| 22 |
+
# Remove the 'id' from property_data if it exists, so it can be auto-generated
|
| 23 |
+
property_data.pop('id', None)
|
| 24 |
+
|
| 25 |
+
# Clean up the last_sold_price before inserting
|
| 26 |
+
if 'last_sold_price' in property_data and property_data['last_sold_price']:
|
| 27 |
+
property_data['last_sold_price'] = clean_price(property_data['last_sold_price'])
|
| 28 |
+
|
| 29 |
+
# Clean up the capital_value, land_value, and improvement_value as well
|
| 30 |
+
if 'capital_value' in property_data and property_data['capital_value']:
|
| 31 |
+
property_data['capital_value'] = clean_price(property_data['capital_value'])
|
| 32 |
+
if 'land_value' in property_data and property_data['land_value']:
|
| 33 |
+
property_data['land_value'] = clean_price(property_data['land_value'])
|
| 34 |
+
if 'improvement_value' in property_data and property_data['improvement_value']:
|
| 35 |
+
property_data['improvement_value'] = clean_price(property_data['improvement_value'])
|
| 36 |
+
|
| 37 |
+
# Insert the property data (without 'id')
|
| 38 |
+
response = supabase_client.table('properties').insert(property_data).execute()
|
| 39 |
+
|
| 40 |
+
# Check for errors in the response
|
| 41 |
+
if response.error:
|
| 42 |
+
print(f"Failed to insert property: {response.error}") # Print the error message
|
| 43 |
+
return None
|
| 44 |
+
|
| 45 |
+
# If successful, return the ID of the inserted property
|
| 46 |
+
print(f"Property inserted: {property_data['address']}")
|
| 47 |
+
return response.data[0]['id'] # Return the property ID
|
| 48 |
+
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(f"Error inserting property: {str(e)}")
|
| 51 |
+
return None
|
| 52 |
+
|
| 53 |
+
def clean_price(price_str):
|
| 54 |
+
if price_str is None:
|
| 55 |
+
return None
|
| 56 |
+
if isinstance(price_str, (int, float)):
|
| 57 |
+
return price_str
|
| 58 |
+
try:
|
| 59 |
+
return float(price_str.replace('$', '').replace(',', '').strip())
|
| 60 |
+
except ValueError:
|
| 61 |
+
return None
|
| 62 |
+
|
| 63 |
+
def clean_property_data(property_data):
|
| 64 |
+
price_fields = ['last_sold_price', 'capital_value', 'land_value', 'improvement_value']
|
| 65 |
+
for field in price_fields:
|
| 66 |
+
if field in property_data:
|
| 67 |
+
property_data[field] = clean_price(property_data[field])
|
| 68 |
+
return property_data
|
| 69 |
+
|
| 70 |
+
def parse_date(date_str):
|
| 71 |
+
if not date_str:
|
| 72 |
+
return None
|
| 73 |
+
date_formats = ['%d %b %Y', '%Y', '%b %Y', '%d/%m/%Y', '%Y-%m-%d']
|
| 74 |
+
for fmt in date_formats:
|
| 75 |
+
try:
|
| 76 |
+
return datetime.strptime(date_str, fmt).date()
|
| 77 |
+
except ValueError:
|
| 78 |
+
continue
|
| 79 |
+
print(f"Warning: Unable to parse date '{date_str}'")
|
| 80 |
+
return None
|
| 81 |
+
|
| 82 |
+
def format_date_for_json(date_obj):
|
| 83 |
+
if date_obj is None:
|
| 84 |
+
return None
|
| 85 |
+
return date_obj.isoformat() # Converts date to ISO 8601 string format
|
| 86 |
+
|
| 87 |
+
def insert_property_and_history(property_data, history_data):
|
| 88 |
+
supabase = create_supabase_client()
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
# Clean property data
|
| 92 |
+
cleaned_property_data = clean_property_data(property_data)
|
| 93 |
+
# 插入 property 数据
|
| 94 |
+
response = supabase.table('properties').insert(cleaned_property_data).execute()
|
| 95 |
+
|
| 96 |
+
if response.data:
|
| 97 |
+
property_id = response.data[0]['id']
|
| 98 |
+
print(f"✅ Property inserted successfully. ID: {property_id}")
|
| 99 |
+
else:
|
| 100 |
+
print(f"⚠️ Failed to insert property. Maybe already exists. URL: {cleaned_property_data.get('property_url')}")
|
| 101 |
+
return # 不继续插入历史
|
| 102 |
+
|
| 103 |
+
except Exception as e:
|
| 104 |
+
error_str = str(e).lower()
|
| 105 |
+
if "duplicate key" in error_str or "unique constraint" in error_str:
|
| 106 |
+
print(f"🔁 Duplicate property skipped (URL: {cleaned_property_data.get('property_url')})")
|
| 107 |
+
else:
|
| 108 |
+
print(f"❌ Unexpected error during property insert: {e}")
|
| 109 |
+
return
|
| 110 |
+
|
| 111 |
+
# 继续插入历史记录
|
| 112 |
+
if history_data and isinstance(history_data, list):
|
| 113 |
+
for event in history_data:
|
| 114 |
+
history_entry = {
|
| 115 |
+
'property_id': property_id,
|
| 116 |
+
'event_description': event.get('event_description', ''),
|
| 117 |
+
'event_date': format_date_for_json(parse_date(event.get('event_date'))),
|
| 118 |
+
'interval_since_last_event': event.get('event_interval', '')
|
| 119 |
+
}
|
| 120 |
+
if history_entry['event_date'] is not None:
|
| 121 |
+
try:
|
| 122 |
+
history_response = supabase.table('property_history').insert(history_entry).execute()
|
| 123 |
+
if not history_response.data:
|
| 124 |
+
print(f"⚠️ Failed to insert history: {event}")
|
| 125 |
+
except Exception as e:
|
| 126 |
+
print(f"❌ Error inserting history: {str(e)}")
|
| 127 |
+
print(f"⏩ Skipped history entry: {event}")
|
| 128 |
+
else:
|
| 129 |
+
print(f"🕒 Skipped invalid date entry: {event}")
|
| 130 |
+
print("📜 Property history insertion completed.")
|
| 131 |
+
else:
|
| 132 |
+
print("ℹ️ No history data to insert.")
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def insert_real_estate(address, status):
|
| 136 |
+
try:
|
| 137 |
+
supabase = create_supabase_client()
|
| 138 |
+
data = {
|
| 139 |
+
"address": address,
|
| 140 |
+
"status": status
|
| 141 |
+
}
|
| 142 |
+
response = supabase.table('real_estate').insert(data).execute()
|
| 143 |
+
if response.data:
|
| 144 |
+
print(f"Inserted {address} into Supabase successfully.")
|
| 145 |
+
else:
|
| 146 |
+
print(f"Failed to insert {address} into Supabase.")
|
| 147 |
+
except Exception as e:
|
| 148 |
+
print(f"Error inserting {address} into Supabase: {str(e)}")
|
| 149 |
+
|
| 150 |
+
def insert_real_estate_rent(address, status):
|
| 151 |
+
try:
|
| 152 |
+
supabase = create_supabase_client()
|
| 153 |
+
data = {
|
| 154 |
+
"address": address,
|
| 155 |
+
"status": status
|
| 156 |
+
}
|
| 157 |
+
response = supabase.table('real_estate_rent').insert(data).execute()
|
| 158 |
+
if response.data:
|
| 159 |
+
print(f"Inserted {address} into Supabase successfully.")
|
| 160 |
+
else:
|
| 161 |
+
print(f"Failed to insert {address} into Supabase.")
|
| 162 |
+
except Exception as e:
|
| 163 |
+
print(f"Error inserting {address} into Supabase: {str(e)}")
|
fetch_property_details.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
|
| 4 |
+
# 固定的城市和 suburb
|
| 5 |
+
CITY = "Porirua City"
|
| 6 |
+
SUBURB = "Aotea"
|
| 7 |
+
|
| 8 |
+
# Step 2: Fetch details for each property
|
| 9 |
+
def fetch_property_details(property_url, title):
|
| 10 |
+
print(f"\nFetching details for {property_url}...")
|
| 11 |
+
response = requests.get(property_url)
|
| 12 |
+
|
| 13 |
+
if response.status_code == 200:
|
| 14 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 15 |
+
|
| 16 |
+
# Extracting property details
|
| 17 |
+
address_line1 = title.split(',')[0].strip() # 从 title 中提取 address_line1
|
| 18 |
+
address_line2 = soup.find('span', {'testid': 'addressLine2'}).get_text(strip=True) if soup.find('span', {'testid': 'addressLine2'}) else 'N/A'
|
| 19 |
+
postcode = title.split(',')[-1].strip() # 获取最后一个逗号后的邮政编码
|
| 20 |
+
|
| 21 |
+
# Combine address_line1 and address_line2 into a single address field
|
| 22 |
+
address = f"{address_line1}, {address_line2}"
|
| 23 |
+
|
| 24 |
+
suburb = SUBURB
|
| 25 |
+
city = CITY
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
year_built = int(soup.find('div', {'testid': 'yearBuiltValue'}).get_text(strip=True))
|
| 29 |
+
except (AttributeError, ValueError):
|
| 30 |
+
year_built = None
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
bedrooms = int(soup.find('span', {'testid': 'bed'}).get_text(strip=True))
|
| 34 |
+
except (AttributeError, ValueError):
|
| 35 |
+
bedrooms = None
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
bathrooms = int(soup.find('span', {'testid': 'bath'}).get_text(strip=True))
|
| 39 |
+
except (AttributeError, ValueError):
|
| 40 |
+
bathrooms = None
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
car_spaces = int(soup.find('span', {'testid': 'car'}).get_text(strip=True))
|
| 44 |
+
except (AttributeError, ValueError):
|
| 45 |
+
car_spaces = None
|
| 46 |
+
|
| 47 |
+
try:
|
| 48 |
+
floor_size = soup.find('span', class_='floor PropertyAttributes_attribute__3bkWm').get_text(strip=True)
|
| 49 |
+
except AttributeError:
|
| 50 |
+
floor_size = 'N/A'
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
land_area = soup.find('span', class_='land PropertyAttributes_attribute__3bkWm').get_text(strip=True)
|
| 54 |
+
except AttributeError:
|
| 55 |
+
land_area = 'N/A'
|
| 56 |
+
|
| 57 |
+
last_sold_price, last_sold_date = parse_sold_details(soup)
|
| 58 |
+
|
| 59 |
+
capital_value = extract_value(soup, 'Capital Value')
|
| 60 |
+
land_value = extract_value(soup, 'Land Value')
|
| 61 |
+
improvement_value = extract_value(soup, 'Improvement Value')
|
| 62 |
+
|
| 63 |
+
# Fetch rental history (this is where rental status is determined)
|
| 64 |
+
rental_history = fetch_rental_history(soup)
|
| 65 |
+
|
| 66 |
+
# Storing and printing the property data
|
| 67 |
+
property_data = {
|
| 68 |
+
'property_url': property_url,
|
| 69 |
+
'address': address,
|
| 70 |
+
'suburb': suburb,
|
| 71 |
+
'city': city,
|
| 72 |
+
'postcode': postcode,
|
| 73 |
+
'year_built': year_built,
|
| 74 |
+
'bedrooms': bedrooms,
|
| 75 |
+
'bathrooms': bathrooms,
|
| 76 |
+
'car_spaces': car_spaces,
|
| 77 |
+
'floor_size': floor_size,
|
| 78 |
+
'land_area': land_area,
|
| 79 |
+
'last_sold_price': last_sold_price,
|
| 80 |
+
'last_sold_date': last_sold_date,
|
| 81 |
+
'capital_value': capital_value,
|
| 82 |
+
'land_value': land_value,
|
| 83 |
+
'improvement_value': improvement_value,
|
| 84 |
+
'rental_history': rental_history['history'],
|
| 85 |
+
'has_rental_history': rental_history['has_rental_history'],
|
| 86 |
+
'is_currently_rented': rental_history['is_currently_rented']
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
# 打印房产详情
|
| 90 |
+
for key, value in property_data.items():
|
| 91 |
+
print(f"{key}: {value}")
|
| 92 |
+
|
| 93 |
+
else:
|
| 94 |
+
print(f"Failed to fetch details for: {property_url}")
|
| 95 |
+
|
| 96 |
+
# Step 3: Parse sold details (remove 'Last Sold on' and keep the correct date and price)
|
| 97 |
+
def parse_sold_details(soup):
|
| 98 |
+
last_sold = soup.find('strong', {'testid': 'lastSoldAttribute'})
|
| 99 |
+
if last_sold:
|
| 100 |
+
last_sold_text = last_sold.get_text(strip=True)
|
| 101 |
+
|
| 102 |
+
# 提取日期并去掉 'Last Sold on'
|
| 103 |
+
if 'Last Sold on' in last_sold_text and 'for' in last_sold_text:
|
| 104 |
+
last_sold_date = last_sold_text.replace('Last Sold on', '').split('for')[0].strip()
|
| 105 |
+
|
| 106 |
+
# 提取价格
|
| 107 |
+
last_sold_price = last_sold_text.split('for')[-1].strip()
|
| 108 |
+
|
| 109 |
+
return last_sold_price, last_sold_date
|
| 110 |
+
|
| 111 |
+
# 如果没有找到相关信息,返回 None
|
| 112 |
+
return None, None
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# Helper to extract values like Capital Value, Land Value, Improvement Value
|
| 116 |
+
def extract_value(soup, value_type):
|
| 117 |
+
try:
|
| 118 |
+
value = soup.find('div', string=value_type).find_next_sibling('div').get_text(strip=True)
|
| 119 |
+
return value
|
| 120 |
+
except AttributeError:
|
| 121 |
+
return 'N/A'
|
| 122 |
+
|
| 123 |
+
# Fetch rental history function
|
| 124 |
+
def fetch_rental_history(soup):
|
| 125 |
+
rental_history = []
|
| 126 |
+
has_rental_history = False
|
| 127 |
+
is_currently_rented = False
|
| 128 |
+
|
| 129 |
+
# 解析租赁历史记录
|
| 130 |
+
events = soup.find_all('div', class_='d-flex flex-row w-100 align-items-center pr-3 mb-2')
|
| 131 |
+
for event in events:
|
| 132 |
+
description = event.find('strong', {'testid': lambda x: x and x.startswith('pt-description')})
|
| 133 |
+
if description:
|
| 134 |
+
description_text = description.get_text(strip=True)
|
| 135 |
+
|
| 136 |
+
# 检查是否有 "Listed for Rent at" 的字样
|
| 137 |
+
if "Listed for Rent at" in description_text:
|
| 138 |
+
has_rental_history = True
|
| 139 |
+
is_currently_rented = True # 根据逻辑判断当前是否出租
|
| 140 |
+
rental_price = description_text.split('Listed for Rent at')[-1].strip() # 提取租金
|
| 141 |
+
rental_history.append(f"Rented for {rental_price}")
|
| 142 |
+
|
| 143 |
+
return {
|
| 144 |
+
'history': rental_history if rental_history else "No rental history available",
|
| 145 |
+
'has_rental_history': has_rental_history,
|
| 146 |
+
'is_currently_rented': is_currently_rented
|
| 147 |
+
}
|
fetch_property_links.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import time
|
| 4 |
+
from requests.adapters import HTTPAdapter
|
| 5 |
+
from urllib3.util.retry import Retry
|
| 6 |
+
|
| 7 |
+
def fetch_property_links(main_url, page=1, max_retries=3):
|
| 8 |
+
property_links = []
|
| 9 |
+
titles = []
|
| 10 |
+
|
| 11 |
+
url = f"{main_url}?page={page}" if page > 1 else main_url
|
| 12 |
+
print(f"Fetching page {page}...")
|
| 13 |
+
|
| 14 |
+
# 创建一个带有重试机制的会话
|
| 15 |
+
session = requests.Session()
|
| 16 |
+
retries = Retry(total=max_retries,
|
| 17 |
+
backoff_factor=0.1,
|
| 18 |
+
status_forcelist=[500, 502, 503, 504])
|
| 19 |
+
session.mount('https://', HTTPAdapter(max_retries=retries))
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
response = session.get(url, timeout=30)
|
| 23 |
+
response.raise_for_status() # 这将抛出一个异常,如果状态码不是200
|
| 24 |
+
|
| 25 |
+
if response.status_code == 200:
|
| 26 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 27 |
+
|
| 28 |
+
# 查找所有属性链接
|
| 29 |
+
for link in soup.find_all('a', class_='PropertyCard_PropertyCardLink__icVIl'):
|
| 30 |
+
full_link = "https://propertyvalue.co.nz" + link['href']
|
| 31 |
+
property_links.append(full_link)
|
| 32 |
+
titles.append(link['title']) # 获取标题属性
|
| 33 |
+
|
| 34 |
+
print(f"\nFound {len(property_links)} properties on page {page}:")
|
| 35 |
+
# 如果需要打印标题,取消下面的注释
|
| 36 |
+
# for title in titles:
|
| 37 |
+
# print(title)
|
| 38 |
+
|
| 39 |
+
else:
|
| 40 |
+
print(f"Unexpected status code {response.status_code} for URL: {url}")
|
| 41 |
+
|
| 42 |
+
except requests.exceptions.RequestException as e:
|
| 43 |
+
print(f"Error fetching page {page}: {e}")
|
| 44 |
+
|
| 45 |
+
finally:
|
| 46 |
+
time.sleep(2) # 增加延迟到2秒,以避免过度加载服务器
|
| 47 |
+
|
| 48 |
+
return property_links, titles
|
| 49 |
+
|
| 50 |
+
# 使用示例
|
| 51 |
+
if __name__ == "__main__":
|
| 52 |
+
main_url = "https://propertyvalue.co.nz/wellington/wellington-city/khandallah-6035/200020"
|
| 53 |
+
links, titles = fetch_property_links(main_url)
|
| 54 |
+
print(f"Total properties found: {len(links)}")
|
main.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import requests
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
from fetch_property_links import fetch_property_links
|
| 5 |
+
from properties import fetch_property_details
|
| 6 |
+
# from config.redis_config import create_redis_client, check_property_in_redis, add_property_to_redis
|
| 7 |
+
from config.supabase_config import insert_property_and_history
|
| 8 |
+
|
| 9 |
+
# Main function to scrape properties
|
| 10 |
+
|
| 11 |
+
def fetch_suburbs(url, city):
|
| 12 |
+
"""
|
| 13 |
+
Fetches the list of suburbs and their links from a given URL.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
response = requests.get(url)
|
| 17 |
+
if response.status_code == 200:
|
| 18 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 19 |
+
suburb_links_container = soup.find('div', {'testid': 'suburbLinksContainer'})
|
| 20 |
+
if suburb_links_container:
|
| 21 |
+
suburb_links = suburb_links_container.find_all('a')
|
| 22 |
+
# Reverse the order of links
|
| 23 |
+
for link in suburb_links:
|
| 24 |
+
suburb_name = link.get_text(strip=True)
|
| 25 |
+
suburb_link = "https://propertyvalue.co.nz" + link.get('href')
|
| 26 |
+
print(f"Suburb: {suburb_name}, Link: {suburb_link}")
|
| 27 |
+
|
| 28 |
+
# Fetch the page content for the suburb link
|
| 29 |
+
suburb_response = requests.get(suburb_link)
|
| 30 |
+
print(f" Status code for {suburb_name}: {suburb_response.status_code}")
|
| 31 |
+
if suburb_response.status_code == 200:
|
| 32 |
+
suburb_soup = BeautifulSoup(suburb_response.content, 'html.parser')
|
| 33 |
+
# Find the pagination element using role='group' and class_='btn-group'
|
| 34 |
+
pagination = suburb_soup.find('div', {'role': 'group', 'class': 'btn-group'})
|
| 35 |
+
if pagination:
|
| 36 |
+
# Find the label with "of" and the next label for the max page number
|
| 37 |
+
of_label = pagination.find('label', string='of')
|
| 38 |
+
if of_label and of_label.find_next_sibling('label'):
|
| 39 |
+
max_page = int(of_label.find_next_sibling('label').get_text(strip=True))
|
| 40 |
+
print(f"Suburb: {suburb_name}, Max Pages: {max_page}")
|
| 41 |
+
else:
|
| 42 |
+
print(f" No page numbers found for {suburb_name}")
|
| 43 |
+
else:
|
| 44 |
+
print(f" No pagination element found for {suburb_name}")
|
| 45 |
+
max_page = 1 # Default to 1 page if no pagination
|
| 46 |
+
|
| 47 |
+
scrape_properties(suburb_link, max_page, city, suburb_name)
|
| 48 |
+
|
| 49 |
+
def scrape_properties(main_url, max_pages, city, suburb):
|
| 50 |
+
# redis_client = create_redis_client() # Instantiate the Redis client
|
| 51 |
+
|
| 52 |
+
for page in range(1, max_pages + 1):
|
| 53 |
+
# Fetch property links and titles for the current page
|
| 54 |
+
property_links, titles = fetch_property_links(main_url, page)
|
| 55 |
+
|
| 56 |
+
# Print and fetch details for each property on the current page
|
| 57 |
+
for property_url, title in zip(property_links, titles):
|
| 58 |
+
print(f"Fetching details for: {title}")
|
| 59 |
+
|
| 60 |
+
# Check if the property address already exists in Redis
|
| 61 |
+
# if check_property_in_redis(redis_client, title):
|
| 62 |
+
# print(f"Property {title} already exists in Redis. Skipping...")
|
| 63 |
+
# continue
|
| 64 |
+
|
| 65 |
+
# Fetch property details and history
|
| 66 |
+
property_data, history_data = fetch_property_details(property_url, title, city, suburb)
|
| 67 |
+
|
| 68 |
+
# Insert into Supabase
|
| 69 |
+
insert_property_and_history(property_data, history_data)
|
| 70 |
+
|
| 71 |
+
# Add the property to Redis to avoid duplicates
|
| 72 |
+
# add_property_to_redis(redis_client, title)
|
| 73 |
+
|
| 74 |
+
# time.sleep(0.5) # Adding a delay to avoid overloading the server
|
| 75 |
+
|
| 76 |
+
# Run the scraper
|
| 77 |
+
if __name__ == "__main__":
|
| 78 |
+
city = "Auckland - City"
|
| 79 |
+
fetch_suburbs("https://www.propertyvalue.co.nz/auckland/auckland/7", city)
|
properties.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
from property_history import fetch_property_history
|
| 4 |
+
from config.supabase_config import insert_property_and_history # Assuming this function exists
|
| 5 |
+
|
| 6 |
+
# 固定的城市和 suburb
|
| 7 |
+
# CITY = "Porirua City"
|
| 8 |
+
# SUBURB = "Aotea"
|
| 9 |
+
|
| 10 |
+
# Fetch property details
|
| 11 |
+
def fetch_property_details(property_url, title, city, suburb):
|
| 12 |
+
print(f"\nFetching details for {property_url}...")
|
| 13 |
+
response = requests.get(property_url)
|
| 14 |
+
|
| 15 |
+
if response.status_code == 200:
|
| 16 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 17 |
+
|
| 18 |
+
# Extracting property details
|
| 19 |
+
address_line1 = title.split(',')[0].strip() # 从 title 中提取 address_line1
|
| 20 |
+
address_line2 = soup.find('span', {'testid': 'addressLine2'}).get_text(strip=True) if soup.find('span', {'testid': 'addressLine2'}) else 'N/A'
|
| 21 |
+
postcode = title.split(',')[-1].strip() # 获取最后一个逗号后的邮政编码
|
| 22 |
+
|
| 23 |
+
# Combine address_line1 and address_line2 into a single address field
|
| 24 |
+
address = f"{address_line1}, {address_line2}"
|
| 25 |
+
|
| 26 |
+
# suburb = SUBURB
|
| 27 |
+
# city = CITY
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
year_built = int(soup.find('div', {'testid': 'yearBuiltValue'}).get_text(strip=True))
|
| 31 |
+
except (AttributeError, ValueError):
|
| 32 |
+
year_built = None
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
bedrooms = int(soup.find('span', {'testid': 'bed'}).get_text(strip=True))
|
| 36 |
+
except (AttributeError, ValueError):
|
| 37 |
+
bedrooms = None
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
bathrooms = int(soup.find('span', {'testid': 'bath'}).get_text(strip=True))
|
| 41 |
+
except (AttributeError, ValueError):
|
| 42 |
+
bathrooms = None
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
car_spaces = int(soup.find('span', {'testid': 'car'}).get_text(strip=True))
|
| 46 |
+
except (AttributeError, ValueError):
|
| 47 |
+
car_spaces = None
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
floor_size = soup.find('span', class_='floor PropertyAttributes_attribute__3bkWm').get_text(strip=True)
|
| 51 |
+
except AttributeError:
|
| 52 |
+
floor_size = 'N/A'
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
land_area = soup.find('span', class_='land PropertyAttributes_attribute__3bkWm').get_text(strip=True)
|
| 56 |
+
except AttributeError:
|
| 57 |
+
land_area = 'N/A'
|
| 58 |
+
|
| 59 |
+
last_sold_price, last_sold_date = parse_sold_details(soup)
|
| 60 |
+
|
| 61 |
+
capital_value = extract_value(soup, 'Capital Value')
|
| 62 |
+
land_value = extract_value(soup, 'Land Value')
|
| 63 |
+
improvement_value = extract_value(soup, 'Improvement Value')
|
| 64 |
+
|
| 65 |
+
# Fetch rental history from property_history.py
|
| 66 |
+
rental_history = fetch_property_history(soup)
|
| 67 |
+
|
| 68 |
+
# Prepare property data for insertion into Supabase
|
| 69 |
+
property_data = {
|
| 70 |
+
'property_url': property_url,
|
| 71 |
+
'address': address,
|
| 72 |
+
'suburb': suburb,
|
| 73 |
+
'city': city,
|
| 74 |
+
'postcode': postcode,
|
| 75 |
+
'year_built': year_built,
|
| 76 |
+
'bedrooms': bedrooms,
|
| 77 |
+
'bathrooms': bathrooms,
|
| 78 |
+
'car_spaces': car_spaces,
|
| 79 |
+
'floor_size': floor_size,
|
| 80 |
+
'land_area': land_area,
|
| 81 |
+
'last_sold_price': last_sold_price,
|
| 82 |
+
'last_sold_date': last_sold_date,
|
| 83 |
+
'capital_value': capital_value,
|
| 84 |
+
'land_value': land_value,
|
| 85 |
+
'improvement_value': improvement_value,
|
| 86 |
+
'has_rental_history': rental_history['has_rental_history'],
|
| 87 |
+
'is_currently_rented': rental_history['is_currently_rented']
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
# Prepare history data for insertion into Supabase
|
| 91 |
+
history_data = rental_history['history']
|
| 92 |
+
|
| 93 |
+
return property_data, history_data # Return the data for insertion
|
| 94 |
+
|
| 95 |
+
else:
|
| 96 |
+
print(f"Failed to fetch details for: {property_url}")
|
| 97 |
+
return None, None
|
| 98 |
+
|
| 99 |
+
# Step 3: Parse sold details
|
| 100 |
+
def parse_sold_details(soup):
|
| 101 |
+
last_sold = soup.find('strong', {'testid': 'lastSoldAttribute'})
|
| 102 |
+
if last_sold:
|
| 103 |
+
last_sold_text = last_sold.get_text(strip=True)
|
| 104 |
+
|
| 105 |
+
if 'for' in last_sold_text and 'on' in last_sold_text:
|
| 106 |
+
last_sold_price = last_sold_text.split('for')[-1].strip()
|
| 107 |
+
|
| 108 |
+
# Clean up the last_sold_price: remove '$' and ',' and convert to a float
|
| 109 |
+
last_sold_price = last_sold_price.replace('$', '').replace(',', '')
|
| 110 |
+
try:
|
| 111 |
+
last_sold_price = float(last_sold_price)
|
| 112 |
+
except ValueError:
|
| 113 |
+
last_sold_price = None # If price cannot be parsed, set it to None
|
| 114 |
+
|
| 115 |
+
if 'Last Sold on' in last_sold_text:
|
| 116 |
+
last_sold_date = last_sold_text.replace('Last Sold on', '').split('for')[0].strip()
|
| 117 |
+
else:
|
| 118 |
+
last_sold_date = last_sold_text.split('on')[-1].strip()
|
| 119 |
+
|
| 120 |
+
return last_sold_price, last_sold_date
|
| 121 |
+
|
| 122 |
+
return None, None
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
# Helper to extract values like Capital Value, Land Value, Improvement Value
|
| 126 |
+
def extract_value(soup, value_type):
|
| 127 |
+
try:
|
| 128 |
+
value = soup.find('div', string=value_type).find_next_sibling('div').get_text(strip=True)
|
| 129 |
+
return value
|
| 130 |
+
except AttributeError:
|
| 131 |
+
return 'N/A'
|
property_history.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup
|
| 2 |
+
|
| 3 |
+
# Fetch property history (like sale history, rental history)
|
| 4 |
+
def fetch_property_history(soup):
|
| 5 |
+
property_history = []
|
| 6 |
+
has_rental_history = False
|
| 7 |
+
is_currently_rented = False
|
| 8 |
+
|
| 9 |
+
# 解析历史记录
|
| 10 |
+
events = soup.find_all('div', class_='d-flex flex-row w-100 align-items-center pr-3 mb-2')
|
| 11 |
+
for event in events:
|
| 12 |
+
# 提取日期部分
|
| 13 |
+
date_day = event.find('div', {'testid': lambda x: x and x.startswith('pt-monthDay')})
|
| 14 |
+
date_year = event.find('div', {'testid': lambda x: x and x.startswith('pt-year')})
|
| 15 |
+
|
| 16 |
+
if date_day and date_year:
|
| 17 |
+
event_date = f"{date_day.get_text(strip=True)} {date_year.get_text(strip=True)}"
|
| 18 |
+
elif date_year:
|
| 19 |
+
event_date = date_year.get_text(strip=True)
|
| 20 |
+
else:
|
| 21 |
+
event_date = "Unknown date"
|
| 22 |
+
|
| 23 |
+
# 提取描述
|
| 24 |
+
description = event.find('strong', {'testid': lambda x: x and x.startswith('pt-description')})
|
| 25 |
+
event_description = description.get_text(strip=True) if description else "No description"
|
| 26 |
+
|
| 27 |
+
# 提取间隔信息
|
| 28 |
+
interval = event.find('div', {'testid': lambda x: x and x.startswith('pt-interval')})
|
| 29 |
+
event_interval = interval.get_text(strip=True) if interval else "No interval info"
|
| 30 |
+
|
| 31 |
+
# 创建事件字典
|
| 32 |
+
event_dict = {
|
| 33 |
+
'event_date': event_date,
|
| 34 |
+
'event_description': event_description,
|
| 35 |
+
'event_interval': event_interval
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
property_history.append(event_dict)
|
| 39 |
+
|
| 40 |
+
# 检查是否有租赁历史
|
| 41 |
+
if "Listed for Rent at" in event_description or "Rented for" in event_description:
|
| 42 |
+
has_rental_history = True
|
| 43 |
+
is_currently_rented = True # 根据逻辑判断当前是否出租
|
| 44 |
+
|
| 45 |
+
# 打印 property_history 详情
|
| 46 |
+
print("----------------")
|
| 47 |
+
print("Property History")
|
| 48 |
+
print("----------------")
|
| 49 |
+
|
| 50 |
+
for event in property_history:
|
| 51 |
+
print(f"Date: {event['event_date']}")
|
| 52 |
+
print(f"Description: {event['event_description']}")
|
| 53 |
+
print(f"Interval: {event['event_interval']}")
|
| 54 |
+
print("----------------")
|
| 55 |
+
|
| 56 |
+
return {
|
| 57 |
+
'history': property_history if property_history else "No rental history available",
|
| 58 |
+
'has_rental_history': has_rental_history,
|
| 59 |
+
'is_currently_rented': is_currently_rented
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
# Example usage (for testing purposes):
|
| 63 |
+
# if __name__ == "__main__":
|
| 64 |
+
# sample_html = """<your HTML content here>"""
|
| 65 |
+
# soup = BeautifulSoup(sample_html, 'html.parser')
|
| 66 |
+
# fetch_property_history(soup)
|
real_estate.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from playwright.sync_api import sync_playwright, TimeoutError
|
| 2 |
+
import time
|
| 3 |
+
import random
|
| 4 |
+
|
| 5 |
+
from config.redis_config import add_real_estate_to_redis, check_real_estate_in_redis, create_redis_client
|
| 6 |
+
from config.supabase_config import insert_real_estate
|
| 7 |
+
|
| 8 |
+
def handle_dialog(dialog):
|
| 9 |
+
print(f"Dialog message: {dialog.message}")
|
| 10 |
+
dialog.accept()
|
| 11 |
+
|
| 12 |
+
def scroll_to_bottom(page):
|
| 13 |
+
print("开始模拟鼠标下滑操作...")
|
| 14 |
+
last_height = page.evaluate("document.body.scrollHeight")
|
| 15 |
+
while True:
|
| 16 |
+
print(f" - 当前页面高度: {last_height},继续下滑...")
|
| 17 |
+
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
| 18 |
+
time.sleep(random.uniform(1, 2)) # 等待页面加载
|
| 19 |
+
new_height = page.evaluate("document.body.scrollHeight")
|
| 20 |
+
if new_height == last_height:
|
| 21 |
+
print(" - 已到达页面底部")
|
| 22 |
+
break
|
| 23 |
+
last_height = new_height
|
| 24 |
+
|
| 25 |
+
# 检查是否出现了页码导航
|
| 26 |
+
if page.query_selector('nav[aria-label="Pagination"]') or page.query_selector('div[class*="pagination"]'):
|
| 27 |
+
print(" - 检测到页码导航,停止滚动")
|
| 28 |
+
break
|
| 29 |
+
|
| 30 |
+
def simulate_user_behavior(page):
|
| 31 |
+
scroll_to_bottom(page)
|
| 32 |
+
|
| 33 |
+
# 随机点击几个房产卡片
|
| 34 |
+
print("模拟查看房产卡片...")
|
| 35 |
+
card_selectors = [
|
| 36 |
+
'div[class*="listing-tile"]',
|
| 37 |
+
'div[class*="property-card"]',
|
| 38 |
+
'div[class*="search-result"]'
|
| 39 |
+
]
|
| 40 |
+
for selector in card_selectors:
|
| 41 |
+
cards = page.query_selector_all(selector)
|
| 42 |
+
if cards:
|
| 43 |
+
for _ in range(min(3, len(cards))):
|
| 44 |
+
card = random.choice(cards)
|
| 45 |
+
try:
|
| 46 |
+
card.scroll_into_view_if_needed()
|
| 47 |
+
card.hover()
|
| 48 |
+
print(f" - 悬停在一个房产卡片上")
|
| 49 |
+
time.sleep(random.uniform(0.5, 1.5))
|
| 50 |
+
except:
|
| 51 |
+
pass
|
| 52 |
+
break
|
| 53 |
+
|
| 54 |
+
# 额外的滚动操作
|
| 55 |
+
print("模拟额外的滚动操作")
|
| 56 |
+
for i in range(10):
|
| 57 |
+
scroll_distance = random.randint(500, 1500)
|
| 58 |
+
page.evaluate(f"window.scrollBy(0, {scroll_distance})")
|
| 59 |
+
print(f" - 向下滚动 {scroll_distance} 像素")
|
| 60 |
+
time.sleep(random.uniform(1, 2))
|
| 61 |
+
|
| 62 |
+
# 再次滚动到底部
|
| 63 |
+
print("再次滚动到页面底部")
|
| 64 |
+
scroll_to_bottom(page)
|
| 65 |
+
|
| 66 |
+
def fetch_addresses(page, url):
|
| 67 |
+
try:
|
| 68 |
+
page.goto(url, wait_until="networkidle", timeout=60000)
|
| 69 |
+
except TimeoutError:
|
| 70 |
+
print(f"Timeout while loading {url}. Continuing with partial page load.")
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
page.wait_for_selector('button:has-text("Accept")', timeout=5000)
|
| 74 |
+
page.click('button:has-text("Accept")')
|
| 75 |
+
print("Clicked cookie consent button.")
|
| 76 |
+
except:
|
| 77 |
+
print("No cookie consent button found or unable to click it.")
|
| 78 |
+
|
| 79 |
+
# 模拟用户行为
|
| 80 |
+
simulate_user_behavior(page)
|
| 81 |
+
|
| 82 |
+
addresses = []
|
| 83 |
+
try:
|
| 84 |
+
selectors = [
|
| 85 |
+
'h3[data-test="standard-tile__search-result__address"]',
|
| 86 |
+
'.standard-tile__search-result__address',
|
| 87 |
+
'h3[class*="address"]',
|
| 88 |
+
'div[class*="address"]',
|
| 89 |
+
'div[class*="listing-tile"] h3',
|
| 90 |
+
'div[class*="listing-tile"] div[class*="address"]'
|
| 91 |
+
]
|
| 92 |
+
|
| 93 |
+
for selector in selectors:
|
| 94 |
+
address_elements = page.query_selector_all(selector)
|
| 95 |
+
if address_elements:
|
| 96 |
+
addresses = [element.inner_text().strip() for element in address_elements if element.inner_text().strip()]
|
| 97 |
+
print(f"Found {len(addresses)} addresses using selector: {selector}")
|
| 98 |
+
break
|
| 99 |
+
|
| 100 |
+
if not addresses:
|
| 101 |
+
print(f"No address elements found on {url} using any of the selectors.")
|
| 102 |
+
print("Page Title:", page.title())
|
| 103 |
+
print("Current URL:", page.url)
|
| 104 |
+
print("HTML content:", page.content()[:1000])
|
| 105 |
+
except Exception as e:
|
| 106 |
+
print(f"An error occurred while scraping {url}: {str(e)}")
|
| 107 |
+
|
| 108 |
+
return addresses
|
| 109 |
+
|
| 110 |
+
def scrape_properties(main_url, max_pages):
|
| 111 |
+
redis_client = create_redis_client() # Instantiate the Redis client
|
| 112 |
+
all_addresses = []
|
| 113 |
+
|
| 114 |
+
with sync_playwright() as p:
|
| 115 |
+
browser = p.chromium.launch(
|
| 116 |
+
headless=True,
|
| 117 |
+
args=[
|
| 118 |
+
"--no-sandbox",
|
| 119 |
+
"--disable-dev-shm-usage",
|
| 120 |
+
],
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
context = browser.new_context(
|
| 124 |
+
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 125 |
+
)
|
| 126 |
+
page = context.new_page()
|
| 127 |
+
page.on("dialog", handle_dialog)
|
| 128 |
+
|
| 129 |
+
for page_num in range(1, max_pages + 1):
|
| 130 |
+
url = f"{main_url}?page={page_num}"
|
| 131 |
+
print(f"\nScraping page {page_num}: {url}")
|
| 132 |
+
|
| 133 |
+
addresses = fetch_addresses(page, url)
|
| 134 |
+
if addresses:
|
| 135 |
+
all_addresses.extend(addresses)
|
| 136 |
+
print(f"Found {len(addresses)} addresses on page {page_num}")
|
| 137 |
+
print("Addresses found on this page:")
|
| 138 |
+
for addr in addresses:
|
| 139 |
+
print(f" - {addr}")
|
| 140 |
+
if not check_real_estate_in_redis(redis_client, addr):
|
| 141 |
+
# 插入到Supabase中
|
| 142 |
+
insert_real_estate(addr, "for Sale") # 假设状态为 "For Sale"
|
| 143 |
+
# 将地址添加到Redis,避免重复
|
| 144 |
+
add_real_estate_to_redis(redis_client, addr)
|
| 145 |
+
else:
|
| 146 |
+
print(f"Address {addr} already exists in Redis. Skipping...")
|
| 147 |
+
else:
|
| 148 |
+
print(f"No addresses found on page {page_num}. Continuing to next page.")
|
| 149 |
+
|
| 150 |
+
if page_num < max_pages:
|
| 151 |
+
delay = random.uniform(5, 10)
|
| 152 |
+
print(f"Waiting for {delay:.2f} seconds before next request...")
|
| 153 |
+
time.sleep(delay)
|
| 154 |
+
|
| 155 |
+
browser.close()
|
| 156 |
+
|
| 157 |
+
def main():
|
| 158 |
+
main_url = "https://www.realestate.co.nz/residential/sale/auckland"
|
| 159 |
+
max_pages = 500
|
| 160 |
+
scrape_properties(main_url, max_pages)
|
| 161 |
+
|
| 162 |
+
if __name__ == "__main__":
|
| 163 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
supabase==2.15.1 # Python client library for Supabase
|
| 2 |
+
python-dotenv==1.0.0 # For loading environment variables from a .env file
|
| 3 |
+
requests==2.31.0 # HTTP library for making requests
|
| 4 |
+
upstash-redis==1.3.0 # Redis client for Upstash
|
| 5 |
+
python-dateutil==2.8.2 # For flexible date parsing
|
| 6 |
+
playwright>=1.33.0
|
| 7 |
+
# greenlet>=3.1.0
|
| 8 |
+
beautifulsoup4>=4.12.2 # BeautifulSoup HTML parser
|
| 9 |
+
|
utils/data_processing.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
|
| 3 |
+
def parse_event_date(date_str):
|
| 4 |
+
# 将字符串日期解析为标准日期格式
|
| 5 |
+
try:
|
| 6 |
+
return datetime.datetime.strptime(date_str, "%d %B %Y")
|
| 7 |
+
except ValueError:
|
| 8 |
+
return None # 返回 None 表示日期格式不正确
|