NZLouislu commited on
Commit
3624bf2
·
1 Parent(s): 0ea8d23

Add code for fetch property data of Auckland NZ.

Browse files
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ /config/__pycache__
2
+ /__pycache__
3
+ /.venv
4
+ .vscode
5
+ /.idx/
6
+ .env
7
+ shell.nix
config/config_test.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from datetime import datetime
3
+ from supabase_config import create_supabase_client
4
+ from redis_config import create_redis_client
5
+ from upstash_redis import Redis
6
+ # from redis import Redis
7
+
8
+ def test_redis_connection():
9
+ try:
10
+ # Connect to Redis
11
+ redis_client = Redis.from_env()
12
+ # redis_client = Redis(host='localhost', port=6379, db=0)
13
+
14
+ # Flush all data in Redis
15
+ # redis_client.flushdb()
16
+
17
+ # Test inserting a value
18
+ redis_client.set('test_key', 'test_value')
19
+ value = redis_client.get('test_key')
20
+ if value:
21
+ print("Redis connection successful, test key inserted.")
22
+ else:
23
+ print("Redis connection failed.")
24
+ return True
25
+ except Exception as e:
26
+ print(f"Error connecting to Redis: {e}")
27
+ print("Redis test failed.")
28
+ return False
29
+
30
+ def test_supabase_connection():
31
+ try:
32
+ # Create a Supabase client
33
+ supabase_client = create_supabase_client()
34
+ print("Successfully connected to Supabase.")
35
+
36
+ # Test querying the 'properties' table
37
+ response = supabase_client.from_('properties').select('*').limit(1).execute()
38
+
39
+ # Check if there is an error or data is None
40
+ if not response.data:
41
+ print("Failed to fetch data from 'properties' table or table is empty.")
42
+ else:
43
+ print("Successfully fetched data from 'properties' table:", response.data)
44
+ return True
45
+ except Exception as e:
46
+ print(f"Error connecting to Supabase: {e}")
47
+ print("Supabase test failed.")
48
+ return False
49
+
50
+ def insert_property(supabase_client, property_data):
51
+ try:
52
+ # Remove the 'id' from property_data if it exists, so it can be auto-generated
53
+ property_data.pop('id', None)
54
+
55
+ # Clean up the price fields
56
+ for field in ['last_sold_price', 'capital_value', 'land_value', 'improvement_value']:
57
+ if field in property_data and property_data[field]:
58
+ property_data[field] = clean_price(property_data[field])
59
+
60
+ # Insert the property data
61
+ response = supabase_client.table('properties').insert(property_data).execute()
62
+
63
+ # Check for errors in the response
64
+ if hasattr(response, 'error') and response.error:
65
+ print(f"Failed to insert property: {response.error}")
66
+ return None
67
+ elif not response.data:
68
+ print("Failed to insert property: No data returned")
69
+ return None
70
+ else:
71
+ print(f"Property inserted: {property_data['address']}")
72
+ return response.data[0]['id'] # Return the property ID
73
+
74
+ except Exception as e:
75
+ print(f"Error inserting property: {str(e)}")
76
+ return None
77
+
78
+ # Helper function to clean price values
79
+ def clean_price(price_str):
80
+ try:
81
+ # Remove dollar sign and commas, then convert to float
82
+ clean_price_str = price_str.replace('$', '').replace(',', '').strip()
83
+ return float(clean_price_str)
84
+ except ValueError:
85
+ return None # If there's an issue parsing the price, return None
86
+
87
+ # Test inserting a property into the database
88
+ def test_insert_property():
89
+ # Create a test property with a sample price that contains special characters
90
+ test_property_data = {
91
+ 'address': '15 Agra Crescent, Khandallah, Wellington, 6035',
92
+ 'suburb': 'Khandallah',
93
+ 'city': 'Wellington',
94
+ 'postcode': '6035',
95
+ 'year_built': 1985,
96
+ 'bedrooms': 3,
97
+ 'bathrooms': 2,
98
+ 'car_spaces': 2,
99
+ 'floor_size': '150 sqm',
100
+ 'land_area': '500 sqm',
101
+ 'last_sold_price': '$1,280,000', # This needs cleaning
102
+ 'last_sold_date': '2023-08-01',
103
+ 'capital_value': '$1,000,000', # This needs cleaning
104
+ 'land_value': '$800,000', # This needs cleaning
105
+ 'improvement_value': '$200,000', # This needs cleaning
106
+ 'has_rental_history': False,
107
+ 'is_currently_rented': False,
108
+ 'status': 'For Sale' # Adding status as per SQL definition
109
+ }
110
+
111
+ try:
112
+ # Create a Supabase client
113
+ supabase_client = create_supabase_client()
114
+
115
+ # Attempt to insert the property
116
+ property_id = insert_property(supabase_client, test_property_data)
117
+ if property_id:
118
+ print(f"Inserted property with generated ID: {property_id}")
119
+
120
+ except Exception as e:
121
+ print(f"Error during test insertion: {e}")
122
+
123
+
124
+ def main():
125
+ # print("Testing Redis connection first...")
126
+ redis_test_result = test_redis_connection()
127
+
128
+ print("\nTesting Supabase connection now...")
129
+ supabase_test_result = test_supabase_connection()
130
+
131
+ print("\nTesting Supabase property insertion...")
132
+ test_insert_property()
133
+
134
+ # if redis_test_result and supabase_test_result:
135
+ if supabase_test_result:
136
+ print("\nAll tests passed!")
137
+ else:
138
+ print("\nOne or more tests failed. Please check the above messages.")
139
+
140
+ if __name__ == '__main__':
141
+ main()
config/redis_config.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # redis_config.py
2
+ import os
3
+ # from redis import Redis
4
+ from upstash_redis import Redis # Import Redis instead of UpstashRedis
5
+
6
+ # Redis connection configuration
7
+ def create_redis_client():
8
+ # Use Redis.from_env() which is the correct way to initialize from environment variables
9
+ return Redis.from_env()
10
+ # return Redis(host='localhost', port=6379, db=0)
11
+
12
+ # Check if a property address exists in Redis
13
+ def check_property_in_redis(redis_client, address):
14
+ # Upstash Redis client uses get() and checks for None
15
+ return redis_client.get(address) is not None
16
+
17
+ # Add a property address to Redis after insertion
18
+ def add_property_to_redis(redis_client, address):
19
+ redis_client.set(address, 1)
20
+
21
+ # Check if a real estate property address exists in Redis
22
+ def check_real_estate_in_redis(redis_client, address):
23
+ return redis_client.get("real" + address) is not None
24
+
25
+ # Add a real estate property address to Redis after insertion
26
+ def add_real_estate_to_redis(redis_client, address):
27
+ redis_client.set("real"+address, 1)
28
+
29
+ # Check if a real estate rent property address exists in Redis
30
+ def check_real_estate_rent_in_redis(redis_client, address):
31
+ return redis_client.get("real" + address) is not None
32
+
33
+ # Add a real estate property rent address to Redis after insertion
34
+ def add_real_estate_rent_to_redis(redis_client, address):
35
+ redis_client.set("real"+address, 1)
config/supabase_config.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # supabase_config.py
2
+ from datetime import datetime
3
+ import os
4
+ from dotenv import load_dotenv
5
+ from supabase import create_client, Client
6
+
7
+ # 连接到 Supabase
8
+ load_dotenv() # 默认加载根目录的 .env 文件
9
+
10
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
11
+ SUPABASE_KEY = os.getenv("SUPABASE_KEY")
12
+
13
+ # Create a Supabase client
14
+ def create_supabase_client() -> Client:
15
+ if not SUPABASE_URL or not SUPABASE_KEY:
16
+ raise ValueError("Supabase URL and API key must be provided")
17
+ return create_client(SUPABASE_URL, SUPABASE_KEY)
18
+
19
+ # Insert property details into the properties table
20
+ def insert_property(supabase_client, property_data):
21
+ try:
22
+ # Remove the 'id' from property_data if it exists, so it can be auto-generated
23
+ property_data.pop('id', None)
24
+
25
+ # Clean up the last_sold_price before inserting
26
+ if 'last_sold_price' in property_data and property_data['last_sold_price']:
27
+ property_data['last_sold_price'] = clean_price(property_data['last_sold_price'])
28
+
29
+ # Clean up the capital_value, land_value, and improvement_value as well
30
+ if 'capital_value' in property_data and property_data['capital_value']:
31
+ property_data['capital_value'] = clean_price(property_data['capital_value'])
32
+ if 'land_value' in property_data and property_data['land_value']:
33
+ property_data['land_value'] = clean_price(property_data['land_value'])
34
+ if 'improvement_value' in property_data and property_data['improvement_value']:
35
+ property_data['improvement_value'] = clean_price(property_data['improvement_value'])
36
+
37
+ # Insert the property data (without 'id')
38
+ response = supabase_client.table('properties').insert(property_data).execute()
39
+
40
+ # Check for errors in the response
41
+ if response.error:
42
+ print(f"Failed to insert property: {response.error}") # Print the error message
43
+ return None
44
+
45
+ # If successful, return the ID of the inserted property
46
+ print(f"Property inserted: {property_data['address']}")
47
+ return response.data[0]['id'] # Return the property ID
48
+
49
+ except Exception as e:
50
+ print(f"Error inserting property: {str(e)}")
51
+ return None
52
+
53
+ def clean_price(price_str):
54
+ if price_str is None:
55
+ return None
56
+ if isinstance(price_str, (int, float)):
57
+ return price_str
58
+ try:
59
+ return float(price_str.replace('$', '').replace(',', '').strip())
60
+ except ValueError:
61
+ return None
62
+
63
+ def clean_property_data(property_data):
64
+ price_fields = ['last_sold_price', 'capital_value', 'land_value', 'improvement_value']
65
+ for field in price_fields:
66
+ if field in property_data:
67
+ property_data[field] = clean_price(property_data[field])
68
+ return property_data
69
+
70
+ def parse_date(date_str):
71
+ if not date_str:
72
+ return None
73
+ date_formats = ['%d %b %Y', '%Y', '%b %Y', '%d/%m/%Y', '%Y-%m-%d']
74
+ for fmt in date_formats:
75
+ try:
76
+ return datetime.strptime(date_str, fmt).date()
77
+ except ValueError:
78
+ continue
79
+ print(f"Warning: Unable to parse date '{date_str}'")
80
+ return None
81
+
82
+ def format_date_for_json(date_obj):
83
+ if date_obj is None:
84
+ return None
85
+ return date_obj.isoformat() # Converts date to ISO 8601 string format
86
+
87
+ def insert_property_and_history(property_data, history_data):
88
+ supabase = create_supabase_client()
89
+
90
+ try:
91
+ # Clean property data
92
+ cleaned_property_data = clean_property_data(property_data)
93
+ # 插入 property 数据
94
+ response = supabase.table('properties').insert(cleaned_property_data).execute()
95
+
96
+ if response.data:
97
+ property_id = response.data[0]['id']
98
+ print(f"✅ Property inserted successfully. ID: {property_id}")
99
+ else:
100
+ print(f"⚠️ Failed to insert property. Maybe already exists. URL: {cleaned_property_data.get('property_url')}")
101
+ return # 不继续插入历史
102
+
103
+ except Exception as e:
104
+ error_str = str(e).lower()
105
+ if "duplicate key" in error_str or "unique constraint" in error_str:
106
+ print(f"🔁 Duplicate property skipped (URL: {cleaned_property_data.get('property_url')})")
107
+ else:
108
+ print(f"❌ Unexpected error during property insert: {e}")
109
+ return
110
+
111
+ # 继续插入历史记录
112
+ if history_data and isinstance(history_data, list):
113
+ for event in history_data:
114
+ history_entry = {
115
+ 'property_id': property_id,
116
+ 'event_description': event.get('event_description', ''),
117
+ 'event_date': format_date_for_json(parse_date(event.get('event_date'))),
118
+ 'interval_since_last_event': event.get('event_interval', '')
119
+ }
120
+ if history_entry['event_date'] is not None:
121
+ try:
122
+ history_response = supabase.table('property_history').insert(history_entry).execute()
123
+ if not history_response.data:
124
+ print(f"⚠️ Failed to insert history: {event}")
125
+ except Exception as e:
126
+ print(f"❌ Error inserting history: {str(e)}")
127
+ print(f"⏩ Skipped history entry: {event}")
128
+ else:
129
+ print(f"🕒 Skipped invalid date entry: {event}")
130
+ print("📜 Property history insertion completed.")
131
+ else:
132
+ print("ℹ️ No history data to insert.")
133
+
134
+
135
+ def insert_real_estate(address, status):
136
+ try:
137
+ supabase = create_supabase_client()
138
+ data = {
139
+ "address": address,
140
+ "status": status
141
+ }
142
+ response = supabase.table('real_estate').insert(data).execute()
143
+ if response.data:
144
+ print(f"Inserted {address} into Supabase successfully.")
145
+ else:
146
+ print(f"Failed to insert {address} into Supabase.")
147
+ except Exception as e:
148
+ print(f"Error inserting {address} into Supabase: {str(e)}")
149
+
150
+ def insert_real_estate_rent(address, status):
151
+ try:
152
+ supabase = create_supabase_client()
153
+ data = {
154
+ "address": address,
155
+ "status": status
156
+ }
157
+ response = supabase.table('real_estate_rent').insert(data).execute()
158
+ if response.data:
159
+ print(f"Inserted {address} into Supabase successfully.")
160
+ else:
161
+ print(f"Failed to insert {address} into Supabase.")
162
+ except Exception as e:
163
+ print(f"Error inserting {address} into Supabase: {str(e)}")
fetch_property_details.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+
4
+ # 固定的城市和 suburb
5
+ CITY = "Porirua City"
6
+ SUBURB = "Aotea"
7
+
8
+ # Step 2: Fetch details for each property
9
+ def fetch_property_details(property_url, title):
10
+ print(f"\nFetching details for {property_url}...")
11
+ response = requests.get(property_url)
12
+
13
+ if response.status_code == 200:
14
+ soup = BeautifulSoup(response.content, 'html.parser')
15
+
16
+ # Extracting property details
17
+ address_line1 = title.split(',')[0].strip() # 从 title 中提取 address_line1
18
+ address_line2 = soup.find('span', {'testid': 'addressLine2'}).get_text(strip=True) if soup.find('span', {'testid': 'addressLine2'}) else 'N/A'
19
+ postcode = title.split(',')[-1].strip() # 获取最后一个逗号后的邮政编码
20
+
21
+ # Combine address_line1 and address_line2 into a single address field
22
+ address = f"{address_line1}, {address_line2}"
23
+
24
+ suburb = SUBURB
25
+ city = CITY
26
+
27
+ try:
28
+ year_built = int(soup.find('div', {'testid': 'yearBuiltValue'}).get_text(strip=True))
29
+ except (AttributeError, ValueError):
30
+ year_built = None
31
+
32
+ try:
33
+ bedrooms = int(soup.find('span', {'testid': 'bed'}).get_text(strip=True))
34
+ except (AttributeError, ValueError):
35
+ bedrooms = None
36
+
37
+ try:
38
+ bathrooms = int(soup.find('span', {'testid': 'bath'}).get_text(strip=True))
39
+ except (AttributeError, ValueError):
40
+ bathrooms = None
41
+
42
+ try:
43
+ car_spaces = int(soup.find('span', {'testid': 'car'}).get_text(strip=True))
44
+ except (AttributeError, ValueError):
45
+ car_spaces = None
46
+
47
+ try:
48
+ floor_size = soup.find('span', class_='floor PropertyAttributes_attribute__3bkWm').get_text(strip=True)
49
+ except AttributeError:
50
+ floor_size = 'N/A'
51
+
52
+ try:
53
+ land_area = soup.find('span', class_='land PropertyAttributes_attribute__3bkWm').get_text(strip=True)
54
+ except AttributeError:
55
+ land_area = 'N/A'
56
+
57
+ last_sold_price, last_sold_date = parse_sold_details(soup)
58
+
59
+ capital_value = extract_value(soup, 'Capital Value')
60
+ land_value = extract_value(soup, 'Land Value')
61
+ improvement_value = extract_value(soup, 'Improvement Value')
62
+
63
+ # Fetch rental history (this is where rental status is determined)
64
+ rental_history = fetch_rental_history(soup)
65
+
66
+ # Storing and printing the property data
67
+ property_data = {
68
+ 'property_url': property_url,
69
+ 'address': address,
70
+ 'suburb': suburb,
71
+ 'city': city,
72
+ 'postcode': postcode,
73
+ 'year_built': year_built,
74
+ 'bedrooms': bedrooms,
75
+ 'bathrooms': bathrooms,
76
+ 'car_spaces': car_spaces,
77
+ 'floor_size': floor_size,
78
+ 'land_area': land_area,
79
+ 'last_sold_price': last_sold_price,
80
+ 'last_sold_date': last_sold_date,
81
+ 'capital_value': capital_value,
82
+ 'land_value': land_value,
83
+ 'improvement_value': improvement_value,
84
+ 'rental_history': rental_history['history'],
85
+ 'has_rental_history': rental_history['has_rental_history'],
86
+ 'is_currently_rented': rental_history['is_currently_rented']
87
+ }
88
+
89
+ # 打印房产详情
90
+ for key, value in property_data.items():
91
+ print(f"{key}: {value}")
92
+
93
+ else:
94
+ print(f"Failed to fetch details for: {property_url}")
95
+
96
+ # Step 3: Parse sold details (remove 'Last Sold on' and keep the correct date and price)
97
+ def parse_sold_details(soup):
98
+ last_sold = soup.find('strong', {'testid': 'lastSoldAttribute'})
99
+ if last_sold:
100
+ last_sold_text = last_sold.get_text(strip=True)
101
+
102
+ # 提取日期并去掉 'Last Sold on'
103
+ if 'Last Sold on' in last_sold_text and 'for' in last_sold_text:
104
+ last_sold_date = last_sold_text.replace('Last Sold on', '').split('for')[0].strip()
105
+
106
+ # 提取价格
107
+ last_sold_price = last_sold_text.split('for')[-1].strip()
108
+
109
+ return last_sold_price, last_sold_date
110
+
111
+ # 如果没有找到相关信息,返回 None
112
+ return None, None
113
+
114
+
115
+ # Helper to extract values like Capital Value, Land Value, Improvement Value
116
+ def extract_value(soup, value_type):
117
+ try:
118
+ value = soup.find('div', string=value_type).find_next_sibling('div').get_text(strip=True)
119
+ return value
120
+ except AttributeError:
121
+ return 'N/A'
122
+
123
+ # Fetch rental history function
124
+ def fetch_rental_history(soup):
125
+ rental_history = []
126
+ has_rental_history = False
127
+ is_currently_rented = False
128
+
129
+ # 解析租赁历史记录
130
+ events = soup.find_all('div', class_='d-flex flex-row w-100 align-items-center pr-3 mb-2')
131
+ for event in events:
132
+ description = event.find('strong', {'testid': lambda x: x and x.startswith('pt-description')})
133
+ if description:
134
+ description_text = description.get_text(strip=True)
135
+
136
+ # 检查是否有 "Listed for Rent at" 的字样
137
+ if "Listed for Rent at" in description_text:
138
+ has_rental_history = True
139
+ is_currently_rented = True # 根据逻辑判断当前是否出租
140
+ rental_price = description_text.split('Listed for Rent at')[-1].strip() # 提取租金
141
+ rental_history.append(f"Rented for {rental_price}")
142
+
143
+ return {
144
+ 'history': rental_history if rental_history else "No rental history available",
145
+ 'has_rental_history': has_rental_history,
146
+ 'is_currently_rented': is_currently_rented
147
+ }
fetch_property_links.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import time
4
+ from requests.adapters import HTTPAdapter
5
+ from urllib3.util.retry import Retry
6
+
7
+ def fetch_property_links(main_url, page=1, max_retries=3):
8
+ property_links = []
9
+ titles = []
10
+
11
+ url = f"{main_url}?page={page}" if page > 1 else main_url
12
+ print(f"Fetching page {page}...")
13
+
14
+ # 创建一个带有重试机制的会话
15
+ session = requests.Session()
16
+ retries = Retry(total=max_retries,
17
+ backoff_factor=0.1,
18
+ status_forcelist=[500, 502, 503, 504])
19
+ session.mount('https://', HTTPAdapter(max_retries=retries))
20
+
21
+ try:
22
+ response = session.get(url, timeout=30)
23
+ response.raise_for_status() # 这将抛出一个异常,如果状态码不是200
24
+
25
+ if response.status_code == 200:
26
+ soup = BeautifulSoup(response.text, 'html.parser')
27
+
28
+ # 查找所有属性链接
29
+ for link in soup.find_all('a', class_='PropertyCard_PropertyCardLink__icVIl'):
30
+ full_link = "https://propertyvalue.co.nz" + link['href']
31
+ property_links.append(full_link)
32
+ titles.append(link['title']) # 获取标题属性
33
+
34
+ print(f"\nFound {len(property_links)} properties on page {page}:")
35
+ # 如果需要打印标题,取消下面的注释
36
+ # for title in titles:
37
+ # print(title)
38
+
39
+ else:
40
+ print(f"Unexpected status code {response.status_code} for URL: {url}")
41
+
42
+ except requests.exceptions.RequestException as e:
43
+ print(f"Error fetching page {page}: {e}")
44
+
45
+ finally:
46
+ time.sleep(2) # 增加延迟到2秒,以避免过度加载服务器
47
+
48
+ return property_links, titles
49
+
50
+ # 使用示例
51
+ if __name__ == "__main__":
52
+ main_url = "https://propertyvalue.co.nz/wellington/wellington-city/khandallah-6035/200020"
53
+ links, titles = fetch_property_links(main_url)
54
+ print(f"Total properties found: {len(links)}")
main.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from fetch_property_links import fetch_property_links
5
+ from properties import fetch_property_details
6
+ # from config.redis_config import create_redis_client, check_property_in_redis, add_property_to_redis
7
+ from config.supabase_config import insert_property_and_history
8
+
9
+ # Main function to scrape properties
10
+
11
+ def fetch_suburbs(url, city):
12
+ """
13
+ Fetches the list of suburbs and their links from a given URL.
14
+ """
15
+
16
+ response = requests.get(url)
17
+ if response.status_code == 200:
18
+ soup = BeautifulSoup(response.content, 'html.parser')
19
+ suburb_links_container = soup.find('div', {'testid': 'suburbLinksContainer'})
20
+ if suburb_links_container:
21
+ suburb_links = suburb_links_container.find_all('a')
22
+ # Reverse the order of links
23
+ for link in suburb_links:
24
+ suburb_name = link.get_text(strip=True)
25
+ suburb_link = "https://propertyvalue.co.nz" + link.get('href')
26
+ print(f"Suburb: {suburb_name}, Link: {suburb_link}")
27
+
28
+ # Fetch the page content for the suburb link
29
+ suburb_response = requests.get(suburb_link)
30
+ print(f" Status code for {suburb_name}: {suburb_response.status_code}")
31
+ if suburb_response.status_code == 200:
32
+ suburb_soup = BeautifulSoup(suburb_response.content, 'html.parser')
33
+ # Find the pagination element using role='group' and class_='btn-group'
34
+ pagination = suburb_soup.find('div', {'role': 'group', 'class': 'btn-group'})
35
+ if pagination:
36
+ # Find the label with "of" and the next label for the max page number
37
+ of_label = pagination.find('label', string='of')
38
+ if of_label and of_label.find_next_sibling('label'):
39
+ max_page = int(of_label.find_next_sibling('label').get_text(strip=True))
40
+ print(f"Suburb: {suburb_name}, Max Pages: {max_page}")
41
+ else:
42
+ print(f" No page numbers found for {suburb_name}")
43
+ else:
44
+ print(f" No pagination element found for {suburb_name}")
45
+ max_page = 1 # Default to 1 page if no pagination
46
+
47
+ scrape_properties(suburb_link, max_page, city, suburb_name)
48
+
49
+ def scrape_properties(main_url, max_pages, city, suburb):
50
+ # redis_client = create_redis_client() # Instantiate the Redis client
51
+
52
+ for page in range(1, max_pages + 1):
53
+ # Fetch property links and titles for the current page
54
+ property_links, titles = fetch_property_links(main_url, page)
55
+
56
+ # Print and fetch details for each property on the current page
57
+ for property_url, title in zip(property_links, titles):
58
+ print(f"Fetching details for: {title}")
59
+
60
+ # Check if the property address already exists in Redis
61
+ # if check_property_in_redis(redis_client, title):
62
+ # print(f"Property {title} already exists in Redis. Skipping...")
63
+ # continue
64
+
65
+ # Fetch property details and history
66
+ property_data, history_data = fetch_property_details(property_url, title, city, suburb)
67
+
68
+ # Insert into Supabase
69
+ insert_property_and_history(property_data, history_data)
70
+
71
+ # Add the property to Redis to avoid duplicates
72
+ # add_property_to_redis(redis_client, title)
73
+
74
+ # time.sleep(0.5) # Adding a delay to avoid overloading the server
75
+
76
+ # Run the scraper
77
+ if __name__ == "__main__":
78
+ city = "Auckland - City"
79
+ fetch_suburbs("https://www.propertyvalue.co.nz/auckland/auckland/7", city)
properties.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from property_history import fetch_property_history
4
+ from config.supabase_config import insert_property_and_history # Assuming this function exists
5
+
6
+ # 固定的城市和 suburb
7
+ # CITY = "Porirua City"
8
+ # SUBURB = "Aotea"
9
+
10
+ # Fetch property details
11
+ def fetch_property_details(property_url, title, city, suburb):
12
+ print(f"\nFetching details for {property_url}...")
13
+ response = requests.get(property_url)
14
+
15
+ if response.status_code == 200:
16
+ soup = BeautifulSoup(response.content, 'html.parser')
17
+
18
+ # Extracting property details
19
+ address_line1 = title.split(',')[0].strip() # 从 title 中提取 address_line1
20
+ address_line2 = soup.find('span', {'testid': 'addressLine2'}).get_text(strip=True) if soup.find('span', {'testid': 'addressLine2'}) else 'N/A'
21
+ postcode = title.split(',')[-1].strip() # 获取最后一个逗号后的邮政编码
22
+
23
+ # Combine address_line1 and address_line2 into a single address field
24
+ address = f"{address_line1}, {address_line2}"
25
+
26
+ # suburb = SUBURB
27
+ # city = CITY
28
+
29
+ try:
30
+ year_built = int(soup.find('div', {'testid': 'yearBuiltValue'}).get_text(strip=True))
31
+ except (AttributeError, ValueError):
32
+ year_built = None
33
+
34
+ try:
35
+ bedrooms = int(soup.find('span', {'testid': 'bed'}).get_text(strip=True))
36
+ except (AttributeError, ValueError):
37
+ bedrooms = None
38
+
39
+ try:
40
+ bathrooms = int(soup.find('span', {'testid': 'bath'}).get_text(strip=True))
41
+ except (AttributeError, ValueError):
42
+ bathrooms = None
43
+
44
+ try:
45
+ car_spaces = int(soup.find('span', {'testid': 'car'}).get_text(strip=True))
46
+ except (AttributeError, ValueError):
47
+ car_spaces = None
48
+
49
+ try:
50
+ floor_size = soup.find('span', class_='floor PropertyAttributes_attribute__3bkWm').get_text(strip=True)
51
+ except AttributeError:
52
+ floor_size = 'N/A'
53
+
54
+ try:
55
+ land_area = soup.find('span', class_='land PropertyAttributes_attribute__3bkWm').get_text(strip=True)
56
+ except AttributeError:
57
+ land_area = 'N/A'
58
+
59
+ last_sold_price, last_sold_date = parse_sold_details(soup)
60
+
61
+ capital_value = extract_value(soup, 'Capital Value')
62
+ land_value = extract_value(soup, 'Land Value')
63
+ improvement_value = extract_value(soup, 'Improvement Value')
64
+
65
+ # Fetch rental history from property_history.py
66
+ rental_history = fetch_property_history(soup)
67
+
68
+ # Prepare property data for insertion into Supabase
69
+ property_data = {
70
+ 'property_url': property_url,
71
+ 'address': address,
72
+ 'suburb': suburb,
73
+ 'city': city,
74
+ 'postcode': postcode,
75
+ 'year_built': year_built,
76
+ 'bedrooms': bedrooms,
77
+ 'bathrooms': bathrooms,
78
+ 'car_spaces': car_spaces,
79
+ 'floor_size': floor_size,
80
+ 'land_area': land_area,
81
+ 'last_sold_price': last_sold_price,
82
+ 'last_sold_date': last_sold_date,
83
+ 'capital_value': capital_value,
84
+ 'land_value': land_value,
85
+ 'improvement_value': improvement_value,
86
+ 'has_rental_history': rental_history['has_rental_history'],
87
+ 'is_currently_rented': rental_history['is_currently_rented']
88
+ }
89
+
90
+ # Prepare history data for insertion into Supabase
91
+ history_data = rental_history['history']
92
+
93
+ return property_data, history_data # Return the data for insertion
94
+
95
+ else:
96
+ print(f"Failed to fetch details for: {property_url}")
97
+ return None, None
98
+
99
+ # Step 3: Parse sold details
100
+ def parse_sold_details(soup):
101
+ last_sold = soup.find('strong', {'testid': 'lastSoldAttribute'})
102
+ if last_sold:
103
+ last_sold_text = last_sold.get_text(strip=True)
104
+
105
+ if 'for' in last_sold_text and 'on' in last_sold_text:
106
+ last_sold_price = last_sold_text.split('for')[-1].strip()
107
+
108
+ # Clean up the last_sold_price: remove '$' and ',' and convert to a float
109
+ last_sold_price = last_sold_price.replace('$', '').replace(',', '')
110
+ try:
111
+ last_sold_price = float(last_sold_price)
112
+ except ValueError:
113
+ last_sold_price = None # If price cannot be parsed, set it to None
114
+
115
+ if 'Last Sold on' in last_sold_text:
116
+ last_sold_date = last_sold_text.replace('Last Sold on', '').split('for')[0].strip()
117
+ else:
118
+ last_sold_date = last_sold_text.split('on')[-1].strip()
119
+
120
+ return last_sold_price, last_sold_date
121
+
122
+ return None, None
123
+
124
+
125
+ # Helper to extract values like Capital Value, Land Value, Improvement Value
126
+ def extract_value(soup, value_type):
127
+ try:
128
+ value = soup.find('div', string=value_type).find_next_sibling('div').get_text(strip=True)
129
+ return value
130
+ except AttributeError:
131
+ return 'N/A'
property_history.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+
3
+ # Fetch property history (like sale history, rental history)
4
+ def fetch_property_history(soup):
5
+ property_history = []
6
+ has_rental_history = False
7
+ is_currently_rented = False
8
+
9
+ # 解析历史记录
10
+ events = soup.find_all('div', class_='d-flex flex-row w-100 align-items-center pr-3 mb-2')
11
+ for event in events:
12
+ # 提取日期部分
13
+ date_day = event.find('div', {'testid': lambda x: x and x.startswith('pt-monthDay')})
14
+ date_year = event.find('div', {'testid': lambda x: x and x.startswith('pt-year')})
15
+
16
+ if date_day and date_year:
17
+ event_date = f"{date_day.get_text(strip=True)} {date_year.get_text(strip=True)}"
18
+ elif date_year:
19
+ event_date = date_year.get_text(strip=True)
20
+ else:
21
+ event_date = "Unknown date"
22
+
23
+ # 提取描述
24
+ description = event.find('strong', {'testid': lambda x: x and x.startswith('pt-description')})
25
+ event_description = description.get_text(strip=True) if description else "No description"
26
+
27
+ # 提取间隔信息
28
+ interval = event.find('div', {'testid': lambda x: x and x.startswith('pt-interval')})
29
+ event_interval = interval.get_text(strip=True) if interval else "No interval info"
30
+
31
+ # 创建事件字典
32
+ event_dict = {
33
+ 'event_date': event_date,
34
+ 'event_description': event_description,
35
+ 'event_interval': event_interval
36
+ }
37
+
38
+ property_history.append(event_dict)
39
+
40
+ # 检查是否有租赁历史
41
+ if "Listed for Rent at" in event_description or "Rented for" in event_description:
42
+ has_rental_history = True
43
+ is_currently_rented = True # 根据逻辑判断当前是否出租
44
+
45
+ # 打印 property_history 详情
46
+ print("----------------")
47
+ print("Property History")
48
+ print("----------------")
49
+
50
+ for event in property_history:
51
+ print(f"Date: {event['event_date']}")
52
+ print(f"Description: {event['event_description']}")
53
+ print(f"Interval: {event['event_interval']}")
54
+ print("----------------")
55
+
56
+ return {
57
+ 'history': property_history if property_history else "No rental history available",
58
+ 'has_rental_history': has_rental_history,
59
+ 'is_currently_rented': is_currently_rented
60
+ }
61
+
62
+ # Example usage (for testing purposes):
63
+ # if __name__ == "__main__":
64
+ # sample_html = """<your HTML content here>"""
65
+ # soup = BeautifulSoup(sample_html, 'html.parser')
66
+ # fetch_property_history(soup)
real_estate.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from playwright.sync_api import sync_playwright, TimeoutError
2
+ import time
3
+ import random
4
+
5
+ from config.redis_config import add_real_estate_to_redis, check_real_estate_in_redis, create_redis_client
6
+ from config.supabase_config import insert_real_estate
7
+
8
+ def handle_dialog(dialog):
9
+ print(f"Dialog message: {dialog.message}")
10
+ dialog.accept()
11
+
12
+ def scroll_to_bottom(page):
13
+ print("开始模拟鼠标下滑操作...")
14
+ last_height = page.evaluate("document.body.scrollHeight")
15
+ while True:
16
+ print(f" - 当前页面高度: {last_height},继续下滑...")
17
+ page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
18
+ time.sleep(random.uniform(1, 2)) # 等待页面加载
19
+ new_height = page.evaluate("document.body.scrollHeight")
20
+ if new_height == last_height:
21
+ print(" - 已到达页面底部")
22
+ break
23
+ last_height = new_height
24
+
25
+ # 检查是否出现了页码导航
26
+ if page.query_selector('nav[aria-label="Pagination"]') or page.query_selector('div[class*="pagination"]'):
27
+ print(" - 检测到页码导航,停止滚动")
28
+ break
29
+
30
+ def simulate_user_behavior(page):
31
+ scroll_to_bottom(page)
32
+
33
+ # 随机点击几个房产卡片
34
+ print("模拟查看房产卡片...")
35
+ card_selectors = [
36
+ 'div[class*="listing-tile"]',
37
+ 'div[class*="property-card"]',
38
+ 'div[class*="search-result"]'
39
+ ]
40
+ for selector in card_selectors:
41
+ cards = page.query_selector_all(selector)
42
+ if cards:
43
+ for _ in range(min(3, len(cards))):
44
+ card = random.choice(cards)
45
+ try:
46
+ card.scroll_into_view_if_needed()
47
+ card.hover()
48
+ print(f" - 悬停在一个房产卡片上")
49
+ time.sleep(random.uniform(0.5, 1.5))
50
+ except:
51
+ pass
52
+ break
53
+
54
+ # 额外的滚动操作
55
+ print("模拟额外的滚动操作")
56
+ for i in range(10):
57
+ scroll_distance = random.randint(500, 1500)
58
+ page.evaluate(f"window.scrollBy(0, {scroll_distance})")
59
+ print(f" - 向下滚动 {scroll_distance} 像素")
60
+ time.sleep(random.uniform(1, 2))
61
+
62
+ # 再次滚动到底部
63
+ print("再次滚动到页面底部")
64
+ scroll_to_bottom(page)
65
+
66
+ def fetch_addresses(page, url):
67
+ try:
68
+ page.goto(url, wait_until="networkidle", timeout=60000)
69
+ except TimeoutError:
70
+ print(f"Timeout while loading {url}. Continuing with partial page load.")
71
+
72
+ try:
73
+ page.wait_for_selector('button:has-text("Accept")', timeout=5000)
74
+ page.click('button:has-text("Accept")')
75
+ print("Clicked cookie consent button.")
76
+ except:
77
+ print("No cookie consent button found or unable to click it.")
78
+
79
+ # 模拟用户行为
80
+ simulate_user_behavior(page)
81
+
82
+ addresses = []
83
+ try:
84
+ selectors = [
85
+ 'h3[data-test="standard-tile__search-result__address"]',
86
+ '.standard-tile__search-result__address',
87
+ 'h3[class*="address"]',
88
+ 'div[class*="address"]',
89
+ 'div[class*="listing-tile"] h3',
90
+ 'div[class*="listing-tile"] div[class*="address"]'
91
+ ]
92
+
93
+ for selector in selectors:
94
+ address_elements = page.query_selector_all(selector)
95
+ if address_elements:
96
+ addresses = [element.inner_text().strip() for element in address_elements if element.inner_text().strip()]
97
+ print(f"Found {len(addresses)} addresses using selector: {selector}")
98
+ break
99
+
100
+ if not addresses:
101
+ print(f"No address elements found on {url} using any of the selectors.")
102
+ print("Page Title:", page.title())
103
+ print("Current URL:", page.url)
104
+ print("HTML content:", page.content()[:1000])
105
+ except Exception as e:
106
+ print(f"An error occurred while scraping {url}: {str(e)}")
107
+
108
+ return addresses
109
+
110
+ def scrape_properties(main_url, max_pages):
111
+ redis_client = create_redis_client() # Instantiate the Redis client
112
+ all_addresses = []
113
+
114
+ with sync_playwright() as p:
115
+ browser = p.chromium.launch(
116
+ headless=True,
117
+ args=[
118
+ "--no-sandbox",
119
+ "--disable-dev-shm-usage",
120
+ ],
121
+ )
122
+
123
+ context = browser.new_context(
124
+ user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
125
+ )
126
+ page = context.new_page()
127
+ page.on("dialog", handle_dialog)
128
+
129
+ for page_num in range(1, max_pages + 1):
130
+ url = f"{main_url}?page={page_num}"
131
+ print(f"\nScraping page {page_num}: {url}")
132
+
133
+ addresses = fetch_addresses(page, url)
134
+ if addresses:
135
+ all_addresses.extend(addresses)
136
+ print(f"Found {len(addresses)} addresses on page {page_num}")
137
+ print("Addresses found on this page:")
138
+ for addr in addresses:
139
+ print(f" - {addr}")
140
+ if not check_real_estate_in_redis(redis_client, addr):
141
+ # 插入到Supabase中
142
+ insert_real_estate(addr, "for Sale") # 假设状态为 "For Sale"
143
+ # 将地址添加到Redis,避免重复
144
+ add_real_estate_to_redis(redis_client, addr)
145
+ else:
146
+ print(f"Address {addr} already exists in Redis. Skipping...")
147
+ else:
148
+ print(f"No addresses found on page {page_num}. Continuing to next page.")
149
+
150
+ if page_num < max_pages:
151
+ delay = random.uniform(5, 10)
152
+ print(f"Waiting for {delay:.2f} seconds before next request...")
153
+ time.sleep(delay)
154
+
155
+ browser.close()
156
+
157
+ def main():
158
+ main_url = "https://www.realestate.co.nz/residential/sale/auckland"
159
+ max_pages = 500
160
+ scrape_properties(main_url, max_pages)
161
+
162
+ if __name__ == "__main__":
163
+ main()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ supabase==2.15.1 # Python client library for Supabase
2
+ python-dotenv==1.0.0 # For loading environment variables from a .env file
3
+ requests==2.31.0 # HTTP library for making requests
4
+ upstash-redis==1.3.0 # Redis client for Upstash
5
+ python-dateutil==2.8.2 # For flexible date parsing
6
+ playwright>=1.33.0
7
+ # greenlet>=3.1.0
8
+ beautifulsoup4>=4.12.2 # BeautifulSoup HTML parser
9
+
utils/data_processing.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+
3
+ def parse_event_date(date_str):
4
+ # 将字符串日期解析为标准日期格式
5
+ try:
6
+ return datetime.datetime.strptime(date_str, "%d %B %Y")
7
+ except ValueError:
8
+ return None # 返回 None 表示日期格式不正确