File size: 3,880 Bytes
3624bf2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import time
import requests
from bs4 import BeautifulSoup
from fetch_property_links import fetch_property_links
from properties import fetch_property_details
# from config.redis_config import create_redis_client, check_property_in_redis, add_property_to_redis
from config.supabase_config import insert_property_and_history

# Main function to scrape properties

def fetch_suburbs(url, city):
    """
    Fetches the list of suburbs and their links from a given URL.
    """
    
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        suburb_links_container = soup.find('div', {'testid': 'suburbLinksContainer'})
        if suburb_links_container:
            suburb_links = suburb_links_container.find_all('a')
            # Reverse the order of links
            for link in suburb_links:
                suburb_name = link.get_text(strip=True)
                suburb_link = "https://propertyvalue.co.nz" + link.get('href')                
                print(f"Suburb: {suburb_name}, Link: {suburb_link}")
                
                # Fetch the page content for the suburb link
                suburb_response = requests.get(suburb_link)
                print(f"  Status code for {suburb_name}: {suburb_response.status_code}")
                if suburb_response.status_code == 200:
                    suburb_soup = BeautifulSoup(suburb_response.content, 'html.parser')
                    # Find the pagination element using role='group' and class_='btn-group'
                    pagination = suburb_soup.find('div', {'role': 'group', 'class': 'btn-group'})
                    if pagination:
                        # Find the label with "of" and the next label for the max page number
                        of_label = pagination.find('label', string='of')
                        if of_label and of_label.find_next_sibling('label'):
                            max_page = int(of_label.find_next_sibling('label').get_text(strip=True))
                            print(f"Suburb: {suburb_name}, Max Pages: {max_page}")
                        else:
                            print(f"  No page numbers found for {suburb_name}")
                    else:
                        print(f"  No pagination element found for {suburb_name}")
                        max_page = 1 # Default to 1 page if no pagination

                    scrape_properties(suburb_link, max_page, city, suburb_name)

def scrape_properties(main_url, max_pages, city, suburb):
    # redis_client = create_redis_client()  # Instantiate the Redis client

    for page in range(1, max_pages + 1):
        # Fetch property links and titles for the current page
        property_links, titles = fetch_property_links(main_url, page)
        
        # Print and fetch details for each property on the current page
        for property_url, title in zip(property_links, titles):
            print(f"Fetching details for: {title}")
            
            # Check if the property address already exists in Redis
            # if check_property_in_redis(redis_client, title):
            #     print(f"Property {title} already exists in Redis. Skipping...")
            #     continue

            # Fetch property details and history
            property_data, history_data = fetch_property_details(property_url, title, city, suburb)
            
            # Insert into Supabase
            insert_property_and_history(property_data, history_data)

            # Add the property to Redis to avoid duplicates
            # add_property_to_redis(redis_client, title)
            
            # time.sleep(0.5)  # Adding a delay to avoid overloading the server

# Run the scraper
if __name__ == "__main__":
    city = "Auckland - City"
    fetch_suburbs("https://www.propertyvalue.co.nz/auckland/auckland/7", city)