# 由 Copilot 生成 import requests import time import json import pandas as pd from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from webdriver_manager.chrome import ChromeDriverManager import re from typing import List, Dict, Optional class Rent591Scraper: """591租屋網爬蟲類別""" def __init__(self): self.base_url = "https://rent.591.com.tw" self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', } self.session = requests.Session() self.session.headers.update(self.headers) def setup_driver(self): """設置Chrome WebDriver""" chrome_options = Options() chrome_options.add_argument('--headless') # 無頭模式 chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--window-size=1920,1080') chrome_options.add_argument(f'--user-agent={self.headers["User-Agent"]}') service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service, options=chrome_options) return driver def get_csrf_token(self, driver): """獲取CSRF Token""" try: # 訪問首頁獲取token driver.get("https://rent.591.com.tw/") time.sleep(2) # 嘗試從頁面中提取token token_element = driver.find_element(By.NAME, "csrf-token") if token_element: return token_element.get_attribute("content") # 如果沒找到,嘗試從cookies中獲取 cookies = driver.get_cookies() for cookie in cookies: if 'token' in cookie['name'].lower(): return cookie['value'] except Exception as e: print(f"獲取token失敗: {e}") return None def scrape_rental_data(self, max_pages: int = 10) -> List[Dict]: """ 爬取租屋資料 Args: max_pages: 最大爬取頁數 Returns: 租屋資料列表 """ driver = self.setup_driver() all_data = [] try: # 目標URL參數 params = { 'region': '17', # 高雄市 'section': '247', # 鼓山區 'kind': '1', # 整層住家 'layout': '2', # 2房 'shape': '2' # 電梯大樓 } for page in range(1, max_pages + 1): print(f"正在爬取第 {page} 頁...") # 構建URL params['page'] = page url = f"{self.base_url}/list?" + "&".join([f"{k}={v}" for k, v in params.items()]) driver.get(url) time.sleep(3) # 等待頁面載入 # 檢查是否有資料 rental_items = driver.find_elements(By.CSS_SELECTOR, '.rent-item') if not rental_items: print(f"第 {page} 頁沒有找到資料,停止爬取") break page_data = self.parse_page_data(driver) all_data.extend(page_data) print(f"第 {page} 頁獲取 {len(page_data)} 筆資料") # 避免被封IP,加入延遲 time.sleep(2) except Exception as e: print(f"爬取資料時發生錯誤: {e}") finally: driver.quit() return all_data def parse_page_data(self, driver) -> List[Dict]: """解析單頁資料""" page_data = [] try: # 獲取頁面HTML soup = BeautifulSoup(driver.page_source, 'html.parser') # 查找租屋項目 rental_items = soup.find_all('div', class_='rent-item') for item in rental_items: try: rental_info = self.extract_rental_info(item) if rental_info: page_data.append(rental_info) except Exception as e: print(f"解析單筆資料時發生錯誤: {e}") continue except Exception as e: print(f"解析頁面資料時發生錯誤: {e}") return page_data def extract_rental_info(self, item) -> Optional[Dict]: """提取單筆租屋資訊""" try: # 基本資訊 title_elem = item.find('h3', class_='rent-item-title') title = title_elem.get_text(strip=True) if title_elem else "N/A" # 租金 price_elem = item.find('div', class_='rent-item-price') price_text = price_elem.get_text(strip=True) if price_elem else "0" price = self.extract_price(price_text) # 地址 address_elem = item.find('div', class_='rent-item-address') address = address_elem.get_text(strip=True) if address_elem else "N/A" # 詳細資訊 info_elem = item.find('div', class_='rent-item-info') info_text = info_elem.get_text(strip=True) if info_elem else "" # 提取坪數、樓層等資訊 area = self.extract_area(info_text) floor = self.extract_floor(info_text) # 連結 link_elem = item.find('a') link = self.base_url + link_elem.get('href') if link_elem and link_elem.get('href') else "" return { 'title': title, 'price': price, 'address': address, 'area': area, 'floor': floor, 'link': link, 'raw_info': info_text } except Exception as e: print(f"提取租屋資訊時發生錯誤: {e}") return None def extract_price(self, price_text: str) -> int: """提取租金數字""" try: # 移除非數字字符,提取租金 price_match = re.search(r'[\d,]+', price_text.replace(',', '')) if price_match: return int(price_match.group().replace(',', '')) except: pass return 0 def extract_area(self, info_text: str) -> float: """提取坪數""" try: area_match = re.search(r'(\d+(?:\.\d+)?)\s*坪', info_text) if area_match: return float(area_match.group(1)) except: pass return 0.0 def extract_floor(self, info_text: str) -> str: """提取樓層資訊""" try: floor_match = re.search(r'(\d+)樓', info_text) if floor_match: return floor_match.group(1) + '樓' except: pass return "N/A" def save_data(self, data: List[Dict], filename: str = "rental_data.json"): """儲存資料到檔案""" try: with open(f"output/{filename}", 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) print(f"資料已儲存到 output/{filename}") except Exception as e: print(f"儲存資料時發生錯誤: {e}") def to_dataframe(self, data: List[Dict]) -> pd.DataFrame: """轉換為DataFrame""" return pd.DataFrame(data) if __name__ == "__main__": scraper = Rent591Scraper() print("開始爬取591租屋資料...") # 爬取資料 rental_data = scraper.scrape_rental_data(max_pages=5) if rental_data: print(f"總共爬取到 {len(rental_data)} 筆資料") # 儲存原始資料 scraper.save_data(rental_data) # 轉換為DataFrame並儲存CSV df = scraper.to_dataframe(rental_data) df.to_csv("output/rental_data.csv", index=False, encoding='utf-8-sig') print("資料爬取完成!") print(df.head()) else: print("沒有爬取到任何資料")