# 由 Copilot 生成
import requests
import time
import json
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import re
from typing import List, Dict, Optional

class Rent591Scraper:
    """591租屋網爬蟲類別"""
    
    def __init__(self):
        self.base_url = "https://rent.591.com.tw"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        
    def setup_driver(self):
        """設置Chrome WebDriver"""
        chrome_options = Options()
        chrome_options.add_argument('--headless')  # 無頭模式
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--window-size=1920,1080')
        chrome_options.add_argument(f'--user-agent={self.headers["User-Agent"]}')
        
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        return driver
    
    def get_csrf_token(self, driver):
        """獲取CSRF Token"""
        try:
            # 訪問首頁獲取token
            driver.get("https://rent.591.com.tw/")
            time.sleep(2)
            
            # 嘗試從頁面中提取token
            token_element = driver.find_element(By.NAME, "csrf-token")
            if token_element:
                return token_element.get_attribute("content")
            
            # 如果沒找到，嘗試從cookies中獲取
            cookies = driver.get_cookies()
            for cookie in cookies:
                if 'token' in cookie['name'].lower():
                    return cookie['value']
                    
        except Exception as e:
            print(f"獲取token失敗: {e}")
            
        return None
    
    def scrape_rental_data(self, max_pages: int = 10) -> List[Dict]:
        """
        爬取租屋資料
        
        Args:
            max_pages: 最大爬取頁數
            
        Returns:
            租屋資料列表
        """
        driver = self.setup_driver()
        all_data = []
        
        try:
            # 目標URL參數
            params = {
                'region': '17',      # 高雄市
                'section': '247',    # 鼓山區
                'kind': '1',         # 整層住家
                'layout': '2',       # 2房
                'shape': '2'         # 電梯大樓
            }
            
            for page in range(1, max_pages + 1):
                print(f"正在爬取第 {page} 頁...")
                
                # 構建URL
                params['page'] = page
                url = f"{self.base_url}/list?" + "&".join([f"{k}={v}" for k, v in params.items()])
                
                driver.get(url)
                time.sleep(3)  # 等待頁面載入
                
                # 檢查是否有資料
                rental_items = driver.find_elements(By.CSS_SELECTOR, '.rent-item')
                if not rental_items:
                    print(f"第 {page} 頁沒有找到資料，停止爬取")
                    break
                
                page_data = self.parse_page_data(driver)
                all_data.extend(page_data)
                
                print(f"第 {page} 頁獲取 {len(page_data)} 筆資料")
                
                # 避免被封IP，加入延遲
                time.sleep(2)
                
        except Exception as e:
            print(f"爬取資料時發生錯誤: {e}")
        finally:
            driver.quit()
            
        return all_data
    
    def parse_page_data(self, driver) -> List[Dict]:
        """解析單頁資料"""
        page_data = []
        
        try:
            # 獲取頁面HTML
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            # 查找租屋項目
            rental_items = soup.find_all('div', class_='rent-item')
            
            for item in rental_items:
                try:
                    rental_info = self.extract_rental_info(item)
                    if rental_info:
                        page_data.append(rental_info)
                except Exception as e:
                    print(f"解析單筆資料時發生錯誤: {e}")
                    continue
                    
        except Exception as e:
            print(f"解析頁面資料時發生錯誤: {e}")
            
        return page_data
    
    def extract_rental_info(self, item) -> Optional[Dict]:
        """提取單筆租屋資訊"""
        try:
            # 基本資訊
            title_elem = item.find('h3', class_='rent-item-title')
            title = title_elem.get_text(strip=True) if title_elem else "N/A"
            
            # 租金
            price_elem = item.find('div', class_='rent-item-price')
            price_text = price_elem.get_text(strip=True) if price_elem else "0"
            price = self.extract_price(price_text)
            
            # 地址
            address_elem = item.find('div', class_='rent-item-address')
            address = address_elem.get_text(strip=True) if address_elem else "N/A"
            
            # 詳細資訊
            info_elem = item.find('div', class_='rent-item-info')
            info_text = info_elem.get_text(strip=True) if info_elem else ""
            
            # 提取坪數、樓層等資訊
            area = self.extract_area(info_text)
            floor = self.extract_floor(info_text)
            
            # 連結
            link_elem = item.find('a')
            link = self.base_url + link_elem.get('href') if link_elem and link_elem.get('href') else ""
            
            return {
                'title': title,
                'price': price,
                'address': address,
                'area': area,
                'floor': floor,
                'link': link,
                'raw_info': info_text
            }
            
        except Exception as e:
            print(f"提取租屋資訊時發生錯誤: {e}")
            return None
    
    def extract_price(self, price_text: str) -> int:
        """提取租金數字"""
        try:
            # 移除非數字字符，提取租金
            price_match = re.search(r'[\d,]+', price_text.replace(',', ''))
            if price_match:
                return int(price_match.group().replace(',', ''))
        except:
            pass
        return 0
    
    def extract_area(self, info_text: str) -> float:
        """提取坪數"""
        try:
            area_match = re.search(r'(\d+(?:\.\d+)?)\s*坪', info_text)
            if area_match:
                return float(area_match.group(1))
        except:
            pass
        return 0.0
    
    def extract_floor(self, info_text: str) -> str:
        """提取樓層資訊"""
        try:
            floor_match = re.search(r'(\d+)樓', info_text)
            if floor_match:
                return floor_match.group(1) + '樓'
        except:
            pass
        return "N/A"
    
    def save_data(self, data: List[Dict], filename: str = "rental_data.json"):
        """儲存資料到檔案"""
        try:
            with open(f"output/{filename}", 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            print(f"資料已儲存到 output/{filename}")
        except Exception as e:
            print(f"儲存資料時發生錯誤: {e}")
    
    def to_dataframe(self, data: List[Dict]) -> pd.DataFrame:
        """轉換為DataFrame"""
        return pd.DataFrame(data)

if __name__ == "__main__":
    scraper = Rent591Scraper()
    print("開始爬取591租屋資料...")
    
    # 爬取資料
    rental_data = scraper.scrape_rental_data(max_pages=5)
    
    if rental_data:
        print(f"總共爬取到 {len(rental_data)} 筆資料")
        
        # 儲存原始資料
        scraper.save_data(rental_data)
        
        # 轉換為DataFrame並儲存CSV
        df = scraper.to_dataframe(rental_data)
        df.to_csv("output/rental_data.csv", index=False, encoding='utf-8-sig')
        
        print("資料爬取完成！")
        print(df.head())
    else:
        print("沒有爬取到任何資料")