Spaces:

cjian2025
/

HTScrapper

Sleeping

File size: 8,163 Bytes

import streamlit as st
from bs4 import BeautifulSoup
from datetime import datetime
import requests
import json
import emoji
import re
import pandas as pd
from io import BytesIO

class GoogleMapSpider:
    def __init__(self):
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
                       "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
        }
        self.store_id_url = "https://www.google.com.tw/maps/search/{store_name}"
        self.store_name_url = "https://www.google.com.tw/maps/place/data=!4m5!3m4!1s{store_id}!8m2!3d25.0564743!4d121.5204167?authuser=0&hl=zh-TW&rclk=1"
        self.comment_url = "https://www.google.com.tw/maps/rpc/listugcposts"

    def get_store_id(self, store_name):
        url = self.store_id_url.format(store_name=store_name)
        response = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(response.text, "lxml")
        pattern = r'0x.{16}:0x.{16}'
        match = re.search(pattern, str(soup))
        store_id = match.group()
        return store_id

    def get_store_name(self, store_id):
        url = self.store_name_url.format(store_id=store_id)
        response = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(response.text, "lxml")
        meta_list = soup.find_all('meta')
        store_name = []
        for i in meta_list:
            if '''itemprop="name"''' in str(i):
                store_name.append(re.search('".*·',str(i)).group()[1:-2])
        return store_name[0]

    def get_comment(self, store_id, page_count=1, sorted_by=2, progress_callback=None):
        next_token = ""
        commont_list = []
        for page in range(1, page_count+1):
            if progress_callback:
                progress_callback(page, page_count)

            params = {
                "authuser": "0",
                "hl": "zh-TW",
                "gl": "tw",
                "pb": (
                    f"!1m6!1s{store_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s"
                    f"{next_token}"
                    f"!5m2!1s0OBwZ4OnGsrM1e8PxIjW6AI!7e81!8m5!1b1!2b1!3b1!5b1!7b1!11m0!13m1!1e{sorted_by}"
                )
            }

            response = requests.get(self.comment_url, params=params, headers=self.headers)
            data = json.loads(emoji.demojize(response.text[4:]))

            next_token = data[1]
            commont_list.extend(data[2])
            if not next_token:
                break

        commont_dict_list = []
        for comment_data in commont_list:
            try:
                comment_date = comment_data[0][2][2][0][1][21][6][-1]
                comment_date = datetime(comment_date[0], comment_date[1], comment_date[2], comment_date[3]).strftime('%Y/%m/%d %H:%M:%S')
            except:
                comment_date = None

            try:
                comment_text = comment_data[0][2][-1][0][0]
            except:
                comment_text = None

            comment_info = {
                "評論者": comment_data[0][1][4][5][0],
                "評論者id": comment_data[0][0],
                "評論者狀態": comment_data[0][1][4][5][10][0],
                "評論者等級": comment_data[0][1][4][5][9],
                "留言時間": comment_data[0][1][6],
                "留言日期": comment_date,
                "評論": comment_text,
                "評論分數": comment_data[0][2][0][0]
            }
            commont_dict_list.append(comment_info)

        return commont_dict_list

def main():
    st.set_page_config(page_title="Google Maps 評論爬蟲", page_icon="🗺️", layout="wide")
    
    st.title("🗺️ Google Maps 評論爬蟲")
    st.markdown("---")

    # Sidebar for input
    with st.sidebar:
        st.header("⚙️ 設定")
        store_name = st.text_input("店家名稱", placeholder="例如：台北101")
        page_count = st.number_input("爬取頁數", min_value=1, max_value=50, value=1)
        
        sort_options = {
            "最相關": 1,
            "最新": 2,
            "評分最高": 3,
            "評分最低": 4
        }
        sorted_by_name = st.selectbox("排序方式", list(sort_options.keys()))
        sorted_by = sort_options[sorted_by_name]
        
        start_button = st.button("🚀 開始爬取", type="primary", use_container_width=True)

    # Initialize session state
    if 'comments_data' not in st.session_state:
        st.session_state.comments_data = None
    if 'store_name_used' not in st.session_state:
        st.session_state.store_name_used = None

    # Main content
    if start_button:
        if not store_name:
            st.error("❌ 請輸入店家名稱！")
        else:
            try:
                spider = GoogleMapSpider()
                
                # Progress indicators
                progress_bar = st.progress(0)
                status_text = st.empty()
                
                # Get store ID
                status_text.text("🔍 正在獲取店家ID...")
                store_id = spider.get_store_id(store_name)
                st.success(f"✅ 店家ID: {store_id}")
                
                # Get comments
                status_text.text("📝 開始爬取評論...")
                
                def update_progress(current, total):
                    progress = current / total
                    progress_bar.progress(progress)
                    status_text.text(f"📝 正在爬取第 {current} 頁，共 {total} 頁")
                
                comments_data = spider.get_comment(
                    store_id=store_id,
                    page_count=page_count,
                    sorted_by=sorted_by,
                    progress_callback=update_progress
                )
                
                progress_bar.progress(1.0)
                status_text.text("✅ 爬取完成！")
                
                # Save to session state
                st.session_state.comments_data = comments_data
                st.session_state.store_name_used = store_name
                
                st.success(f"🎉 完成！共爬取 {len(comments_data)} 則評論")
                
            except Exception as e:
                st.error(f"❌ 發生錯誤: {str(e)}")

    # Display results
    if st.session_state.comments_data:
        st.markdown("---")
        st.header("📊 評論結果")
        
        df = pd.DataFrame(st.session_state.comments_data)
        
        # Statistics
        col1, col2, col3, col4 = st.columns(4)
        with col1:
            st.metric("總評論數", len(df))
        with col2:
            avg_score = df['評論分數'].mean()
            st.metric("平均評分", f"{avg_score:.2f}")
        with col3:
            max_score = df['評論分數'].max()
            st.metric("最高評分", int(max_score))
        with col4:
            min_score = df['評論分數'].min()
            st.metric("最低評分", int(min_score))
        
        # Data table
        st.subheader("📝 評論詳細內容")
        st.dataframe(df, use_container_width=True, height=400)
        
        # Download options
        st.subheader("💾 下載資料")
        col1, col2 = st.columns(2)
        
        with col1:
            # CSV download with proper encoding
            csv = df.to_csv(index=False, encoding='utf-8-sig').encode('utf-8-sig')
            st.download_button(
                label="📥 下載 CSV",
                data=csv,
                file_name=f"{st.session_state.store_name_used}_評論.csv",
                mime="text/csv",
                use_container_width=True
            )
        
        with col2:
            # JSON download
            json_str = json.dumps(st.session_state.comments_data, ensure_ascii=False, indent=2)
            st.download_button(
                label="📥 下載 JSON",
                data=json_str,
                file_name=f"{st.session_state.store_name_used}_評論.json",
                mime="application/json",
                use_container_width=True
            )

if __name__ == "__main__":
    main()