File size: 8,163 Bytes
548d9ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d6c766
 
548d9ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import streamlit as st
from bs4 import BeautifulSoup
from datetime import datetime
import requests
import json
import emoji
import re
import pandas as pd
from io import BytesIO

class GoogleMapSpider:
    def __init__(self):
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
                       "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
        }
        self.store_id_url = "https://www.google.com.tw/maps/search/{store_name}"
        self.store_name_url = "https://www.google.com.tw/maps/place/data=!4m5!3m4!1s{store_id}!8m2!3d25.0564743!4d121.5204167?authuser=0&hl=zh-TW&rclk=1"
        self.comment_url = "https://www.google.com.tw/maps/rpc/listugcposts"

    def get_store_id(self, store_name):
        url = self.store_id_url.format(store_name=store_name)
        response = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(response.text, "lxml")
        pattern = r'0x.{16}:0x.{16}'
        match = re.search(pattern, str(soup))
        store_id = match.group()
        return store_id

    def get_store_name(self, store_id):
        url = self.store_name_url.format(store_id=store_id)
        response = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(response.text, "lxml")
        meta_list = soup.find_all('meta')
        store_name = []
        for i in meta_list:
            if '''itemprop="name"''' in str(i):
                store_name.append(re.search('".*·',str(i)).group()[1:-2])
        return store_name[0]

    def get_comment(self, store_id, page_count=1, sorted_by=2, progress_callback=None):
        next_token = ""
        commont_list = []
        for page in range(1, page_count+1):
            if progress_callback:
                progress_callback(page, page_count)

            params = {
                "authuser": "0",
                "hl": "zh-TW",
                "gl": "tw",
                "pb": (
                    f"!1m6!1s{store_id}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s"
                    f"{next_token}"
                    f"!5m2!1s0OBwZ4OnGsrM1e8PxIjW6AI!7e81!8m5!1b1!2b1!3b1!5b1!7b1!11m0!13m1!1e{sorted_by}"
                )
            }

            response = requests.get(self.comment_url, params=params, headers=self.headers)
            data = json.loads(emoji.demojize(response.text[4:]))

            next_token = data[1]
            commont_list.extend(data[2])
            if not next_token:
                break

        commont_dict_list = []
        for comment_data in commont_list:
            try:
                comment_date = comment_data[0][2][2][0][1][21][6][-1]
                comment_date = datetime(comment_date[0], comment_date[1], comment_date[2], comment_date[3]).strftime('%Y/%m/%d %H:%M:%S')
            except:
                comment_date = None

            try:
                comment_text = comment_data[0][2][-1][0][0]
            except:
                comment_text = None

            comment_info = {
                "評論者": comment_data[0][1][4][5][0],
                "評論者id": comment_data[0][0],
                "評論者狀態": comment_data[0][1][4][5][10][0],
                "評論者等級": comment_data[0][1][4][5][9],
                "留言時間": comment_data[0][1][6],
                "留言日期": comment_date,
                "評論": comment_text,
                "評論分數": comment_data[0][2][0][0]
            }
            commont_dict_list.append(comment_info)

        return commont_dict_list

def main():
    st.set_page_config(page_title="Google Maps 評論爬蟲", page_icon="🗺️", layout="wide")
    
    st.title("🗺️ Google Maps 評論爬蟲")
    st.markdown("---")

    # Sidebar for input
    with st.sidebar:
        st.header("⚙️ 設定")
        store_name = st.text_input("店家名稱", placeholder="例如:台北101")
        page_count = st.number_input("爬取頁數", min_value=1, max_value=50, value=1)
        
        sort_options = {
            "最相關": 1,
            "最新": 2,
            "評分最高": 3,
            "評分最低": 4
        }
        sorted_by_name = st.selectbox("排序方式", list(sort_options.keys()))
        sorted_by = sort_options[sorted_by_name]
        
        start_button = st.button("🚀 開始爬取", type="primary", use_container_width=True)

    # Initialize session state
    if 'comments_data' not in st.session_state:
        st.session_state.comments_data = None
    if 'store_name_used' not in st.session_state:
        st.session_state.store_name_used = None

    # Main content
    if start_button:
        if not store_name:
            st.error("❌ 請輸入店家名稱!")
        else:
            try:
                spider = GoogleMapSpider()
                
                # Progress indicators
                progress_bar = st.progress(0)
                status_text = st.empty()
                
                # Get store ID
                status_text.text("🔍 正在獲取店家ID...")
                store_id = spider.get_store_id(store_name)
                st.success(f"✅ 店家ID: {store_id}")
                
                # Get comments
                status_text.text("📝 開始爬取評論...")
                
                def update_progress(current, total):
                    progress = current / total
                    progress_bar.progress(progress)
                    status_text.text(f"📝 正在爬取第 {current} 頁,共 {total} 頁")
                
                comments_data = spider.get_comment(
                    store_id=store_id,
                    page_count=page_count,
                    sorted_by=sorted_by,
                    progress_callback=update_progress
                )
                
                progress_bar.progress(1.0)
                status_text.text("✅ 爬取完成!")
                
                # Save to session state
                st.session_state.comments_data = comments_data
                st.session_state.store_name_used = store_name
                
                st.success(f"🎉 完成!共爬取 {len(comments_data)} 則評論")
                
            except Exception as e:
                st.error(f"❌ 發生錯誤: {str(e)}")

    # Display results
    if st.session_state.comments_data:
        st.markdown("---")
        st.header("📊 評論結果")
        
        df = pd.DataFrame(st.session_state.comments_data)
        
        # Statistics
        col1, col2, col3, col4 = st.columns(4)
        with col1:
            st.metric("總評論數", len(df))
        with col2:
            avg_score = df['評論分數'].mean()
            st.metric("平均評分", f"{avg_score:.2f}")
        with col3:
            max_score = df['評論分數'].max()
            st.metric("最高評分", int(max_score))
        with col4:
            min_score = df['評論分數'].min()
            st.metric("最低評分", int(min_score))
        
        # Data table
        st.subheader("📝 評論詳細內容")
        st.dataframe(df, use_container_width=True, height=400)
        
        # Download options
        st.subheader("💾 下載資料")
        col1, col2 = st.columns(2)
        
        with col1:
            # CSV download with proper encoding
            csv = df.to_csv(index=False, encoding='utf-8-sig').encode('utf-8-sig')
            st.download_button(
                label="📥 下載 CSV",
                data=csv,
                file_name=f"{st.session_state.store_name_used}_評論.csv",
                mime="text/csv",
                use_container_width=True
            )
        
        with col2:
            # JSON download
            json_str = json.dumps(st.session_state.comments_data, ensure_ascii=False, indent=2)
            st.download_button(
                label="📥 下載 JSON",
                data=json_str,
                file_name=f"{st.session_state.store_name_used}_評論.json",
                mime="application/json",
                use_container_width=True
            )

if __name__ == "__main__":
    main()