import streamlit as st import plotly.graph_objs as go import requests import json import pandas as pd import time from pytrends.request import TrendReq from pytrends.exceptions import TooManyRequestsError # 設置 Streamlit 應用程序 st.title("PCHOME 和 MOMO 商品價格爬蟲分析與趨勢分析") # 定義基礎URL pchome_base_url = 'https://ecshweb.pchome.com.tw/search/v3.3/all/results?q=' momo_url = "https://apisearch.momoshop.com.tw/momoSearchCloud/moec/textSearch" # 爬取 PCHOME 的函數 def crawl_pchome(keyword, num_pages): alldata = pd.DataFrame() for i in range(1, num_pages + 1): url = f'{pchome_base_url}{keyword}&page={i}&sort=sale/dc' list_req = requests.get(url) getdata = json.loads(list_req.content) todataFrame = pd.DataFrame(getdata['prods']) alldata = pd.concat([alldata, todataFrame]) time.sleep(1) return alldata[["name", "price"]] # 爬取 MOMO 的函數 def crawl_momo(keyword, num_pages): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36" } product_list = [] for page in range(1, num_pages + 1): payload = { "host": "momoshop", "flag": "searchEngine", "data": { "searchValue": keyword, "curPage": str(page), "priceS": "0", "priceE": "9999999", "searchType": "1" } } response = requests.post(momo_url, headers=headers, json=payload) if response.status_code == 200: data_from_api = response.json() products = data_from_api.get('rtnSearchData', {}).get('goodsInfoList', []) for product in products: name = product.get('goodsName', '') price = product.get('goodsPrice', '') price_str = str(price).split('(')[0].replace(',', '').replace('$', '') try: product_price = float(price_str) except ValueError: product_price = 0 product_list.append({'name': name, 'price': product_price}) time.sleep(1) return pd.DataFrame(product_list) # 獲取 Google 趨勢數據的函數 def get_trends_data(keyword, start_date, end_date): pytrends = TrendReq(hl='en-US', tz=360) kw_list = [keyword] timeframe = f'{start_date} {end_date}' for _ in range(3): # 重試最多3次 try: pytrends.build_payload(kw_list, cat=0, timeframe=timeframe, geo='TW', gprop='') trends_data = pytrends.interest_over_time() if not trends_data.empty: trends_data = trends_data.reset_index() return trends_data except TooManyRequestsError: st.warning("Google 趨勢請求過多,正在等待重試...") time.sleep(60) # 等待60秒後重試 st.error("無法獲取 Google 趨勢數據,請稍後再試。") return None # 輸入關鍵字和頁數 keyword = st.text_input("請輸入關鍵字:", "平板") num_pages = st.number_input("請輸入要爬取的頁數:", min_value=1, max_value=100, value=1) # 選擇 Pytrends 時間範圍 start_date = st.date_input("選擇開始日期:", value=pd.to_datetime("2023-01-01")) end_date = st.date_input("選擇結束日期:", value=pd.to_datetime("2023-12-31")) # 主爬取函數 if st.button("開始爬取"): start_time = time.time() # 爬取 PCHOME 資料 st.subheader("爬取 PCHOME 資料") pchome_progress = st.progress(0) pchome_data = crawl_pchome(keyword, num_pages) pchome_progress.progress(100) # 爬取 MOMO 資料 st.subheader("爬取 MOMO 資料") momo_progress = st.progress(0) momo_data = crawl_momo(keyword, num_pages) momo_progress.progress(100) # 檢查是否有數據 if pchome_data.empty and momo_data.empty: st.error("查無商品") else: # 添加來源欄位 pchome_data['source'] = 'PCHOME' momo_data['source'] = 'MOMO' # 合併並排序數據 combined_data = pd.concat([pchome_data, momo_data]) combined_data = combined_data.sort_values('price', ascending=False).reset_index(drop=True) # 顯示結果 st.subheader("爬取結果:") st.write(combined_data) # 生成 CSV 檔案 csv = combined_data.to_csv(index=False) st.download_button( label="下載 CSV 檔案", data=csv, file_name=f"{keyword}_price_data.csv", mime="text/csv", ) # 繪製價格分布圖 - 圓餅圖 st.subheader("價格分布圖 - 圓餅圖") combined_data['price_range'] = pd.cut(combined_data['price'], bins=range(0, int(combined_data['price'].max()) + 1000, 1000)) price_range_counts = combined_data['price_range'].value_counts().sort_index() fig_pie = go.Figure(data=[go.Pie(labels=price_range_counts.index.astype(str), values=price_range_counts.values)]) fig_pie.update_layout( title='價格分布 - 圓餅圖', width=800, # 設定圖表寬度 height=800 # 設定圖表高度 ) st.plotly_chart(fig_pie) # 繪製價格分布圖 - 折線圖 st.subheader("價格分布圖 - 折線圖") fig_line = go.Figure(data=[go.Scatter(x=price_range_counts.index.astype(str), y=price_range_counts.values, mode='lines+markers')]) fig_line.update_layout(title='價格分布 - 折線圖', xaxis_title='價格區間', yaxis_title='商品數量', width=800, # 設定圖表寬度 height=800 # 設定圖表高度 ) st.plotly_chart(fig_line) # 繪製價格分布圖 - 南丁格爾玫瑰圖 st.subheader("價格分布圖 - 南丁格爾玫瑰圖") fig_rose = go.Figure(data=[go.Barpolar(r=price_range_counts.values, theta=price_range_counts.index.astype(str), marker=dict(color=price_range_counts.values, colorscale='Viridis'))]) fig_rose.update_layout(title='價格分布 - 南丁格爾玫瑰圖', width=800, # 設定圖表寬度 height=800 # 設定圖表高度 ) st.plotly_chart(fig_rose) # 繪製價格分布圖 - 環形圖 st.subheader("價格分布圖 - 環形圖") fig_donut = go.Figure(data=[go.Pie(labels=price_range_counts.index.astype(str), values=price_range_counts.values, hole=0.4)]) fig_donut.update_layout(title='價格分布 - 環形圖', width=800, # 設定圖表寬度 height=800 # 設定圖表高度 ) st.plotly_chart(fig_donut) # 顯示統計數據 st.subheader("統計數據") st.write(f"平均價格: {combined_data['price'].mean():.2f}") st.write(f"最高價格: {combined_data['price'].max():.2f}") st.write(f"最低價格: {combined_data['price'].min():.2f}") # 獲取並繪製 Google 趨勢數據 trends_data = get_trends_data(keyword, start_date, end_date) if trends_data is not None: st.subheader("Google 搜尋趨勢") fig = go.Figure() fig.add_trace(go.Scatter(x=trends_data['date'], y=trends_data[keyword], mode='lines', name='趨勢指數')) fig.update_layout(title=f'Google 搜尋趨勢: {keyword}', xaxis_title='日期', yaxis_title='趨勢指數') st.plotly_chart(fig) else: st.warning("無法獲取 Google 趨勢數據") # 執行時間 end_time = time.time() execution_time = end_time - start_time st.write(f"執行時間: {execution_time:.2f} 秒")