test01 / app.py
KKingzor's picture
Update app.py
4049afa verified
import streamlit as st
import plotly.graph_objs as go
import requests
import json
import pandas as pd
import time
from pytrends.request import TrendReq
from pytrends.exceptions import TooManyRequestsError
# 設置 Streamlit 應用程序
st.title("PCHOME 和 MOMO 商品價格爬蟲分析與趨勢分析")
# 定義基礎URL
pchome_base_url = 'https://ecshweb.pchome.com.tw/search/v3.3/all/results?q='
momo_url = "https://apisearch.momoshop.com.tw/momoSearchCloud/moec/textSearch"
# 爬取 PCHOME 的函數
def crawl_pchome(keyword, num_pages):
alldata = pd.DataFrame()
for i in range(1, num_pages + 1):
url = f'{pchome_base_url}{keyword}&page={i}&sort=sale/dc'
list_req = requests.get(url)
getdata = json.loads(list_req.content)
todataFrame = pd.DataFrame(getdata['prods'])
alldata = pd.concat([alldata, todataFrame])
time.sleep(1)
return alldata[["name", "price"]]
# 爬取 MOMO 的函數
def crawl_momo(keyword, num_pages):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}
product_list = []
for page in range(1, num_pages + 1):
payload = {
"host": "momoshop",
"flag": "searchEngine",
"data": {
"searchValue": keyword,
"curPage": str(page),
"priceS": "0",
"priceE": "9999999",
"searchType": "1"
}
}
response = requests.post(momo_url, headers=headers, json=payload)
if response.status_code == 200:
data_from_api = response.json()
products = data_from_api.get('rtnSearchData', {}).get('goodsInfoList', [])
for product in products:
name = product.get('goodsName', '')
price = product.get('goodsPrice', '')
price_str = str(price).split('(')[0].replace(',', '').replace('$', '')
try:
product_price = float(price_str)
except ValueError:
product_price = 0
product_list.append({'name': name, 'price': product_price})
time.sleep(1)
return pd.DataFrame(product_list)
# 獲取 Google 趨勢數據的函數
def get_trends_data(keyword, start_date, end_date):
pytrends = TrendReq(hl='en-US', tz=360)
kw_list = [keyword]
timeframe = f'{start_date} {end_date}'
for _ in range(3): # 重試最多3次
try:
pytrends.build_payload(kw_list, cat=0, timeframe=timeframe, geo='TW', gprop='')
trends_data = pytrends.interest_over_time()
if not trends_data.empty:
trends_data = trends_data.reset_index()
return trends_data
except TooManyRequestsError:
st.warning("Google 趨勢請求過多,正在等待重試...")
time.sleep(60) # 等待60秒後重試
st.error("無法獲取 Google 趨勢數據,請稍後再試。")
return None
# 輸入關鍵字和頁數
keyword = st.text_input("請輸入關鍵字:", "平板")
num_pages = st.number_input("請輸入要爬取的頁數:", min_value=1, max_value=100, value=1)
# 選擇 Pytrends 時間範圍
start_date = st.date_input("選擇開始日期:", value=pd.to_datetime("2023-01-01"))
end_date = st.date_input("選擇結束日期:", value=pd.to_datetime("2023-12-31"))
# 主爬取函數
if st.button("開始爬取"):
start_time = time.time()
# 爬取 PCHOME 資料
st.subheader("爬取 PCHOME 資料")
pchome_progress = st.progress(0)
pchome_data = crawl_pchome(keyword, num_pages)
pchome_progress.progress(100)
# 爬取 MOMO 資料
st.subheader("爬取 MOMO 資料")
momo_progress = st.progress(0)
momo_data = crawl_momo(keyword, num_pages)
momo_progress.progress(100)
# 檢查是否有數據
if pchome_data.empty and momo_data.empty:
st.error("查無商品")
else:
# 添加來源欄位
pchome_data['source'] = 'PCHOME'
momo_data['source'] = 'MOMO'
# 合併並排序數據
combined_data = pd.concat([pchome_data, momo_data])
combined_data = combined_data.sort_values('price', ascending=False).reset_index(drop=True)
# 顯示結果
st.subheader("爬取結果:")
st.write(combined_data)
# 生成 CSV 檔案
csv = combined_data.to_csv(index=False)
st.download_button(
label="下載 CSV 檔案",
data=csv,
file_name=f"{keyword}_price_data.csv",
mime="text/csv",
)
# 繪製價格分布圖 - 圓餅圖
st.subheader("價格分布圖 - 圓餅圖")
combined_data['price_range'] = pd.cut(combined_data['price'], bins=range(0, int(combined_data['price'].max()) + 1000, 1000))
price_range_counts = combined_data['price_range'].value_counts().sort_index()
fig_pie = go.Figure(data=[go.Pie(labels=price_range_counts.index.astype(str), values=price_range_counts.values)])
fig_pie.update_layout(
title='價格分布 - 圓餅圖',
width=800, # 設定圖表寬度
height=800 # 設定圖表高度
)
st.plotly_chart(fig_pie)
# 繪製價格分布圖 - 折線圖
st.subheader("價格分布圖 - 折線圖")
fig_line = go.Figure(data=[go.Scatter(x=price_range_counts.index.astype(str), y=price_range_counts.values, mode='lines+markers')])
fig_line.update_layout(title='價格分布 - 折線圖', xaxis_title='價格區間', yaxis_title='商品數量',
width=800, # 設定圖表寬度
height=800 # 設定圖表高度
)
st.plotly_chart(fig_line)
# 繪製價格分布圖 - 南丁格爾玫瑰圖
st.subheader("價格分布圖 - 南丁格爾玫瑰圖")
fig_rose = go.Figure(data=[go.Barpolar(r=price_range_counts.values, theta=price_range_counts.index.astype(str), marker=dict(color=price_range_counts.values, colorscale='Viridis'))])
fig_rose.update_layout(title='價格分布 - 南丁格爾玫瑰圖',
width=800, # 設定圖表寬度
height=800 # 設定圖表高度
)
st.plotly_chart(fig_rose)
# 繪製價格分布圖 - 環形圖
st.subheader("價格分布圖 - 環形圖")
fig_donut = go.Figure(data=[go.Pie(labels=price_range_counts.index.astype(str), values=price_range_counts.values, hole=0.4)])
fig_donut.update_layout(title='價格分布 - 環形圖',
width=800, # 設定圖表寬度
height=800 # 設定圖表高度
)
st.plotly_chart(fig_donut)
# 顯示統計數據
st.subheader("統計數據")
st.write(f"平均價格: {combined_data['price'].mean():.2f}")
st.write(f"最高價格: {combined_data['price'].max():.2f}")
st.write(f"最低價格: {combined_data['price'].min():.2f}")
# 獲取並繪製 Google 趨勢數據
trends_data = get_trends_data(keyword, start_date, end_date)
if trends_data is not None:
st.subheader("Google 搜尋趨勢")
fig = go.Figure()
fig.add_trace(go.Scatter(x=trends_data['date'], y=trends_data[keyword], mode='lines', name='趨勢指數'))
fig.update_layout(title=f'Google 搜尋趨勢: {keyword}', xaxis_title='日期', yaxis_title='趨勢指數')
st.plotly_chart(fig)
else:
st.warning("無法獲取 Google 趨勢數據")
# 執行時間
end_time = time.time()
execution_time = end_time - start_time
st.write(f"執行時間: {execution_time:.2f} 秒")