Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import requests | |
| import pandas as pd | |
| import time | |
| import json | |
| from urllib.parse import urlencode | |
| import resend | |
| from datetime import datetime | |
| import io | |
| # 頁面配置 | |
| st.set_page_config( | |
| page_title="MeetTaiwan 活動爬蟲系統", | |
| page_icon="🎯", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| class MeetTaiwanAPIScraper: | |
| def __init__(self): | |
| self.base_url = "https://service.meettaiwan.com" | |
| self.api_base = "https://service.meettaiwan.com/gpa/api/v2/events" | |
| # 設定session | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| 'Accept': 'application/json, text/plain, */*', | |
| 'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8', | |
| 'Accept-Encoding': 'gzip, deflate, br', | |
| 'Connection': 'keep-alive', | |
| 'Referer': 'https://service.meettaiwan.com/gpa/zh/events/list;type=all', | |
| 'Origin': 'https://service.meettaiwan.com' | |
| }) | |
| def get_events_by_page(self, page=1, page_size=10, event_type=None): | |
| """調用API獲取指定頁面的活動資料""" | |
| try: | |
| # 構建API URL | |
| params = { | |
| 'page': page, | |
| 'pageSize': page_size | |
| } | |
| if event_type: | |
| params['type'] = event_type | |
| # 嘗試不同的API端點 | |
| api_endpoints = [ | |
| f"{self.api_base}/", # 主要API | |
| f"{self.api_base}/tt-events" # TT events API | |
| ] | |
| for api_url in api_endpoints: | |
| try: | |
| response = self.session.get(api_url, params=params, timeout=30) | |
| if response.status_code == 200: | |
| try: | |
| data = response.json() | |
| # 檢查資料結構 | |
| if isinstance(data, dict): | |
| # 常見的資料結構 | |
| if 'data' in data: | |
| events = data['data'] | |
| elif 'items' in data: | |
| events = data['items'] | |
| elif 'results' in data: | |
| events = data['results'] | |
| elif 'events' in data: | |
| events = data['events'] | |
| else: | |
| events = data | |
| if isinstance(events, list) and events: | |
| return events, data | |
| else: | |
| continue | |
| elif isinstance(data, list): | |
| return data, data | |
| except json.JSONDecodeError as e: | |
| continue | |
| except Exception as e: | |
| continue | |
| return None, None | |
| except Exception as e: | |
| return None, None | |
| def get_all_events(self, progress_callback=None): | |
| """獲取所有類型的活動資料""" | |
| all_events = [] | |
| page_size_options = [50, 30, 20] | |
| max_pages = 20 | |
| for page_size in page_size_options: | |
| for page in range(1, max_pages + 1): | |
| if progress_callback: | |
| progress_callback(f"正在獲取第 {page} 頁資料 (頁面大小: {page_size})") | |
| events, raw_data = self.get_events_by_page(page=page, page_size=page_size, event_type=None) | |
| if not events or len(events) == 0: | |
| if page == 1: | |
| break | |
| else: | |
| return all_events | |
| # 處理事件資料 | |
| for event in events: | |
| processed_event = self.process_event_data(event, page, 'All') | |
| if processed_event: | |
| # 檢查是否已存在(避免重複) | |
| is_duplicate = False | |
| for existing_event in all_events: | |
| if (existing_event['name'] == processed_event['name'] and | |
| existing_event['event_date'] == processed_event['event_date']): | |
| is_duplicate = True | |
| break | |
| if not is_duplicate: | |
| all_events.append(processed_event) | |
| # 如果這一頁的資料少於頁面大小,可能是最後一頁 | |
| if len(events) < page_size: | |
| return all_events | |
| time.sleep(0.5) # 減少延遲 | |
| # 如果成功獲取到資料,就不需要嘗試其他頁面大小 | |
| if all_events: | |
| break | |
| return all_events | |
| def process_event_data(self, event, page_num, event_type): | |
| """處理單個活動資料""" | |
| try: | |
| if isinstance(event, dict): | |
| name = event.get('name', event.get('title', event.get('eventName', ''))) | |
| form = event.get('type', event.get('category', event.get('eventType', event_type or ''))) | |
| event_date = event.get('eventDate', event.get('startDate', event.get('date', ''))) | |
| upload_date = event.get('createdAt', event.get('uploadDate', event.get('publishDate', ''))) | |
| # 構建連結 | |
| event_id = event.get('id', event.get('eventId', '')) | |
| if event_id: | |
| link = f"{self.base_url}/gpa/zh/events/{form}/{event_id}" | |
| else: | |
| link = "" | |
| return { | |
| 'name': str(name), | |
| 'link': link, | |
| 'form': str(form), | |
| 'event_date': str(event_date), | |
| 'upload_date': str(upload_date), | |
| 'page_num': page_num | |
| } | |
| except Exception as e: | |
| pass | |
| return None | |
| def create_html_table(df, max_display=10): | |
| """將活動資料轉換為 HTML 表格格式""" | |
| if df is None or df.empty: | |
| return "<p>沒有找到活動資料</p>" | |
| display_count = min(len(df), max_display) | |
| df_display = df.head(display_count) | |
| # 創建 HTML 表格 | |
| html_content = f""" | |
| <html> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <style> | |
| body {{ font-family: Arial, sans-serif; margin: 20px; line-height: 1.6; }} | |
| .greeting {{ | |
| font-size: 18px; | |
| color: #2c3e50; | |
| margin-bottom: 20px; | |
| font-weight: bold; | |
| }} | |
| h2 {{ color: #2c3e50; margin-top: 20px; }} | |
| .data-source {{ | |
| background-color: #e8f4f8; | |
| padding: 15px; | |
| border-left: 4px solid #3498db; | |
| margin: 20px 0; | |
| font-weight: bold; | |
| color: #2c3e50; | |
| }} | |
| table {{ | |
| border-collapse: collapse; | |
| width: 100%; | |
| margin-top: 20px; | |
| box-shadow: 0 2px 8px rgba(0,0,0,0.1); | |
| }} | |
| th, td {{ | |
| border: 1px solid #ddd; | |
| padding: 12px; | |
| text-align: left; | |
| }} | |
| th {{ | |
| background-color: #3498db; | |
| color: white; | |
| font-weight: bold; | |
| }} | |
| tr:nth-child(even) {{ background-color: #f2f2f2; }} | |
| tr:hover {{ background-color: #e8f4f8; }} | |
| a {{ color: #3498db; text-decoration: none; }} | |
| a:hover {{ text-decoration: underline; }} | |
| .summary {{ | |
| background-color: #ecf0f1; | |
| padding: 15px; | |
| border-radius: 5px; | |
| margin-bottom: 20px; | |
| }} | |
| .copyright {{ | |
| text-align: center; | |
| margin-top: 30px; | |
| padding: 20px; | |
| background-color: #34495e; | |
| color: white; | |
| border-radius: 5px; | |
| font-size: 14px; | |
| }} | |
| </style> | |
| </head> | |
| <body> | |
| <div class="greeting"> | |
| 親愛的會員您好: | |
| </div> | |
| <div class="data-source"> | |
| 📋 資料來源:全球政府採購商機網 | |
| </div> | |
| <h2>🎯 最新活動資訊</h2> | |
| <div class="summary"> | |
| <strong>📊 資料統計:</strong>顯示前 {display_count} 筆,共 {len(df)} 筆活動 | |
| </div> | |
| <table> | |
| <thead> | |
| <tr> | |
| <th>序號</th> | |
| <th>名稱</th> | |
| <th>形式</th> | |
| <th>活動日期</th> | |
| <th>上載日期</th> | |
| <th>網址</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| """ | |
| for idx, row in df_display.iterrows(): | |
| link_html = f'<a href="{row["超連結網址"]}" target="_blank">查看詳情</a>' if row["超連結網址"] else "無連結" | |
| html_content += f""" | |
| <tr> | |
| <td>{idx + 1}</td> | |
| <td><strong>{row['名稱']}</strong></td> | |
| <td>{row['形式']}</td> | |
| <td>{row['活動日期']}</td> | |
| <td>{row['上載日期']}</td> | |
| <td>{link_html}</td> | |
| </tr> | |
| """ | |
| html_content += """ | |
| </tbody> | |
| </table> | |
| """ | |
| if len(df) > max_display: | |
| html_content += f""" | |
| <div class="summary" style="margin-top: 20px;"> | |
| <strong>📝 提醒:</strong>還有 {len(df) - max_display} 筆資料未顯示, | |
| 請查看附加的 CSV 檔案獲取完整資料。 | |
| </div> | |
| """ | |
| html_content += """ | |
| <div class="summary" style="margin-top: 20px;"> | |
| <strong>🤖 自動爬蟲系統</strong><br> | |
| 此郵件由 MeetTaiwan API 爬蟲系統自動產生並發送 | |
| </div> | |
| <div class="copyright"> | |
| 2025 © Copyright robert_studio | |
| </div> | |
| </body> | |
| </html> | |
| """ | |
| return html_content | |
| def send_events_email(df_events, recipient_email, api_key, max_display=5): | |
| """發送活動資料到指定郵箱""" | |
| if df_events is None or df_events.empty: | |
| return False, "沒有資料可發送" | |
| try: | |
| # 設定 Resend API Key | |
| resend.api_key = api_key | |
| # 取前N筆資料用於顯示 | |
| df_display = df_events.head(max_display) | |
| # 建立 HTML 內容 | |
| html_content = create_html_table(df_display, max_display=max_display) | |
| # 準備郵件主題 | |
| current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| subject = f"📊 MeetTaiwan 最新活動資訊 - {len(df_events)}筆活動 ({current_time})" | |
| # 發送郵件 | |
| r = resend.Emails.send({ | |
| "from": "onboarding@resend.dev", | |
| "to": recipient_email, | |
| "subject": subject, | |
| "html": html_content | |
| }) | |
| return True, f"郵件發送成功!郵件 ID: {r.get('id', 'N/A')}" | |
| except Exception as e: | |
| return False, f"郵件發送失敗: {str(e)}" | |
| # Streamlit 應用程式主體 | |
| def main(): | |
| # 標題和說明 | |
| st.title("🎯 MeetTaiwan 活動爬蟲系統") | |
| st.markdown("**全球政府採購商機網活動資訊自動抓取與郵件發送系統**") | |
| # 側邊欄設定 | |
| st.sidebar.header("⚙️ 系統設定") | |
| # API Key 設定 | |
| api_key = st.sidebar.text_input( | |
| "Resend API Key", | |
| value="re_ZGacBiDw_HFEBpuCbaJ2S3NThPWiMU7Ex", | |
| type="password", | |
| help="請輸入您的 Resend API Key" | |
| ) | |
| # 收件人設定 | |
| recipient_email = st.sidebar.text_input( | |
| "收件人郵箱", | |
| value="cjhuang38@gmail.com", | |
| help="請輸入要接收報告的郵箱地址" | |
| ) | |
| # 顯示筆數設定 | |
| max_display = st.sidebar.slider( | |
| "郵件顯示筆數", | |
| min_value=5, | |
| max_value=20, | |
| value=10, | |
| help="設定郵件中要顯示的活動筆數" | |
| ) | |
| # 初始化 session state | |
| if 'df_events' not in st.session_state: | |
| st.session_state.df_events = None | |
| if 'scraping_done' not in st.session_state: | |
| st.session_state.scraping_done = False | |
| # 主要操作區域 | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| st.header("📊 資料爬取") | |
| if st.button("🚀 開始爬取活動資料", type="primary", use_container_width=True): | |
| # 清除之前的資料 | |
| st.session_state.df_events = None | |
| st.session_state.scraping_done = False | |
| # 建立進度條和狀態顯示 | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| log_container = st.empty() | |
| # 建立爬蟲實例 | |
| scraper = MeetTaiwanAPIScraper() | |
| def update_progress(message): | |
| status_text.text(f"🔄 {message}") | |
| try: | |
| # 開始爬取 | |
| with st.spinner("正在爬取資料..."): | |
| events_data = scraper.get_all_events(progress_callback=update_progress) | |
| if events_data and len(events_data) > 0: | |
| # 轉換為 DataFrame | |
| df_events = pd.DataFrame(events_data) | |
| df_events.columns = ["名稱", "超連結網址", "形式", "活動日期", "上載日期", "頁數"] | |
| # 去除重複資料 | |
| original_count = len(df_events) | |
| df_events = df_events.drop_duplicates(subset=['名稱', '活動日期']) | |
| deduplicated_count = len(df_events) | |
| # 儲存到 session state | |
| st.session_state.df_events = df_events | |
| st.session_state.scraping_done = True | |
| # 更新進度條 | |
| progress_bar.progress(100) | |
| status_text.text(f"✅ 爬取完成!獲得 {deduplicated_count} 筆活動資料") | |
| if original_count != deduplicated_count: | |
| st.info(f"📝 去除了 {original_count - deduplicated_count} 筆重複資料") | |
| st.success(f"🎉 成功獲取 {len(df_events)} 筆活動資料!") | |
| else: | |
| status_text.text("❌ 無法獲取活動資料") | |
| st.error("爬取失敗,可能原因:API端點變更、需要認證或網路問題") | |
| except Exception as e: | |
| status_text.text(f"❌ 爬取過程發生錯誤: {str(e)}") | |
| st.error(f"執行錯誤:{str(e)}") | |
| with col2: | |
| st.header("📧 郵件發送") | |
| if st.session_state.df_events is not None: | |
| st.success(f"📊 已載入 {len(st.session_state.df_events)} 筆資料") | |
| if st.button("📨 發送郵件報告", type="secondary", use_container_width=True): | |
| if not api_key: | |
| st.error("請輸入 Resend API Key") | |
| elif not recipient_email: | |
| st.error("請輸入收件人郵箱") | |
| else: | |
| with st.spinner("正在發送郵件..."): | |
| success, message = send_events_email( | |
| st.session_state.df_events, | |
| recipient_email, | |
| api_key, | |
| max_display | |
| ) | |
| if success: | |
| st.success(f"✅ {message}") | |
| else: | |
| st.error(f"❌ {message}") | |
| else: | |
| st.info("請先爬取資料") | |
| # 資料預覽區域 | |
| if st.session_state.df_events is not None: | |
| st.header("📋 資料預覽") | |
| # 統計資訊 | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("總活動數", len(st.session_state.df_events)) | |
| with col2: | |
| form_counts = st.session_state.df_events['形式'].nunique() | |
| st.metric("活動形式數", form_counts) | |
| with col3: | |
| try: | |
| year_counts = pd.to_datetime(st.session_state.df_events['活動日期'], errors='coerce').dt.year.nunique() | |
| st.metric("涵蓋年份", year_counts) | |
| except: | |
| st.metric("涵蓋年份", "N/A") | |
| with col4: | |
| page_counts = st.session_state.df_events['頁數'].nunique() | |
| st.metric("來源頁面數", page_counts) | |
| # 資料表格 | |
| st.subheader("📊 詳細資料") | |
| # 篩選選項 | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| form_options = ['全部'] + list(st.session_state.df_events['形式'].unique()) | |
| selected_form = st.selectbox("選擇活動形式", form_options) | |
| with col2: | |
| display_count = st.slider("顯示筆數", 10, min(100, len(st.session_state.df_events)), 20) | |
| # 篩選資料 | |
| if selected_form != '全部': | |
| filtered_df = st.session_state.df_events[st.session_state.df_events['形式'] == selected_form] | |
| else: | |
| filtered_df = st.session_state.df_events | |
| # 顯示表格 | |
| st.dataframe( | |
| filtered_df.head(display_count), | |
| use_container_width=True, | |
| hide_index=True | |
| ) | |
| # 下載功能 | |
| st.subheader("💾 資料下載") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # CSV 下載 | |
| csv = st.session_state.df_events.to_csv(index=False, encoding='utf-8-sig') | |
| st.download_button( | |
| label="📄 下載 CSV 檔案", | |
| data=csv, | |
| file_name=f"meettaiwan_events_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", | |
| mime="text/csv", | |
| use_container_width=True | |
| ) | |
| with col2: | |
| # Excel 下載 | |
| buffer = io.BytesIO() | |
| with pd.ExcelWriter(buffer, engine='openpyxl') as writer: | |
| st.session_state.df_events.to_excel(writer, index=False, sheet_name='活動資料') | |
| st.download_button( | |
| label="📊 下載 Excel 檔案", | |
| data=buffer.getvalue(), | |
| file_name=f"meettaiwan_events_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
| use_container_width=True | |
| ) | |
| # 統計圖表 | |
| st.subheader("📈 資料統計") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # 活動形式分布 | |
| form_counts = st.session_state.df_events['形式'].value_counts() | |
| st.bar_chart(form_counts, use_container_width=True) | |
| st.caption("活動形式分布") | |
| with col2: | |
| # 按頁面分布 | |
| if '頁數' in st.session_state.df_events.columns: | |
| page_counts = st.session_state.df_events['頁數'].value_counts().sort_index() | |
| st.bar_chart(page_counts, use_container_width=True) | |
| st.caption("各頁面資料分布") | |
| # 頁尾資訊 | |
| st.markdown("---") | |
| st.markdown( | |
| """ | |
| <div style='text-align: center; color: #666;'> | |
| <p>🤖 MeetTaiwan API 爬蟲系統 | 2025 © Copyright robert_studio</p> | |
| <p>資料來源:全球政府採購商機網</p> | |
| </div> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| if __name__ == "__main__": | |
| main() |