cjian2025 commited on
Commit
2fa1566
·
verified ·
1 Parent(s): 53c4377

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +568 -0
app.py ADDED
@@ -0,0 +1,568 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import pandas as pd
4
+ import time
5
+ import json
6
+ from urllib.parse import urlencode
7
+ import resend
8
+ from datetime import datetime
9
+ import io
10
+
11
+ # 頁面配置
12
+ st.set_page_config(
13
+ page_title="MeetTaiwan 活動爬蟲系統",
14
+ page_icon="🎯",
15
+ layout="wide",
16
+ initial_sidebar_state="expanded"
17
+ )
18
+
19
+ class MeetTaiwanAPIScraper:
20
+ def __init__(self):
21
+ self.base_url = "https://service.meettaiwan.com"
22
+ self.api_base = "https://service.meettaiwan.com/gpa/api/v2/events"
23
+
24
+ # 設定session
25
+ self.session = requests.Session()
26
+ self.session.headers.update({
27
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
28
+ 'Accept': 'application/json, text/plain, */*',
29
+ 'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
30
+ 'Accept-Encoding': 'gzip, deflate, br',
31
+ 'Connection': 'keep-alive',
32
+ 'Referer': 'https://service.meettaiwan.com/gpa/zh/events/list;type=all',
33
+ 'Origin': 'https://service.meettaiwan.com'
34
+ })
35
+
36
+ def get_events_by_page(self, page=1, page_size=10, event_type=None):
37
+ """調用API獲取指定頁面的活動資料"""
38
+ try:
39
+ # 構建API URL
40
+ params = {
41
+ 'page': page,
42
+ 'pageSize': page_size
43
+ }
44
+
45
+ if event_type:
46
+ params['type'] = event_type
47
+
48
+ # 嘗試不同的API端點
49
+ api_endpoints = [
50
+ f"{self.api_base}/", # 主要API
51
+ f"{self.api_base}/tt-events" # TT events API
52
+ ]
53
+
54
+ for api_url in api_endpoints:
55
+ try:
56
+ response = self.session.get(api_url, params=params, timeout=30)
57
+
58
+ if response.status_code == 200:
59
+ try:
60
+ data = response.json()
61
+
62
+ # 檢查資料結構
63
+ if isinstance(data, dict):
64
+ # 常見的資料結構
65
+ if 'data' in data:
66
+ events = data['data']
67
+ elif 'items' in data:
68
+ events = data['items']
69
+ elif 'results' in data:
70
+ events = data['results']
71
+ elif 'events' in data:
72
+ events = data['events']
73
+ else:
74
+ events = data
75
+
76
+ if isinstance(events, list) and events:
77
+ return events, data
78
+ else:
79
+ continue
80
+
81
+ elif isinstance(data, list):
82
+ return data, data
83
+
84
+ except json.JSONDecodeError as e:
85
+ continue
86
+
87
+ except Exception as e:
88
+ continue
89
+
90
+ return None, None
91
+
92
+ except Exception as e:
93
+ return None, None
94
+
95
+ def get_all_events(self, progress_callback=None):
96
+ """獲取所有類型的活動資料"""
97
+ all_events = []
98
+ page_size_options = [50, 30, 20]
99
+ max_pages = 20
100
+
101
+ for page_size in page_size_options:
102
+ for page in range(1, max_pages + 1):
103
+ if progress_callback:
104
+ progress_callback(f"正在獲取第 {page} 頁資料 (頁面大小: {page_size})")
105
+
106
+ events, raw_data = self.get_events_by_page(page=page, page_size=page_size, event_type=None)
107
+
108
+ if not events or len(events) == 0:
109
+ if page == 1:
110
+ break
111
+ else:
112
+ return all_events
113
+
114
+ # 處理事件資料
115
+ for event in events:
116
+ processed_event = self.process_event_data(event, page, 'All')
117
+ if processed_event:
118
+ # 檢查是否已存在(避免重複)
119
+ is_duplicate = False
120
+ for existing_event in all_events:
121
+ if (existing_event['name'] == processed_event['name'] and
122
+ existing_event['event_date'] == processed_event['event_date']):
123
+ is_duplicate = True
124
+ break
125
+
126
+ if not is_duplicate:
127
+ all_events.append(processed_event)
128
+
129
+ # 如果這一頁的資料少於頁面大小,可能是最後一頁
130
+ if len(events) < page_size:
131
+ return all_events
132
+
133
+ time.sleep(0.5) # 減少延遲
134
+
135
+ # 如果成功獲取到資料,就��需要嘗試其他頁面大小
136
+ if all_events:
137
+ break
138
+
139
+ return all_events
140
+
141
+ def process_event_data(self, event, page_num, event_type):
142
+ """處理單個活動資料"""
143
+ try:
144
+ if isinstance(event, dict):
145
+ name = event.get('name', event.get('title', event.get('eventName', '')))
146
+ form = event.get('type', event.get('category', event.get('eventType', event_type or '')))
147
+ event_date = event.get('eventDate', event.get('startDate', event.get('date', '')))
148
+ upload_date = event.get('createdAt', event.get('uploadDate', event.get('publishDate', '')))
149
+
150
+ # 構建連結
151
+ event_id = event.get('id', event.get('eventId', ''))
152
+ if event_id:
153
+ link = f"{self.base_url}/gpa/zh/events/{form}/{event_id}"
154
+ else:
155
+ link = ""
156
+
157
+ return {
158
+ 'name': str(name),
159
+ 'link': link,
160
+ 'form': str(form),
161
+ 'event_date': str(event_date),
162
+ 'upload_date': str(upload_date),
163
+ 'page_num': page_num
164
+ }
165
+
166
+ except Exception as e:
167
+ pass
168
+
169
+ return None
170
+
171
+ def create_html_table(df, max_display=10):
172
+ """將活動資料轉換為 HTML 表格格式"""
173
+ if df is None or df.empty:
174
+ return "<p>沒有找到活動資料</p>"
175
+
176
+ display_count = min(len(df), max_display)
177
+ df_display = df.head(display_count)
178
+
179
+ # 創建 HTML 表格
180
+ html_content = f"""
181
+ <html>
182
+ <head>
183
+ <meta charset="UTF-8">
184
+ <style>
185
+ body {{ font-family: Arial, sans-serif; margin: 20px; line-height: 1.6; }}
186
+ .greeting {{
187
+ font-size: 18px;
188
+ color: #2c3e50;
189
+ margin-bottom: 20px;
190
+ font-weight: bold;
191
+ }}
192
+ h2 {{ color: #2c3e50; margin-top: 20px; }}
193
+ .data-source {{
194
+ background-color: #e8f4f8;
195
+ padding: 15px;
196
+ border-left: 4px solid #3498db;
197
+ margin: 20px 0;
198
+ font-weight: bold;
199
+ color: #2c3e50;
200
+ }}
201
+ table {{
202
+ border-collapse: collapse;
203
+ width: 100%;
204
+ margin-top: 20px;
205
+ box-shadow: 0 2px 8px rgba(0,0,0,0.1);
206
+ }}
207
+ th, td {{
208
+ border: 1px solid #ddd;
209
+ padding: 12px;
210
+ text-align: left;
211
+ }}
212
+ th {{
213
+ background-color: #3498db;
214
+ color: white;
215
+ font-weight: bold;
216
+ }}
217
+ tr:nth-child(even) {{ background-color: #f2f2f2; }}
218
+ tr:hover {{ background-color: #e8f4f8; }}
219
+ a {{ color: #3498db; text-decoration: none; }}
220
+ a:hover {{ text-decoration: underline; }}
221
+ .summary {{
222
+ background-color: #ecf0f1;
223
+ padding: 15px;
224
+ border-radius: 5px;
225
+ margin-bottom: 20px;
226
+ }}
227
+ .copyright {{
228
+ text-align: center;
229
+ margin-top: 30px;
230
+ padding: 20px;
231
+ background-color: #34495e;
232
+ color: white;
233
+ border-radius: 5px;
234
+ font-size: 14px;
235
+ }}
236
+ </style>
237
+ </head>
238
+ <body>
239
+ <div class="greeting">
240
+ 親愛的會員您好:
241
+ </div>
242
+
243
+ <div class="data-source">
244
+ 📋 資料來源:全球政府採購商機網
245
+ </div>
246
+
247
+ <h2>🎯 最新活動資訊</h2>
248
+ <div class="summary">
249
+ <strong>📊 資料統計:</strong>顯示前 {display_count} 筆,共 {len(df)} 筆活動
250
+ </div>
251
+ <table>
252
+ <thead>
253
+ <tr>
254
+ <th>序號</th>
255
+ <th>名稱</th>
256
+ <th>形式</th>
257
+ <th>活動日期</th>
258
+ <th>上載日期</th>
259
+ <th>網址</th>
260
+ </tr>
261
+ </thead>
262
+ <tbody>
263
+ """
264
+
265
+ for idx, row in df_display.iterrows():
266
+ link_html = f'<a href="{row["超連結網址"]}" target="_blank">查看詳情</a>' if row["超連結網址"] else "無連結"
267
+ html_content += f"""
268
+ <tr>
269
+ <td>{idx + 1}</td>
270
+ <td><strong>{row['名稱']}</strong></td>
271
+ <td>{row['形式']}</td>
272
+ <td>{row['活動日期']}</td>
273
+ <td>{row['上載日期']}</td>
274
+ <td>{link_html}</td>
275
+ </tr>
276
+ """
277
+
278
+ html_content += """
279
+ </tbody>
280
+ </table>
281
+ """
282
+
283
+ if len(df) > max_display:
284
+ html_content += f"""
285
+ <div class="summary" style="margin-top: 20px;">
286
+ <strong>📝 提醒:</strong>還有 {len(df) - max_display} 筆資料未顯示,
287
+ 請查看附加的 CSV 檔案獲取完整資料。
288
+ </div>
289
+ """
290
+
291
+ html_content += """
292
+ <div class="summary" style="margin-top: 20px;">
293
+ <strong>🤖 自動爬蟲系統</strong><br>
294
+ 此郵件由 MeetTaiwan API 爬蟲系統自動產生並發送
295
+ </div>
296
+
297
+ <div class="copyright">
298
+ 2025 © Copyright robert_studio
299
+ </div>
300
+ </body>
301
+ </html>
302
+ """
303
+
304
+ return html_content
305
+
306
+ def send_events_email(df_events, recipient_email, api_key, max_display=5):
307
+ """發送活動資料到指定郵箱"""
308
+ if df_events is None or df_events.empty:
309
+ return False, "沒有資料可發送"
310
+
311
+ try:
312
+ # 設定 Resend API Key
313
+ resend.api_key = api_key
314
+
315
+ # 取前N筆資料用於顯示
316
+ df_display = df_events.head(max_display)
317
+
318
+ # 建立 HTML 內容
319
+ html_content = create_html_table(df_display, max_display=max_display)
320
+
321
+ # 準備郵件主題
322
+ current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
323
+ subject = f"📊 MeetTaiwan 最新活動資訊 - {len(df_events)}筆活動 ({current_time})"
324
+
325
+ # 發送郵件
326
+ r = resend.Emails.send({
327
+ "from": "onboarding@resend.dev",
328
+ "to": recipient_email,
329
+ "subject": subject,
330
+ "html": html_content
331
+ })
332
+
333
+ return True, f"郵件發送成功!郵件 ID: {r.get('id', 'N/A')}"
334
+
335
+ except Exception as e:
336
+ return False, f"郵件發送失敗: {str(e)}"
337
+
338
+ # Streamlit 應用程式主體
339
+ def main():
340
+ # 標題和說明
341
+ st.title("🎯 MeetTaiwan 活動爬蟲系統")
342
+ st.markdown("**全球政府採購商機網活動資訊自動抓取與郵件發送系統**")
343
+
344
+ # 側邊欄設定
345
+ st.sidebar.header("⚙️ 系統設定")
346
+
347
+ # API Key 設定
348
+ api_key = st.sidebar.text_input(
349
+ "Resend API Key",
350
+ value="re_ZGacBiDw_HFEBpuCbaJ2S3NThPWiMU7Ex",
351
+ type="password",
352
+ help="請輸入您的 Resend API Key"
353
+ )
354
+
355
+ # 收件人設定
356
+ recipient_email = st.sidebar.text_input(
357
+ "收件人郵箱",
358
+ value="cjhuang38@gmail.com",
359
+ help="請輸入要接收報告的郵箱地址"
360
+ )
361
+
362
+ # 顯示筆數設定
363
+ max_display = st.sidebar.slider(
364
+ "郵件顯示筆數",
365
+ min_value=5,
366
+ max_value=20,
367
+ value=10,
368
+ help="設定郵件中要顯示的活動筆數"
369
+ )
370
+
371
+ # 初始化 session state
372
+ if 'df_events' not in st.session_state:
373
+ st.session_state.df_events = None
374
+ if 'scraping_done' not in st.session_state:
375
+ st.session_state.scraping_done = False
376
+
377
+ # 主要操作區域
378
+ col1, col2 = st.columns([2, 1])
379
+
380
+ with col1:
381
+ st.header("📊 資料爬取")
382
+
383
+ if st.button("🚀 開始爬取活動資料", type="primary", use_container_width=True):
384
+ # 清除之前的資料
385
+ st.session_state.df_events = None
386
+ st.session_state.scraping_done = False
387
+
388
+ # 建立進度條和狀態顯示
389
+ progress_bar = st.progress(0)
390
+ status_text = st.empty()
391
+ log_container = st.empty()
392
+
393
+ # 建立爬蟲實例
394
+ scraper = MeetTaiwanAPIScraper()
395
+
396
+ def update_progress(message):
397
+ status_text.text(f"🔄 {message}")
398
+
399
+ try:
400
+ # 開始爬取
401
+ with st.spinner("正在爬取資料..."):
402
+ events_data = scraper.get_all_events(progress_callback=update_progress)
403
+
404
+ if events_data and len(events_data) > 0:
405
+ # 轉換為 DataFrame
406
+ df_events = pd.DataFrame(events_data)
407
+ df_events.columns = ["名稱", "超連結網址", "形式", "活動日期", "上載日期", "頁數"]
408
+
409
+ # 去除重複資料
410
+ original_count = len(df_events)
411
+ df_events = df_events.drop_duplicates(subset=['名稱', '活動日期'])
412
+ deduplicated_count = len(df_events)
413
+
414
+ # 儲存到 session state
415
+ st.session_state.df_events = df_events
416
+ st.session_state.scraping_done = True
417
+
418
+ # 更新進度條
419
+ progress_bar.progress(100)
420
+ status_text.text(f"✅ 爬取完成!獲得 {deduplicated_count} 筆活動資料")
421
+
422
+ if original_count != deduplicated_count:
423
+ st.info(f"📝 去除了 {original_count - deduplicated_count} 筆重複資料")
424
+
425
+ st.success(f"🎉 成功獲取 {len(df_events)} 筆活動資料!")
426
+
427
+ else:
428
+ status_text.text("❌ 無法獲取活動資料")
429
+ st.error("爬取失敗,可能原因:API端點變更、需要認證或網路問題")
430
+
431
+ except Exception as e:
432
+ status_text.text(f"❌ 爬取過程發生錯誤: {str(e)}")
433
+ st.error(f"執行錯誤:{str(e)}")
434
+
435
+ with col2:
436
+ st.header("📧 郵件發送")
437
+
438
+ if st.session_state.df_events is not None:
439
+ st.success(f"📊 已載入 {len(st.session_state.df_events)} 筆資料")
440
+
441
+ if st.button("📨 發送郵件報告", type="secondary", use_container_width=True):
442
+ if not api_key:
443
+ st.error("請輸入 Resend API Key")
444
+ elif not recipient_email:
445
+ st.error("請輸入收件人郵箱")
446
+ else:
447
+ with st.spinner("正在發送郵件..."):
448
+ success, message = send_events_email(
449
+ st.session_state.df_events,
450
+ recipient_email,
451
+ api_key,
452
+ max_display
453
+ )
454
+
455
+ if success:
456
+ st.success(f"✅ {message}")
457
+ else:
458
+ st.error(f"❌ {message}")
459
+ else:
460
+ st.info("請先爬取資料")
461
+
462
+ # 資料預覽區域
463
+ if st.session_state.df_events is not None:
464
+ st.header("📋 資料預覽")
465
+
466
+ # 統計資訊
467
+ col1, col2, col3, col4 = st.columns(4)
468
+ with col1:
469
+ st.metric("總活動數", len(st.session_state.df_events))
470
+ with col2:
471
+ form_counts = st.session_state.df_events['形式'].nunique()
472
+ st.metric("活動形式數", form_counts)
473
+ with col3:
474
+ try:
475
+ year_counts = pd.to_datetime(st.session_state.df_events['活動日期'], errors='coerce').dt.year.nunique()
476
+ st.metric("涵蓋年份", year_counts)
477
+ except:
478
+ st.metric("涵蓋年份", "N/A")
479
+ with col4:
480
+ page_counts = st.session_state.df_events['頁數'].nunique()
481
+ st.metric("來源頁面數", page_counts)
482
+
483
+ # 資料表格
484
+ st.subheader("📊 詳細資料")
485
+
486
+ # 篩選選項
487
+ col1, col2 = st.columns(2)
488
+ with col1:
489
+ form_options = ['全部'] + list(st.session_state.df_events['形式'].unique())
490
+ selected_form = st.selectbox("選擇活動形式", form_options)
491
+
492
+ with col2:
493
+ display_count = st.slider("顯示筆數", 10, min(100, len(st.session_state.df_events)), 20)
494
+
495
+ # 篩選資料
496
+ if selected_form != '全部':
497
+ filtered_df = st.session_state.df_events[st.session_state.df_events['形式'] == selected_form]
498
+ else:
499
+ filtered_df = st.session_state.df_events
500
+
501
+ # 顯示表格
502
+ st.dataframe(
503
+ filtered_df.head(display_count),
504
+ use_container_width=True,
505
+ hide_index=True
506
+ )
507
+
508
+ # 下載功能
509
+ st.subheader("💾 資料下載")
510
+ col1, col2 = st.columns(2)
511
+
512
+ with col1:
513
+ # CSV 下載
514
+ csv = st.session_state.df_events.to_csv(index=False, encoding='utf-8-sig')
515
+ st.download_button(
516
+ label="📄 下載 CSV 檔案",
517
+ data=csv,
518
+ file_name=f"meettaiwan_events_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
519
+ mime="text/csv",
520
+ use_container_width=True
521
+ )
522
+
523
+ with col2:
524
+ # Excel 下載
525
+ buffer = io.BytesIO()
526
+ with pd.ExcelWriter(buffer, engine='openpyxl') as writer:
527
+ st.session_state.df_events.to_excel(writer, index=False, sheet_name='活動資料')
528
+
529
+ st.download_button(
530
+ label="📊 下載 Excel 檔案",
531
+ data=buffer.getvalue(),
532
+ file_name=f"meettaiwan_events_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
533
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
534
+ use_container_width=True
535
+ )
536
+
537
+ # 統計圖表
538
+ st.subheader("📈 資料統計")
539
+
540
+ col1, col2 = st.columns(2)
541
+
542
+ with col1:
543
+ # 活動形式分布
544
+ form_counts = st.session_state.df_events['形式'].value_counts()
545
+ st.bar_chart(form_counts, use_container_width=True)
546
+ st.caption("活動形式分布")
547
+
548
+ with col2:
549
+ # 按頁面分布
550
+ if '頁數' in st.session_state.df_events.columns:
551
+ page_counts = st.session_state.df_events['頁數'].value_counts().sort_index()
552
+ st.bar_chart(page_counts, use_container_width=True)
553
+ st.caption("各頁面資料分布")
554
+
555
+ # 頁尾資訊
556
+ st.markdown("---")
557
+ st.markdown(
558
+ """
559
+ <div style='text-align: center; color: #666;'>
560
+ <p>🤖 MeetTaiwan API 爬蟲系統 | 2025 © Copyright robert_studio</p>
561
+ <p>資料來源:全球政府採購商機網</p>
562
+ </div>
563
+ """,
564
+ unsafe_allow_html=True
565
+ )
566
+
567
+ if __name__ == "__main__":
568
+ main()