cormort's picture
Update app.py
1758b3a verified
import gradio as gr
import requests
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
# --- 核心函式 ---
BASE_URL = "https://ivod.ly.gov.tw"
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/5.37.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/5.37.36'
}
def get_all_legislators_map():
"""
(爬蟲 A1) 抓取頁面上的 select 選項,建立 姓名->ID 的對照表。
"""
target_url = f"{BASE_URL}/Demand?targetSession=current"
print(f"--- 爬蟲 A1 啟動 ---")
legislator_map = {}
try:
response = requests.get(target_url, headers=HEADERS, timeout=10)
response.encoding = 'utf-8'
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
selects = soup.find_all('select')
for select in selects:
options = select.find_all('option')
for option in options:
code = option.get('value')
name = option.text.strip()
# 過濾邏輯
if code and code.isdigit() and name:
legislator_map[name] = code
print(f"爬取完成,共找到 {len(legislator_map)} 位委員。")
if not legislator_map:
return {}, gr.Dropdown(choices=[], value=None, label="錯誤:未找到資料")
names_list = sorted(list(legislator_map.keys()))
return (
legislator_map,
gr.Dropdown(
choices=names_list,
value=names_list[0],
interactive=True,
label="1. 選擇委員 (已載入)"
)
)
except Exception as e:
msg = f"錯誤:爬取委員列表失敗。 {e}"
print(msg)
return {}, gr.Dropdown(choices=[], label=msg)
def get_videos_for_legislator(legislator_name: str, legislator_map: dict):
"""
(爬蟲 A2) 爬取指定委員的影片列表。
"""
# --- [修正重點 2] API 呼叫防護 ---
# 如果是透過 API Client 呼叫,legislator_map 可能是 None 或空字典
# 這時我們需要現場重新抓一次名單,確保能查到 ID
if not legislator_map or legislator_name not in legislator_map:
print(f"⚠️ 檢測到 Map 為空或找不到 '{legislator_name}',正在重新獲取名單...")
new_map, _ = get_all_legislators_map()
if isinstance(new_map, dict) and new_map:
legislator_map = new_map
else:
return None, "錯誤:無法獲取委員名單,請稍後再試。"
# 1. 查詢 ID
legislator_id = legislator_map.get(legislator_name)
if not legislator_id:
return None, f"錯誤:找不到委員 '{legislator_name}' 的 ID。"
legislator_page_url = f"{BASE_URL}/Demand/ListLgltVideo/{legislator_id}"
print(f"--- 爬蟲 A2 啟動: {legislator_name} (ID: {legislator_id}) ---")
all_clips = []
page_num = 1
try:
while True:
page_url = f"{legislator_page_url}?page={page_num}"
# print(f"正在爬取第 {page_num} 頁...")
response = requests.get(page_url, headers=HEADERS, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'lxml') # 建議安裝 lxml: pip install lxml
clip_lists = soup.find_all('ul', id='clipUl')
if not clip_lists:
break
found_clips_on_page = 0
for ul in clip_lists:
list_items = ul.find_all('li')
for item in list_items:
clip_info = {"legislator_name": legislator_name}
meet_name_tag = item.select_one("span.metdec")
clip_info['meeting_name'] = meet_name_tag.string.strip() if meet_name_tag else "會議名稱未知"
play_link_tag = item.find('a', title=re.compile(r"委員寬頻影片"))
if play_link_tag and play_link_tag.get('href'):
absolute_url = urljoin(BASE_URL, play_link_tag.get('href'))
clip_info['playback_page_url'] = absolute_url
all_clips.append(clip_info)
found_clips_on_page += 1
if found_clips_on_page == 0:
break
# 簡易分頁檢查
pagination_script = soup.find('script', string=re.compile(r"var total ="))
if pagination_script:
total_match = re.search(r'var total = "(\d+)"', pagination_script.string)
size_match = re.search(r'var pageSize = "(\d+)"', pagination_script.string)
if total_match and size_match and (page_num * int(size_match.group(1)) >= int(total_match.group(1))):
break
else:
break
page_num += 1
msg = f"爬取完成!共抓取 {legislator_name} 委員 {len(all_clips)} 筆發言片段。"
print(msg)
return all_clips, msg
except Exception as e:
msg = f"錯誤:爬取影片失敗。 {e}"
print(msg)
return None, msg
# --- 建立 Gradio 介面 ---
with gr.Blocks() as iface:
gr.Markdown("# IVOD 委員發言爬蟲 API (修正版)")
legislator_map_state = gr.State(value={}) # 初始給空字典
with gr.Row():
# --- [修正重點 1] allow_custom_value=True ---
# 這允許 API Client 傳入任何字串,不會因為初始 choices 為空而被擋下
legislator_input = gr.Dropdown(
label="1. 選擇委員",
choices=[],
interactive=True,
allow_custom_value=True
)
analyze_button = gr.Button("開始爬取", variant="primary")
status_output = gr.Textbox(label="處理狀態", interactive=False)
json_output = gr.JSON(label="發言片段 URL 列表")
# 頁面載入時自動抓名單
iface.load(
fn=get_all_legislators_map,
inputs=[],
outputs=[legislator_map_state, legislator_input],
api_name="get_all_legislators_map"
)
analyze_button.click(
fn=get_videos_for_legislator,
inputs=[legislator_input, legislator_map_state],
outputs=[json_output, status_output],
api_name="get_videos_for_legislator"
)
if __name__ == "__main__":
iface.launch()