Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| import re | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin | |
| import json | |
| # --- 核心函式 --- | |
| BASE_URL = "https://ivod.ly.gov.tw" | |
| HEADERS = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/5.37.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/5.37.36' | |
| } | |
| def get_all_legislators_map(): | |
| """ | |
| (爬蟲 A1) 抓取頁面上的 select 選項,建立 姓名->ID 的對照表。 | |
| """ | |
| target_url = f"{BASE_URL}/Demand?targetSession=current" | |
| print(f"--- 爬蟲 A1 啟動 ---") | |
| legislator_map = {} | |
| try: | |
| response = requests.get(target_url, headers=HEADERS, timeout=10) | |
| response.encoding = 'utf-8' | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| selects = soup.find_all('select') | |
| for select in selects: | |
| options = select.find_all('option') | |
| for option in options: | |
| code = option.get('value') | |
| name = option.text.strip() | |
| # 過濾邏輯 | |
| if code and code.isdigit() and name: | |
| legislator_map[name] = code | |
| print(f"爬取完成,共找到 {len(legislator_map)} 位委員。") | |
| if not legislator_map: | |
| return {}, gr.Dropdown(choices=[], value=None, label="錯誤:未找到資料") | |
| names_list = sorted(list(legislator_map.keys())) | |
| return ( | |
| legislator_map, | |
| gr.Dropdown( | |
| choices=names_list, | |
| value=names_list[0], | |
| interactive=True, | |
| label="1. 選擇委員 (已載入)" | |
| ) | |
| ) | |
| except Exception as e: | |
| msg = f"錯誤:爬取委員列表失敗。 {e}" | |
| print(msg) | |
| return {}, gr.Dropdown(choices=[], label=msg) | |
| def get_videos_for_legislator(legislator_name: str, legislator_map: dict): | |
| """ | |
| (爬蟲 A2) 爬取指定委員的影片列表。 | |
| """ | |
| # --- [修正重點 2] API 呼叫防護 --- | |
| # 如果是透過 API Client 呼叫,legislator_map 可能是 None 或空字典 | |
| # 這時我們需要現場重新抓一次名單,確保能查到 ID | |
| if not legislator_map or legislator_name not in legislator_map: | |
| print(f"⚠️ 檢測到 Map 為空或找不到 '{legislator_name}',正在重新獲取名單...") | |
| new_map, _ = get_all_legislators_map() | |
| if isinstance(new_map, dict) and new_map: | |
| legislator_map = new_map | |
| else: | |
| return None, "錯誤:無法獲取委員名單,請稍後再試。" | |
| # 1. 查詢 ID | |
| legislator_id = legislator_map.get(legislator_name) | |
| if not legislator_id: | |
| return None, f"錯誤:找不到委員 '{legislator_name}' 的 ID。" | |
| legislator_page_url = f"{BASE_URL}/Demand/ListLgltVideo/{legislator_id}" | |
| print(f"--- 爬蟲 A2 啟動: {legislator_name} (ID: {legislator_id}) ---") | |
| all_clips = [] | |
| page_num = 1 | |
| try: | |
| while True: | |
| page_url = f"{legislator_page_url}?page={page_num}" | |
| # print(f"正在爬取第 {page_num} 頁...") | |
| response = requests.get(page_url, headers=HEADERS, timeout=10) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'lxml') # 建議安裝 lxml: pip install lxml | |
| clip_lists = soup.find_all('ul', id='clipUl') | |
| if not clip_lists: | |
| break | |
| found_clips_on_page = 0 | |
| for ul in clip_lists: | |
| list_items = ul.find_all('li') | |
| for item in list_items: | |
| clip_info = {"legislator_name": legislator_name} | |
| meet_name_tag = item.select_one("span.metdec") | |
| clip_info['meeting_name'] = meet_name_tag.string.strip() if meet_name_tag else "會議名稱未知" | |
| play_link_tag = item.find('a', title=re.compile(r"委員寬頻影片")) | |
| if play_link_tag and play_link_tag.get('href'): | |
| absolute_url = urljoin(BASE_URL, play_link_tag.get('href')) | |
| clip_info['playback_page_url'] = absolute_url | |
| all_clips.append(clip_info) | |
| found_clips_on_page += 1 | |
| if found_clips_on_page == 0: | |
| break | |
| # 簡易分頁檢查 | |
| pagination_script = soup.find('script', string=re.compile(r"var total =")) | |
| if pagination_script: | |
| total_match = re.search(r'var total = "(\d+)"', pagination_script.string) | |
| size_match = re.search(r'var pageSize = "(\d+)"', pagination_script.string) | |
| if total_match and size_match and (page_num * int(size_match.group(1)) >= int(total_match.group(1))): | |
| break | |
| else: | |
| break | |
| page_num += 1 | |
| msg = f"爬取完成!共抓取 {legislator_name} 委員 {len(all_clips)} 筆發言片段。" | |
| print(msg) | |
| return all_clips, msg | |
| except Exception as e: | |
| msg = f"錯誤:爬取影片失敗。 {e}" | |
| print(msg) | |
| return None, msg | |
| # --- 建立 Gradio 介面 --- | |
| with gr.Blocks() as iface: | |
| gr.Markdown("# IVOD 委員發言爬蟲 API (修正版)") | |
| legislator_map_state = gr.State(value={}) # 初始給空字典 | |
| with gr.Row(): | |
| # --- [修正重點 1] allow_custom_value=True --- | |
| # 這允許 API Client 傳入任何字串,不會因為初始 choices 為空而被擋下 | |
| legislator_input = gr.Dropdown( | |
| label="1. 選擇委員", | |
| choices=[], | |
| interactive=True, | |
| allow_custom_value=True | |
| ) | |
| analyze_button = gr.Button("開始爬取", variant="primary") | |
| status_output = gr.Textbox(label="處理狀態", interactive=False) | |
| json_output = gr.JSON(label="發言片段 URL 列表") | |
| # 頁面載入時自動抓名單 | |
| iface.load( | |
| fn=get_all_legislators_map, | |
| inputs=[], | |
| outputs=[legislator_map_state, legislator_input], | |
| api_name="get_all_legislators_map" | |
| ) | |
| analyze_button.click( | |
| fn=get_videos_for_legislator, | |
| inputs=[legislator_input, legislator_map_state], | |
| outputs=[json_output, status_output], | |
| api_name="get_videos_for_legislator" | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |