Spaces:

cormort
/

ivod-legislator-scraper

Sleeping

App Files Files Community

cormort commited on Nov 18, 2025

Commit

1758b3a

verified ·

1 Parent(s): 91d3dcb

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -76

app.py CHANGED Viewed

@@ -6,123 +6,108 @@ from urllib.parse import urljoin
 import json
 # --- 核心函式 ---
 BASE_URL = "https://ivod.ly.gov.tw"
 HEADERS = {
     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/5.37.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/5.37.36'
 }
-# (!!! 依據你的程式碼邏輯更新 !!!)
 def get_all_legislators_map():
     """
-    (爬蟲 A1)
-    參考你的邏輯：抓取頁面上的 select 選項，建立 姓名->ID 的對照表。
     """
     target_url = f"{BASE_URL}/Demand?targetSession=current"
     print(f"--- 爬蟲 A1 啟動 ---")
-    print(f"正在爬取委員總列表: {target_url}")
     legislator_map = {}
     try:
         response = requests.get(target_url, headers=HEADERS, timeout=10)
-        response.encoding = 'utf-8' # 確保中文顯示正確
         response.raise_for_status()
-        soup = BeautifulSoup(response.text, 'html.parser') # 使用 html.parser，與你提供的腳本一致
-        # 找到所有 select 元素
         selects = soup.find_all('select')
-        count = 0
         for select in selects:
             options = select.find_all('option')
             for option in options:
                 code = option.get('value')
                 name = option.text.strip()
-                # (!!! 優化過濾 !!!)
-                # 只抓取代碼是 "數字" 的選項 (例如 7005)，排除 "current"(屆期) 等無關選項
                 if code and code.isdigit() and name:
                     legislator_map[name] = code
-                    count += 1
-        print(f"爬取完成，共找到 {count} 位委員 (含現任與離職)。")
         if not legislator_map:
-             return {}, gr.Dropdown(choices=[], value=None, interactive=False, label="錯誤：未找到任何委員資料")
-        # 準備 Dropdown 的選項 (依姓名排序)
         names_list = sorted(list(legislator_map.keys()))
         return (
             legislator_map,
             gr.Dropdown(
                 choices=names_list,
-                value=names_list[0], # 預設選取第一位
                 interactive=True,
-                label="1. 選擇委員 (自動載入)"
             )
         )
     except Exception as e:
-        msg = f"錯誤：爬取委員總列表失敗。 {e}"
         print(msg)
-        return {}, gr.Dropdown(choices=[], value=None, interactive=False, label=msg)
 def get_videos_for_legislator(legislator_name: str, legislator_map: dict):
     """
-    (爬蟲 A2)
-    爬取指定委員的影片列表頁面 (自動處理分頁)。
     """
-    # 1. 從 map 中查詢 ID
     legislator_id = legislator_map.get(legislator_name)
     if not legislator_id:
         return None, f"錯誤：找不到委員 '{legislator_name}' 的 ID。"
     legislator_page_url = f"{BASE_URL}/Demand/ListLgltVideo/{legislator_id}"
-    print(f"--- 爬蟲 A2 啟動 ---")
-    print(f"正在爬取 {legislator_name} (ID: {legislator_id}) 的所有影片...")
     all_clips = []
     page_num = 1
     try:
         while True:
-            # 2. 組合分頁 URL
             page_url = f"{legislator_page_url}?page={page_num}"
-            print(f"正在爬取第 {page_num} 頁: {page_url}")
             response = requests.get(page_url, headers=HEADERS, timeout=10)
             response.raise_for_status()
-            soup = BeautifulSoup(response.text, 'lxml') # 這裡維持 lxml 解析影片列表結構較穩
-            # 3. 抓取影片片段 (clipUl)
             clip_lists = soup.find_all('ul', id='clipUl')
             if not clip_lists:
-                print(f"第 {page_num} 頁找不到 'clipUl'，判斷為爬取完畢。")
                 break
             found_clips_on_page = 0
             for ul in clip_lists:
                 list_items = ul.find_all('li')
-                if not list_items:
-                    continue
                 for item in list_items:
                     clip_info = {"legislator_name": legislator_name}
-                    # 抓取會議名稱
                     meet_name_tag = item.select_one("span.metdec")
                     clip_info['meeting_name'] = meet_name_tag.string.strip() if meet_name_tag else "會議名稱未知"
-                    # 抓取發言時間
-                    time_tag = item.find('p', string=re.compile(r"委員發言時間："))
-                    clip_info['speech_time'] = time_tag.string.replace("委員發言時間：", "").strip() if time_tag else "時間未知"
-                    # 抓取播放頁面 URL (優先抓寬頻 1M)
                     play_link_tag = item.find('a', title=re.compile(r"委員寬頻影片"))
                     if play_link_tag and play_link_tag.get('href'):
                         absolute_url = urljoin(BASE_URL, play_link_tag.get('href'))
@@ -130,27 +115,19 @@ def get_videos_for_legislator(legislator_name: str, legislator_map: dict):
                         all_clips.append(clip_info)
                         found_clips_on_page += 1
-            # 4. 檢查是否為最後一頁
             if found_clips_on_page == 0:
-                 print(f"第 {page_num} 頁未找到任何影片，判斷為爬取完畢。")
                  break
-            # 5. 檢查分頁腳本 (判斷是否還有下一頁)
             pagination_script = soup.find('script', string=re.compile(r"var total ="))
             if pagination_script:
                 total_match = re.search(r'var total = "(\d+)"', pagination_script.string)
                 size_match = re.search(r'var pageSize = "(\d+)"', pagination_script.string)
-                if total_match and size_match:
-                    total_items = int(total_match.group(1))
-                    page_size = int(size_match.group(1))
-                    # 如果當前頁數 * 每頁筆數 >= 總筆數，代表這是最後一頁
-                    if page_num * page_size >= total_items:
-                        print("已達總頁數，爬取完畢。")
-                        break
             else:
-                # 找不到分頁資訊，通常代表只有一頁
                 break
             page_num += 1
         msg = f"爬取完成！共抓取 {legislator_name} 委員 {len(all_clips)} 筆發言片段。"
@@ -158,53 +135,44 @@ def get_videos_for_legislator(legislator_name: str, legislator_map: dict):
         return all_clips, msg
     except Exception as e:
-        msg = f"錯誤：爬取委員頁面失敗。 {e}"
         print(msg)
         return None, msg
 # --- 建立 Gradio 介面 ---
 with gr.Blocks() as iface:
-    gr.Markdown("# IVOD 委員發言爬蟲 API (新版爬蟲 A)")
-    gr.Markdown("自動載入所有委員名單，選擇後即可抓取該委員所有的發言影片連結。")
-    # 建立一個隱藏的 State 來儲存 {"姓名": "ID"} 的完整 map
-    legislator_map_state = gr.State()
     with gr.Row():
-        # 下拉選單初始為空，等待 load 事件填入
         legislator_input = gr.Dropdown(
-            label="1. 選擇委員 (載入中...)",
             choices=[],
-            interactive=False
         )
-    analyze_button = gr.Button("開始爬取", variant="primary")
     status_output = gr.Textbox(label="處理狀態", interactive=False)
-    gr.Markdown("---")
-    gr.Markdown("## 爬取結果 (JSON)")
     json_output = gr.JSON(label="發言片段 URL 列表")
-   # --- 綁定事件 ---
-    # 1. 介面載入時，自動觸發一次
-    # (!!! 修正：加上 api_name，讓外部腳本可以呼叫 !!!)
     iface.load(
         fn=get_all_legislators_map,
         inputs=[],
         outputs=[legislator_map_state, legislator_input],
-        api_name="get_all_legislators_map"
     )
-    # 2. "開始爬取" 按鈕的事件
-    # (!!! 修正：加上 api_name，讓外部腳本可以呼叫 !!!)
     analyze_button.click(
         fn=get_videos_for_legislator,
-        inputs=[legislator_input, legislator_map_state],
         outputs=[json_output, status_output],
         api_name="get_videos_for_legislator"
     )
-# 啟動介面
 if __name__ == "__main__":
     iface.launch()

 import json
 # --- 核心函式 ---
 BASE_URL = "https://ivod.ly.gov.tw"
 HEADERS = {
     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/5.37.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/5.37.36'
 }
 def get_all_legislators_map():
     """
+    (爬蟲 A1) 抓取頁面上的 select 選項，建立 姓名->ID 的對照表。
     """
     target_url = f"{BASE_URL}/Demand?targetSession=current"
     print(f"--- 爬蟲 A1 啟動 ---")
     legislator_map = {}
     try:
         response = requests.get(target_url, headers=HEADERS, timeout=10)
+        response.encoding = 'utf-8'
         response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
         selects = soup.find_all('select')
         for select in selects:
             options = select.find_all('option')
             for option in options:
                 code = option.get('value')
                 name = option.text.strip()
+                # 過濾邏輯
                 if code and code.isdigit() and name:
                     legislator_map[name] = code
+        print(f"爬取完成，共找到 {len(legislator_map)} 位委員。")
         if not legislator_map:
+             return {}, gr.Dropdown(choices=[], value=None, label="錯誤：未找到資料")
         names_list = sorted(list(legislator_map.keys()))
         return (
             legislator_map,
             gr.Dropdown(
                 choices=names_list,
+                value=names_list[0],
                 interactive=True,
+                label="1. 選擇委員 (已載入)"
             )
         )
     except Exception as e:
+        msg = f"錯誤：爬取委員列表失敗。 {e}"
         print(msg)
+        return {}, gr.Dropdown(choices=[], label=msg)
 def get_videos_for_legislator(legislator_name: str, legislator_map: dict):
     """
+    (爬蟲 A2) 爬取指定委員的影片列表。
     """
+    # --- [修正重點 2] API 呼叫防護 ---
+    # 如果是透過 API Client 呼叫，legislator_map 可能是 None 或空字典
+    # 這時我們需要現場重新抓一次名單，確保能查到 ID
+    if not legislator_map or legislator_name not in legislator_map:
+        print(f"⚠️ 檢測到 Map 為空或找不到 '{legislator_name}'，正在重新獲取名單...")
+        new_map, _ = get_all_legislators_map()
+        if isinstance(new_map, dict) and new_map:
+            legislator_map = new_map
+        else:
+            return None, "錯誤：無法獲取委員名單，請稍後再試。"
+    # 1. 查詢 ID
     legislator_id = legislator_map.get(legislator_name)
     if not legislator_id:
         return None, f"錯誤：找不到委員 '{legislator_name}' 的 ID。"
     legislator_page_url = f"{BASE_URL}/Demand/ListLgltVideo/{legislator_id}"
+    print(f"--- 爬蟲 A2 啟動: {legislator_name} (ID: {legislator_id}) ---")
     all_clips = []
     page_num = 1
     try:
         while True:
             page_url = f"{legislator_page_url}?page={page_num}"
+            # print(f"正在爬取第 {page_num} 頁...")
             response = requests.get(page_url, headers=HEADERS, timeout=10)
             response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'lxml') # 建議安裝 lxml: pip install lxml
             clip_lists = soup.find_all('ul', id='clipUl')
             if not clip_lists:
                 break
             found_clips_on_page = 0
             for ul in clip_lists:
                 list_items = ul.find_all('li')
                 for item in list_items:
                     clip_info = {"legislator_name": legislator_name}
                     meet_name_tag = item.select_one("span.metdec")
                     clip_info['meeting_name'] = meet_name_tag.string.strip() if meet_name_tag else "會議名稱未知"
                     play_link_tag = item.find('a', title=re.compile(r"委員寬頻影片"))
                     if play_link_tag and play_link_tag.get('href'):
                         absolute_url = urljoin(BASE_URL, play_link_tag.get('href'))
                         all_clips.append(clip_info)
                         found_clips_on_page += 1
             if found_clips_on_page == 0:
                  break
+            # 簡易分頁檢查
             pagination_script = soup.find('script', string=re.compile(r"var total ="))
             if pagination_script:
                 total_match = re.search(r'var total = "(\d+)"', pagination_script.string)
                 size_match = re.search(r'var pageSize = "(\d+)"', pagination_script.string)
+                if total_match and size_match and (page_num * int(size_match.group(1)) >= int(total_match.group(1))):
+                    break
             else:
                 break
             page_num += 1
         msg = f"爬取完成！共抓取 {legislator_name} 委員 {len(all_clips)} 筆發言片段。"
         return all_clips, msg
     except Exception as e:
+        msg = f"錯誤：爬取影片失敗。 {e}"
         print(msg)
         return None, msg
 # --- 建立 Gradio 介面 ---
 with gr.Blocks() as iface:
+    gr.Markdown("# IVOD 委員發言爬蟲 API (修正版)")
+    legislator_map_state = gr.State(value={}) # 初始給空字典
     with gr.Row():
+        # --- [修正重點 1] allow_custom_value=True ---
+        # 這允許 API Client 傳入任何字串，不會因為初始 choices 為空而被擋下
         legislator_input = gr.Dropdown(
+            label="1. 選擇委員",
             choices=[],
+            interactive=True,
+            allow_custom_value=True
         )
+        analyze_button = gr.Button("開始爬取", variant="primary")
     status_output = gr.Textbox(label="處理狀態", interactive=False)
     json_output = gr.JSON(label="發言片段 URL 列表")
+    # 頁面載入時自動抓名單
     iface.load(
         fn=get_all_legislators_map,
         inputs=[],
         outputs=[legislator_map_state, legislator_input],
+        api_name="get_all_legislators_map"
     )
     analyze_button.click(
         fn=get_videos_for_legislator,
+        inputs=[legislator_input, legislator_map_state],
         outputs=[json_output, status_output],
         api_name="get_videos_for_legislator"
     )
 if __name__ == "__main__":
     iface.launch()