Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,123 +6,108 @@ from urllib.parse import urljoin
|
|
| 6 |
import json
|
| 7 |
|
| 8 |
# --- 核心函式 ---
|
| 9 |
-
|
| 10 |
BASE_URL = "https://ivod.ly.gov.tw"
|
| 11 |
HEADERS = {
|
| 12 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/5.37.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/5.37.36'
|
| 13 |
}
|
| 14 |
|
| 15 |
-
# (!!! 依據你的程式碼邏輯更新 !!!)
|
| 16 |
def get_all_legislators_map():
|
| 17 |
"""
|
| 18 |
-
(爬蟲 A1)
|
| 19 |
-
參考你的邏輯:抓取頁面上的 select 選項,建立 姓名->ID 的對照表。
|
| 20 |
"""
|
| 21 |
target_url = f"{BASE_URL}/Demand?targetSession=current"
|
| 22 |
print(f"--- 爬蟲 A1 啟動 ---")
|
| 23 |
-
print(f"正在爬取委員總列表: {target_url}")
|
| 24 |
|
| 25 |
legislator_map = {}
|
| 26 |
try:
|
| 27 |
response = requests.get(target_url, headers=HEADERS, timeout=10)
|
| 28 |
-
response.encoding = 'utf-8'
|
| 29 |
response.raise_for_status()
|
| 30 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
| 31 |
|
| 32 |
-
# 找到所有 select 元素
|
| 33 |
selects = soup.find_all('select')
|
| 34 |
|
| 35 |
-
count = 0
|
| 36 |
for select in selects:
|
| 37 |
options = select.find_all('option')
|
| 38 |
for option in options:
|
| 39 |
code = option.get('value')
|
| 40 |
name = option.text.strip()
|
| 41 |
|
| 42 |
-
#
|
| 43 |
-
# 只抓取代碼是 "數字" 的選項 (例如 7005),排除 "current"(屆期) 等無關選項
|
| 44 |
if code and code.isdigit() and name:
|
| 45 |
legislator_map[name] = code
|
| 46 |
-
count += 1
|
| 47 |
|
| 48 |
-
print(f"爬取完成,共找到 {
|
| 49 |
|
| 50 |
if not legislator_map:
|
| 51 |
-
return {}, gr.Dropdown(choices=[], value=None,
|
| 52 |
-
|
| 53 |
-
# 準備 Dropdown 的選項 (依姓名排序)
|
| 54 |
names_list = sorted(list(legislator_map.keys()))
|
| 55 |
|
| 56 |
return (
|
| 57 |
legislator_map,
|
| 58 |
gr.Dropdown(
|
| 59 |
choices=names_list,
|
| 60 |
-
value=names_list[0],
|
| 61 |
interactive=True,
|
| 62 |
-
label="1. 選擇委員 (
|
| 63 |
)
|
| 64 |
)
|
| 65 |
-
|
| 66 |
except Exception as e:
|
| 67 |
-
msg = f"錯誤:爬取委員
|
| 68 |
print(msg)
|
| 69 |
-
return {}, gr.Dropdown(choices=[],
|
| 70 |
-
|
| 71 |
|
| 72 |
def get_videos_for_legislator(legislator_name: str, legislator_map: dict):
|
| 73 |
"""
|
| 74 |
-
(爬蟲 A2)
|
| 75 |
-
爬取指定委員的影片列表頁面 (自動處理分頁)。
|
| 76 |
"""
|
| 77 |
|
| 78 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
legislator_id = legislator_map.get(legislator_name)
|
| 80 |
|
| 81 |
if not legislator_id:
|
| 82 |
return None, f"錯誤:找不到委員 '{legislator_name}' 的 ID。"
|
| 83 |
|
| 84 |
legislator_page_url = f"{BASE_URL}/Demand/ListLgltVideo/{legislator_id}"
|
| 85 |
-
|
| 86 |
-
print(f"--- 爬蟲 A2 啟動 ---")
|
| 87 |
-
print(f"正在爬取 {legislator_name} (ID: {legislator_id}) 的所有影片...")
|
| 88 |
|
| 89 |
all_clips = []
|
| 90 |
page_num = 1
|
| 91 |
|
| 92 |
try:
|
| 93 |
while True:
|
| 94 |
-
# 2. 組合分頁 URL
|
| 95 |
page_url = f"{legislator_page_url}?page={page_num}"
|
| 96 |
-
print(f"正在爬取第 {page_num} 頁
|
| 97 |
|
| 98 |
response = requests.get(page_url, headers=HEADERS, timeout=10)
|
| 99 |
response.raise_for_status()
|
| 100 |
-
soup = BeautifulSoup(response.text, 'lxml') #
|
| 101 |
|
| 102 |
-
# 3. 抓取影片片段 (clipUl)
|
| 103 |
clip_lists = soup.find_all('ul', id='clipUl')
|
| 104 |
if not clip_lists:
|
| 105 |
-
print(f"第 {page_num} 頁找不到 'clipUl',判斷為爬取完畢。")
|
| 106 |
break
|
| 107 |
|
| 108 |
found_clips_on_page = 0
|
| 109 |
for ul in clip_lists:
|
| 110 |
list_items = ul.find_all('li')
|
| 111 |
-
if not list_items:
|
| 112 |
-
continue
|
| 113 |
-
|
| 114 |
for item in list_items:
|
| 115 |
clip_info = {"legislator_name": legislator_name}
|
| 116 |
|
| 117 |
-
# 抓取會議名稱
|
| 118 |
meet_name_tag = item.select_one("span.metdec")
|
| 119 |
clip_info['meeting_name'] = meet_name_tag.string.strip() if meet_name_tag else "會議名稱未知"
|
| 120 |
|
| 121 |
-
# 抓取發言時間
|
| 122 |
-
time_tag = item.find('p', string=re.compile(r"委員發言時間:"))
|
| 123 |
-
clip_info['speech_time'] = time_tag.string.replace("委員發言時間:", "").strip() if time_tag else "時間未知"
|
| 124 |
-
|
| 125 |
-
# 抓取播放頁面 URL (優先抓寬頻 1M)
|
| 126 |
play_link_tag = item.find('a', title=re.compile(r"委員寬頻影片"))
|
| 127 |
if play_link_tag and play_link_tag.get('href'):
|
| 128 |
absolute_url = urljoin(BASE_URL, play_link_tag.get('href'))
|
|
@@ -130,27 +115,19 @@ def get_videos_for_legislator(legislator_name: str, legislator_map: dict):
|
|
| 130 |
all_clips.append(clip_info)
|
| 131 |
found_clips_on_page += 1
|
| 132 |
|
| 133 |
-
# 4. 檢查是否為最後一頁
|
| 134 |
if found_clips_on_page == 0:
|
| 135 |
-
print(f"第 {page_num} 頁未找到任何影片,判斷為爬取完畢。")
|
| 136 |
break
|
| 137 |
-
|
| 138 |
-
#
|
| 139 |
pagination_script = soup.find('script', string=re.compile(r"var total ="))
|
| 140 |
if pagination_script:
|
| 141 |
total_match = re.search(r'var total = "(\d+)"', pagination_script.string)
|
| 142 |
size_match = re.search(r'var pageSize = "(\d+)"', pagination_script.string)
|
| 143 |
-
if total_match and size_match:
|
| 144 |
-
|
| 145 |
-
page_size = int(size_match.group(1))
|
| 146 |
-
# 如果當前頁數 * 每頁筆數 >= 總筆數,代表這是最後一頁
|
| 147 |
-
if page_num * page_size >= total_items:
|
| 148 |
-
print("已達總頁數,爬取完畢。")
|
| 149 |
-
break
|
| 150 |
else:
|
| 151 |
-
# 找不到分頁資訊,通常代表只有一頁
|
| 152 |
break
|
| 153 |
-
|
| 154 |
page_num += 1
|
| 155 |
|
| 156 |
msg = f"爬取完成!共抓取 {legislator_name} 委員 {len(all_clips)} 筆發言片段。"
|
|
@@ -158,53 +135,44 @@ def get_videos_for_legislator(legislator_name: str, legislator_map: dict):
|
|
| 158 |
return all_clips, msg
|
| 159 |
|
| 160 |
except Exception as e:
|
| 161 |
-
msg = f"錯誤:爬取
|
| 162 |
print(msg)
|
| 163 |
return None, msg
|
| 164 |
|
| 165 |
# --- 建立 Gradio 介面 ---
|
| 166 |
with gr.Blocks() as iface:
|
| 167 |
-
gr.Markdown("# IVOD 委員發言爬蟲 API (
|
| 168 |
-
gr.Markdown("自動載入所有委員名單,選擇後即可抓取該委員所有的發言影片連結。")
|
| 169 |
|
| 170 |
-
|
| 171 |
-
legislator_map_state = gr.State()
|
| 172 |
|
| 173 |
with gr.Row():
|
| 174 |
-
#
|
|
|
|
| 175 |
legislator_input = gr.Dropdown(
|
| 176 |
-
label="1. 選擇委員
|
| 177 |
choices=[],
|
| 178 |
-
interactive=
|
|
|
|
| 179 |
)
|
|
|
|
| 180 |
|
| 181 |
-
analyze_button = gr.Button("開始爬取", variant="primary")
|
| 182 |
status_output = gr.Textbox(label="處理狀態", interactive=False)
|
| 183 |
-
|
| 184 |
-
gr.Markdown("---")
|
| 185 |
-
gr.Markdown("## 爬取結果 (JSON)")
|
| 186 |
json_output = gr.JSON(label="發言片段 URL 列表")
|
| 187 |
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
# 1. 介面載入時,自動觸發一次
|
| 191 |
-
# (!!! 修正:加上 api_name,讓外部腳本可以呼叫 !!!)
|
| 192 |
iface.load(
|
| 193 |
fn=get_all_legislators_map,
|
| 194 |
inputs=[],
|
| 195 |
outputs=[legislator_map_state, legislator_input],
|
| 196 |
-
api_name="get_all_legislators_map"
|
| 197 |
)
|
| 198 |
|
| 199 |
-
# 2. "開始爬取" 按鈕的事件
|
| 200 |
-
# (!!! 修正:加上 api_name,讓外部腳本可以呼叫 !!!)
|
| 201 |
analyze_button.click(
|
| 202 |
fn=get_videos_for_legislator,
|
| 203 |
-
inputs=[legislator_input, legislator_map_state],
|
| 204 |
outputs=[json_output, status_output],
|
| 205 |
api_name="get_videos_for_legislator"
|
| 206 |
)
|
| 207 |
|
| 208 |
-
# 啟動介面
|
| 209 |
if __name__ == "__main__":
|
| 210 |
iface.launch()
|
|
|
|
| 6 |
import json
|
| 7 |
|
| 8 |
# --- 核心函式 ---
|
|
|
|
| 9 |
BASE_URL = "https://ivod.ly.gov.tw"
|
| 10 |
HEADERS = {
|
| 11 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/5.37.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/5.37.36'
|
| 12 |
}
|
| 13 |
|
|
|
|
| 14 |
def get_all_legislators_map():
|
| 15 |
"""
|
| 16 |
+
(爬蟲 A1) 抓取頁面上的 select 選項,建立 姓名->ID 的對照表。
|
|
|
|
| 17 |
"""
|
| 18 |
target_url = f"{BASE_URL}/Demand?targetSession=current"
|
| 19 |
print(f"--- 爬蟲 A1 啟動 ---")
|
|
|
|
| 20 |
|
| 21 |
legislator_map = {}
|
| 22 |
try:
|
| 23 |
response = requests.get(target_url, headers=HEADERS, timeout=10)
|
| 24 |
+
response.encoding = 'utf-8'
|
| 25 |
response.raise_for_status()
|
| 26 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 27 |
|
|
|
|
| 28 |
selects = soup.find_all('select')
|
| 29 |
|
|
|
|
| 30 |
for select in selects:
|
| 31 |
options = select.find_all('option')
|
| 32 |
for option in options:
|
| 33 |
code = option.get('value')
|
| 34 |
name = option.text.strip()
|
| 35 |
|
| 36 |
+
# 過濾邏輯
|
|
|
|
| 37 |
if code and code.isdigit() and name:
|
| 38 |
legislator_map[name] = code
|
|
|
|
| 39 |
|
| 40 |
+
print(f"爬取完成,共找到 {len(legislator_map)} 位委員。")
|
| 41 |
|
| 42 |
if not legislator_map:
|
| 43 |
+
return {}, gr.Dropdown(choices=[], value=None, label="錯誤:未找到資料")
|
| 44 |
+
|
|
|
|
| 45 |
names_list = sorted(list(legislator_map.keys()))
|
| 46 |
|
| 47 |
return (
|
| 48 |
legislator_map,
|
| 49 |
gr.Dropdown(
|
| 50 |
choices=names_list,
|
| 51 |
+
value=names_list[0],
|
| 52 |
interactive=True,
|
| 53 |
+
label="1. 選擇委員 (已載入)"
|
| 54 |
)
|
| 55 |
)
|
|
|
|
| 56 |
except Exception as e:
|
| 57 |
+
msg = f"錯誤:爬取委員列表失敗。 {e}"
|
| 58 |
print(msg)
|
| 59 |
+
return {}, gr.Dropdown(choices=[], label=msg)
|
|
|
|
| 60 |
|
| 61 |
def get_videos_for_legislator(legislator_name: str, legislator_map: dict):
|
| 62 |
"""
|
| 63 |
+
(爬蟲 A2) 爬取指定委員的影片列表。
|
|
|
|
| 64 |
"""
|
| 65 |
|
| 66 |
+
# --- [修正重點 2] API 呼叫防護 ---
|
| 67 |
+
# 如果是透過 API Client 呼叫,legislator_map 可能是 None 或空字典
|
| 68 |
+
# 這時我們需要現場重新抓一次名單,確保能查到 ID
|
| 69 |
+
if not legislator_map or legislator_name not in legislator_map:
|
| 70 |
+
print(f"⚠️ 檢測到 Map 為空或找不到 '{legislator_name}',正在重新獲取名單...")
|
| 71 |
+
new_map, _ = get_all_legislators_map()
|
| 72 |
+
if isinstance(new_map, dict) and new_map:
|
| 73 |
+
legislator_map = new_map
|
| 74 |
+
else:
|
| 75 |
+
return None, "錯誤:無法獲取委員名單,請稍後再試。"
|
| 76 |
+
|
| 77 |
+
# 1. 查詢 ID
|
| 78 |
legislator_id = legislator_map.get(legislator_name)
|
| 79 |
|
| 80 |
if not legislator_id:
|
| 81 |
return None, f"錯誤:找不到委員 '{legislator_name}' 的 ID。"
|
| 82 |
|
| 83 |
legislator_page_url = f"{BASE_URL}/Demand/ListLgltVideo/{legislator_id}"
|
| 84 |
+
print(f"--- 爬蟲 A2 啟動: {legislator_name} (ID: {legislator_id}) ---")
|
|
|
|
|
|
|
| 85 |
|
| 86 |
all_clips = []
|
| 87 |
page_num = 1
|
| 88 |
|
| 89 |
try:
|
| 90 |
while True:
|
|
|
|
| 91 |
page_url = f"{legislator_page_url}?page={page_num}"
|
| 92 |
+
# print(f"正在爬取第 {page_num} 頁...")
|
| 93 |
|
| 94 |
response = requests.get(page_url, headers=HEADERS, timeout=10)
|
| 95 |
response.raise_for_status()
|
| 96 |
+
soup = BeautifulSoup(response.text, 'lxml') # 建議安裝 lxml: pip install lxml
|
| 97 |
|
|
|
|
| 98 |
clip_lists = soup.find_all('ul', id='clipUl')
|
| 99 |
if not clip_lists:
|
|
|
|
| 100 |
break
|
| 101 |
|
| 102 |
found_clips_on_page = 0
|
| 103 |
for ul in clip_lists:
|
| 104 |
list_items = ul.find_all('li')
|
|
|
|
|
|
|
|
|
|
| 105 |
for item in list_items:
|
| 106 |
clip_info = {"legislator_name": legislator_name}
|
| 107 |
|
|
|
|
| 108 |
meet_name_tag = item.select_one("span.metdec")
|
| 109 |
clip_info['meeting_name'] = meet_name_tag.string.strip() if meet_name_tag else "會議名稱未知"
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
play_link_tag = item.find('a', title=re.compile(r"委員寬頻影片"))
|
| 112 |
if play_link_tag and play_link_tag.get('href'):
|
| 113 |
absolute_url = urljoin(BASE_URL, play_link_tag.get('href'))
|
|
|
|
| 115 |
all_clips.append(clip_info)
|
| 116 |
found_clips_on_page += 1
|
| 117 |
|
|
|
|
| 118 |
if found_clips_on_page == 0:
|
|
|
|
| 119 |
break
|
| 120 |
+
|
| 121 |
+
# 簡易分頁檢查
|
| 122 |
pagination_script = soup.find('script', string=re.compile(r"var total ="))
|
| 123 |
if pagination_script:
|
| 124 |
total_match = re.search(r'var total = "(\d+)"', pagination_script.string)
|
| 125 |
size_match = re.search(r'var pageSize = "(\d+)"', pagination_script.string)
|
| 126 |
+
if total_match and size_match and (page_num * int(size_match.group(1)) >= int(total_match.group(1))):
|
| 127 |
+
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
else:
|
|
|
|
| 129 |
break
|
| 130 |
+
|
| 131 |
page_num += 1
|
| 132 |
|
| 133 |
msg = f"爬取完成!共抓取 {legislator_name} 委員 {len(all_clips)} 筆發言片段。"
|
|
|
|
| 135 |
return all_clips, msg
|
| 136 |
|
| 137 |
except Exception as e:
|
| 138 |
+
msg = f"錯誤:爬取影片失敗。 {e}"
|
| 139 |
print(msg)
|
| 140 |
return None, msg
|
| 141 |
|
| 142 |
# --- 建立 Gradio 介面 ---
|
| 143 |
with gr.Blocks() as iface:
|
| 144 |
+
gr.Markdown("# IVOD 委員發言爬蟲 API (修正版)")
|
|
|
|
| 145 |
|
| 146 |
+
legislator_map_state = gr.State(value={}) # 初始給空字典
|
|
|
|
| 147 |
|
| 148 |
with gr.Row():
|
| 149 |
+
# --- [修正重點 1] allow_custom_value=True ---
|
| 150 |
+
# 這允許 API Client 傳入任何字串,不會因為初始 choices 為空而被擋下
|
| 151 |
legislator_input = gr.Dropdown(
|
| 152 |
+
label="1. 選擇委員",
|
| 153 |
choices=[],
|
| 154 |
+
interactive=True,
|
| 155 |
+
allow_custom_value=True
|
| 156 |
)
|
| 157 |
+
analyze_button = gr.Button("開始爬取", variant="primary")
|
| 158 |
|
|
|
|
| 159 |
status_output = gr.Textbox(label="處理狀態", interactive=False)
|
|
|
|
|
|
|
|
|
|
| 160 |
json_output = gr.JSON(label="發言片段 URL 列表")
|
| 161 |
|
| 162 |
+
# 頁面載入時自動抓名單
|
|
|
|
|
|
|
|
|
|
| 163 |
iface.load(
|
| 164 |
fn=get_all_legislators_map,
|
| 165 |
inputs=[],
|
| 166 |
outputs=[legislator_map_state, legislator_input],
|
| 167 |
+
api_name="get_all_legislators_map"
|
| 168 |
)
|
| 169 |
|
|
|
|
|
|
|
| 170 |
analyze_button.click(
|
| 171 |
fn=get_videos_for_legislator,
|
| 172 |
+
inputs=[legislator_input, legislator_map_state],
|
| 173 |
outputs=[json_output, status_output],
|
| 174 |
api_name="get_videos_for_legislator"
|
| 175 |
)
|
| 176 |
|
|
|
|
| 177 |
if __name__ == "__main__":
|
| 178 |
iface.launch()
|