cormort commited on
Commit
1758b3a
·
verified ·
1 Parent(s): 91d3dcb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -76
app.py CHANGED
@@ -6,123 +6,108 @@ from urllib.parse import urljoin
6
  import json
7
 
8
  # --- 核心函式 ---
9
-
10
  BASE_URL = "https://ivod.ly.gov.tw"
11
  HEADERS = {
12
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/5.37.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/5.37.36'
13
  }
14
 
15
- # (!!! 依據你的程式碼邏輯更新 !!!)
16
  def get_all_legislators_map():
17
  """
18
- (爬蟲 A1)
19
- 參考你的邏輯:抓取頁面上的 select 選項,建立 姓名->ID 的對照表。
20
  """
21
  target_url = f"{BASE_URL}/Demand?targetSession=current"
22
  print(f"--- 爬蟲 A1 啟動 ---")
23
- print(f"正在爬取委員總列表: {target_url}")
24
 
25
  legislator_map = {}
26
  try:
27
  response = requests.get(target_url, headers=HEADERS, timeout=10)
28
- response.encoding = 'utf-8' # 確保中文顯示正確
29
  response.raise_for_status()
30
- soup = BeautifulSoup(response.text, 'html.parser') # 使用 html.parser,與你提供的腳本一致
31
 
32
- # 找到所有 select 元素
33
  selects = soup.find_all('select')
34
 
35
- count = 0
36
  for select in selects:
37
  options = select.find_all('option')
38
  for option in options:
39
  code = option.get('value')
40
  name = option.text.strip()
41
 
42
- # (!!! 優化過濾 !!!)
43
- # 只抓取代碼是 "數字" 的選項 (例如 7005),排除 "current"(屆期) 等無關選項
44
  if code and code.isdigit() and name:
45
  legislator_map[name] = code
46
- count += 1
47
 
48
- print(f"爬取完成,共找到 {count} 位委員 (含現任與離職)。")
49
 
50
  if not legislator_map:
51
- return {}, gr.Dropdown(choices=[], value=None, interactive=False, label="錯誤:未找到任何委員資料")
52
-
53
- # 準備 Dropdown 的選項 (依姓名排序)
54
  names_list = sorted(list(legislator_map.keys()))
55
 
56
  return (
57
  legislator_map,
58
  gr.Dropdown(
59
  choices=names_list,
60
- value=names_list[0], # 預設選取第一位
61
  interactive=True,
62
- label="1. 選擇委員 (自動載入)"
63
  )
64
  )
65
-
66
  except Exception as e:
67
- msg = f"錯誤:爬取委員列表失敗。 {e}"
68
  print(msg)
69
- return {}, gr.Dropdown(choices=[], value=None, interactive=False, label=msg)
70
-
71
 
72
  def get_videos_for_legislator(legislator_name: str, legislator_map: dict):
73
  """
74
- (爬蟲 A2)
75
- 爬取指定委員的影片列表頁面 (自動處理分頁)。
76
  """
77
 
78
- # 1. map 中查詢 ID
 
 
 
 
 
 
 
 
 
 
 
79
  legislator_id = legislator_map.get(legislator_name)
80
 
81
  if not legislator_id:
82
  return None, f"錯誤:找不到委員 '{legislator_name}' 的 ID。"
83
 
84
  legislator_page_url = f"{BASE_URL}/Demand/ListLgltVideo/{legislator_id}"
85
-
86
- print(f"--- 爬蟲 A2 啟動 ---")
87
- print(f"正在爬取 {legislator_name} (ID: {legislator_id}) 的所有影片...")
88
 
89
  all_clips = []
90
  page_num = 1
91
 
92
  try:
93
  while True:
94
- # 2. 組合分頁 URL
95
  page_url = f"{legislator_page_url}?page={page_num}"
96
- print(f"正在爬取第 {page_num} 頁: {page_url}")
97
 
98
  response = requests.get(page_url, headers=HEADERS, timeout=10)
99
  response.raise_for_status()
100
- soup = BeautifulSoup(response.text, 'lxml') # 這裡維持 lxml 解析影片列表結構較穩
101
 
102
- # 3. 抓取影片片段 (clipUl)
103
  clip_lists = soup.find_all('ul', id='clipUl')
104
  if not clip_lists:
105
- print(f"第 {page_num} 頁找不到 'clipUl',判斷為爬取完畢。")
106
  break
107
 
108
  found_clips_on_page = 0
109
  for ul in clip_lists:
110
  list_items = ul.find_all('li')
111
- if not list_items:
112
- continue
113
-
114
  for item in list_items:
115
  clip_info = {"legislator_name": legislator_name}
116
 
117
- # 抓取會議名稱
118
  meet_name_tag = item.select_one("span.metdec")
119
  clip_info['meeting_name'] = meet_name_tag.string.strip() if meet_name_tag else "會議名稱未知"
120
 
121
- # 抓取發言時間
122
- time_tag = item.find('p', string=re.compile(r"委員發言時間:"))
123
- clip_info['speech_time'] = time_tag.string.replace("委員發言時間:", "").strip() if time_tag else "時間未知"
124
-
125
- # 抓取播放頁面 URL (優先抓寬頻 1M)
126
  play_link_tag = item.find('a', title=re.compile(r"委員寬頻影片"))
127
  if play_link_tag and play_link_tag.get('href'):
128
  absolute_url = urljoin(BASE_URL, play_link_tag.get('href'))
@@ -130,27 +115,19 @@ def get_videos_for_legislator(legislator_name: str, legislator_map: dict):
130
  all_clips.append(clip_info)
131
  found_clips_on_page += 1
132
 
133
- # 4. 檢查是否為最後一頁
134
  if found_clips_on_page == 0:
135
- print(f"第 {page_num} 頁未找到任何影片,判斷為爬取完畢。")
136
  break
137
-
138
- # 5. 檢查分頁腳本 (判斷是否還有下一頁)
139
  pagination_script = soup.find('script', string=re.compile(r"var total ="))
140
  if pagination_script:
141
  total_match = re.search(r'var total = "(\d+)"', pagination_script.string)
142
  size_match = re.search(r'var pageSize = "(\d+)"', pagination_script.string)
143
- if total_match and size_match:
144
- total_items = int(total_match.group(1))
145
- page_size = int(size_match.group(1))
146
- # 如果當前頁數 * 每頁筆數 >= 總筆數,代表這是最後一頁
147
- if page_num * page_size >= total_items:
148
- print("已達總頁數,爬取完畢。")
149
- break
150
  else:
151
- # 找不到分頁資訊,通常代表只有一頁
152
  break
153
-
154
  page_num += 1
155
 
156
  msg = f"爬取完成!共抓取 {legislator_name} 委員 {len(all_clips)} 筆發言片段。"
@@ -158,53 +135,44 @@ def get_videos_for_legislator(legislator_name: str, legislator_map: dict):
158
  return all_clips, msg
159
 
160
  except Exception as e:
161
- msg = f"錯誤:爬取委員頁面失敗。 {e}"
162
  print(msg)
163
  return None, msg
164
 
165
  # --- 建立 Gradio 介面 ---
166
  with gr.Blocks() as iface:
167
- gr.Markdown("# IVOD 委員發言爬蟲 API (爬蟲 A)")
168
- gr.Markdown("自動載入所有委員名單,選擇後即可抓取該委員所有的發言影片連結。")
169
 
170
- # 建立一個隱藏的 State 來儲存 {"姓名": "ID"} 的完整 map
171
- legislator_map_state = gr.State()
172
 
173
  with gr.Row():
174
- # 下拉選單初始為空,等待 load 事件填入
 
175
  legislator_input = gr.Dropdown(
176
- label="1. 選擇委員 (載入中...)",
177
  choices=[],
178
- interactive=False
 
179
  )
 
180
 
181
- analyze_button = gr.Button("開始爬取", variant="primary")
182
  status_output = gr.Textbox(label="處理狀態", interactive=False)
183
-
184
- gr.Markdown("---")
185
- gr.Markdown("## 爬取結果 (JSON)")
186
  json_output = gr.JSON(label="發言片段 URL 列表")
187
 
188
- # --- 綁定事件 ---
189
-
190
- # 1. 介面載入時,自動觸發一次
191
- # (!!! 修正:加上 api_name,讓外部腳本可以呼叫 !!!)
192
  iface.load(
193
  fn=get_all_legislators_map,
194
  inputs=[],
195
  outputs=[legislator_map_state, legislator_input],
196
- api_name="get_all_legislators_map"
197
  )
198
 
199
- # 2. "開始爬取" 按鈕的事件
200
- # (!!! 修正:加上 api_name,讓外部腳本可以呼叫 !!!)
201
  analyze_button.click(
202
  fn=get_videos_for_legislator,
203
- inputs=[legislator_input, legislator_map_state],
204
  outputs=[json_output, status_output],
205
  api_name="get_videos_for_legislator"
206
  )
207
 
208
- # 啟動介面
209
  if __name__ == "__main__":
210
  iface.launch()
 
6
  import json
7
 
8
  # --- 核心函式 ---
 
9
  BASE_URL = "https://ivod.ly.gov.tw"
10
  HEADERS = {
11
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/5.37.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/5.37.36'
12
  }
13
 
 
14
  def get_all_legislators_map():
15
  """
16
+ (爬蟲 A1) 抓取頁面上的 select 選項,建立 姓名->ID 的對照表。
 
17
  """
18
  target_url = f"{BASE_URL}/Demand?targetSession=current"
19
  print(f"--- 爬蟲 A1 啟動 ---")
 
20
 
21
  legislator_map = {}
22
  try:
23
  response = requests.get(target_url, headers=HEADERS, timeout=10)
24
+ response.encoding = 'utf-8'
25
  response.raise_for_status()
26
+ soup = BeautifulSoup(response.text, 'html.parser')
27
 
 
28
  selects = soup.find_all('select')
29
 
 
30
  for select in selects:
31
  options = select.find_all('option')
32
  for option in options:
33
  code = option.get('value')
34
  name = option.text.strip()
35
 
36
+ # 過濾邏輯
 
37
  if code and code.isdigit() and name:
38
  legislator_map[name] = code
 
39
 
40
+ print(f"爬取完成,共找到 {len(legislator_map)} 位委員。")
41
 
42
  if not legislator_map:
43
+ return {}, gr.Dropdown(choices=[], value=None, label="錯誤:未找到資料")
44
+
 
45
  names_list = sorted(list(legislator_map.keys()))
46
 
47
  return (
48
  legislator_map,
49
  gr.Dropdown(
50
  choices=names_list,
51
+ value=names_list[0],
52
  interactive=True,
53
+ label="1. 選擇委員 (載入)"
54
  )
55
  )
 
56
  except Exception as e:
57
+ msg = f"錯誤:爬取委員列表失敗。 {e}"
58
  print(msg)
59
+ return {}, gr.Dropdown(choices=[], label=msg)
 
60
 
61
  def get_videos_for_legislator(legislator_name: str, legislator_map: dict):
62
  """
63
+ (爬蟲 A2) 爬取指定委員的影片列表。
 
64
  """
65
 
66
+ # --- [修正重點 2] API 呼叫防護 ---
67
+ # 如果是透過 API Client 呼叫,legislator_map 可能是 None 或空字典
68
+ # 這時我們需要現場重新抓一次名單,確保能查到 ID
69
+ if not legislator_map or legislator_name not in legislator_map:
70
+ print(f"⚠️ 檢測到 Map 為空或找不到 '{legislator_name}',正在重新獲取名單...")
71
+ new_map, _ = get_all_legislators_map()
72
+ if isinstance(new_map, dict) and new_map:
73
+ legislator_map = new_map
74
+ else:
75
+ return None, "錯誤:無法獲取委員名單,請稍後再試。"
76
+
77
+ # 1. 查詢 ID
78
  legislator_id = legislator_map.get(legislator_name)
79
 
80
  if not legislator_id:
81
  return None, f"錯誤:找不到委員 '{legislator_name}' 的 ID。"
82
 
83
  legislator_page_url = f"{BASE_URL}/Demand/ListLgltVideo/{legislator_id}"
84
+ print(f"--- 爬蟲 A2 啟動: {legislator_name} (ID: {legislator_id}) ---")
 
 
85
 
86
  all_clips = []
87
  page_num = 1
88
 
89
  try:
90
  while True:
 
91
  page_url = f"{legislator_page_url}?page={page_num}"
92
+ # print(f"正在爬取第 {page_num} 頁...")
93
 
94
  response = requests.get(page_url, headers=HEADERS, timeout=10)
95
  response.raise_for_status()
96
+ soup = BeautifulSoup(response.text, 'lxml') # 建議安裝 lxml: pip install lxml
97
 
 
98
  clip_lists = soup.find_all('ul', id='clipUl')
99
  if not clip_lists:
 
100
  break
101
 
102
  found_clips_on_page = 0
103
  for ul in clip_lists:
104
  list_items = ul.find_all('li')
 
 
 
105
  for item in list_items:
106
  clip_info = {"legislator_name": legislator_name}
107
 
 
108
  meet_name_tag = item.select_one("span.metdec")
109
  clip_info['meeting_name'] = meet_name_tag.string.strip() if meet_name_tag else "會議名稱未知"
110
 
 
 
 
 
 
111
  play_link_tag = item.find('a', title=re.compile(r"委員寬頻影片"))
112
  if play_link_tag and play_link_tag.get('href'):
113
  absolute_url = urljoin(BASE_URL, play_link_tag.get('href'))
 
115
  all_clips.append(clip_info)
116
  found_clips_on_page += 1
117
 
 
118
  if found_clips_on_page == 0:
 
119
  break
120
+
121
+ # 簡易分頁檢查
122
  pagination_script = soup.find('script', string=re.compile(r"var total ="))
123
  if pagination_script:
124
  total_match = re.search(r'var total = "(\d+)"', pagination_script.string)
125
  size_match = re.search(r'var pageSize = "(\d+)"', pagination_script.string)
126
+ if total_match and size_match and (page_num * int(size_match.group(1)) >= int(total_match.group(1))):
127
+ break
 
 
 
 
 
128
  else:
 
129
  break
130
+
131
  page_num += 1
132
 
133
  msg = f"爬取完成!共抓取 {legislator_name} 委員 {len(all_clips)} 筆發言片段。"
 
135
  return all_clips, msg
136
 
137
  except Exception as e:
138
+ msg = f"錯誤:爬取影片失敗。 {e}"
139
  print(msg)
140
  return None, msg
141
 
142
  # --- 建立 Gradio 介面 ---
143
  with gr.Blocks() as iface:
144
+ gr.Markdown("# IVOD 委員發言爬蟲 API (修正版)")
 
145
 
146
+ legislator_map_state = gr.State(value={}) # 初始給空字典
 
147
 
148
  with gr.Row():
149
+ # --- [修正重點 1] allow_custom_value=True ---
150
+ # 這允許 API Client 傳入任何字串,不會因為初始 choices 為空而被擋下
151
  legislator_input = gr.Dropdown(
152
+ label="1. 選擇委員",
153
  choices=[],
154
+ interactive=True,
155
+ allow_custom_value=True
156
  )
157
+ analyze_button = gr.Button("開始爬取", variant="primary")
158
 
 
159
  status_output = gr.Textbox(label="處理狀態", interactive=False)
 
 
 
160
  json_output = gr.JSON(label="發言片段 URL 列表")
161
 
162
+ # 頁面載入時自動抓名單
 
 
 
163
  iface.load(
164
  fn=get_all_legislators_map,
165
  inputs=[],
166
  outputs=[legislator_map_state, legislator_input],
167
+ api_name="get_all_legislators_map"
168
  )
169
 
 
 
170
  analyze_button.click(
171
  fn=get_videos_for_legislator,
172
+ inputs=[legislator_input, legislator_map_state],
173
  outputs=[json_output, status_output],
174
  api_name="get_videos_for_legislator"
175
  )
176
 
 
177
  if __name__ == "__main__":
178
  iface.launch()