deneve07 commited on
Commit
7c8bbb7
·
verified ·
1 Parent(s): 85ade6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -139
app.py CHANGED
@@ -4,6 +4,7 @@ import datetime
4
  from urllib.parse import quote
5
  from bs4 import BeautifulSoup
6
  from playwright.sync_api import sync_playwright
 
7
  import gradio as gr
8
  import os
9
 
@@ -20,64 +21,28 @@ def translate_en_to_ja(text):
20
  return text
21
 
22
  # ==========================================
23
- # 🇬🇧 eMC
24
  # ==========================================
25
- def get_uk_originator(ingredient_query, page):
26
  log = []
27
  brands = set()
28
  companies = set()
29
 
30
  try:
31
- log.append("1. 前往 eMC 搜尋頁面...")
32
- page.goto(f"https://www.medicines.org.uk/emc/search?q={ingredient_query}", timeout=30000)
33
- page.wait_for_selector('.search-results-product-info-title-link', timeout=15000)
34
 
35
- soup = BeautifulSoup(page.content(), 'html.parser')
36
- links = soup.find_all('a', class_='search-results-product-info-title-link')
37
- log.append(f"2. 找到 {len(links)} 筆結果,篩選非成分名開頭的項目...")
 
 
 
 
38
 
39
- for link in links:
40
- raw_title = link.get_text(strip=True)
41
- if not raw_title.lower().startswith(ingredient_query.lower()):
42
- clean_brand = re.split(r'\s+\d', raw_title)[0].strip()
43
- brands.add(clean_brand)
44
-
45
- parent_div = link.find_parent(class_='search-results-product-info')
46
- if parent_div:
47
- comp_tag = parent_div.find(class_='search-results-product-info-company')
48
- if comp_tag: companies.add(comp_tag.get_text(strip=True))
49
-
50
- if brands:
51
- log.append("✅ 成功找到原廠藥!")
52
- return ", ".join(brands), ", ".join(companies), "\n".join(log)
53
- else:
54
- log.append("❌ 查無原廠 (皆以成分名開頭)。")
55
- return "查無原廠", "-", "\n".join(log)
56
- except Exception as e:
57
- log.append(f"❌ 發生錯誤: {str(e)}")
58
- return "執行失敗", "-", "\n".join(log)
59
-
60
- # ==========================================
61
- # 🇺🇸 美國 FDA Orange Book (加入偽裝)
62
- # ==========================================
63
- def get_usa_originator(ingredient_query, page):
64
- log = []
65
- brands = set()
66
- companies = set()
67
-
68
- try:
69
- log.append("1. 前往 FDA Orange Book...")
70
- # 加上 wait_until="domcontentloaded" 避免等待外部資源卡住
71
- page.goto("https://www.accessdata.fda.gov/scripts/cder/ob/index.cfm", timeout=30000, wait_until="domcontentloaded")
72
-
73
- log.append("2. 切換頁籤並搜尋...")
74
- page.locator('a[aria-controls="ingredient"], button:has-text("Active Ingredient")').first.click()
75
- page.locator('input[name="activeIngredient"], input#ingredient').first.fill(ingredient_query)
76
- page.keyboard.press("Enter")
77
-
78
- log.append("3. 等待結果表格...")
79
- page.wait_for_selector('table#example', timeout=15000)
80
- soup = BeautifulSoup(page.content(), 'html.parser')
81
  table = soup.find('table', id='example')
82
 
83
  if table:
@@ -99,22 +64,53 @@ def get_usa_originator(ingredient_query, page):
99
  log.append("✅ 成功找到 RLD 原廠藥!")
100
  return ", ".join(brands), ", ".join(companies), "\n".join(log)
101
  else:
102
- log.append("❌ 尚未核准或無 RLD。")
103
  return "查無原廠", "-", "\n".join(log)
 
104
  except Exception as e:
105
  log.append(f"❌ 發生錯誤: {str(e)}")
106
- return "執行失敗 (可能被阻擋)", "-", "\n".join(log)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  # ==========================================
109
- # 🇨🇦 加拿大 DPD (DIN 排序優化版)
110
  # ==========================================
111
  def get_canada_originator(ingredient_query, page):
112
  log = []
113
  generic_companies = ['apotex', 'teva', 'sandoz', 'jamp', 'mint', 'pharmascience', 'sanis', 'sivem', 'auro', 'glenmark', 'taro', 'marcan', 'nora', 'mantra', 'reddy']
114
 
115
  try:
116
- log.append("1. 前往 Canada DPD...")
117
- page.goto("https://health-products.canada.ca/dpd-bdpp/index-eng.jsp", timeout=30000)
 
118
  page.locator('input[id="activeIngredient"]').fill(ingredient_query)
119
  page.keyboard.press("Enter")
120
 
@@ -129,74 +125,48 @@ def get_canada_originator(ingredient_query, page):
129
  rows = table.find('tbody').find_all('tr')
130
  all_candidates = []
131
 
132
- log.append(f"3. 找到 {len(rows)} 筆,過濾黑名單並擷取 DIN...")
133
  for tr in rows:
134
  tds = tr.find_all('td')
135
  if len(tds) >= 4:
136
  comp_name = tds[2].get_text(strip=True)
 
137
 
138
- # 擋掉已知學名藥廠
139
- if any(gc in comp_name.lower() for gc in generic_companies):
140
- continue
141
-
142
- # 擷取 DIN 並轉為數字
143
- din_text = tds[1].get_text(strip=True)
144
- din_match = re.search(r'\d+', din_text)
145
  if din_match:
146
- din_num = int(din_match.group())
147
- product_name = tds[3].get_text(strip=True)
148
  link_tag = tds[1].find('a')
149
-
150
  if link_tag:
151
- url = "https://health-products.canada.ca" + link_tag['href']
152
  all_candidates.append({
153
  "company": comp_name,
154
- "product": product_name,
155
- "din": din_num,
156
- "url": url
157
  })
158
 
159
  if not all_candidates:
160
- log.append("❌ 剩下的全為學名藥廠。")
161
- return "查無原廠", "-", "\n".join(log)
162
 
163
- # 🟢 核心優化:將候選名單依照 DIN 號碼由小到大排序!
164
  all_candidates = sorted(all_candidates, key=lambda x: x['din'])
165
-
166
- # 排序後,DIN 最小的第一家公司,極大概率就是原廠
167
  originator_company = all_candidates[0]['company']
168
- log.append(f"4. 依 DIN 排序後,鎖定最古老藥廠: {originator_company}")
169
 
170
- # 為了嚴謹,我們還是進去這家公司的第一個連結抓一下日期
171
- earliest_date_str = "未知"
172
  try:
173
- log.append(f"5. 進入詳細頁面確認日期...")
174
  page.goto(all_candidates[0]['url'], timeout=15000)
175
  detail_soup = BeautifulSoup(page.content(), 'html.parser')
176
-
177
- # 🟢 修正:使用您提供的 HTML 結構尋找 "Original market date:"
178
  strong_tag = detail_soup.find(lambda tag: tag.name == "strong" and "Original market date" in tag.get_text(strip=True))
179
  if strong_tag:
180
  parent_row = strong_tag.find_parent('div', class_='row')
181
  if parent_row:
182
- date_p = parent_row.find('p', class_='col-sm-8')
183
- if date_p:
184
- earliest_date_str = date_p.get_text(strip=True)
185
- except Exception as e:
186
- log.append(" - 日期抓取失敗,但仍以 DIN 排序結果為準")
187
 
188
- log.append(f"✅ 確認原廠為: {originator_company} (上市日: {earliest_date_str})")
189
-
190
- # 把屬於這家原廠的所有商品名都抓出來!(例如一次抓出 Ozempic, Rybelsus, Wegovy)
191
  final_brands = set([c['product'] for c in all_candidates if c['company'] == originator_company])
192
  return ", ".join(final_brands), originator_company, "\n".join(log)
193
 
194
- except Exception as e:
195
- log.append(f"❌ 發生錯誤: {str(e)}")
196
- return "執行失敗", "-", "\n".join(log)
197
 
198
  # ==========================================
199
- # 🇯🇵 日本 PMDA (廣泛定位器修正版)
200
  # ==========================================
201
  def get_japan_originator(ingredient_query_ja, page):
202
  log = []
@@ -205,43 +175,27 @@ def get_japan_originator(ingredient_query_ja, page):
205
 
206
  log.append(f"使用日文名: {ingredient_query_ja} 進行搜尋")
207
  try:
208
- log.append("1. 前往 PMDA...")
209
- page.goto("https://www.pmda.go.jp/PmdaSearch/iyakuSearch/", timeout=30000)
210
 
211
  try:
212
  agree_btn = page.locator('text=同意する, input[value="同意する"], a:has-text("同意する")').first
213
- if agree_btn.is_visible(timeout=3000):
214
- log.append(" - 發現使用條款畫面,自動點擊同意...")
215
- agree_btn.click()
216
- page.wait_for_load_state('networkidle')
217
  except: pass
218
 
219
- log.append("2. 尋找輸入框並送出...")
220
- # 🟢 修正:使用更強大、涵蓋多種可能定位器
221
- search_input = page.locator('input[name="general_name"], input[name="generalName"], input[title*="一般名"], input[name="t_generalName"], input[type="text"]').first
222
- search_input.fill(ingredient_query_ja)
223
-
224
- # 🟢 修正:使用更強大的按鈕定位器,或是直接按 Enter
225
- try:
226
- search_btn = page.locator('input[value*="検索"], button:has-text("検索"), input[alt="検索"], .searchBtn').first
227
- search_btn.click(timeout=5000)
228
- except:
229
- # 如果找不到按鈕,就直接在輸入框按 Enter
230
- search_input.press("Enter")
231
 
232
- log.append("3. 等待表格 id=ResultList...")
233
- try:
234
- page.wait_for_selector('table#ResultList', timeout=15000)
235
- except:
236
- log.append("❌ 等待逾時,查無此成分。")
237
- return "查無資料", "-", "\n".join(log)
238
 
239
  soup = BeautifulSoup(page.content(), 'html.parser')
240
  table = soup.find('table', id='ResultList')
241
 
242
  if table:
243
  rows = table.find_all('tr')
244
- log.append(f"4. 找到表格,分析 {len(rows)} 列資料...")
245
  for tr in rows:
246
  tds = tr.find_all('td')
247
  if len(tds) >= 3:
@@ -253,19 +207,13 @@ def get_japan_originator(ingredient_query_ja, page):
253
  comp = tds[2].get_text(separator=" ", strip=True).replace('製造販売元/', '')
254
  companies.add(comp)
255
 
256
- if brands:
257
- log.append("✅ 成功找到原廠")
258
- return ", ".join(brands), ", ".join(companies), "\n".join(log)
259
- else:
260
- log.append("❌ 皆為學名藥括號。")
261
- return "查無原廠", "-", "\n".join(log)
262
 
263
- except Exception as e:
264
- log.append(f"❌ 發生錯誤: {str(e)}")
265
- return "執行失敗", "-", "\n".join(log)
266
 
267
  # ==========================================
268
- # 🚀 主執行函數 (加入「分頁隔離」機制)
269
  # ==========================================
270
  def run_diagnostic_search(ingredient_en, ingredient_ja_manual):
271
  if not ingredient_en:
@@ -274,44 +222,41 @@ def run_diagnostic_search(ingredient_en, ingredient_ja_manual):
274
  ingredient_ja = ingredient_ja_manual if ingredient_ja_manual else translate_en_to_ja(ingredient_en)
275
 
276
  results = []
 
 
 
 
 
 
277
  with sync_playwright() as p:
278
- # 啟動瀏覽器與設定 User-Agent
279
  browser = p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
280
  context = browser.new_context(
281
  user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
282
  )
283
 
284
- # 🇬🇧 英國 (開啟專屬分頁)
285
  page_uk = context.new_page()
286
  uk_b, uk_c, uk_log = get_uk_originator(ingredient_en, page_uk)
287
  page_uk.close()
288
  results.append(["🇬🇧 英國 (eMC)", uk_b, uk_c, uk_log])
289
 
290
- # 🇺🇸 美國 (開啟專屬分頁)
291
- page_us = context.new_page()
292
- us_b, us_c, us_log = get_usa_originator(ingredient_en, page_us)
293
- page_us.close()
294
- results.append(["🇺🇸 美國 (FDA)", us_b, us_c, us_log])
295
-
296
- # 🇨🇦 加拿大 (開啟專屬分頁)
297
  page_ca = context.new_page()
298
  ca_b, ca_c, ca_log = get_canada_originator(ingredient_en, page_ca)
299
  page_ca.close()
300
  results.append(["🇨🇦 加拿大 (DPD)", ca_b, ca_c, ca_log])
301
 
302
- # 🇯🇵 日本 (開啟專屬分頁)
303
  page_ja = context.new_page()
304
  ja_b, ja_c, ja_log = get_japan_originator(ingredient_ja, page_ja)
305
  page_ja.close()
306
  results.append(["🇯🇵 日本 (PMDA)", ja_b, ja_c, ja_log])
307
 
308
  browser.close()
 
309
  return results
310
 
311
  # ==========================================
312
  # 🎨 UI 介面
313
  # ==========================================
314
- with gr.Blocks(title="四國原廠智能檢索 (精準多重版)") as demo:
315
  gr.Markdown("## 🌐 跨國原廠商品名檢索器 (支援多重商品名與防爬蟲突破)")
316
 
317
  with gr.Row():
 
4
  from urllib.parse import quote
5
  from bs4 import BeautifulSoup
6
  from playwright.sync_api import sync_playwright
7
+ from curl_cffi import requests as curl_req
8
  import gradio as gr
9
  import os
10
 
 
21
  return text
22
 
23
  # ==========================================
24
+ # 🇺🇸 FDA Orange Book (改用 curl_cffi 突破防火牆)
25
  # ==========================================
26
+ def get_usa_originator(ingredient_query):
27
  log = []
28
  brands = set()
29
  companies = set()
30
 
31
  try:
32
+ log.append("1. 使用 curl_cffi 偽裝成 Chrome 120 發送請求...")
33
+ session = curl_req.Session(impersonate="chrome120")
34
+ url = "https://www.accessdata.fda.gov/scripts/cder/ob/search_product.cfm"
35
 
36
+ # 根據您提供的 HTML,直接建構表單 Payload
37
+ payload = {
38
+ "drugname": ingredient_query,
39
+ "discontinued": "RX,OTC,DISCN",
40
+ "submit": "Search"
41
+ }
42
+ res = session.post(url, data=payload, timeout=30, verify=False)
43
 
44
+ log.append("2. 成功取得 FDA 伺服器回應,解析表格...")
45
+ soup = BeautifulSoup(res.text, 'html.parser')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  table = soup.find('table', id='example')
47
 
48
  if table:
 
64
  log.append("✅ 成功找到 RLD 原廠藥!")
65
  return ", ".join(brands), ", ".join(companies), "\n".join(log)
66
  else:
67
+ log.append("❌ 表格中未發現 RLD,或該藥品尚未核准。")
68
  return "查無原廠", "-", "\n".join(log)
69
+
70
  except Exception as e:
71
  log.append(f"❌ 發生錯誤: {str(e)}")
72
+ return "執行失敗", "-", "\n".join(log)
73
+
74
+ # ==========================================
75
+ # 🇬🇧 英國 eMC
76
+ # ==========================================
77
+ def get_uk_originator(ingredient_query, page):
78
+ log = []
79
+ brands = set()
80
+ companies = set()
81
+
82
+ try:
83
+ log.append("1. 前往 eMC 搜尋頁面...")
84
+ page.goto(f"https://www.medicines.org.uk/emc/search?q={ingredient_query}", timeout=30000)
85
+ page.wait_for_selector('.search-results-product-info-title-link', timeout=15000)
86
+
87
+ soup = BeautifulSoup(page.content(), 'html.parser')
88
+ links = soup.find_all('a', class_='search-results-product-info-title-link')
89
+ for link in links:
90
+ raw_title = link.get_text(strip=True)
91
+ if not raw_title.lower().startswith(ingredient_query.lower()):
92
+ clean_brand = re.split(r'\s+\d', raw_title)[0].strip()
93
+ brands.add(clean_brand)
94
+ parent_div = link.find_parent(class_='search-results-product-info')
95
+ if parent_div:
96
+ comp_tag = parent_div.find(class_='search-results-product-info-company')
97
+ if comp_tag: companies.add(comp_tag.get_text(strip=True))
98
+
99
+ if brands: return ", ".join(brands), ", ".join(companies), "✅ 成功找到原廠藥!"
100
+ return "查無原廠", "-", "❌ 皆以成分名開頭"
101
+ except Exception as e: return "執行失敗", "-", str(e)
102
 
103
  # ==========================================
104
+ # 🇨🇦 加拿大 DPD (修正 domcontentloaded 避免超時)
105
  # ==========================================
106
  def get_canada_originator(ingredient_query, page):
107
  log = []
108
  generic_companies = ['apotex', 'teva', 'sandoz', 'jamp', 'mint', 'pharmascience', 'sanis', 'sivem', 'auro', 'glenmark', 'taro', 'marcan', 'nora', 'mantra', 'reddy']
109
 
110
  try:
111
+ log.append("1. 前往 Canada DPD (放寬載入條件)...")
112
+ # 🟢 修正:使用 domcontentloaded 且放寬到 45 秒
113
+ page.goto("https://health-products.canada.ca/dpd-bdpp/index-eng.jsp", timeout=45000, wait_until="domcontentloaded")
114
  page.locator('input[id="activeIngredient"]').fill(ingredient_query)
115
  page.keyboard.press("Enter")
116
 
 
125
  rows = table.find('tbody').find_all('tr')
126
  all_candidates = []
127
 
 
128
  for tr in rows:
129
  tds = tr.find_all('td')
130
  if len(tds) >= 4:
131
  comp_name = tds[2].get_text(strip=True)
132
+ if any(gc in comp_name.lower() for gc in generic_companies): continue
133
 
134
+ din_match = re.search(r'\d+', tds[1].get_text(strip=True))
 
 
 
 
 
 
135
  if din_match:
 
 
136
  link_tag = tds[1].find('a')
 
137
  if link_tag:
 
138
  all_candidates.append({
139
  "company": comp_name,
140
+ "product": tds[3].get_text(strip=True),
141
+ "din": int(din_match.group()),
142
+ "url": "https://health-products.canada.ca" + link_tag['href']
143
  })
144
 
145
  if not all_candidates:
146
+ return "查無原廠", "-", "❌ 剩下的全為學名藥廠。"
 
147
 
 
148
  all_candidates = sorted(all_candidates, key=lambda x: x['din'])
 
 
149
  originator_company = all_candidates[0]['company']
150
+ log.append(f"3. 依 DIN 排序後,鎖定最古老藥廠: {originator_company}")
151
 
 
 
152
  try:
 
153
  page.goto(all_candidates[0]['url'], timeout=15000)
154
  detail_soup = BeautifulSoup(page.content(), 'html.parser')
 
 
155
  strong_tag = detail_soup.find(lambda tag: tag.name == "strong" and "Original market date" in tag.get_text(strip=True))
156
  if strong_tag:
157
  parent_row = strong_tag.find_parent('div', class_='row')
158
  if parent_row:
159
+ date_str = parent_row.find('p', class_='col-sm-8').get_text(strip=True)
160
+ log.append(f"✅ 上市日: {date_str}")
161
+ except: pass
 
 
162
 
 
 
 
163
  final_brands = set([c['product'] for c in all_candidates if c['company'] == originator_company])
164
  return ", ".join(final_brands), originator_company, "\n".join(log)
165
 
166
+ except Exception as e: return "執行失敗", "-", str(e)
 
 
167
 
168
  # ==========================================
169
+ # 🇯🇵 日本 PMDA (依據真實 HTML 精準定位)
170
  # ==========================================
171
  def get_japan_originator(ingredient_query_ja, page):
172
  log = []
 
175
 
176
  log.append(f"使用日文名: {ingredient_query_ja} 進行搜尋")
177
  try:
178
+ page.goto("https://www.pmda.go.jp/PmdaSearch/iyakuSearch/", timeout=30000, wait_until="domcontentloaded")
 
179
 
180
  try:
181
  agree_btn = page.locator('text=同意する, input[value="同意する"], a:has-text("同意する")').first
182
+ if agree_btn.is_visible(timeout=3000): agree_btn.click(); page.wait_for_load_state('networkidle')
 
 
 
183
  except: pass
184
 
185
+ log.append("1. 尋找輸入框並送出...")
186
+ # 🟢 修正:使用您提供 id="txtName" 與 name="nameWord"
187
+ page.locator('input#txtName, input[name="nameWord"]').first.fill(ingredient_query_ja)
188
+ # 🟢 修正:使用您提供的 name="btnA" 與 type="image"
189
+ page.locator('input[name="btnA"], input[type="image"][src*="SearchBtn"]').first.click()
 
 
 
 
 
 
 
190
 
191
+ log.append("2. 等待表格 id=ResultList...")
192
+ page.wait_for_selector('table#ResultList', timeout=15000)
 
 
 
 
193
 
194
  soup = BeautifulSoup(page.content(), 'html.parser')
195
  table = soup.find('table', id='ResultList')
196
 
197
  if table:
198
  rows = table.find_all('tr')
 
199
  for tr in rows:
200
  tds = tr.find_all('td')
201
  if len(tds) >= 3:
 
207
  comp = tds[2].get_text(separator=" ", strip=True).replace('製造販売元/', '')
208
  companies.add(comp)
209
 
210
+ if brands: return ", ".join(brands), ", ".join(companies), "✅ 成功找到原廠!"
211
+ return "查無原廠", "-", "❌ 皆為學名藥括號"
 
 
 
 
212
 
213
+ except Exception as e: return "執行失敗", "-", str(e)
 
 
214
 
215
  # ==========================================
216
+ # 🚀 主執行函數 (美國使用 curl_cffi,其他使用 Playwright 分頁隔離)
217
  # ==========================================
218
  def run_diagnostic_search(ingredient_en, ingredient_ja_manual):
219
  if not ingredient_en:
 
222
  ingredient_ja = ingredient_ja_manual if ingredient_ja_manual else translate_en_to_ja(ingredient_en)
223
 
224
  results = []
225
+
226
+ # 🇺🇸 美國:獨立使用 curl_cffi 執行,完全不受 Playwright 影響
227
+ us_b, us_c, us_log = get_usa_originator(ingredient_en)
228
+ results.append(["🇺🇸 美國 (FDA)", us_b, us_c, us_log])
229
+
230
+ # 🇬🇧🇨🇦🇯🇵 英加日:使用 Playwright 執行
231
  with sync_playwright() as p:
 
232
  browser = p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
233
  context = browser.new_context(
234
  user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
235
  )
236
 
 
237
  page_uk = context.new_page()
238
  uk_b, uk_c, uk_log = get_uk_originator(ingredient_en, page_uk)
239
  page_uk.close()
240
  results.append(["🇬🇧 英國 (eMC)", uk_b, uk_c, uk_log])
241
 
 
 
 
 
 
 
 
242
  page_ca = context.new_page()
243
  ca_b, ca_c, ca_log = get_canada_originator(ingredient_en, page_ca)
244
  page_ca.close()
245
  results.append(["🇨🇦 加拿大 (DPD)", ca_b, ca_c, ca_log])
246
 
 
247
  page_ja = context.new_page()
248
  ja_b, ja_c, ja_log = get_japan_originator(ingredient_ja, page_ja)
249
  page_ja.close()
250
  results.append(["🇯🇵 日本 (PMDA)", ja_b, ja_c, ja_log])
251
 
252
  browser.close()
253
+
254
  return results
255
 
256
  # ==========================================
257
  # 🎨 UI 介面
258
  # ==========================================
259
+ with gr.Blocks(title="四國原廠智能檢索 (抗防護終極版)") as demo:
260
  gr.Markdown("## 🌐 跨國原廠商品名檢索器 (支援多重商品名與防爬蟲突破)")
261
 
262
  with gr.Row():