Spaces:

deneve07
/

OriginatorFinder

Running

App Files Files Community

deneve07 commited on 25 days ago

Commit

7c8bbb7

verified ·

1 Parent(s): 85ade6f

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -139

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import datetime
 from urllib.parse import quote
 from bs4 import BeautifulSoup
 from playwright.sync_api import sync_playwright
 import gradio as gr
 import os
@@ -20,64 +21,28 @@ def translate_en_to_ja(text):
     return text
 # ==========================================
-# 🇬🇧 英國 eMC
 # ==========================================
-def get_uk_originator(ingredient_query, page):
     log = []
     brands = set()
     companies = set()
     try:
-        log.append("1. 前往 eMC 搜尋頁面...")
-        page.goto(f"https://www.medicines.org.uk/emc/search?q={ingredient_query}", timeout=30000)
-        page.wait_for_selector('.search-results-product-info-title-link', timeout=15000)
-        soup = BeautifulSoup(page.content(), 'html.parser')
-        links = soup.find_all('a', class_='search-results-product-info-title-link')
-        log.append(f"2. 找到 {len(links)} 筆結果，篩選非成分名開頭的項目...")
-        for link in links:
-            raw_title = link.get_text(strip=True)
-            if not raw_title.lower().startswith(ingredient_query.lower()):
-                clean_brand = re.split(r'\s+\d', raw_title)[0].strip()
-                brands.add(clean_brand)
-                parent_div = link.find_parent(class_='search-results-product-info')
-                if parent_div:
-                    comp_tag = parent_div.find(class_='search-results-product-info-company')
-                    if comp_tag: companies.add(comp_tag.get_text(strip=True))
-        if brands:
-            log.append("✅ 成功找到原廠藥！")
-            return ", ".join(brands), ", ".join(companies), "\n".join(log)
-        else:
-            log.append("❌ 查無原廠 (皆以成分名開頭)。")
-            return "查無原廠", "-", "\n".join(log)
-    except Exception as e:
-        log.append(f"❌ 發生錯誤: {str(e)}")
-        return "執行失敗", "-", "\n".join(log)
-# ==========================================
-# 🇺🇸 美國 FDA Orange Book (加入偽裝)
-# ==========================================
-def get_usa_originator(ingredient_query, page):
-    log = []
-    brands = set()
-    companies = set()
-    try:
-        log.append("1. 前往 FDA Orange Book...")
-        # 加上 wait_until="domcontentloaded" 避免等待外部資源卡住
-        page.goto("https://www.accessdata.fda.gov/scripts/cder/ob/index.cfm", timeout=30000, wait_until="domcontentloaded")
-        log.append("2. 切換頁籤並搜尋...")
-        page.locator('a[aria-controls="ingredient"], button:has-text("Active Ingredient")').first.click()
-        page.locator('input[name="activeIngredient"], input#ingredient').first.fill(ingredient_query)
-        page.keyboard.press("Enter")
-        log.append("3. 等待結果表格...")
-        page.wait_for_selector('table#example', timeout=15000)
-        soup = BeautifulSoup(page.content(), 'html.parser')
         table = soup.find('table', id='example')
         if table:
@@ -99,22 +64,53 @@ def get_usa_originator(ingredient_query, page):
             log.append("✅ 成功找到 RLD 原廠藥！")
             return ", ".join(brands), ", ".join(companies), "\n".join(log)
         else:
-            log.append("❌ 尚未核准或無 RLD。")
             return "查無原廠", "-", "\n".join(log)
     except Exception as e:
         log.append(f"❌ 發生錯誤: {str(e)}")
-        return "執行失敗 (可能被阻擋)", "-", "\n".join(log)
 # ==========================================
-# 🇨🇦 加拿大 DPD (DIN 排序優化版)
 # ==========================================
 def get_canada_originator(ingredient_query, page):
     log = []
     generic_companies = ['apotex', 'teva', 'sandoz', 'jamp', 'mint', 'pharmascience', 'sanis', 'sivem', 'auro', 'glenmark', 'taro', 'marcan', 'nora', 'mantra', 'reddy']
     try:
-        log.append("1. 前往 Canada DPD...")
-        page.goto("https://health-products.canada.ca/dpd-bdpp/index-eng.jsp", timeout=30000)
         page.locator('input[id="activeIngredient"]').fill(ingredient_query)
         page.keyboard.press("Enter")
@@ -129,74 +125,48 @@ def get_canada_originator(ingredient_query, page):
         rows = table.find('tbody').find_all('tr')
         all_candidates = []
-        log.append(f"3. 找到 {len(rows)} 筆，過濾黑名單並擷取 DIN...")
         for tr in rows:
             tds = tr.find_all('td')
             if len(tds) >= 4:
                 comp_name = tds[2].get_text(strip=True)
-                # 擋掉已知學名藥廠
-                if any(gc in comp_name.lower() for gc in generic_companies):
-                    continue
-                # 擷取 DIN 並轉為數字
-                din_text = tds[1].get_text(strip=True)
-                din_match = re.search(r'\d+', din_text)
                 if din_match:
-                    din_num = int(din_match.group())
-                    product_name = tds[3].get_text(strip=True)
                     link_tag = tds[1].find('a')
                     if link_tag:
-                        url = "https://health-products.canada.ca" + link_tag['href']
                         all_candidates.append({
                             "company": comp_name,
-                            "product": product_name,
-                            "din": din_num,
-                            "url": url
                         })
         if not all_candidates:
-            log.append("❌ 剩下的全為學名藥廠。")
-            return "查無原廠", "-", "\n".join(log)
-        # 🟢 核心優化：將候選名單依照 DIN 號碼由小到大排序！
         all_candidates = sorted(all_candidates, key=lambda x: x['din'])
-        # 排序後，DIN 最小的第一家公司，極大概率就是原廠
         originator_company = all_candidates[0]['company']
-        log.append(f"4. 依 DIN 排序後，鎖定最古老藥廠: {originator_company}")
-        # 為了嚴謹，我們還是進去這家公司的第一個連結抓一下日期
-        earliest_date_str = "未知"
         try:
-            log.append(f"5. 進入詳細頁面確認日期...")
             page.goto(all_candidates[0]['url'], timeout=15000)
             detail_soup = BeautifulSoup(page.content(), 'html.parser')
-            # 🟢 修正：使用您提供的 HTML 結構尋找 "Original market date:"
             strong_tag = detail_soup.find(lambda tag: tag.name == "strong" and "Original market date" in tag.get_text(strip=True))
             if strong_tag:
                 parent_row = strong_tag.find_parent('div', class_='row')
                 if parent_row:
-                    date_p = parent_row.find('p', class_='col-sm-8')
-                    if date_p:
-                        earliest_date_str = date_p.get_text(strip=True)
-        except Exception as e:
-            log.append("   - 日期抓取失敗，但仍以 DIN 排序結果為準")
-        log.append(f"✅ 確認原廠為: {originator_company} (上市日: {earliest_date_str})")
-        # 把屬於這家原廠的所有商品名都抓出來！(例如一次抓出 Ozempic, Rybelsus, Wegovy)
         final_brands = set([c['product'] for c in all_candidates if c['company'] == originator_company])
         return ", ".join(final_brands), originator_company, "\n".join(log)
-    except Exception as e:
-        log.append(f"❌ 發生錯誤: {str(e)}")
-        return "執行失敗", "-", "\n".join(log)
 # ==========================================
-# 🇯🇵 日本 PMDA (廣泛定位器修正版)
 # ==========================================
 def get_japan_originator(ingredient_query_ja, page):
     log = []
@@ -205,43 +175,27 @@ def get_japan_originator(ingredient_query_ja, page):
     log.append(f"使用日文名: {ingredient_query_ja} 進行搜尋")
     try:
-        log.append("1. 前往 PMDA...")
-        page.goto("https://www.pmda.go.jp/PmdaSearch/iyakuSearch/", timeout=30000)
         try:
             agree_btn = page.locator('text=同意する, input[value="同意する"], a:has-text("同意する")').first
-            if agree_btn.is_visible(timeout=3000):
-                log.append("   - 發現使用條款畫面，自動點擊同意...")
-                agree_btn.click()
-                page.wait_for_load_state('networkidle')
         except: pass
-        log.append("2. 尋找輸入框並送出...")
-        # 🟢 修正：使用更強大、涵蓋多種可能的定位器
-        search_input = page.locator('input[name="general_name"], input[name="generalName"], input[title*="一般名"], input[name="t_generalName"], input[type="text"]').first
-        search_input.fill(ingredient_query_ja)
-        # 🟢 修正：使用更強大的按鈕定位器，或是直接按 Enter
-        try:
-            search_btn = page.locator('input[value*="検索"], button:has-text("検索"), input[alt="検索"], .searchBtn').first
-            search_btn.click(timeout=5000)
-        except:
-            # 如果找不到按鈕，就直接在輸入框按 Enter
-            search_input.press("Enter")
-        log.append("3. 等待表格 id=ResultList...")
-        try:
-            page.wait_for_selector('table#ResultList', timeout=15000)
-        except:
-            log.append("❌ 等待逾時，查無此成分。")
-            return "查無資料", "-", "\n".join(log)
         soup = BeautifulSoup(page.content(), 'html.parser')
         table = soup.find('table', id='ResultList')
         if table:
             rows = table.find_all('tr')
-            log.append(f"4. 找到表格，分析 {len(rows)} 列資料...")
             for tr in rows:
                 tds = tr.find_all('td')
                 if len(tds) >= 3:
@@ -253,19 +207,13 @@ def get_japan_originator(ingredient_query_ja, page):
                             comp = tds[2].get_text(separator=" ", strip=True).replace('製造販売元／', '')
                             companies.add(comp)
-        if brands:
-            log.append("✅ 成功找到原廠！")
-            return ", ".join(brands), ", ".join(companies), "\n".join(log)
-        else:
-            log.append("❌ 皆為學名藥括號。")
-            return "查無原廠", "-", "\n".join(log)
-    except Exception as e:
-        log.append(f"❌ 發生錯誤: {str(e)}")
-        return "執行失敗", "-", "\n".join(log)
 # ==========================================
-# 🚀 主執行函數 (加入「分頁隔離」機制)
 # ==========================================
 def run_diagnostic_search(ingredient_en, ingredient_ja_manual):
     if not ingredient_en:
@@ -274,44 +222,41 @@ def run_diagnostic_search(ingredient_en, ingredient_ja_manual):
     ingredient_ja = ingredient_ja_manual if ingredient_ja_manual else translate_en_to_ja(ingredient_en)
     results = []
     with sync_playwright() as p:
-        # 啟動瀏覽器與設定 User-Agent
         browser = p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
         context = browser.new_context(
             user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
         )
-        # 🇬🇧 英國 (開啟專屬分頁)
         page_uk = context.new_page()
         uk_b, uk_c, uk_log = get_uk_originator(ingredient_en, page_uk)
         page_uk.close()
         results.append(["🇬🇧 英國 (eMC)", uk_b, uk_c, uk_log])
-        # 🇺🇸 美國 (開啟專屬分頁)
-        page_us = context.new_page()
-        us_b, us_c, us_log = get_usa_originator(ingredient_en, page_us)
-        page_us.close()
-        results.append(["🇺🇸 美國 (FDA)", us_b, us_c, us_log])
-        # 🇨🇦 加拿大 (開啟專屬分頁)
         page_ca = context.new_page()
         ca_b, ca_c, ca_log = get_canada_originator(ingredient_en, page_ca)
         page_ca.close()
         results.append(["🇨🇦 加拿大 (DPD)", ca_b, ca_c, ca_log])
-        # 🇯🇵 日本 (開啟專屬分頁)
         page_ja = context.new_page()
         ja_b, ja_c, ja_log = get_japan_originator(ingredient_ja, page_ja)
         page_ja.close()
         results.append(["🇯🇵 日本 (PMDA)", ja_b, ja_c, ja_log])
         browser.close()
     return results
 # ==========================================
 # 🎨 UI 介面
 # ==========================================
-with gr.Blocks(title="四國原廠智能檢索 (精準多重版)") as demo:
     gr.Markdown("## 🌐 跨國原廠商品名檢索器 (支援多重商品名與防爬蟲突破)")
     with gr.Row():

 from urllib.parse import quote
 from bs4 import BeautifulSoup
 from playwright.sync_api import sync_playwright
+from curl_cffi import requests as curl_req
 import gradio as gr
 import os
     return text
 # ==========================================
+# 🇺🇸 美國 FDA Orange Book (改用 curl_cffi 突破防火牆)
 # ==========================================
+def get_usa_originator(ingredient_query):
     log = []
     brands = set()
     companies = set()
     try:
+        log.append("1. 使用 curl_cffi 偽裝成 Chrome 120 發送請求...")
+        session = curl_req.Session(impersonate="chrome120")
+        url = "https://www.accessdata.fda.gov/scripts/cder/ob/search_product.cfm"
+        # 根據您提供的 HTML，直接建構表單 Payload
+        payload = {
+            "drugname": ingredient_query,
+            "discontinued": "RX,OTC,DISCN",
+            "submit": "Search"
+        }
+        res = session.post(url, data=payload, timeout=30, verify=False)
+        log.append("2. 成功取得 FDA 伺服器回應，解析表格...")
+        soup = BeautifulSoup(res.text, 'html.parser')
         table = soup.find('table', id='example')
         if table:
             log.append("✅ 成功找到 RLD 原廠藥！")
             return ", ".join(brands), ", ".join(companies), "\n".join(log)
         else:
+            log.append("❌ 表格中未發現 RLD，或該藥品尚未核准。")
             return "查無原廠", "-", "\n".join(log)
     except Exception as e:
         log.append(f"❌ 發生錯誤: {str(e)}")
+        return "執行失敗", "-", "\n".join(log)
+# ==========================================
+# 🇬🇧 英國 eMC
+# ==========================================
+def get_uk_originator(ingredient_query, page):
+    log = []
+    brands = set()
+    companies = set()
+    try:
+        log.append("1. 前往 eMC 搜尋頁面...")
+        page.goto(f"https://www.medicines.org.uk/emc/search?q={ingredient_query}", timeout=30000)
+        page.wait_for_selector('.search-results-product-info-title-link', timeout=15000)
+        soup = BeautifulSoup(page.content(), 'html.parser')
+        links = soup.find_all('a', class_='search-results-product-info-title-link')
+        for link in links:
+            raw_title = link.get_text(strip=True)
+            if not raw_title.lower().startswith(ingredient_query.lower()):
+                clean_brand = re.split(r'\s+\d', raw_title)[0].strip()
+                brands.add(clean_brand)
+                parent_div = link.find_parent(class_='search-results-product-info')
+                if parent_div:
+                    comp_tag = parent_div.find(class_='search-results-product-info-company')
+                    if comp_tag: companies.add(comp_tag.get_text(strip=True))
+        if brands: return ", ".join(brands), ", ".join(companies), "✅ 成功找到原廠藥！"
+        return "查無原廠", "-", "❌ 皆以成分名開頭"
+    except Exception as e: return "執行失敗", "-", str(e)
 # ==========================================
+# 🇨🇦 加拿大 DPD (修正 domcontentloaded 避免超時)
 # ==========================================
 def get_canada_originator(ingredient_query, page):
     log = []
     generic_companies = ['apotex', 'teva', 'sandoz', 'jamp', 'mint', 'pharmascience', 'sanis', 'sivem', 'auro', 'glenmark', 'taro', 'marcan', 'nora', 'mantra', 'reddy']
     try:
+        log.append("1. 前往 Canada DPD (放寬載入條件)...")
+        # 🟢 修正：使用 domcontentloaded 且放寬到 45 秒
+        page.goto("https://health-products.canada.ca/dpd-bdpp/index-eng.jsp", timeout=45000, wait_until="domcontentloaded")
         page.locator('input[id="activeIngredient"]').fill(ingredient_query)
         page.keyboard.press("Enter")
         rows = table.find('tbody').find_all('tr')
         all_candidates = []
         for tr in rows:
             tds = tr.find_all('td')
             if len(tds) >= 4:
                 comp_name = tds[2].get_text(strip=True)
+                if any(gc in comp_name.lower() for gc in generic_companies): continue
+                din_match = re.search(r'\d+', tds[1].get_text(strip=True))
                 if din_match:
                     link_tag = tds[1].find('a')
                     if link_tag:
                         all_candidates.append({
                             "company": comp_name,
+                            "product": tds[3].get_text(strip=True),
+                            "din": int(din_match.group()),
+                            "url": "https://health-products.canada.ca" + link_tag['href']
                         })
         if not all_candidates:
+            return "查無原廠", "-", "❌ 剩下的全為學名藥廠。"
         all_candidates = sorted(all_candidates, key=lambda x: x['din'])
         originator_company = all_candidates[0]['company']
+        log.append(f"3. 依 DIN 排序後，鎖定最古老藥廠: {originator_company}")
         try:
             page.goto(all_candidates[0]['url'], timeout=15000)
             detail_soup = BeautifulSoup(page.content(), 'html.parser')
             strong_tag = detail_soup.find(lambda tag: tag.name == "strong" and "Original market date" in tag.get_text(strip=True))
             if strong_tag:
                 parent_row = strong_tag.find_parent('div', class_='row')
                 if parent_row:
+                    date_str = parent_row.find('p', class_='col-sm-8').get_text(strip=True)
+                    log.append(f"✅ 上市日: {date_str}")
+        except: pass
         final_brands = set([c['product'] for c in all_candidates if c['company'] == originator_company])
         return ", ".join(final_brands), originator_company, "\n".join(log)
+    except Exception as e: return "執行失敗", "-", str(e)
 # ==========================================
+# 🇯🇵 日本 PMDA (依據真實 HTML 精準定位)
 # ==========================================
 def get_japan_originator(ingredient_query_ja, page):
     log = []
     log.append(f"使用日文名: {ingredient_query_ja} 進行搜尋")
     try:
+        page.goto("https://www.pmda.go.jp/PmdaSearch/iyakuSearch/", timeout=30000, wait_until="domcontentloaded")
         try:
             agree_btn = page.locator('text=同意する, input[value="同意する"], a:has-text("同意する")').first
+            if agree_btn.is_visible(timeout=3000): agree_btn.click(); page.wait_for_load_state('networkidle')
         except: pass
+        log.append("1. 尋找輸入框並送出...")
+        # 🟢 修正：使用您提供的 id="txtName" 與 name="nameWord"
+        page.locator('input#txtName, input[name="nameWord"]').first.fill(ingredient_query_ja)
+        # 🟢 修正：使用您提供的 name="btnA" 與 type="image"
+        page.locator('input[name="btnA"], input[type="image"][src*="SearchBtn"]').first.click()
+        log.append("2. 等待表格 id=ResultList...")
+        page.wait_for_selector('table#ResultList', timeout=15000)
         soup = BeautifulSoup(page.content(), 'html.parser')
         table = soup.find('table', id='ResultList')
         if table:
             rows = table.find_all('tr')
             for tr in rows:
                 tds = tr.find_all('td')
                 if len(tds) >= 3:
                             comp = tds[2].get_text(separator=" ", strip=True).replace('製造販売元／', '')
                             companies.add(comp)
+        if brands: return ", ".join(brands), ", ".join(companies), "✅ 成功找到原廠！"
+        return "查無原廠", "-", "❌ 皆為學名藥括號"
+    except Exception as e: return "執行失敗", "-", str(e)
 # ==========================================
+# 🚀 主執行函數 (美國使用 curl_cffi，其他使用 Playwright 分頁隔離)
 # ==========================================
 def run_diagnostic_search(ingredient_en, ingredient_ja_manual):
     if not ingredient_en:
     ingredient_ja = ingredient_ja_manual if ingredient_ja_manual else translate_en_to_ja(ingredient_en)
     results = []
+    # 🇺🇸 美國：獨立使用 curl_cffi 執行，完全不受 Playwright 影響
+    us_b, us_c, us_log = get_usa_originator(ingredient_en)
+    results.append(["🇺🇸 美國 (FDA)", us_b, us_c, us_log])
+    # 🇬🇧🇨🇦🇯🇵 英加日：使用 Playwright 執行
     with sync_playwright() as p:
         browser = p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
         context = browser.new_context(
             user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
         )
         page_uk = context.new_page()
         uk_b, uk_c, uk_log = get_uk_originator(ingredient_en, page_uk)
         page_uk.close()
         results.append(["🇬🇧 英國 (eMC)", uk_b, uk_c, uk_log])
         page_ca = context.new_page()
         ca_b, ca_c, ca_log = get_canada_originator(ingredient_en, page_ca)
         page_ca.close()
         results.append(["🇨🇦 加拿大 (DPD)", ca_b, ca_c, ca_log])
         page_ja = context.new_page()
         ja_b, ja_c, ja_log = get_japan_originator(ingredient_ja, page_ja)
         page_ja.close()
         results.append(["🇯🇵 日本 (PMDA)", ja_b, ja_c, ja_log])
         browser.close()
     return results
 # ==========================================
 # 🎨 UI 介面
 # ==========================================
+with gr.Blocks(title="四國原廠智能檢索 (抗防護終極版)") as demo:
     gr.Markdown("## 🌐 跨國原廠商品名檢索器 (支援多重商品名與防爬蟲突破)")
     with gr.Row():