Spaces:

deneve07
/

OriginatorFinder

Sleeping

App Files Files Community

deneve07 commited on 29 days ago

Commit

0207f8d

verified ·

1 Parent(s): a429c13

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -122

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import re
-import datetime
 import requests
 from urllib.parse import quote
 from bs4 import BeautifulSoup
@@ -7,219 +6,256 @@ from playwright.sync_api import sync_playwright
 import gradio as gr
 import os
-# 強制安裝 Playwright 瀏覽器核心 (若在 HF Spaces 執行)
 os.system("playwright install chromium")
-# ==========================================
-# 工具函數：英文成分自動翻譯為日文片假名
-# ==========================================
 def translate_en_to_ja(text):
     try:
         url = f"https://translate.googleapis.com/translate_a/single?client=gtx&sl=en&tl=ja&dt=t&q={quote(text)}"
         res = requests.get(url, timeout=5)
         if res.status_code == 200:
             return res.json()[0][0][0].strip()
-    except:
-        pass
-    return text # 若翻譯失敗則回傳原字串
 # ==========================================
-# 🇬🇧 英國 eMC 原廠查詢
 # ==========================================
 def get_uk_originator(ingredient_query, page):
-    print(f"[英國] 搜尋中: {ingredient_query}")
     try:
         page.goto(f"https://www.medicines.org.uk/emc/search?q={ingredient_query}", timeout=30000)
         page.wait_for_selector('.search-results-product-info-title-link', timeout=15000)
         soup = BeautifulSoup(page.content(), 'html.parser')
-        # 尋找所有商品名的 <a> 連結
-        for link in soup.find_all('a', class_='search-results-product-info-title-link'):
             raw_title = link.get_text(strip=True)
-            # 如果標題不是以成分名開頭，就是原廠商品名 (例如 Ilaxten)
             if not raw_title.lower().startswith(ingredient_query.lower()):
                 clean_brand = re.split(r'\s+\d', raw_title)[0].strip()
-                company = "-"
-                # 往上找父層，再往下找公司名稱的 div
                 parent_div = link.find_parent(class_='search-results-product-info')
                 if parent_div:
                     comp_tag = parent_div.find(class_='search-results-product-info-company')
-                    if comp_tag: company = comp_tag.get_text(strip=True)
-                return clean_brand, company
-        return "查無原廠 (可能皆為學名藥)", "-"
     except Exception as e:
-        return "查無資料", "-"
 # ==========================================
-# 🇺🇸 美國 FDA Orange Book 原廠查詢
 # ==========================================
 def get_usa_originator(ingredient_query, page):
-    print(f"[美國] 搜尋中: {ingredient_query}")
     try:
         page.goto("https://www.accessdata.fda.gov/scripts/cder/ob/index.cfm", timeout=30000)
-        page.click('button:has-text("Active Ingredient"), a:has-text("Active Ingredient")')
-        page.fill('input[name="activeIngredient"], input[id*="ingredient"]', ingredient_query)
-        page.click('button[id*="submit"], input[type="submit"]')
-        # 等待 id="example" 的表格出現
         page.wait_for_selector('table#example', timeout=15000)
-        soup = BeautifulSoup(page.content(), 'html.parser')
         table = soup.find('table', id='example')
         if table:
-            # 尋找表頭找出各欄位正確的 Index
             headers = [th.get_text(strip=True).lower() for th in table.find_all('th')]
             brand_idx = next((i for i, h in enumerate(headers) if 'proprietary name' in h), 2)
             rld_idx = next((i for i, h in enumerate(headers) if 'rld' in h), 8)
             mfg_idx = next((i for i, h in enumerate(headers) if 'applicant holder' in h), 10)
-            # 解析表格內容
             tbody = table.find('tbody') or table
-            for tr in tbody.find_all('tr'):
                 tds = tr.find_all('td')
                 if len(tds) > max(rld_idx, brand_idx):
-                    # 判斷 RLD 欄位文字是否包含 "RLD"
-                    if "rld" in tds[rld_idx].get_text(strip=True).lower():
-                        brand = tds[brand_idx].get_text(strip=True)
-                        company = tds[mfg_idx].get_text(strip=True) if len(tds) > mfg_idx else "-"
-                        return brand, company
-        return "查無原廠或尚未核准", "-"
     except Exception as e:
-        return "查無資料 (可能無此藥)", "-"
 # ==========================================
-# 🇨🇦 加拿大 DPD 原廠查詢
 # ==========================================
 def get_canada_originator(ingredient_query, page):
-    print(f"[加拿大] 搜尋中: {ingredient_query}")
-    earliest_date = datetime.datetime.now()
-    originator_brand, company_name = None, "-"
-    generic_companies = ['apotex', 'teva', 'sandoz', 'jamp', 'mint', 'pharmascience', 'sanis', 'sivem', 'auro', 'glenmark']
     try:
         page.goto("https://health-products.canada.ca/dpd-bdpp/index-eng.jsp", timeout=30000)
-        page.fill('input[id="activeIngredient"]', ingredient_query)
-        page.click('input[type="submit"][value*="Search"]')
-        # 等待 id="results" 的表格出現
         page.wait_for_selector('table#results', timeout=15000)
-        soup = BeautifulSoup(page.content(), 'html.parser')
-        candidate_links = []
         table = soup.find('table', id='results')
         if table:
             tbody = table.find('tbody')
             if tbody:
-                for tr in tbody.find_all('tr'):
                     tds = tr.find_all('td')
                     if len(tds) >= 4:
-                        # 藥廠在第 3 欄 (index 2)
                         comp_name_full = tds[2].get_text(strip=True)
                         comp_name_lower = comp_name_full.lower()
-                        # 排除學名藥廠
-                        if any(gc in comp_name_lower for gc in generic_companies):
-                            continue
-                        # 連結在第 2 欄 (index 1) 的 <a> 標籤內
-                        link_tag = tds[1].find('a')
-                        if link_tag:
-                            full_url = "https://health-products.canada.ca" + link_tag['href']
-                            # 商品名在第 4 欄 (index 3)
                             product_name = tds[3].get_text(strip=True)
-                            candidate_links.append({"url": full_url, "company": comp_name_full, "product": product_name})
-        # 進入詳細頁面比對最初上市日期
-        for item in candidate_links[:3]:
-            page.goto(item['url'], timeout=15000)
-            detail_soup = BeautifulSoup(page.content(), 'html.parser')
-            strong_tag = detail_soup.find(lambda tag: tag.name == "strong" and "Original Market Authorization Date" in tag.get_text())
-            if strong_tag and strong_tag.next_sibling:
-                try:
-                    auth_date = datetime.datetime.strptime(strong_tag.next_sibling.strip(), "%Y-%m-%d")
-                    if auth_date < earliest_date:
-                        earliest_date = auth_date
-                        originator_brand = item['product']
-                        company_name = item['company']
-                except: pass
-        return originator_brand if originator_brand else "查無原廠", company_name
     except Exception as e:
-        return "查無資料", "-"
 # ==========================================
-# 🇯🇵 日本 PMDA 原廠查詢
 # ==========================================
 def get_japan_originator(ingredient_query_ja, page):
-    print(f"[日本] 搜尋中: {ingredient_query_ja}")
     try:
         page.goto("https://www.pmda.go.jp/PmdaSearch/iyakuSearch/", timeout=30000)
-        page.fill('input[title*="一般名"], input[name="generalName"]', ingredient_query_ja)
-        page.click('input[type="submit"][value*="検索"]')
-        # 等待 id="ResultList" 的表格出現
-        page.wait_for_selector('table#ResultList', timeout=15000)
-        soup = BeautifulSoup(page.content(), 'html.parser')
         table = soup.find('table', id='ResultList')
         if table:
-            # 略過第一、第二列的表頭 (tr有2個是th)
-            for tr in table.find_all('tr'):
                 tds = tr.find_all('td')
                 if len(tds) >= 3:
-                    # 抓取第 2 欄 (index 1) 的商品名
                     raw_brand = tds[1].get_text(strip=True)
-                    # 排除帶有「」或（）的學名藥，且不能包含純成分名
                     if '「' not in raw_brand and '（' not in raw_brand and ingredient_query_ja not in raw_brand:
-                        clean_brand = re.split(r'(錠|カプセル|顆粒|シロップ|OD|細粒|液|\d+)', raw_brand)[0].strip()
-                        # 抓取第 3 欄 (index 2) 的廠商名
-                        company_name = tds[2].get_text(separator=" ", strip=True).replace('製造販売元／', '')
-                        return clean_brand, company_name
-        return "查無原廠 (可能皆為學名藥)", "-"
     except Exception as e:
-        return "查無資料", "-"
 # ==========================================
 # 🚀 主執行函數
 # ==========================================
-def run_all_countries(ingredient_en):
     if not ingredient_en:
-        return [["錯誤", "請輸入英文成分名", "-"]]
-    # 自動翻譯為日文
-    ingredient_ja = translate_en_to_ja(ingredient_en)
-    print(f"成分解析: 英文={ingredient_en}, 日文={ingredient_ja}")
     results = []
     with sync_playwright() as p:
-        # 共用一個 Browser instance 加快速度
         browser = p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
         context = browser.new_context()
         page = context.new_page()
-        # 依序執行四國查詢
-        uk_brand, uk_mfg = get_uk_originator(ingredient_en, page)
-        results.append(["🇬🇧 英國 (eMC)", uk_brand, uk_mfg])
-        us_brand, us_mfg = get_usa_originator(ingredient_en, page)
-        results.append(["🇺🇸 美國 (FDA)", us_brand, us_mfg])
-        ca_brand, ca_mfg = get_canada_originator(ingredient_en, page)
-        results.append(["🇨🇦 加拿大 (DPD)", ca_brand, ca_mfg])
-        ja_brand, ja_mfg = get_japan_originator(ingredient_ja, page)
-        results.append(["🇯🇵 日本 (PMDA)", ja_brand, ja_mfg])
         browser.close()
@@ -228,21 +264,23 @@ def run_all_countries(ingredient_en):
 # ==========================================
 # 🎨 UI 介面
 # ==========================================
-with gr.Blocks(title="四國原廠商品名智能檢索器") as demo:
-    gr.Markdown("## 🌐 跨國原廠商品名自動檢索器")
-    gr.Markdown("只需輸入 **英文成分名**，系統會自動翻譯日文，並同步爬取英、美、加、日四國官方資料庫，智慧判斷原廠商品名。")
     with gr.Row():
-        ing_input = gr.Textbox(label="🧪 請輸入英文成分名 (Active Ingredient)", placeholder="例如: Bilastine")
-        search_btn = gr.Button("🚀 一鍵查詢四國原廠", variant="primary")
     result_table = gr.Dataframe(
-        headers=["國家 / 資料庫", "🌟 判定為原廠的商品名", "🏭 藥廠名稱"],
-        datatype=["str", "str", "str"],
         interactive=False
     )
-    search_btn.click(fn=run_all_countries, inputs=[ing_input], outputs=[result_table])
 if __name__ == "__main__":
     demo.launch()

 import re
 import requests
 from urllib.parse import quote
 from bs4 import BeautifulSoup
 import gradio as gr
 import os
 os.system("playwright install chromium")
 def translate_en_to_ja(text):
     try:
         url = f"https://translate.googleapis.com/translate_a/single?client=gtx&sl=en&tl=ja&dt=t&q={quote(text)}"
         res = requests.get(url, timeout=5)
         if res.status_code == 200:
             return res.json()[0][0][0].strip()
+    except Exception as e:
+        return f"翻譯失敗: {e}"
+    return text
 # ==========================================
+# 🇬🇧 英國 eMC (支援多重商品名)
 # ==========================================
 def get_uk_originator(ingredient_query, page):
+    log = []
+    brands = set()
+    companies = set()
     try:
+        log.append("1. 前往 eMC 搜尋頁面...")
         page.goto(f"https://www.medicines.org.uk/emc/search?q={ingredient_query}", timeout=30000)
+        log.append("2. 等待搜尋結果...")
         page.wait_for_selector('.search-results-product-info-title-link', timeout=15000)
         soup = BeautifulSoup(page.content(), 'html.parser')
+        links = soup.find_all('a', class_='search-results-product-info-title-link')
+        log.append(f"3. 找到 {len(links)} 筆結果，開始篩選...")
+        for link in links:
             raw_title = link.get_text(strip=True)
             if not raw_title.lower().startswith(ingredient_query.lower()):
                 clean_brand = re.split(r'\s+\d', raw_title)[0].strip()
+                brands.add(clean_brand)
                 parent_div = link.find_parent(class_='search-results-product-info')
                 if parent_div:
                     comp_tag = parent_div.find(class_='search-results-product-info-company')
+                    if comp_tag: companies.add(comp_tag.get_text(strip=True))
+        if brands:
+            log.append("✅ 成功找到原廠藥！")
+            return ", ".join(brands), ", ".join(companies), "\n".join(log)
+        else:
+            log.append("❌ 結果皆以成分名開頭，判定為無原廠。")
+            return "查無原廠", "-", "\n".join(log)
     except Exception as e:
+        log.append(f"❌ 發生錯誤: {str(e)}")
+        return "執行失敗", "-", "\n".join(log)
 # ==========================================
+# 🇺🇸 美國 FDA (支援多重商品名)
 # ==========================================
 def get_usa_originator(ingredient_query, page):
+    log = []
+    brands = set()
+    companies = set()
     try:
+        log.append("1. 前往 FDA Orange Book...")
         page.goto("https://www.accessdata.fda.gov/scripts/cder/ob/index.cfm", timeout=30000)
+        log.append("2. 切換至 Active Ingredient 頁籤並輸入...")
+        # 增強版定位器
+        page.locator('a[aria-controls="ingredient"], button:has-text("Active Ingredient")').first.click()
+        page.locator('input[name="activeIngredient"], input#ingredient').first.fill(ingredient_query)
+        page.keyboard.press("Enter") # 避免按鈕找不到，直接按 Enter
+        log.append("3. 等待結果表格...")
         page.wait_for_selector('table#example', timeout=15000)
+        soup = BeautifulSoup(page.content(), 'html.parser')
         table = soup.find('table', id='example')
         if table:
             headers = [th.get_text(strip=True).lower() for th in table.find_all('th')]
             brand_idx = next((i for i, h in enumerate(headers) if 'proprietary name' in h), 2)
             rld_idx = next((i for i, h in enumerate(headers) if 'rld' in h), 8)
             mfg_idx = next((i for i, h in enumerate(headers) if 'applicant holder' in h), 10)
             tbody = table.find('tbody') or table
+            rows = tbody.find_all('tr')
+            log.append(f"4. 表格載入完成，共 {len(rows)} 列資料，開始尋找 RLD...")
+            for tr in rows:
                 tds = tr.find_all('td')
                 if len(tds) > max(rld_idx, brand_idx):
+                    rld_text = tds[rld_idx].get_text(strip=True).upper()
+                    # 只要欄位內有 RLD 三個字就是原廠
+                    if "RLD" in rld_text:
+                        brands.add(tds[brand_idx].get_text(strip=True))
+                        if len(tds) > mfg_idx:
+                            companies.add(tds[mfg_idx].get_text(strip=True))
+        if brands:
+            log.append(f"✅ 成功找到 {len(brands)} 個 RLD 原廠藥！")
+            return ", ".join(brands), ", ".join(companies), "\n".join(log)
+        else:
+            log.append("❌ 表格中未發現 RLD 標記。")
+            return "尚未核准或無 RLD", "-", "\n".join(log)
     except Exception as e:
+        log.append(f"❌ 發生錯誤: {str(e)}")
+        return "執行失敗", "-", "\n".join(log)
 # ==========================================
+# 🇨🇦 加拿大 DPD (支援多重商品名)
 # ==========================================
 def get_canada_originator(ingredient_query, page):
+    log = []
+    brands = set()
+    companies = set()
+    generic_companies = ['apotex', 'teva', 'sandoz', 'jamp', 'mint', 'pharmascience', 'sanis', 'sivem', 'auro', 'glenmark', 'taro']
     try:
+        log.append("1. 前往 Canada DPD...")
         page.goto("https://health-products.canada.ca/dpd-bdpp/index-eng.jsp", timeout=30000)
+        log.append("2. 輸入成分並送出...")
+        page.locator('input[id="activeIngredient"]').fill(ingredient_query)
+        page.keyboard.press("Enter")
+        log.append("3. 等待結果表格...")
         page.wait_for_selector('table#results', timeout=15000)
+        soup = BeautifulSoup(page.content(), 'html.parser')
         table = soup.find('table', id='results')
         if table:
             tbody = table.find('tbody')
             if tbody:
+                rows = tbody.find_all('tr')
+                log.append(f"4. 找到 {len(rows)} 筆資料，過濾學名藥廠中...")
+                for tr in rows:
                     tds = tr.find_all('td')
                     if len(tds) >= 4:
                         comp_name_full = tds[2].get_text(strip=True)
                         comp_name_lower = comp_name_full.lower()
+                        # 如果不是知名學名藥廠，我們就將其視為原廠（收集起來）
+                        if not any(gc in comp_name_lower for gc in generic_companies):
                             product_name = tds[3].get_text(strip=True)
+                            brands.add(product_name)
+                            companies.add(comp_name_full)
+        if brands:
+            log.append("✅ 成功過濾出非學名藥品項！")
+            return ", ".join(brands), ", ".join(companies), "\n".join(log)
+        else:
+            log.append("❌ 剩下的全為學名藥廠，查無原廠。")
+            return "查無原廠", "-", "\n".join(log)
     except Exception as e:
+        log.append(f"❌ 發生錯誤: {str(e)}")
+        return "執行失敗", "-", "\n".join(log)
 # ==========================================
+# 🇯🇵 日本 PMDA (支援多重商品名)
 # ==========================================
 def get_japan_originator(ingredient_query_ja, page):
+    log = []
+    brands = set()
+    companies = set()
+    log.append(f"使用日文名: {ingredient_query_ja} 進行搜尋")
     try:
+        log.append("1. 前往 PMDA...")
         page.goto("https://www.pmda.go.jp/PmdaSearch/iyakuSearch/", timeout=30000)
+        log.append("2. 輸入並送出...")
+        page.locator('input[title*="一般名"], input[name="generalName"]').fill(ingredient_query_ja)
+        page.keyboard.press("Enter")
+        log.append("3. 等待表格 id=ResultList...")
+        try:
+            page.wait_for_selector('table#ResultList', timeout=15000)
+        except:
+            log.append("❌ 等待逾時，可能是完全查無此成分。")
+            return "查無資料", "-", "\n".join(log)
+        soup = BeautifulSoup(page.content(), 'html.parser')
         table = soup.find('table', id='ResultList')
         if table:
+            rows = table.find_all('tr')
+            log.append(f"4. 找到表格，分析 {len(rows)} 列資料...")
+            for tr in rows:
                 tds = tr.find_all('td')
                 if len(tds) >= 3:
                     raw_brand = tds[1].get_text(strip=True)
+                    # 條件：無括號學名標記，且不能只是純成分名
                     if '「' not in raw_brand and '（' not in raw_brand and ingredient_query_ja not in raw_brand:
+                        # 切除皮下注、錠、OD、數字等劑量標籤
+                        clean_brand = re.split(r'(皮下注|錠|カプセル|顆粒|シロップ|OD|細粒|液|\d+)', raw_brand)[0].strip()
+                        if clean_brand:
+                            brands.add(clean_brand)
+                            comp = tds[2].get_text(separator=" ", strip=True).replace('製造販売元／', '')
+                            companies.add(comp)
+        if brands:
+            log.append("✅ 成功排除學名藥括號，找到原廠！")
+            return ", ".join(brands), ", ".join(companies), "\n".join(log)
+        else:
+            log.append("❌ 找到的都是學名藥或格式不符。")
+            return "查無原廠", "-", "\n".join(log)
     except Exception as e:
+        log.append(f"❌ 發生錯誤: {str(e)}")
+        return "執行失敗", "-", "\n".join(log)
 # ==========================================
 # 🚀 主執行函數
 # ==========================================
+def run_diagnostic_search(ingredient_en, ingredient_ja_manual):
     if not ingredient_en:
+        return [["錯誤", "請輸入英文成分名", "-", ""]]
+    # 如果使用者沒有手動填寫日文，就呼叫 API 翻譯
+    if not ingredient_ja_manual:
+        ingredient_ja = translate_en_to_ja(ingredient_en)
+    else:
+        ingredient_ja = ingredient_ja_manual
     results = []
     with sync_playwright() as p:
+        # 使用 Firefox 測試 (有時候 Chromium 會被反爬蟲擋住)
         browser = p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
         context = browser.new_context()
         page = context.new_page()
+        # 英國
+        uk_b, uk_c, uk_log = get_uk_originator(ingredient_en, page)
+        results.append(["🇬🇧 英國 (eMC)", uk_b, uk_c, uk_log])
+        # 美國
+        us_b, us_c, us_log = get_usa_originator(ingredient_en, page)
+        results.append(["🇺🇸 美國 (FDA)", us_b, us_c, us_log])
+        # 加拿大
+        ca_b, ca_c, ca_log = get_canada_originator(ingredient_en, page)
+        results.append(["🇨🇦 加拿大 (DPD)", ca_b, ca_c, ca_log])
+        # 日本
+        ja_b, ja_c, ja_log = get_japan_originator(ingredient_ja, page)
+        results.append(["🇯🇵 日本 (PMDA)", ja_b, ja_c, ja_log])
         browser.close()
 # ==========================================
 # 🎨 UI 介面
 # ==========================================
+with gr.Blocks(title="四國原廠智能檢索 (診斷與多重版)") as demo:
+    gr.Markdown("## 🌐 跨國原廠商品名檢索器 (支援多重商品名與診斷紀錄)")
     with gr.Row():
+        ing_input = gr.Textbox(label="🧪 英文成分名 (必填)", placeholder="例如: Semaglutide")
+        ja_input = gr.Textbox(label="🇯🇵 日文成分名 (選填，若空白則自動翻譯)", placeholder="例如: セマグルチド (若自動翻譯失敗請手動填入)")
+    search_btn = gr.Button("🚀 啟動診斷與查詢", variant="primary")
     result_table = gr.Dataframe(
+        headers=["國家", "🌟 判定為原廠的商品名", "🏭 藥廠名稱", "🛠️ 系統執行診斷日誌"],
+        datatype=["str", "str", "str", "str"],
+        wrap=True, # 讓長長的日誌可以自動換行
         interactive=False
     )
+    search_btn.click(fn=run_diagnostic_search, inputs=[ing_input, ja_input], outputs=[result_table])
 if __name__ == "__main__":
     demo.launch()