Spaces:

deneve07
/

OriginatorFinder

Sleeping

App Files Files Community

deneve07 commited on May 5

Commit

1152cee

verified ·

1 Parent(s): b0bbf11

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -82

app.py CHANGED Viewed

@@ -29,11 +29,11 @@ def is_generic(brand_name, company_name, ingredient):
     generic_keywords = ['sandoz', 'teva', 'apotex', 'ratiopharm', 'jamp', 'mint', 'pharmascience', 'sanis', 'sivem',
                         'auro', 'glenmark', 'taro', 'marcan', 'nora', 'mantra', 'reddy', 'mepha', 'axapharm',
-                        'helvepharm', 'zentiva', 'spirig', 'aliud', 'puren', 'stada', 'eg ', '- gé']
     if b_lower.startswith(i_lower) or i_lower in b_lower: return True
     if any(gk in b_lower or gk in c_lower for gk in generic_keywords): return True
-    if '「' in brand_name or '（' in brand_name: return True # 日本學名藥特徵
     return False
 def clean_brand_name(raw_name):
@@ -41,7 +41,7 @@ def clean_brand_name(raw_name):
     return re.split(r'(皮下注|錠|カプセル|顆粒|シロップ|OD|細粒|液|\d+)', raw_name)[0].replace('®', '').strip()
 # ==========================================
-# 🚀 模組 A：使用 curl_cffi 抓取 (美、比、法)
 # ==========================================
 def get_usa_originator(ingredient):
     log, brands, companies = [], set(), set()
@@ -53,7 +53,9 @@ def get_usa_originator(ingredient):
         table = soup.find('table', id='example')
         if table:
             headers = [th.get_text(strip=True).lower() for th in table.find_all('th')]
-            brand_idx, rld_idx, mfg_idx = 2, 8, 10
             for tr in (table.find('tbody') or table).find_all('tr'):
                 tds = tr.find_all('td')
                 if len(tds) > max(rld_idx, brand_idx) and "RLD" in tds[rld_idx].get_text(strip=True).upper():
@@ -78,9 +80,6 @@ def get_belgium_originator(ingredient):
         return "查無原廠", "-", "❌ 皆為學名藥"
     except Exception as e: return "執行失敗", "-", str(e)
-# ==========================================
-# 🇫🇷 法國 (精準解析 result drug 結構)
-# ==========================================
 def get_france_originator(ingredient):
     brands = set()
     try:
@@ -88,13 +87,11 @@ def get_france_originator(ingredient):
         res = session.get(f"https://base-donnees-publique.medicaments.gouv.fr/medicament/recherche/resultat?contains={quote(ingredient)}", timeout=30, verify=False)
         soup = BeautifulSoup(res.text, 'html.parser')
-        # 🟢 修正：針對您提供的 HTML，尋找 div class="result drug"
         for div in soup.find_all('div', class_='result drug'):
             info_div = div.find('div', class_='infos')
             if info_div and info_div.find('a'):
                 title = info_div.find('a').get_text(strip=True)
-                # 排除含有 Gé 標籤或以成分名開頭的學名藥
                 if not is_generic(title, "", ingredient) and '- gé' not in title.lower():
                     brands.add(clean_brand_name(title))
@@ -102,8 +99,47 @@ def get_france_originator(ingredient):
         return "查無原廠", "-", "❌ 查無資料或皆為 Gé"
     except Exception as e: return "執行失敗", "-", str(e)
 # ==========================================
-# 🚀 模組 B：使用 Playwright 抓取 (英、加、日、澳、瑞、德、瑞典)
 # ==========================================
 def get_uk_originator(ingredient, page):
     brands, companies = set(), set()
@@ -153,16 +189,13 @@ def get_canada_originator(ingredient, page):
 def get_japan_originator(ing_ja, page):
     brands, companies = set(), set()
     try:
         page.goto("https://www.pmda.go.jp/PmdaSearch/iyakuSearch/", timeout=30000, wait_until="domcontentloaded")
-        try:
-            btn = page.locator('text=同意する, input[value="同意する"]').first
-            if btn.is_visible(timeout=3000): btn.click(); page.wait_for_load_state('networkidle')
-        except: pass
-        page.locator('input#txtName, input[name="nameWord"]').first.fill(ing_ja)
-        page.locator('input[name="btnA"], input[type="image"][src*="SearchBtn"]').first.click()
         page.wait_for_selector('table#ResultList', timeout=15000)
         for tr in BeautifulSoup(page.content(), 'html.parser').find('table', id='ResultList').find_all('tr'):
             tds = tr.find_all('td')
             if len(tds) >= 3:
@@ -174,42 +207,13 @@ def get_japan_originator(ing_ja, page):
         return "查無原廠", "-", "❌ 皆為學名藥"
     except Exception as e: return "執行失敗", "-", str(e)
-# ==========================================
-# 🇦🇺 澳洲 (改用 curl_cffi 突破 ERR_HTTP2_PROTOCOL_ERROR)
-# ==========================================
-def get_australia_originator(ingredient, page=None):
-    brands = set()
-    try:
-        # 🟢 修正：放棄 Playwright，改用 curl_cffi
-        session = curl_req.Session(impersonate="chrome120")
-        res = session.get(f"https://www.tga.gov.au/resources/artg?keywords={ingredient}", timeout=30, verify=False)
-        soup = BeautifulSoup(res.text, 'html.parser')
-        cands = []
-        for article in soup.find_all('article', class_='node--artg'):
-            title_tag = article.find('h3')
-            time_tag = article.find('time')
-            if title_tag and time_tag:
-                full_t = title_tag.get_text(strip=True)
-                # 切割成分名取前半段
-                parts = re.split(ingredient, full_t, flags=re.IGNORECASE)
-                if len(parts) > 1 and parts[0].strip():
-                    brand = parts[0].strip()
-                    # 過濾常見學名藥廠
-                    if not is_generic(brand, "", ingredient):
-                        cands.append({"brand": brand, "date": time_tag.get('datetime')})
-        if cands:
-            cands = sorted(cands, key=lambda x: x['date'])
-            return cands[0]['brand'], "TGA資料庫", f"✅ 最早註冊: {cands[0]['date'][:10]}"
-        return "查無原廠", "-", "❌ 查無資料"
-    except Exception as e: return "執行失敗", "-", str(e)
 def get_switzerland_originator(ing_de, page):
     brands, companies = set(), set()
     try:
         page.goto(f"https://compendium.ch/search?q={ing_de}", timeout=30000)
-        page.wait_for_selector('.medicament-card', timeout=15000)
         for card in BeautifulSoup(page.content(), 'html.parser').find_all('div', class_=re.compile('medicament-card')):
             h3, strong = card.find('h3'), card.find('strong', class_='info')
             if h3 and strong:
@@ -240,33 +244,7 @@ def get_germany_originator(ing_de, page):
     except Exception as e: return "執行失敗", "-", str(e)
 # ==========================================
-# 🇸🇪 瑞典 (改用 curl_cffi 抓取 FASS)
-# ==========================================
-def get_sweden_originator(ingredient, page=None):
-    brands, companies = set(), set()
-    try:
-        session = curl_req.Session(impersonate="chrome120")
-        # 直接對 FASS 發送搜尋請求
-        res = session.get(f"https://www.fass.se/LIF/search?query={quote(ingredient)}", timeout=30, verify=False)
-        soup = BeautifulSoup(res.text, 'html.parser')
-        # FASS 的搜尋結果通常在 .search-result-item 中
-        for item in soup.find_all('li', class_=re.compile(r'search-result-item')):
-            title_tag = item.find('a', class_='product-name')
-            comp_tag = item.find('span', class_='company-name')
-            if title_tag and comp_tag:
-                title = title_tag.get_text(strip=True)
-                comp = comp_tag.get_text(strip=True)
-                if not is_generic(title, comp, ingredient):
-                    brands.add(clean_brand_name(title))
-                    companies.add(comp)
-        if brands: return ", ".join(brands), ", ".join(companies), "✅ 成功 (FASS)"
-        return "查無原廠", "-", "❌ 查無資料"
-    except Exception as e: return "執行失敗", "-", str(e)
-# ==========================================
-# 🚀 主執行中樞：併發與隔離
 # ==========================================
 def run_all_ten_countries(ing_en, ing_ja_manual, ing_de_manual):
     if not ing_en: return [["錯誤", "請輸入英文成分名", "-", ""]]
@@ -275,7 +253,7 @@ def run_all_ten_countries(ing_en, ing_ja_manual, ing_de_manual):
     ing_de = ing_de_manual if ing_de_manual else translate_lang(ing_en, 'de')
     results = []
-    # 1. API 模組 (不需瀏覽器，極速)
     usa_b, usa_c, usa_log = get_usa_originator(ing_en)
     results.append(["🇺🇸 美國 (FDA)", usa_b, usa_c, usa_log])
@@ -291,7 +269,7 @@ def run_all_ten_countries(ing_en, ing_ja_manual, ing_de_manual):
     se_b, se_c, se_log = get_sweden_originator(ing_en)
     results.append(["🇸🇪 瑞典 (FASS)", se_b, se_c, se_log])
-    # 2. 瀏覽器模組 (Playwright 分頁隔離)
     with sync_playwright() as p:
         browser = p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
         context = browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0 Safari/537.36")
@@ -305,10 +283,8 @@ def run_all_ten_countries(ing_en, ing_ja_manual, ing_de_manual):
         run_pw(get_uk_originator, ing_en, "🇬🇧 英國 (eMC)")
         run_pw(get_canada_originator, ing_en, "🇨🇦 加拿大 (DPD)")
         run_pw(get_japan_originator, ing_ja, "🇯🇵 日本 (PMDA)")
-        run_pw(get_australia_originator, ing_en, "🇦🇺 澳洲 (TGA)")
         run_pw(get_switzerland_originator, ing_de, "🇨🇭 瑞士 (Compendium)")
         run_pw(get_germany_originator, ing_de, "🇩🇪 德國 (Gelbe Liste)")
-        run_pw(get_sweden_originator, ing_en, "🇸🇪 瑞典 (TLV)")
         browser.close()
@@ -318,7 +294,7 @@ def run_all_ten_countries(ing_en, ing_ja_manual, ing_de_manual):
 # 🎨 UI 介面
 # ==========================================
 with gr.Blocks(title="十國原廠商品名智能檢索器") as demo:
-    gr.Markdown("## 🌐 跨國原廠商品名檢索器 (十國完整版)")
     with gr.Row():
         ing_en = gr.Textbox(label="🧪 英文成分名 (必填)", placeholder="例如: Semaglutide")

     generic_keywords = ['sandoz', 'teva', 'apotex', 'ratiopharm', 'jamp', 'mint', 'pharmascience', 'sanis', 'sivem',
                         'auro', 'glenmark', 'taro', 'marcan', 'nora', 'mantra', 'reddy', 'mepha', 'axapharm',
+                        'helvepharm', 'zentiva', 'spirig', 'aliud', 'puren', 'stada', 'eg ', '- gé', 'biogaran', 'arrow', 'viatris', 'zydus']
     if b_lower.startswith(i_lower) or i_lower in b_lower: return True
     if any(gk in b_lower or gk in c_lower for gk in generic_keywords): return True
+    if '「' in brand_name or '（' in brand_name: return True
     return False
 def clean_brand_name(raw_name):
     return re.split(r'(皮下注|錠|カプセル|顆粒|シロップ|OD|細粒|液|\d+)', raw_name)[0].replace('®', '').strip()
 # ==========================================
+# 🚀 模組 A：使用 curl_cffi 抓取 (美、比、法、澳、瑞典)
 # ==========================================
 def get_usa_originator(ingredient):
     log, brands, companies = [], set(), set()
         table = soup.find('table', id='example')
         if table:
             headers = [th.get_text(strip=True).lower() for th in table.find_all('th')]
+            brand_idx = next((i for i, h in enumerate(headers) if 'proprietary name' in h), 2)
+            rld_idx = next((i for i, h in enumerate(headers) if 'rld' in h), 8)
+            mfg_idx = next((i for i, h in enumerate(headers) if 'applicant holder' in h), 10)
             for tr in (table.find('tbody') or table).find_all('tr'):
                 tds = tr.find_all('td')
                 if len(tds) > max(rld_idx, brand_idx) and "RLD" in tds[rld_idx].get_text(strip=True).upper():
         return "查無原廠", "-", "❌ 皆為學名藥"
     except Exception as e: return "執行失敗", "-", str(e)
 def get_france_originator(ingredient):
     brands = set()
     try:
         res = session.get(f"https://base-donnees-publique.medicaments.gouv.fr/medicament/recherche/resultat?contains={quote(ingredient)}", timeout=30, verify=False)
         soup = BeautifulSoup(res.text, 'html.parser')
+        # 🟢 確保使用精確的法國 DOM 解析邏輯
         for div in soup.find_all('div', class_='result drug'):
             info_div = div.find('div', class_='infos')
             if info_div and info_div.find('a'):
                 title = info_div.find('a').get_text(strip=True)
                 if not is_generic(title, "", ingredient) and '- gé' not in title.lower():
                     brands.add(clean_brand_name(title))
         return "查無原廠", "-", "❌ 查無資料或皆為 Gé"
     except Exception as e: return "執行失敗", "-", str(e)
+def get_australia_originator(ingredient):
+    brands = set()
+    try:
+        session = curl_req.Session(impersonate="chrome120")
+        res = session.get(f"https://www.tga.gov.au/resources/artg?keywords={ingredient}", timeout=30, verify=False)
+        soup = BeautifulSoup(res.text, 'html.parser')
+        cands = []
+        for article in soup.find_all('article', class_='node--artg'):
+            title_tag, time_tag = article.find('h3'), article.find('time')
+            if title_tag and time_tag:
+                full_t = title_tag.get_text(strip=True)
+                parts = re.split(ingredient, full_t, flags=re.IGNORECASE)
+                if len(parts) > 1 and parts[0].strip():
+                    brand = parts[0].strip()
+                    if not is_generic(brand, "", ingredient):
+                        cands.append({"brand": brand, "date": time_tag.get('datetime')})
+        if cands:
+            cands = sorted(cands, key=lambda x: x['date'])
+            return cands[0]['brand'], "TGA資料庫", f"✅ 最早註冊: {cands[0]['date'][:10]}"
+        return "查無原廠", "-", "❌ 查無資料"
+    except Exception as e: return "執行失敗", "-", str(e)
+def get_sweden_originator(ingredient):
+    brands, companies = set(), set()
+    try:
+        session = curl_req.Session(impersonate="chrome120")
+        res = session.get(f"https://www.fass.se/LIF/search?query={quote(ingredient)}", timeout=30, verify=False)
+        soup = BeautifulSoup(res.text, 'html.parser')
+        for item in soup.find_all('li', class_=re.compile(r'search-result-item')):
+            title_tag, comp_tag = item.find('a', class_='product-name'), item.find('span', class_='company-name')
+            if title_tag and comp_tag:
+                title, comp = title_tag.get_text(strip=True), comp_tag.get_text(strip=True)
+                if not is_generic(title, comp, ingredient):
+                    brands.add(clean_brand_name(title))
+                    companies.add(comp)
+        if brands: return ", ".join(brands), ", ".join(companies), "✅ 成功 (FASS)"
+        return "查無原廠", "-", "❌ 查無資料"
+    except Exception as e: return "執行失敗", "-", str(e)
 # ==========================================
+# 🚀 模組 B：使用 Playwright 抓取 (英、加、日、德、瑞)
 # ==========================================
 def get_uk_originator(ingredient, page):
     brands, companies = set(), set()
 def get_japan_originator(ing_ja, page):
     brands, companies = set(), set()
     try:
+        # 🟢 修正：直接前往網頁，等待輸入框，輸入後按 Enter 提交
         page.goto("https://www.pmda.go.jp/PmdaSearch/iyakuSearch/", timeout=30000, wait_until="domcontentloaded")
+        page.wait_for_selector('input#txtName', timeout=15000)
+        page.fill('input#txtName', ing_ja)
+        page.press('input#txtName', 'Enter')
         page.wait_for_selector('table#ResultList', timeout=15000)
         for tr in BeautifulSoup(page.content(), 'html.parser').find('table', id='ResultList').find_all('tr'):
             tds = tr.find_all('td')
             if len(tds) >= 3:
         return "查無原廠", "-", "❌ 皆為學名藥"
     except Exception as e: return "執行失敗", "-", str(e)
 def get_switzerland_originator(ing_de, page):
     brands, companies = set(), set()
     try:
         page.goto(f"https://compendium.ch/search?q={ing_de}", timeout=30000)
+        # 🟢 修正：Vue.js 需要時間渲染，改為等待網路請求靜止
+        page.wait_for_load_state('networkidle', timeout=20000)
         for card in BeautifulSoup(page.content(), 'html.parser').find_all('div', class_=re.compile('medicament-card')):
             h3, strong = card.find('h3'), card.find('strong', class_='info')
             if h3 and strong:
     except Exception as e: return "執行失敗", "-", str(e)
 # ==========================================
+# 🚀 主執行中樞
 # ==========================================
 def run_all_ten_countries(ing_en, ing_ja_manual, ing_de_manual):
     if not ing_en: return [["錯誤", "請輸入英文成分名", "-", ""]]
     ing_de = ing_de_manual if ing_de_manual else translate_lang(ing_en, 'de')
     results = []
+    # 1. API 模組 (澳洲與瑞典已移至此處)
     usa_b, usa_c, usa_log = get_usa_originator(ing_en)
     results.append(["🇺🇸 美國 (FDA)", usa_b, usa_c, usa_log])
     se_b, se_c, se_log = get_sweden_originator(ing_en)
     results.append(["🇸🇪 瑞典 (FASS)", se_b, se_c, se_log])
+    # 2. 瀏覽器模組 (Playwright 嚴格隔離)
     with sync_playwright() as p:
         browser = p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
         context = browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0 Safari/537.36")
         run_pw(get_uk_originator, ing_en, "🇬🇧 英國 (eMC)")
         run_pw(get_canada_originator, ing_en, "🇨🇦 加拿大 (DPD)")
         run_pw(get_japan_originator, ing_ja, "🇯🇵 日本 (PMDA)")
         run_pw(get_switzerland_originator, ing_de, "🇨🇭 瑞士 (Compendium)")
         run_pw(get_germany_originator, ing_de, "🇩🇪 德國 (Gelbe Liste)")
         browser.close()
 # 🎨 UI 介面
 # ==========================================
 with gr.Blocks(title="十國原廠商品名智能檢索器") as demo:
+    gr.Markdown("## 🌐 跨國原廠商品名檢索器 (十國無重複修復版)")
     with gr.Row():
         ing_en = gr.Textbox(label="🧪 英文成分名 (必填)", placeholder="例如: Semaglutide")