import os import re import datetime import requests from urllib.parse import quote from bs4 import BeautifulSoup from playwright.sync_api import sync_playwright import gradio as gr from curl_cffi import requests as curl_req import torch from transformers import pipeline os.system("playwright install chromium") # ========================================== # 🧠 載入原生 ElanMT 醫療翻譯模型 # ========================================== print("⏳ 系統啟動中:正在載入 ElanMT 醫療翻譯模型...") device = "cuda" if torch.cuda.is_available() else "cpu" try: translator_en_ja = pipeline("translation", model="Mitsua/elan-mt-bt-en-ja", device=device) print(f"✅ ElanMT 模型載入完成!(執行環境: {device})") except Exception as e: print(f"⚠️ ElanMT 模型載入失敗,將退回備用機制。錯誤: {e}") translator_en_ja = None # ========================================== # 🛠️ 共用工具:翻譯與進階學名藥濾網 # ========================================== def get_official_japanese_name(ingredient_en): try: search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={quote(ingredient_en)}&utf8=&format=json" res = requests.get(search_url, timeout=5).json() if not res['query']['search']: return None en_title = res['query']['search'][0]['title'] lang_url = f"https://en.wikipedia.org/w/api.php?action=query&titles={quote(en_title)}&prop=langlinks&lllang=ja&format=json" lang_res = requests.get(lang_url, timeout=5).json() pages = lang_res['query']['pages'] for page_id in pages: if 'langlinks' in pages[page_id]: ja_title = pages[page_id]['langlinks'][0]['*'] return ja_title.split(' ')[0].split('(')[0] except Exception: pass return None def translate_lang(text, target_lang): text_lower = text.lower().strip() if target_lang == 'ja': ja_overrides = {"bilastine": "ビラスチン", "semaglutide": "セマグルチド"} if text_lower in ja_overrides: return ja_overrides[text_lower] if translator_en_ja is not None: try: return translator_en_ja(text)[0]['translation_text'] except Exception: pass official_ja = get_official_japanese_name(text) if official_ja: return official_ja try: url = f"https://translate.googleapis.com/translate_a/single?client=gtx&sl=en&tl={target_lang}&dt=t&q={quote(text)}" res = requests.get(url, timeout=10) if res.status_code == 200: return res.json()[0][0][0].strip() except Exception: pass return text def is_generic(brand_name, company_name, ingredient): b_lower, c_lower, i_lower = brand_name.lower(), company_name.lower(), ingredient.lower() generic_keywords = [ 'sandoz', 'teva', 'apotex', 'ratiopharm', 'jamp', 'mint', 'pharmascience', 'sanis', 'sivem', 'auro', 'glenmark', 'taro', 'marcan', 'nora', 'mantra', 'reddy', 'mepha', 'axapharm', 'helvepharm', 'zentiva', 'spirig', 'aliud', 'puren', 'stada', 'eg ', '- gé', 'biogaran', 'arrow', 'viatris', 'zydus', 'kohlpharma', 'eurim', 'abacus', 'emra', 'cc pharma', 'orifarm', 'bb farma', 'fd pharma', 'mpa pharma', '1 4 u', '2care4', 'axicorp', 'nattermann', '1 0 1 carefarm', 'haemato', 'eql', 'krka', 'actavis', 'accord', 'aristo', 'mylan', 'sun pharma', 'hexal', 'beximco', 'bionorica', '1a pharma', 'tiefenbacher', 'cinfa', 'polpharma', 'pharmaceutical innovation services', 'eurogenerics', 'eg' ] if b_lower.startswith(i_lower) or i_lower in b_lower: return True if any(gk in b_lower or gk in c_lower for gk in generic_keywords): return True if '「' in brand_name or '(' in brand_name: return True return False def clean_brand_name(raw_name): pattern = r'(皮下注|錠|カプセル|顆粒|シロップ|OD|細粒|液|Augentropfen|Schmelztabletten|Tabletten|kids|Lingual|Sol|cp|inj|pen|prefilled|flex|\d+)' cleaned = re.split(pattern, raw_name, flags=re.IGNORECASE)[0] return cleaned.replace('®', '').replace('™', '').strip(' -_') # ========================================== # 🚀 模組 A:澳洲 TGA # ========================================== def get_australia_originator(ingredient): log, brands, companies = [], set(), set() try: log.append("1. 發送 GET 至澳洲 TGA 搜尋頁面 (啟用 curl_cffi 偽裝)...") session = curl_req.Session(impersonate="chrome120") res = session.get(f"https://www.tga.gov.au/resources/artg?keywords={ingredient}", timeout=45, verify=False) soup = BeautifulSoup(res.text, 'html.parser') articles = soup.find_all('article', class_='node--artg') log.append(f"2. 找到 {len(articles)} 筆 ARTG 紀錄。") cands = [] for article in articles: title_tag, time_tag = article.find('h3'), article.find('time') if title_tag and time_tag: full_t = title_tag.get_text(strip=True) parts = re.split(ingredient, full_t, flags=re.IGNORECASE) if len(parts) > 1 and parts[0].strip(): brand = parts[0].strip() if not is_generic(brand, "", ingredient): a_tag = title_tag.find('a') href = a_tag['href'] if a_tag else None cands.append({"brand": brand, "date": time_tag.get('datetime'), "href": href}) if cands: cands = sorted(cands, key=lambda x: x['date']) target = cands[0] brand = target['brand'] company = "TGA資料庫" if target['href']: detail_url = f"https://www.tga.gov.au{target['href']}" if target['href'].startswith('/') else target['href'] res_detail = session.get(detail_url, timeout=45, verify=False) detail_soup = BeautifulSoup(res_detail.text, 'html.parser') sponsor_div = detail_soup.find('div', class_=re.compile(r'field--name-field-sponsor')) if sponsor_div and sponsor_div.find('a'): company = sponsor_div.find('a').get_text(strip=True) return brand, company, "\n".join(log) return "查無原廠", "-", "\n".join(log) except Exception as e: return "執行失敗", "-", "\n".join(log) + f"\n錯誤: {str(e)}" # ========================================== # 🚀 模組 B:Playwright 抓取 (加長超時、修復德國邏輯) # ========================================== def get_usa_originator(ingredient, page): log, brands, companies = [], set(), set() try: log.append(f"1. 前往 FDA Orange Book...") url = f"https://www.accessdata.fda.gov/scripts/cder/ob/results_product.cfm?Generic_Name={quote(ingredient)}&rx_otc=All" # 💡 放寬美國載入時間 page.goto(url, timeout=60000, wait_until="domcontentloaded") try: page.wait_for_selector('table#example, .alert-warning', state="attached", timeout=15000) except Exception: pass page.wait_for_timeout(2000) soup = BeautifulSoup(page.content(), 'html.parser') table = soup.find('table', id='example') if table and table.find('tbody'): rows = table.find('tbody').find_all('tr') headers = [th.get_text(strip=True).lower() for th in table.find_all('th')] brand_idx = next((i for i, h in enumerate(headers) if 'proprietary name' in h), 2) rld_idx = next((i for i, h in enumerate(headers) if 'rld' in h), 8) mfg_idx = next((i for i, h in enumerate(headers) if 'applicant holder' in h), 10) for tr in rows: tds = tr.find_all('td') if len(tds) > max(rld_idx, brand_idx): rld_text = tds[rld_idx].get_text(strip=True).upper() if "RLD" in rld_text or "RS" in rld_text: title = tds[brand_idx].get_text(strip=True) comp = tds[mfg_idx].get_text(strip=True) if len(tds) > mfg_idx else "-" if not is_generic(title, comp, ingredient): brands.add(clean_brand_name(title)) if comp != "-": companies.add(comp) if brands: return ", ".join(brands), ", ".join(companies), "\n".join(log) return "查無原廠", "-", "\n".join(log) + "\n❌ 查無資料" except Exception as e: return "執行失敗", "-", "\n".join(log) + f"\n錯誤: {str(e)}" def get_belgium_originator(ingredient, page): log, brands = [], set() try: log.append(f"1. 前往 CBIP (比利時) 並準備定位 Select2 搜尋框...") page.goto("https://www.cbip.be/fr/", timeout=60000, wait_until="domcontentloaded") page.wait_for_timeout(2000) try: page.locator('text="Tout accepter"').click(timeout=3000) except Exception: pass # 💡 精準打擊:使用您提供的 Select2 專屬 ID 與 Class log.append("2. 定位隱藏的 Select2 輸入框...") search_input = page.locator('#s2id_autogen1, input.select2-focusser, input[type="search"]').first search_input.wait_for(state="attached", timeout=15000) # Select2 通常需要先點擊才會展開真正的輸入框 try: page.locator('#s2id_search-dummy-input').click(timeout=5000) except Exception: pass search_input.fill(ingredient) log.append("3. 等待下拉選單 (Autocomplete) 出現...") dropdown = page.locator('.select2-results, .tt-menu, .autocomplete-suggestions') dropdown.first.wait_for(state='visible', timeout=15000) page.wait_for_timeout(1500) log.append("4. 點擊下拉選單中的目標連結...") links = page.locator('.select2-results__option, .tt-menu a, .autocomplete-suggestions a') if links.count() > 0: links.first.click(force=True) page.wait_for_load_state('domcontentloaded', timeout=30000) page.wait_for_timeout(2000) else: return "查無原廠", "-", "\n".join(log) + "\n❌ 下拉選單未出現可點擊選項" log.append("5. 解析藥品表格內商品名...") soup = BeautifulSoup(page.content(), 'html.parser') for tr in soup.find_all('tr'): tds = tr.find_all('td') if len(tds) >= 2: raw_name = tds[1].get_text(separator=" ", strip=True) if len(tds) > 1 else tds[0].get_text(separator=" ", strip=True) name = re.sub(r'(compr\.|gél\.|flac\.|amp\.|sol\.).*', '', raw_name, flags=re.IGNORECASE).strip() if len(name) > 3 and not is_generic(name, "", ingredient): brands.add(clean_brand_name(name)) if brands: return ", ".join(brands), "CBIP 資料庫", "\n".join(log) return "查無原廠", "-", "\n".join(log) + "\n❌ 查無資料或全為學名藥" except Exception as e: return "執行失敗", "-", "\n".join(log) + f"\n錯誤: {str(e)}" def get_france_originator(ingredient, page): log, brands, companies = [], set(), set() try: page.goto(f"https://www.vidal.fr/recherche.html?query={quote(ingredient)}", timeout=45000, wait_until="domcontentloaded") page.wait_for_selector('.results, .searchbar', timeout=15000) soup = BeautifulSoup(page.content(), 'html.parser') divs = soup.find_all('div', class_=re.compile(r'result drug')) for div in divs: info_div = div.find('div', class_='infos') if info_div and info_div.find('a'): a_tag = info_div.find('a') title = a_tag.get_text(strip=True) href = a_tag.get('href', '') if not is_generic(title, "", ingredient): detail_url = href if href.startswith('http') else (f"https://www.vidal.fr{href}" if href.startswith('/') else f"https://www.vidal.fr/{href}") page.goto(detail_url, timeout=30000, wait_until="domcontentloaded") comp = "-" try: page.wait_for_selector('div.nomlab', timeout=10000) comp = BeautifulSoup(page.content(), 'html.parser').find('div', class_='nomlab').get_text(strip=True) except Exception: pass brands.add(clean_brand_name(title)) if comp != "-": companies.add(comp) if brands: return ", ".join(brands), ", ".join(companies), "\n".join(log) return "查無原廠", "-", "\n".join(log) except Exception as e: return "執行失敗", "-", "\n".join(log) + f"\n錯誤: {str(e)}" def get_uk_originator(ingredient, page): log, brands, companies = [], set(), set() try: page.goto(f"https://www.medicines.org.uk/emc/search?q={ingredient}", timeout=45000) page.wait_for_selector('.search-results-product-info-title-link', timeout=15000) soup = BeautifulSoup(page.content(), 'html.parser') for link in soup.find_all('a', class_='search-results-product-info-title-link'): title = link.get_text(strip=True) if not title.lower().startswith(ingredient.lower()) and not is_generic(title, "", ingredient): brands.add(clean_brand_name(title)) p_div = link.find_parent(class_='search-results-product-info') if p_div and p_div.find(class_='search-results-product-info-company'): companies.add(p_div.find(class_='search-results-product-info-company').get_text(strip=True)) if brands: return ", ".join(brands), ", ".join(companies), "\n".join(log) return "查無原廠", "-", "\n".join(log) except Exception as e: return "執行失敗", "-", "\n".join(log) + f"\n錯誤: {str(e)}" def get_canada_originator(ingredient, page): log = [] try: page.goto("https://health-products.canada.ca/dpd-bdpp/index-eng.jsp", timeout=45000, wait_until="domcontentloaded") page.locator('input[id="activeIngredient"]').fill(ingredient) page.keyboard.press("Enter") page.wait_for_selector('table#results, .alert-info, .alert-warning', timeout=15000) soup = BeautifulSoup(page.content(), 'html.parser') table = soup.find('table', id='results') if not table or not table.find('tbody'): return "查無資料", "-", "\n".join(log) all_cands = [] for tr in table.find('tbody').find_all('tr'): tds = tr.find_all('td') if len(tds) >= 4: comp, prod = tds[2].get_text(strip=True), tds[3].get_text(strip=True) if not is_generic(prod, comp, ingredient): m = re.search(r'\d+', tds[1].get_text(strip=True)) if m: all_cands.append({"company": comp, "product": prod, "din": int(m.group())}) if not all_cands: return "查無原廠", "-", "\n".join(log) orig_comp = sorted(all_cands, key=lambda x: x['din'])[0]['company'] brands = set([c['product'] for c in all_cands if c['company'] == orig_comp]) return ", ".join(clean_brand_name(b) for b in brands), orig_comp, "\n".join(log) except Exception as e: return "執行失敗", "-", "\n".join(log) + f"\n錯誤: {str(e)}" def get_japan_originator(ing_ja, page): log, brands, companies = [], set(), set() try: page.goto("https://www.pmda.go.jp/PmdaSearch/iyakuSearch/", timeout=45000, wait_until="domcontentloaded") search_input = page.locator('input#txtName') search_input.wait_for(state="attached", timeout=15000) search_input.fill(ing_ja, force=True) with page.expect_popup() as popup_info: search_input.press("Enter") popup = popup_info.value page_limit = 5 current_page = 1 while current_page <= page_limit: try: popup.wait_for_selector('table#ResultList, .errormsg, .non-result', timeout=15000) except Exception: break table = BeautifulSoup(popup.content(), 'html.parser').find('table', id='ResultList') if not table: break for tr in table.find_all('tr'): tds = tr.find_all('td') if len(tds) >= 3: title = tds[1].get_text(strip=True) if not is_generic(title, "", ing_ja): brands.add(clean_brand_name(title)) raw_comp = tds[2].get_text(separator=" ", strip=True) raw_comp = re.sub(r'(製造販売元/|販売元/|提携先/)', ' ', raw_comp).strip() clean_comp = re.split(r'\s+', raw_comp)[0] companies.add(clean_comp) next_link = popup.locator(f'a[href*="changePg"]') if next_link.count() > 0: next_link.last.click(force=True) popup.wait_for_timeout(2000) current_page += 1 else: break popup.close() if brands: return ", ".join(brands), ", ".join(companies), "\n".join(log) return "查無原廠", "-", "\n".join(log) except Exception as e: return "執行失敗", "-", "\n".join(log) + f"\n錯誤: {str(e)}" def get_switzerland_originator(ing_de, page): log, brands, companies = [], set(), set() try: page.goto("https://swissmedicinfo-pro.ch/?Lang=EN", timeout=45000, wait_until="domcontentloaded") search_input = page.locator('input#MainContent_ucSearch1_txtSubstance') search_input.wait_for(state="attached", timeout=15000) search_input.fill(ing_de, force=True) search_input.press("Enter") try: page.wait_for_selector('table[id*="GVMonographies"], #MainContent_LabelNoResult', timeout=15000) except Exception: pass table = BeautifulSoup(page.content(), 'html.parser').find('table', id=re.compile(r'GVMonographies')) if table and table.find('tbody'): for tr in table.find('tbody').find_all('tr', class_=re.compile(r'clickable-row')): tds = tr.find_all('td') if len(tds) >= 4: title, comp = tds[0].get_text(strip=True), tds[3].get_text(strip=True) if not is_generic(title, comp, ing_de): brands.add(clean_brand_name(title)) if comp != "-": companies.add(comp) if brands: return ", ".join(brands), ", ".join(companies), "\n".join(log) return "查無原廠", "-", "\n".join(log) except Exception as e: return "執行失敗", "-", "\n".join(log) + f"\n錯誤: {str(e)}" def get_sweden_originator(ingredient, page): log, brands, companies = [], set(), set() try: page.goto(f"https://fass.se/search?query={quote(ingredient)}", timeout=45000, wait_until="domcontentloaded") try: page.wait_for_selector('details.app-toggle-details-icon, .no-results', state="attached", timeout=15000) except Exception: pass page.wait_for_timeout(2000) for item in BeautifulSoup(page.content(), 'html.parser').find_all('details', class_=re.compile(r'app-toggle-details-icon')): summary = item.find('summary') if not summary: continue title_span = summary.find('span', class_=re.compile(r'font-semibold')) title = title_span.get_text(strip=True) if title_span else "" comp, ol = "-", item.find('ol') if ol: comp_span = ol.find('span', class_='text-label-md') if comp_span: comp = comp_span.get_text(strip=True) if title and not is_generic(title, comp, ingredient): brands.add(clean_brand_name(title)) if comp != "-": companies.add(comp) if brands: return ", ".join(brands), ", ".join(companies), "\n".join(log) return "查無原廠", "-", "\n".join(log) except Exception as e: return "執行失敗", "-", "\n".join(log) + f"\n錯誤: {str(e)}" def get_germany_originator(ing_de, page): log, brands, companies = [], set(), set() try: log.append("1. 前往 PharmNet.Bund...") page.goto("https://portal.bfarm.de/amguifree/am/search.xhtml", timeout=60000) page.wait_for_timeout(3000) # 給予跳轉時間 # 💡 精準打擊:使用您提供的 標籤 ID 點擊同意條款 if "termsofuse" in page.url: log.append(" -> 偵測到條款頁面,點擊超連結同意並等待跳轉...") try: # 尋找特定的 a 標籤 ID accept_link = page.locator('a#docOutputPromptForm\\:acceptLink, a.button.next') if accept_link.count() > 0: accept_link.first.click() else: # 備用方案:如果 ID 找不到,尋找帶有 accept=true 的連結 page.locator('a[href*="accept=true"]').first.click() # 等待網頁跳轉回搜尋頁面 page.wait_for_url("**/amguifree/am/search.xhtml**", timeout=15000) page.wait_for_timeout(2000) except Exception as e: log.append(f" ⚠️ 點擊同意條款失敗: {e}") log.append("2. 定位搜尋框...") search_input = page.locator('input[id="searchForm:searchInputsComponent:searchRows:1:firstSearchTerm"]') search_input.wait_for(state="attached", timeout=15000) search_input.fill(ing_de) page.locator('input[name="searchForm:searchInputsComponent:suchestarten"]').click() page_limit = 5 current_page = 1 while current_page <= page_limit: try: page.wait_for_selector('table[id="searchResultsForm:searchResultsComponent:titles"]', timeout=15000) except Exception: break soup = BeautifulSoup(page.content(), 'html.parser') table = soup.find('table', id='searchResultsForm:searchResultsComponent:titles') if not table or not table.find('tbody'): break for tr in table.find('tbody').find_all('tr'): tds = tr.find_all('td') if len(tds) >= 4: title, comp = tds[1].get_text(strip=True), tds[3].get_text(strip=True) if not is_generic(title, comp, ing_de): brands.add(clean_brand_name(title)) if comp != "-": companies.add(comp) paging_div = soup.find('div', class_='browse') has_next = False if paging_div: buttons = paging_div.find_all('input', class_='submit') for i, btn in enumerate(buttons): if 'disabled' in btn.get('class', []) and i + 1 < len(buttons): next_btn_id = buttons[i+1].get('id') page.locator(f'input[id="{next_btn_id}"]').click() page.wait_for_timeout(2500) has_next = True current_page += 1 break if not has_next: break if brands: return ", ".join(brands), ", ".join(companies), "\n".join(log) return "查無原廠", "-", "\n".join(log) except Exception as e: return "執行失敗", "-", "\n".join(log) + f"\n錯誤: {str(e)}" # ========================================== # 🚀 主執行中樞 # ========================================== def run_all_ten_countries(ing_en, ing_ja_manual, ing_de_manual): if not ing_en: return [["錯誤", "請輸入英文成分名", "-", ""]], "" ing_ja = ing_ja_manual if ing_ja_manual else translate_lang(ing_en, 'ja') ing_de = ing_de_manual if ing_de_manual else translate_lang(ing_en, 'de') results = [] au_b, au_c, au_log = get_australia_originator(ing_en) results.append(["🇦🇺 澳洲 (TGA)", au_b, au_c, au_log]) with sync_playwright() as p: browser = p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage']) context = browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0") def run_pw(func, arg, name): page = context.new_page() b, c, log = func(arg, page) page.close() results.append([name, b, c, log]) run_pw(get_usa_originator, ing_en, "🇺🇸 美國 (FDA)") run_pw(get_belgium_originator, ing_en, "🇧🇪 比利時 (CBIP)") run_pw(get_france_originator, ing_en, "🇫🇷 法國 (Vidal)") run_pw(get_uk_originator, ing_en, "🇬🇧 英國 (eMC)") run_pw(get_canada_originator, ing_en, "🇨🇦 加拿大 (DPD)") run_pw(get_japan_originator, ing_ja, "🇯🇵 日本 (PMDA)") run_pw(get_switzerland_originator, ing_de, "🇨🇭 瑞士 (Swissmedicinfo)") run_pw(get_sweden_originator, ing_en, "🇸🇪 瑞典 (FASS)") run_pw(get_germany_originator, ing_de, "🇩🇪 德國 (PharmNet.Bund)") browser.close() # 💡 修正:將日誌加入複製區域,並把換行替換為直線,防止貼上 Excel 跑版 copy_text = "國家\t🌟 判定為原廠的商品名\t🏭 藥廠名稱\t🛠️ 詳細日誌\n" for r in results: clean_log = str(r[3]).replace("\n", " | ") copy_text += f"{r[0]}\t{r[1]}\t{r[2]}\t{clean_log}\n" return results, copy_text # ========================================== # 🎨 UI 介面 # ========================================== with gr.Blocks(title="十國原廠商品名智能檢索器") as demo: gr.Markdown("## 🌐 跨國原廠商品名檢索器 (搭載原生 ElanMT 醫療翻譯)") with gr.Row(): ing_en = gr.Textbox(label="🧪 英文成分名 (必填)", placeholder="例如: bilastine") with gr.Row(): with gr.Accordion("⚙️ 手動覆寫翻譯 (進階)", open=False): ing_ja = gr.Textbox(label="🇯🇵 日文成分名", placeholder="若空白則自動啟動 ElanMT 翻譯") ing_de = gr.Textbox(label="🇩🇪 德文成分名", placeholder="若空白則自動翻譯") search_btn = gr.Button("🚀 啟動十國查詢", variant="primary") copy_output = gr.Textbox(label="📋 一鍵複製用文字 (包含日誌,點擊右上角圖示,直接貼上 Excel)", show_copy_button=True, interactive=False, lines=11) result_table = gr.Dataframe( headers=["國家", "🌟 判定為原廠的商品名", "🏭 藥廠名稱", "🛠️ 系統狀態與詳細日誌"], datatype=["str", "str", "str", "str"], wrap=True, interactive=False ) search_btn.click(fn=run_all_ten_countries, inputs=[ing_en, ing_ja, ing_de], outputs=[result_table, copy_output]) if __name__ == "__main__": demo.launch()