deneve07 commited on
Commit
1152cee
·
verified ·
1 Parent(s): b0bbf11

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -82
app.py CHANGED
@@ -29,11 +29,11 @@ def is_generic(brand_name, company_name, ingredient):
29
 
30
  generic_keywords = ['sandoz', 'teva', 'apotex', 'ratiopharm', 'jamp', 'mint', 'pharmascience', 'sanis', 'sivem',
31
  'auro', 'glenmark', 'taro', 'marcan', 'nora', 'mantra', 'reddy', 'mepha', 'axapharm',
32
- 'helvepharm', 'zentiva', 'spirig', 'aliud', 'puren', 'stada', 'eg ', '- gé']
33
 
34
  if b_lower.startswith(i_lower) or i_lower in b_lower: return True
35
  if any(gk in b_lower or gk in c_lower for gk in generic_keywords): return True
36
- if '「' in brand_name or '(' in brand_name: return True # 日本學名藥特徵
37
  return False
38
 
39
  def clean_brand_name(raw_name):
@@ -41,7 +41,7 @@ def clean_brand_name(raw_name):
41
  return re.split(r'(皮下注|錠|カプセル|顆粒|シロップ|OD|細粒|液|\d+)', raw_name)[0].replace('®', '').strip()
42
 
43
  # ==========================================
44
- # 🚀 模組 A:使用 curl_cffi 抓取 (美、比、法)
45
  # ==========================================
46
  def get_usa_originator(ingredient):
47
  log, brands, companies = [], set(), set()
@@ -53,7 +53,9 @@ def get_usa_originator(ingredient):
53
  table = soup.find('table', id='example')
54
  if table:
55
  headers = [th.get_text(strip=True).lower() for th in table.find_all('th')]
56
- brand_idx, rld_idx, mfg_idx = 2, 8, 10
 
 
57
  for tr in (table.find('tbody') or table).find_all('tr'):
58
  tds = tr.find_all('td')
59
  if len(tds) > max(rld_idx, brand_idx) and "RLD" in tds[rld_idx].get_text(strip=True).upper():
@@ -78,9 +80,6 @@ def get_belgium_originator(ingredient):
78
  return "查無原廠", "-", "❌ 皆為學名藥"
79
  except Exception as e: return "執行失敗", "-", str(e)
80
 
81
- # ==========================================
82
- # 🇫🇷 法國 (精準解析 result drug 結構)
83
- # ==========================================
84
  def get_france_originator(ingredient):
85
  brands = set()
86
  try:
@@ -88,13 +87,11 @@ def get_france_originator(ingredient):
88
  res = session.get(f"https://base-donnees-publique.medicaments.gouv.fr/medicament/recherche/resultat?contains={quote(ingredient)}", timeout=30, verify=False)
89
  soup = BeautifulSoup(res.text, 'html.parser')
90
 
91
- # 🟢 修正:針對您提供HTML,尋找 div class="result drug"
92
  for div in soup.find_all('div', class_='result drug'):
93
  info_div = div.find('div', class_='infos')
94
  if info_div and info_div.find('a'):
95
  title = info_div.find('a').get_text(strip=True)
96
-
97
- # 排除含有 Gé 標籤或以成分名開頭的學名藥
98
  if not is_generic(title, "", ingredient) and '- gé' not in title.lower():
99
  brands.add(clean_brand_name(title))
100
 
@@ -102,8 +99,47 @@ def get_france_originator(ingredient):
102
  return "查無原廠", "-", "❌ 查無資料或皆為 Gé"
103
  except Exception as e: return "執行失敗", "-", str(e)
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  # ==========================================
106
- # 🚀 模組 B:使用 Playwright 抓取 (英、加、日、澳、瑞、德、瑞)
107
  # ==========================================
108
  def get_uk_originator(ingredient, page):
109
  brands, companies = set(), set()
@@ -153,16 +189,13 @@ def get_canada_originator(ingredient, page):
153
  def get_japan_originator(ing_ja, page):
154
  brands, companies = set(), set()
155
  try:
 
156
  page.goto("https://www.pmda.go.jp/PmdaSearch/iyakuSearch/", timeout=30000, wait_until="domcontentloaded")
157
- try:
158
- btn = page.locator('text=同意する, input[value="同意する"]').first
159
- if btn.is_visible(timeout=3000): btn.click(); page.wait_for_load_state('networkidle')
160
- except: pass
161
 
162
- page.locator('input#txtName, input[name="nameWord"]').first.fill(ing_ja)
163
- page.locator('input[name="btnA"], input[type="image"][src*="SearchBtn"]').first.click()
164
  page.wait_for_selector('table#ResultList', timeout=15000)
165
-
166
  for tr in BeautifulSoup(page.content(), 'html.parser').find('table', id='ResultList').find_all('tr'):
167
  tds = tr.find_all('td')
168
  if len(tds) >= 3:
@@ -174,42 +207,13 @@ def get_japan_originator(ing_ja, page):
174
  return "查無原廠", "-", "❌ 皆為學名藥"
175
  except Exception as e: return "執行失敗", "-", str(e)
176
 
177
- # ==========================================
178
- # 🇦🇺 澳洲 (改用 curl_cffi 突破 ERR_HTTP2_PROTOCOL_ERROR)
179
- # ==========================================
180
- def get_australia_originator(ingredient, page=None):
181
- brands = set()
182
- try:
183
- # 🟢 修正:放棄 Playwright,改用 curl_cffi
184
- session = curl_req.Session(impersonate="chrome120")
185
- res = session.get(f"https://www.tga.gov.au/resources/artg?keywords={ingredient}", timeout=30, verify=False)
186
- soup = BeautifulSoup(res.text, 'html.parser')
187
-
188
- cands = []
189
- for article in soup.find_all('article', class_='node--artg'):
190
- title_tag = article.find('h3')
191
- time_tag = article.find('time')
192
- if title_tag and time_tag:
193
- full_t = title_tag.get_text(strip=True)
194
- # 切割成分名取前半段
195
- parts = re.split(ingredient, full_t, flags=re.IGNORECASE)
196
- if len(parts) > 1 and parts[0].strip():
197
- brand = parts[0].strip()
198
- # 過濾常見學名藥廠
199
- if not is_generic(brand, "", ingredient):
200
- cands.append({"brand": brand, "date": time_tag.get('datetime')})
201
-
202
- if cands:
203
- cands = sorted(cands, key=lambda x: x['date'])
204
- return cands[0]['brand'], "TGA資料庫", f"✅ 最早註冊: {cands[0]['date'][:10]}"
205
- return "查無原廠", "-", "❌ 查無資料"
206
- except Exception as e: return "執行失敗", "-", str(e)
207
-
208
  def get_switzerland_originator(ing_de, page):
209
  brands, companies = set(), set()
210
  try:
211
  page.goto(f"https://compendium.ch/search?q={ing_de}", timeout=30000)
212
- page.wait_for_selector('.medicament-card', timeout=15000)
 
 
213
  for card in BeautifulSoup(page.content(), 'html.parser').find_all('div', class_=re.compile('medicament-card')):
214
  h3, strong = card.find('h3'), card.find('strong', class_='info')
215
  if h3 and strong:
@@ -240,33 +244,7 @@ def get_germany_originator(ing_de, page):
240
  except Exception as e: return "執行失敗", "-", str(e)
241
 
242
  # ==========================================
243
- # 🇸🇪 瑞典 (改用 curl_cffi 抓取 FASS)
244
- # ==========================================
245
- def get_sweden_originator(ingredient, page=None):
246
- brands, companies = set(), set()
247
- try:
248
- session = curl_req.Session(impersonate="chrome120")
249
- # 直接對 FASS 發送搜尋請求
250
- res = session.get(f"https://www.fass.se/LIF/search?query={quote(ingredient)}", timeout=30, verify=False)
251
- soup = BeautifulSoup(res.text, 'html.parser')
252
-
253
- # FASS 的搜尋結果通常在 .search-result-item 中
254
- for item in soup.find_all('li', class_=re.compile(r'search-result-item')):
255
- title_tag = item.find('a', class_='product-name')
256
- comp_tag = item.find('span', class_='company-name')
257
- if title_tag and comp_tag:
258
- title = title_tag.get_text(strip=True)
259
- comp = comp_tag.get_text(strip=True)
260
- if not is_generic(title, comp, ingredient):
261
- brands.add(clean_brand_name(title))
262
- companies.add(comp)
263
-
264
- if brands: return ", ".join(brands), ", ".join(companies), "✅ 成功 (FASS)"
265
- return "查無原廠", "-", "❌ 查無資料"
266
- except Exception as e: return "執行失敗", "-", str(e)
267
-
268
- # ==========================================
269
- # 🚀 主執行中樞:併發與隔離
270
  # ==========================================
271
  def run_all_ten_countries(ing_en, ing_ja_manual, ing_de_manual):
272
  if not ing_en: return [["錯誤", "請輸入英文成分名", "-", ""]]
@@ -275,7 +253,7 @@ def run_all_ten_countries(ing_en, ing_ja_manual, ing_de_manual):
275
  ing_de = ing_de_manual if ing_de_manual else translate_lang(ing_en, 'de')
276
  results = []
277
 
278
- # 1. API 模組 (不需瀏覽器,極速)
279
  usa_b, usa_c, usa_log = get_usa_originator(ing_en)
280
  results.append(["🇺🇸 美國 (FDA)", usa_b, usa_c, usa_log])
281
 
@@ -291,7 +269,7 @@ def run_all_ten_countries(ing_en, ing_ja_manual, ing_de_manual):
291
  se_b, se_c, se_log = get_sweden_originator(ing_en)
292
  results.append(["🇸🇪 瑞典 (FASS)", se_b, se_c, se_log])
293
 
294
- # 2. 瀏覽器模組 (Playwright 分頁隔離)
295
  with sync_playwright() as p:
296
  browser = p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
297
  context = browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0 Safari/537.36")
@@ -305,10 +283,8 @@ def run_all_ten_countries(ing_en, ing_ja_manual, ing_de_manual):
305
  run_pw(get_uk_originator, ing_en, "🇬🇧 英國 (eMC)")
306
  run_pw(get_canada_originator, ing_en, "🇨🇦 加拿大 (DPD)")
307
  run_pw(get_japan_originator, ing_ja, "🇯🇵 日本 (PMDA)")
308
- run_pw(get_australia_originator, ing_en, "🇦🇺 澳洲 (TGA)")
309
  run_pw(get_switzerland_originator, ing_de, "🇨🇭 瑞士 (Compendium)")
310
  run_pw(get_germany_originator, ing_de, "🇩🇪 德國 (Gelbe Liste)")
311
- run_pw(get_sweden_originator, ing_en, "🇸🇪 瑞典 (TLV)")
312
 
313
  browser.close()
314
 
@@ -318,7 +294,7 @@ def run_all_ten_countries(ing_en, ing_ja_manual, ing_de_manual):
318
  # 🎨 UI 介面
319
  # ==========================================
320
  with gr.Blocks(title="十國原廠商品名智能檢索器") as demo:
321
- gr.Markdown("## 🌐 跨國原廠商品名檢索器 (十國完整版)")
322
 
323
  with gr.Row():
324
  ing_en = gr.Textbox(label="🧪 英文成分名 (必填)", placeholder="例如: Semaglutide")
 
29
 
30
  generic_keywords = ['sandoz', 'teva', 'apotex', 'ratiopharm', 'jamp', 'mint', 'pharmascience', 'sanis', 'sivem',
31
  'auro', 'glenmark', 'taro', 'marcan', 'nora', 'mantra', 'reddy', 'mepha', 'axapharm',
32
+ 'helvepharm', 'zentiva', 'spirig', 'aliud', 'puren', 'stada', 'eg ', '- gé', 'biogaran', 'arrow', 'viatris', 'zydus']
33
 
34
  if b_lower.startswith(i_lower) or i_lower in b_lower: return True
35
  if any(gk in b_lower or gk in c_lower for gk in generic_keywords): return True
36
+ if '「' in brand_name or '(' in brand_name: return True
37
  return False
38
 
39
  def clean_brand_name(raw_name):
 
41
  return re.split(r'(皮下注|錠|カプセル|顆粒|シロップ|OD|細粒|液|\d+)', raw_name)[0].replace('®', '').strip()
42
 
43
  # ==========================================
44
+ # 🚀 模組 A:使用 curl_cffi 抓取 (美、比、法、澳、瑞典)
45
  # ==========================================
46
  def get_usa_originator(ingredient):
47
  log, brands, companies = [], set(), set()
 
53
  table = soup.find('table', id='example')
54
  if table:
55
  headers = [th.get_text(strip=True).lower() for th in table.find_all('th')]
56
+ brand_idx = next((i for i, h in enumerate(headers) if 'proprietary name' in h), 2)
57
+ rld_idx = next((i for i, h in enumerate(headers) if 'rld' in h), 8)
58
+ mfg_idx = next((i for i, h in enumerate(headers) if 'applicant holder' in h), 10)
59
  for tr in (table.find('tbody') or table).find_all('tr'):
60
  tds = tr.find_all('td')
61
  if len(tds) > max(rld_idx, brand_idx) and "RLD" in tds[rld_idx].get_text(strip=True).upper():
 
80
  return "查無原廠", "-", "❌ 皆為學名藥"
81
  except Exception as e: return "執行失敗", "-", str(e)
82
 
 
 
 
83
  def get_france_originator(ingredient):
84
  brands = set()
85
  try:
 
87
  res = session.get(f"https://base-donnees-publique.medicaments.gouv.fr/medicament/recherche/resultat?contains={quote(ingredient)}", timeout=30, verify=False)
88
  soup = BeautifulSoup(res.text, 'html.parser')
89
 
90
+ # 🟢 確保使用精確法國 DOM 解析邏輯
91
  for div in soup.find_all('div', class_='result drug'):
92
  info_div = div.find('div', class_='infos')
93
  if info_div and info_div.find('a'):
94
  title = info_div.find('a').get_text(strip=True)
 
 
95
  if not is_generic(title, "", ingredient) and '- gé' not in title.lower():
96
  brands.add(clean_brand_name(title))
97
 
 
99
  return "查無原廠", "-", "❌ 查無資料或皆為 Gé"
100
  except Exception as e: return "執行失敗", "-", str(e)
101
 
102
+ def get_australia_originator(ingredient):
103
+ brands = set()
104
+ try:
105
+ session = curl_req.Session(impersonate="chrome120")
106
+ res = session.get(f"https://www.tga.gov.au/resources/artg?keywords={ingredient}", timeout=30, verify=False)
107
+ soup = BeautifulSoup(res.text, 'html.parser')
108
+ cands = []
109
+ for article in soup.find_all('article', class_='node--artg'):
110
+ title_tag, time_tag = article.find('h3'), article.find('time')
111
+ if title_tag and time_tag:
112
+ full_t = title_tag.get_text(strip=True)
113
+ parts = re.split(ingredient, full_t, flags=re.IGNORECASE)
114
+ if len(parts) > 1 and parts[0].strip():
115
+ brand = parts[0].strip()
116
+ if not is_generic(brand, "", ingredient):
117
+ cands.append({"brand": brand, "date": time_tag.get('datetime')})
118
+ if cands:
119
+ cands = sorted(cands, key=lambda x: x['date'])
120
+ return cands[0]['brand'], "TGA資料庫", f"✅ 最早註冊: {cands[0]['date'][:10]}"
121
+ return "查無原廠", "-", "❌ 查無資料"
122
+ except Exception as e: return "執行失敗", "-", str(e)
123
+
124
+ def get_sweden_originator(ingredient):
125
+ brands, companies = set(), set()
126
+ try:
127
+ session = curl_req.Session(impersonate="chrome120")
128
+ res = session.get(f"https://www.fass.se/LIF/search?query={quote(ingredient)}", timeout=30, verify=False)
129
+ soup = BeautifulSoup(res.text, 'html.parser')
130
+ for item in soup.find_all('li', class_=re.compile(r'search-result-item')):
131
+ title_tag, comp_tag = item.find('a', class_='product-name'), item.find('span', class_='company-name')
132
+ if title_tag and comp_tag:
133
+ title, comp = title_tag.get_text(strip=True), comp_tag.get_text(strip=True)
134
+ if not is_generic(title, comp, ingredient):
135
+ brands.add(clean_brand_name(title))
136
+ companies.add(comp)
137
+ if brands: return ", ".join(brands), ", ".join(companies), "✅ 成功 (FASS)"
138
+ return "查無原廠", "-", "❌ 查無資料"
139
+ except Exception as e: return "執行失敗", "-", str(e)
140
+
141
  # ==========================================
142
+ # 🚀 模組 B:使用 Playwright 抓取 (英、加、日、德、瑞)
143
  # ==========================================
144
  def get_uk_originator(ingredient, page):
145
  brands, companies = set(), set()
 
189
  def get_japan_originator(ing_ja, page):
190
  brands, companies = set(), set()
191
  try:
192
+ # 🟢 修正:直接前往網頁,等待輸入框,輸入後按 Enter 提交
193
  page.goto("https://www.pmda.go.jp/PmdaSearch/iyakuSearch/", timeout=30000, wait_until="domcontentloaded")
194
+ page.wait_for_selector('input#txtName', timeout=15000)
195
+ page.fill('input#txtName', ing_ja)
196
+ page.press('input#txtName', 'Enter')
 
197
 
 
 
198
  page.wait_for_selector('table#ResultList', timeout=15000)
 
199
  for tr in BeautifulSoup(page.content(), 'html.parser').find('table', id='ResultList').find_all('tr'):
200
  tds = tr.find_all('td')
201
  if len(tds) >= 3:
 
207
  return "查無原廠", "-", "❌ 皆為學名藥"
208
  except Exception as e: return "執行失敗", "-", str(e)
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  def get_switzerland_originator(ing_de, page):
211
  brands, companies = set(), set()
212
  try:
213
  page.goto(f"https://compendium.ch/search?q={ing_de}", timeout=30000)
214
+ # 🟢 修正:Vue.js 需要時間渲染,改為等待網路請求靜止
215
+ page.wait_for_load_state('networkidle', timeout=20000)
216
+
217
  for card in BeautifulSoup(page.content(), 'html.parser').find_all('div', class_=re.compile('medicament-card')):
218
  h3, strong = card.find('h3'), card.find('strong', class_='info')
219
  if h3 and strong:
 
244
  except Exception as e: return "執行失敗", "-", str(e)
245
 
246
  # ==========================================
247
+ # 🚀 主執行中樞
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  # ==========================================
249
  def run_all_ten_countries(ing_en, ing_ja_manual, ing_de_manual):
250
  if not ing_en: return [["錯誤", "請輸入英文成分名", "-", ""]]
 
253
  ing_de = ing_de_manual if ing_de_manual else translate_lang(ing_en, 'de')
254
  results = []
255
 
256
+ # 1. API 模組 (澳洲與瑞典已移至此處)
257
  usa_b, usa_c, usa_log = get_usa_originator(ing_en)
258
  results.append(["🇺🇸 美國 (FDA)", usa_b, usa_c, usa_log])
259
 
 
269
  se_b, se_c, se_log = get_sweden_originator(ing_en)
270
  results.append(["🇸🇪 瑞典 (FASS)", se_b, se_c, se_log])
271
 
272
+ # 2. 瀏覽器模組 (Playwright 嚴格隔離)
273
  with sync_playwright() as p:
274
  browser = p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
275
  context = browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0 Safari/537.36")
 
283
  run_pw(get_uk_originator, ing_en, "🇬🇧 英國 (eMC)")
284
  run_pw(get_canada_originator, ing_en, "🇨🇦 加拿大 (DPD)")
285
  run_pw(get_japan_originator, ing_ja, "🇯🇵 日本 (PMDA)")
 
286
  run_pw(get_switzerland_originator, ing_de, "🇨🇭 瑞士 (Compendium)")
287
  run_pw(get_germany_originator, ing_de, "🇩🇪 德國 (Gelbe Liste)")
 
288
 
289
  browser.close()
290
 
 
294
  # 🎨 UI 介面
295
  # ==========================================
296
  with gr.Blocks(title="十國原廠商品名智能檢索器") as demo:
297
+ gr.Markdown("## 🌐 跨國原廠商品名檢索器 (十國無重複修復版)")
298
 
299
  with gr.Row():
300
  ing_en = gr.Textbox(label="🧪 英文成分名 (必填)", placeholder="例如: Semaglutide")