from flask import Flask, request, jsonify, render_template from flask_cors import CORS from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup import chromedriver_autoinstaller from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium import webdriver import json import os app = Flask(__name__) CORS(app) # 允許跨域請求 def setup_driver(): try: # 指定 ChromeDriver 的安裝路徑為 /tmp chromedriver_path = chromedriver_autoinstaller.install(path="/tmp") # 配置 Chrome 選項 options = Options() options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") options.add_argument("--disable-gpu") # 啟動 WebDriver,使用指定的 Service service = Service(chromedriver_path) return webdriver.Chrome(service=service, options=options) except Exception as e: raise RuntimeError(f"WebDriver 啟動失敗: {str(e)}") def is_good_name(name): """ 判斷商品名稱是否具有足夠的資訊 規則: 1. 長度在 5-100 字之間 2. 不是太通用的詞彙 3. 包含具體描述 """ if not name or len(name) < 5 or len(name) > 100: return False # 排除一些通用或無意義的名稱 bad_keywords = [ 'product', 'item', 'sale', 'wts', 'wtb', 'for sale', 'bunjang', 'global', 'sign', 'album', 'photocard' ] name_lower = name.lower() if any(keyword in name_lower for keyword in bad_keywords): return False return True @app.route('/') def index(): return render_template('product_scraper.html') @app.route('/scrape', methods=['POST']) def scrape_product(): url = request.json.get('url') if not url: return jsonify({"error": "未提供網址"}), 400 driver = None try: driver = setup_driver() driver.get(url) # 等待頁面加載 WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body"))) # 獲取網頁內容 html_content = driver.page_source # 使用 BeautifulSoup 解析 HTML soup = BeautifulSoup(html_content, "html.parser") # 先從 alt 屬性提取韓文原文 korean_title = None # 尋找具有特定特徵的圖片 img_tags = soup.find_all('img', { 'fetchpriority': 'high', 'data-nimg': 'fill', 'class': 'osrq1v4', 'alt': True }) if img_tags: korean_title = img_tags[0].get('alt') # 查找 JSON-LD 資料 script_tags = soup.find_all("script", type="application/ld+json") for script_tag in script_tags: try: product_data = json.loads(script_tag.string) # 確保資料包含所需字段 if product_data.get("@type") == "Product": image_url = product_data.get("image") name = product_data.get("name") description = product_data.get("description") offers = product_data.get("offers") # 如果 offers 是列表,提取第一個元素 if isinstance(offers, list): offers = offers[0] price = offers.get("price") if offers else None price_currency = offers.get("priceCurrency") if offers else None # 評估商品名稱的品質 name_quality = is_good_name(name) return jsonify({ "image": image_url, "name": name, "korean_name": korean_title or name, # 如果沒找到特定圖片的 alt,則使用原始 name "description": description, "price": price, "currency": price_currency, "name_quality": name_quality }) except json.JSONDecodeError: continue return jsonify({"error": "未找到商品資訊"}), 404 except Exception as e: return jsonify({"error": str(e)}), 500 finally: if driver: driver.quit() if __name__ == "__main__": port = int(os.environ.get("PORT", 7860)) # 默認 Hugging Face 使用 7860 app.run(host="0.0.0.0", port=port)