Spaces:
Sleeping
Sleeping
| from flask import Flask, request, jsonify, render_template | |
| from flask_cors import CORS | |
| from selenium import webdriver | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from bs4 import BeautifulSoup | |
| import chromedriver_autoinstaller | |
| from selenium.webdriver.chrome.service import Service | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium import webdriver | |
| import json | |
| import os | |
| app = Flask(__name__) | |
| CORS(app) # 允許跨域請求 | |
| def setup_driver(): | |
| try: | |
| # 指定 ChromeDriver 的安裝路徑為 /tmp | |
| chromedriver_path = chromedriver_autoinstaller.install(path="/tmp") | |
| # 配置 Chrome 選項 | |
| options = Options() | |
| options.add_argument("--headless") | |
| options.add_argument("--no-sandbox") | |
| options.add_argument("--disable-dev-shm-usage") | |
| options.add_argument("--disable-gpu") | |
| # 啟動 WebDriver,使用指定的 Service | |
| service = Service(chromedriver_path) | |
| return webdriver.Chrome(service=service, options=options) | |
| except Exception as e: | |
| raise RuntimeError(f"WebDriver 啟動失敗: {str(e)}") | |
| def is_good_name(name): | |
| """ | |
| 判斷商品名稱是否具有足夠的資訊 | |
| 規則: | |
| 1. 長度在 5-100 字之間 | |
| 2. 不是太通用的詞彙 | |
| 3. 包含具體描述 | |
| """ | |
| if not name or len(name) < 5 or len(name) > 100: | |
| return False | |
| # 排除一些通用或無意義的名稱 | |
| bad_keywords = [ | |
| 'product', 'item', 'sale', 'wts', 'wtb', 'for sale', | |
| 'bunjang', 'global', 'sign', 'album', 'photocard' | |
| ] | |
| name_lower = name.lower() | |
| if any(keyword in name_lower for keyword in bad_keywords): | |
| return False | |
| return True | |
| def index(): | |
| return render_template('product_scraper.html') | |
| def scrape_product(): | |
| url = request.json.get('url') | |
| if not url: | |
| return jsonify({"error": "未提供網址"}), 400 | |
| driver = None | |
| try: | |
| driver = setup_driver() | |
| driver.get(url) | |
| # 等待頁面加載 | |
| WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body"))) | |
| # 獲取網頁內容 | |
| html_content = driver.page_source | |
| # 使用 BeautifulSoup 解析 HTML | |
| soup = BeautifulSoup(html_content, "html.parser") | |
| # 先從 alt 屬性提取韓文原文 | |
| korean_title = None | |
| # 尋找具有特定特徵的圖片 | |
| img_tags = soup.find_all('img', { | |
| 'fetchpriority': 'high', | |
| 'data-nimg': 'fill', | |
| 'class': 'osrq1v4', | |
| 'alt': True | |
| }) | |
| if img_tags: | |
| korean_title = img_tags[0].get('alt') | |
| # 查找 JSON-LD 資料 | |
| script_tags = soup.find_all("script", type="application/ld+json") | |
| for script_tag in script_tags: | |
| try: | |
| product_data = json.loads(script_tag.string) | |
| # 確保資料包含所需字段 | |
| if product_data.get("@type") == "Product": | |
| image_url = product_data.get("image") | |
| name = product_data.get("name") | |
| description = product_data.get("description") | |
| offers = product_data.get("offers") | |
| # 如果 offers 是列表,提取第一個元素 | |
| if isinstance(offers, list): | |
| offers = offers[0] | |
| price = offers.get("price") if offers else None | |
| price_currency = offers.get("priceCurrency") if offers else None | |
| # 評估商品名稱的品質 | |
| name_quality = is_good_name(name) | |
| return jsonify({ | |
| "image": image_url, | |
| "name": name, | |
| "korean_name": korean_title or name, # 如果沒找到特定圖片的 alt,則使用原始 name | |
| "description": description, | |
| "price": price, | |
| "currency": price_currency, | |
| "name_quality": name_quality | |
| }) | |
| except json.JSONDecodeError: | |
| continue | |
| return jsonify({"error": "未找到商品資訊"}), 404 | |
| except Exception as e: | |
| return jsonify({"error": str(e)}), 500 | |
| finally: | |
| if driver: | |
| driver.quit() | |
| if __name__ == "__main__": | |
| port = int(os.environ.get("PORT", 7860)) # 默認 Hugging Face 使用 7860 | |
| app.run(host="0.0.0.0", port=port) | |