Spaces:

Zaious
/

BunjangInfo

Sleeping

App Files Files Community

BunjangInfo / app.py

Zaious

Update app.py

dd07cc1 verified about 1 year ago

raw

history blame contribute delete

4.76 kB

	from flask import Flask, request, jsonify, render_template
	from flask_cors import CORS
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from bs4 import BeautifulSoup
	import chromedriver_autoinstaller
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.chrome.options import Options
	from selenium import webdriver
	import json
	import os

	app = Flask(__name__)
	CORS(app) # 允許跨域請求


	def setup_driver():
	try:
	# 指定 ChromeDriver 的安裝路徑為 /tmp
	chromedriver_path = chromedriver_autoinstaller.install(path="/tmp")

	# 配置 Chrome 選項
	options = Options()
	options.add_argument("--headless")
	options.add_argument("--no-sandbox")
	options.add_argument("--disable-dev-shm-usage")
	options.add_argument("--disable-gpu")

	# 啟動 WebDriver，使用指定的 Service
	service = Service(chromedriver_path)
	return webdriver.Chrome(service=service, options=options)

	except Exception as e:
	raise RuntimeError(f"WebDriver 啟動失敗: {str(e)}")


	def is_good_name(name):
	"""
	判斷商品名稱是否具有足夠的資訊

	規則：
	1. 長度在 5-100 字之間
	2. 不是太通用的詞彙
	3. 包含具體描述
	"""
	if not name or len(name) < 5 or len(name) > 100:
	return False

	# 排除一些通用或無意義的名稱
	bad_keywords = [
	'product', 'item', 'sale', 'wts', 'wtb', 'for sale',
	'bunjang', 'global', 'sign', 'album', 'photocard'
	]

	name_lower = name.lower()
	if any(keyword in name_lower for keyword in bad_keywords):
	return False

	return True

	@app.route('/')
	def index():
	return render_template('product_scraper.html')

	@app.route('/scrape', methods=['POST'])
	def scrape_product():
	url = request.json.get('url')

	if not url:
	return jsonify({"error": "未提供網址"}), 400

	driver = None
	try:
	driver = setup_driver()
	driver.get(url)

	# 等待頁面加載
	WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

	# 獲取網頁內容
	html_content = driver.page_source

	# 使用 BeautifulSoup 解析 HTML
	soup = BeautifulSoup(html_content, "html.parser")

	# 先從 alt 屬性提取韓文原文
	korean_title = None

	# 尋找具有特定特徵的圖片
	img_tags = soup.find_all('img', {
	'fetchpriority': 'high',
	'data-nimg': 'fill',
	'class': 'osrq1v4',
	'alt': True
	})

	if img_tags:
	korean_title = img_tags[0].get('alt')

	# 查找 JSON-LD 資料
	script_tags = soup.find_all("script", type="application/ld+json")
	for script_tag in script_tags:
	try:
	product_data = json.loads(script_tag.string)

	# 確保資料包含所需字段
	if product_data.get("@type") == "Product":
	image_url = product_data.get("image")
	name = product_data.get("name")
	description = product_data.get("description")
	offers = product_data.get("offers")

	# 如果 offers 是列表，提取第一個元素
	if isinstance(offers, list):
	offers = offers[0]

	price = offers.get("price") if offers else None
	price_currency = offers.get("priceCurrency") if offers else None

	# 評估商品名稱的品質
	name_quality = is_good_name(name)

	return jsonify({
	"image": image_url,
	"name": name,
	"korean_name": korean_title or name, # 如果沒找到特定圖片的 alt，則使用原始 name
	"description": description,
	"price": price,
	"currency": price_currency,
	"name_quality": name_quality
	})
	except json.JSONDecodeError:
	continue

	return jsonify({"error": "未找到商品資訊"}), 404

	except Exception as e:
	return jsonify({"error": str(e)}), 500

	finally:
	if driver:
	driver.quit()

	if __name__ == "__main__":
	port = int(os.environ.get("PORT", 7860)) # 默認 Hugging Face 使用 7860
	app.run(host="0.0.0.0", port=port)