Spaces:

leave-everything
/

ChieBukuro

Runtime error

App Files Files Community

ChieBukuro / scraper.py

leave-everything

Migrated from another account

cd2975a verified 3 months ago

raw

history blame contribute delete

28.1 kB

	"""
	Yahoo知恵袋スクレイピングモジュール
	Selenium WebDriverを使用してYahoo知恵袋から質問と回答を取得
	"""

	import time
	import logging
	from typing import Dict, List, Optional
	from datetime import datetime
	from urllib.parse import urljoin, quote
	from concurrent.futures import ThreadPoolExecutor, as_completed
	import threading

	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.common.exceptions import TimeoutException, NoSuchElementException
	from webdriver_manager.chrome import ChromeDriverManager
	from bs4 import BeautifulSoup

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	class YahooChiebukuroScraper:
	"""Yahoo知恵袋スクレイピングクラス"""

	BASE_URL = "https://chiebukuro.yahoo.co.jp"
	SEARCH_URL = "https://chiebukuro.yahoo.co.jp/search"

	def __init__(self, headless: bool = True, wait_time: int = 10):
	"""
	初期化

	Args:
	headless: ヘッドレスモードで実行するか
	wait_time: 要素の待機時間（秒）
	"""
	self.headless = headless
	self.wait_time = wait_time
	self.driver = None
	self.wait = None

	def save_page_source(self, filename: str = "debug_page.html"):
	"""デバッグ用にページソースを保存"""
	if self.driver:
	try:
	with open(filename, "w", encoding="utf-8") as f:
	f.write(self.driver.page_source)
	logger.info(f"Page source saved to {filename}")
	except Exception as e:
	logger.error(f"Failed to save page source: {e}")

	def setup_driver(self):
	"""WebDriverのセットアップ"""
	try:
	options = Options()

	# ヘッドレスモード設定
	if self.headless:
	options.add_argument('--headless')
	options.add_argument('--disable-gpu')

	# Hugging Face Spaces対応の追加オプション
	options.add_argument('--no-sandbox')
	options.add_argument('--disable-dev-shm-usage')
	options.add_argument('--disable-blink-features=AutomationControlled')
	options.add_experimental_option("excludeSwitches", ["enable-automation"])
	options.add_experimental_option('useAutomationExtension', False)

	# Hugging Face Spaces環境用の追加設定
	options.add_argument('--disable-software-rasterizer')
	options.add_argument('--disable-extensions')
	options.add_argument('--disable-setuid-sandbox')
	options.add_argument('--single-process')

	# User-Agent設定
	options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

	# Hugging Face Spaces環境でのChromium使用
	import os
	if os.path.exists('/usr/bin/chromium'):
	# Hugging Face Spaces環境
	options.binary_location = '/usr/bin/chromium'

	# chromium-driverのパスを設定
	if os.path.exists('/usr/bin/chromedriver'):
	service = Service('/usr/bin/chromedriver')
	else:
	# webdriver-managerを使用
	service = Service(ChromeDriverManager(chrome_type="chromium").install())
	else:
	# 通常環境
	service = Service(ChromeDriverManager().install())

	self.driver = webdriver.Chrome(service=service, options=options)
	self.wait = WebDriverWait(self.driver, self.wait_time)

	logger.info("WebDriver setup completed")
	return True

	except Exception as e:
	logger.error(f"Failed to setup WebDriver: {e}")
	return False

	def close_driver(self):
	"""WebDriverの終了"""
	if self.driver:
	self.driver.quit()
	self.driver = None
	self.wait = None
	logger.info("WebDriver closed")

	def search_questions(self, keyword: str, max_results: int = 20, debug: bool = False, max_workers: int = 5) -> List[Dict]:
	"""
	キーワードで質問を検索し、各質問の詳細も取得（並列処理対応）

	Args:
	keyword: 検索キーワード
	max_results: 最大取得件数（デフォルト20件、最大20件）
	debug: デバッグモード（ページソースを保存）
	max_workers: 並列処理の最大ワーカー数（デフォルト5、最大5件同時処理）

	Returns:
	質問リスト（詳細情報付き）
	"""
	# 最大件数を20に制限
	max_results = min(max_results, 20)
	results = []

	try:
	if not self.driver:
	if not self.setup_driver():
	return results

	# 検索URLにアクセス
	search_url = f"{self.SEARCH_URL}?p={quote(keyword)}&type=list"
	logger.info(f"Searching: {search_url}")
	self.driver.get(search_url)

	# ページ読み込み待機
	time.sleep(3)

	# デバッグモード：ページソースを保存
	if debug:
	self.save_page_source(f"search_results_{keyword.replace(' ', '_')}.html")

	# 新しいCSSセレクタで検索結果を取得
	question_elements = self.driver.find_elements(
	By.CSS_SELECTOR,
	"li.ListSearchResults_listSearchResults__listItem__PurLr"
	)[:max_results]

	logger.info(f"Found {len(question_elements)} question elements with new selector")

	# まず全ての質問情報を収集（ページ遷移前に）
	questions_data = []
	for element in question_elements:
	try:
	# タイトル取得（新しいCSSセレクタ）
	title_elem = element.find_element(
	By.CSS_SELECTOR,
	"h3.ListSearchResults_listSearchResults__heading__WGSq8 a"
	)
	title = title_elem.text.strip()
	url = title_elem.get_attribute("href")

	# 質問本文のプレビューを取得
	content_preview = ""
	try:
	content_elem = element.find_element(
	By.CSS_SELECTOR,
	"p.ListSearchResults_listSearchResults__summary__0897S"
	)
	content_preview = content_elem.text.strip()
	except NoSuchElementException:
	pass

	# 投稿日時取得
	post_date = "不明"
	try:
	date_elem = element.find_element(
	By.CSS_SELECTOR,
	"span.ListSearchResults_listSearchResults__informationDate__J4NVn span:last-child"
	)
	post_date = date_elem.text.strip()
	except NoSuchElementException:
	pass

	# 回答数取得
	answer_count = "0"
	try:
	answer_elem = element.find_element(
	By.CSS_SELECTOR,
	"span.ListSearchResults_listSearchResults__informationAnswers__64Dhv span:last-child"
	)
	answer_count = answer_elem.text.strip()
	except NoSuchElementException:
	pass

	questions_data.append({
	"title": title,
	"url": url,
	"content_preview": content_preview,
	"post_date": post_date,
	"answer_count": answer_count
	})

	except Exception as e:
	logger.warning(f"Failed to parse question element: {e}")
	continue

	# 並列処理で詳細を取得
	logger.info(f"Starting parallel detail fetching with max {max_workers} workers...")

	# 最大ワーカー数を5に制限
	max_workers = min(max_workers, 5)

	# 詳細取得用の関数（各ワーカーで実行）
	def fetch_detail_with_delay(idx_and_question):
	idx, question = idx_and_question
	try:
	# 最初の5件は同時開始、それ以降は2秒間隔を設ける
	# 並列処理でもサーバー負荷を考慮
	if idx >= max_workers:
	time.sleep((idx - max_workers + 1) * 2)

	logger.info(f"[Worker] Getting detail for question {idx+1}/{len(questions_data)}: {question['title'][:50]}...")
	question_detail = self.get_question_detail_content(question['url'])

	return {
	"title": question['title'],
	"url": question['url'],
	"content_preview": question['content_preview'],
	"full_content": question_detail.get("content", question['content_preview']) if question_detail else question['content_preview'],
	"post_date": question['post_date'],
	"answer_count": question['answer_count'],
	"best_answer": question_detail.get("best_answer", None) if question_detail else None,
	"searched_at": datetime.now().isoformat()
	}
	except Exception as e:
	logger.warning(f"[Worker] Failed to get detail for question: {e}")
	# 詳細取得に失敗しても基本情報は保存
	return {
	"title": question['title'],
	"url": question['url'],
	"content_preview": question['content_preview'],
	"full_content": question['content_preview'],
	"post_date": question['post_date'],
	"answer_count": question['answer_count'],
	"best_answer": None,
	"searched_at": datetime.now().isoformat()
	}

	# ThreadPoolExecutorで並列処理
	with ThreadPoolExecutor(max_workers=min(max_workers, len(questions_data))) as executor:
	# インデックス付きでサブミット
	future_to_idx = {
	executor.submit(fetch_detail_with_delay, (idx, q)): idx
	for idx, q in enumerate(questions_data)
	}

	# 結果を順番通りに格納するための辞書
	results_dict = {}

	# 完了したものから処理（順序は保持）
	for future in as_completed(future_to_idx):
	idx = future_to_idx[future]
	try:
	result = future.result(timeout=30) # 30秒のタイムアウト
	results_dict[idx] = result
	logger.info(f"[Worker] Completed {len(results_dict)}/{len(questions_data)} questions")
	except Exception as e:
	logger.error(f"[Worker] Exception for question {idx}: {e}")
	# エラー時は基本情報のみ
	question = questions_data[idx]
	results_dict[idx] = {
	"title": question['title'],
	"url": question['url'],
	"content_preview": question['content_preview'],
	"full_content": question['content_preview'],
	"post_date": question['post_date'],
	"answer_count": question['answer_count'],
	"best_answer": None,
	"searched_at": datetime.now().isoformat()
	}

	# インデックス順にソートして結果リストを作成
	results = [results_dict[i] for i in sorted(results_dict.keys())]

	logger.info(f"Successfully retrieved {len(results)} questions with details")

	except Exception as e:
	logger.error(f"Search failed: {e}")
	if debug:
	self.save_page_source("error_page.html")

	return results

	def search_questions_fast(self, keyword: str, max_results: int = 20, debug: bool = False) -> List[Dict]:
	"""
	キーワードで質問を検索（高速版・詳細なし）

	Args:
	keyword: 検索キーワード
	max_results: 最大取得件数（最大20件）
	debug: デバッグモード（ページソースを保存）

	Returns:
	質問リスト（基本情報のみ）
	"""
	# 最大件数を20に制限
	max_results = min(max_results, 20)
	results = []

	try:
	if not self.driver:
	if not self.setup_driver():
	return results

	# 検索URLにアクセス
	search_url = f"{self.SEARCH_URL}?p={quote(keyword)}&type=list"
	logger.info(f"Searching (fast mode): {search_url}")
	self.driver.get(search_url)

	# ページ読み込み待機
	time.sleep(3)

	# デバッグモード：ページソースを保存
	if debug:
	self.save_page_source(f"search_results_{keyword.replace(' ', '_')}.html")

	# 新しいCSSセレクタで検索結果を取得
	question_elements = self.driver.find_elements(
	By.CSS_SELECTOR,
	"li.ListSearchResults_listSearchResults__listItem__PurLr"
	)[:max_results]

	logger.info(f"Found {len(question_elements)} question elements (fast mode)")

	for element in question_elements:
	try:
	# タイトル取得
	title_elem = element.find_element(
	By.CSS_SELECTOR,
	"h3.ListSearchResults_listSearchResults__heading__WGSq8 a"
	)
	title = title_elem.text.strip()
	url = title_elem.get_attribute("href")

	# 質問本文のプレビューを取得
	content_preview = ""
	try:
	content_elem = element.find_element(
	By.CSS_SELECTOR,
	"p.ListSearchResults_listSearchResults__summary__0897S"
	)
	content_preview = content_elem.text.strip()
	except NoSuchElementException:
	pass

	# 投稿日時取得
	post_date = "不明"
	try:
	date_elem = element.find_element(
	By.CSS_SELECTOR,
	"span.ListSearchResults_listSearchResults__informationDate__J4NVn span:last-child"
	)
	post_date = date_elem.text.strip()
	except NoSuchElementException:
	pass

	# 回答数取得
	answer_count = "0"
	try:
	answer_elem = element.find_element(
	By.CSS_SELECTOR,
	"span.ListSearchResults_listSearchResults__informationAnswers__64Dhv span:last-child"
	)
	answer_count = answer_elem.text.strip()
	except NoSuchElementException:
	pass

	# 閲覧数取得
	views_count = "0"
	try:
	views_elem = element.find_element(
	By.CSS_SELECTOR,
	"span.ListSearchResults_listSearchResults__informationViews__VivY6 span:last-child"
	)
	views_count = views_elem.text.strip()
	except NoSuchElementException:
	pass

	results.append({
	"title": title,
	"url": url,
	"content_preview": content_preview,
	"post_date": post_date,
	"answer_count": answer_count,
	"views_count": views_count,
	"searched_at": datetime.now().isoformat()
	})

	except Exception as e:
	logger.warning(f"Failed to parse question element: {e}")
	continue

	logger.info(f"Successfully retrieved {len(results)} questions (fast mode)")

	except Exception as e:
	logger.error(f"Search failed: {e}")
	if debug:
	self.save_page_source("error_page.html")

	return results

	def get_question_detail_content(self, question_url: str) -> Optional[Dict]:
	"""
	質問の詳細コンテンツのみ取得（簡易版）

	Args:
	question_url: 質問のURL

	Returns:
	質問の詳細情報（コンテンツとベストアンサーのみ）
	"""
	try:
	logger.info(f"Getting question detail: {question_url}")
	self.driver.get(question_url)

	# ページ読み込み待機
	time.sleep(3)

	result = {}

	# 質問本文を取得 - h1タグに本文が含まれている
	content = ""
	selectors = [
	# h1タグ（タイトルと本文が同じ要素に含まれる場合）
	"h1.ClapLv1TextBlock_Chie-TextBlock__Text__etZbS",
	"h1[class*='TextBlock']",
	# その他の可能性
	"div.ClapLv1TextBlock_Chie-TextBlock__4j9Y9 h1",
	"article h1"
	]

	for selector in selectors:
	try:
	content_elem = self.driver.find_element(By.CSS_SELECTOR, selector)
	content = content_elem.text.strip()
	if content:
	logger.info(f"Found question content with selector: {selector}")
	break
	except NoSuchElementException:
	continue
	except Exception as e:
	logger.debug(f"Error with selector {selector}: {e}")
	continue

	result["content"] = content if content else ""

	# ベストアンサーを取得
	best_answer = None

	# まず、ベストアンサーのテキストを探す
	answer_selectors = [
	# ベストアンサーのテキストブロック
	"div.ClapLv2AnswerItem_Chie-AnswerItem--Best__yJIDl div.ClapLv1TextBlock_Chie-TextBlock__Text__etZbS",
	"div[class='AnswerItem--Best'] div[class='TextBlock__Text']",
	# 通常の回答の最初のもの（ベストアンサーがない場合）
	"div.ClapLv2AnswerItem_Chie-AnswerItem__CYXyb div.ClapLv1TextBlock_Chie-TextBlock__Text__etZbS"
	]

	for selector in answer_selectors:
	try:
	answer_elem = self.driver.find_element(By.CSS_SELECTOR, selector)
	best_answer = answer_elem.text.strip()
	if best_answer:
	logger.info(f"Found best answer with selector: {selector}")
	break
	except NoSuchElementException:
	continue
	except Exception as e:
	logger.debug(f"Error with answer selector {selector}: {e}")
	continue

	result["best_answer"] = best_answer

	# デバッグ情報
	if not content:
	logger.warning(f"Could not find content for: {question_url}")

	return result

	except Exception as e:
	logger.warning(f"Failed to get question detail: {e}")
	return None

	def get_question_detail(self, question_url: str) -> Optional[Dict]:
	"""
	質問の詳細情報を取得

	Args:
	question_url: 質問のURL

	Returns:
	質問の詳細情報
	"""
	try:
	if not self.driver:
	if not self.setup_driver():
	return None

	logger.info(f"Getting question detail: {question_url}")
	self.driver.get(question_url)

	# ページ読み込み待機
	time.sleep(2)

	# 質問タイトル
	try:
	title_elem = self.wait.until(
	EC.presence_of_element_located(
	(By.CSS_SELECTOR, "h1.ClapLv1QuestionItem__title")
	)
	)
	title = title_elem.text.strip()
	except TimeoutException:
	title = "タイトル取得失敗"

	# 質問本文
	try:
	content_elem = self.driver.find_element(
	By.CSS_SELECTOR,
	"div.ClapLv1QuestionItem__body"
	)
	content = content_elem.text.strip()
	except NoSuchElementException:
	content = "本文取得失敗"

	# カテゴリ
	try:
	category_elem = self.driver.find_element(
	By.CSS_SELECTOR,
	"div.ClapLv1QuestionItem__category"
	)
	category = category_elem.text.strip()
	except NoSuchElementException:
	category = "不明"

	# 投稿日時
	try:
	date_elem = self.driver.find_element(
	By.CSS_SELECTOR,
	"span.ClapLv1QuestionItem__date"
	)
	post_date = date_elem.text.strip()
	except NoSuchElementException:
	post_date = "不明"

	# ベストアンサー取得
	best_answer = None
	try:
	best_answer_elem = self.driver.find_element(
	By.CSS_SELECTOR,
	"div.ClapLv1AnswerItem--best"
	)
	best_answer_content = best_answer_elem.find_element(
	By.CSS_SELECTOR,
	"div.ClapLv1AnswerItem__body"
	).text.strip()

	best_answer = {
	"content": best_answer_content,
	"is_best": True
	}
	except NoSuchElementException:
	logger.info("No best answer found")

	# その他の回答取得
	other_answers = []
	try:
	answer_elements = self.driver.find_elements(
	By.CSS_SELECTOR,
	"div.ClapLv1AnswerItem:not(.ClapLv1AnswerItem--best)"
	)

	for answer_elem in answer_elements[:5]: # 最大5件まで
	try:
	answer_content = answer_elem.find_element(
	By.CSS_SELECTOR,
	"div.ClapLv1AnswerItem__body"
	).text.strip()

	other_answers.append({
	"content": answer_content,
	"is_best": False
	})
	except Exception as e:
	logger.warning(f"Failed to parse answer: {e}")
	continue

	except Exception as e:
	logger.warning(f"Failed to get other answers: {e}")

	# 全回答をまとめる
	all_answers = []
	if best_answer:
	all_answers.append(best_answer)
	all_answers.extend(other_answers)

	return {
	"url": question_url,
	"title": title,
	"content": content,
	"category": category,
	"post_date": post_date,
	"answers": all_answers,
	"answer_count": len(all_answers),
	"scraped_at": datetime.now().isoformat()
	}

	except Exception as e:
	logger.error(f"Failed to get question detail: {e}")
	return None

	def get_category_questions(self, category: str, max_results: int = 10) -> List[Dict]:
	"""
	カテゴリ別に質問を取得

	Args:
	category: カテゴリ名
	max_results: 最大取得件数

	Returns:
	質問リスト
	"""
	# カテゴリ検索は通常の検索を使用
	return self.search_questions(f"カテゴリ:{category}", max_results)


	# テスト用関数
	def test_scraper():
	"""
	スクレイパーのテスト関数

	並列処理テスト方法:
	1. 通常の検索（並列処理あり・デフォルト）
	scraper.search_questions("Python", max_results=10)
	-> 最大5件同時処理で高速化

	2. 並列度を変更してテスト
	scraper.search_questions("Python", max_results=10, max_workers=3)
	-> 最大3件同時処理

	3. 逐次処理でテスト（比較用）
	scraper.search_questions("Python", max_results=10, max_workers=1)
	-> 1件ずつ処理（従来の方法と同等）

	4. 処理時間の比較例
	- max_workers=1: 約30秒（10件取得時）
	- max_workers=5: 約6-10秒（10件取得時）
	"""
	import time

	scraper = YahooChiebukuroScraper(headless=True)

	try:
	# 並列処理テスト（デフォルト: max_workers=5）
	print("=== 並列処理テスト開始 ===")
	start_time = time.time()

	results = scraper.search_questions(
	"Python プログラミング",
	max_results=10, # 10件取得
	max_workers=5 # 5件同時処理（デフォルト）
	)

	elapsed_time = time.time() - start_time
	print(f"並列処理（5 workers）: {len(results)}件取得 - {elapsed_time:.1f}秒")

	# 逐次処理との比較（オプション）
	# コメントアウトを外すと逐次処理との比較が可能
	# start_time = time.time()
	# results_sequential = scraper.search_questions(
	# "Python プログラミング",
	# max_results=10,
	# max_workers=1 # 逐次処理
	# )
	# elapsed_time_seq = time.time() - start_time
	# print(f"逐次処理（1 worker）: {len(results_sequential)}件取得 - {elapsed_time_seq:.1f}秒")

	if results:
	print(f"\n最初の質問: {results[0]['title']}")
	print(f"詳細情報: {'あり' if results[0].get('best_answer') else 'なし'}")

	finally:
	scraper.close_driver()
	print("\n=== テスト完了 ===")


	if __name__ == "__main__":
	test_scraper()