"""
Yahoo知恵袋スクレイピングモジュール
Selenium WebDriverを使用してYahoo知恵袋から質問と回答を取得
"""

import time
import logging
from typing import Dict, List, Optional
from datetime import datetime
from urllib.parse import urljoin, quote
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class YahooChiebukuroScraper:
    """Yahoo知恵袋スクレイピングクラス"""

    BASE_URL = "https://chiebukuro.yahoo.co.jp"
    SEARCH_URL = "https://chiebukuro.yahoo.co.jp/search"

    def __init__(self, headless: bool = True, wait_time: int = 10):
        """
        初期化

        Args:
            headless: ヘッドレスモードで実行するか
            wait_time: 要素の待機時間（秒）
        """
        self.headless = headless
        self.wait_time = wait_time
        self.driver = None
        self.wait = None

    def save_page_source(self, filename: str = "debug_page.html"):
        """デバッグ用にページソースを保存"""
        if self.driver:
            try:
                with open(filename, "w", encoding="utf-8") as f:
                    f.write(self.driver.page_source)
                logger.info(f"Page source saved to {filename}")
            except Exception as e:
                logger.error(f"Failed to save page source: {e}")

    def setup_driver(self):
        """WebDriverのセットアップ"""
        try:
            options = Options()

            # ヘッドレスモード設定
            if self.headless:
                options.add_argument('--headless')
                options.add_argument('--disable-gpu')

            # Hugging Face Spaces対応の追加オプション
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-dev-shm-usage')
            options.add_argument('--disable-blink-features=AutomationControlled')
            options.add_experimental_option("excludeSwitches", ["enable-automation"])
            options.add_experimental_option('useAutomationExtension', False)

            # Hugging Face Spaces環境用の追加設定
            options.add_argument('--disable-software-rasterizer')
            options.add_argument('--disable-extensions')
            options.add_argument('--disable-setuid-sandbox')
            options.add_argument('--single-process')

            # User-Agent設定
            options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

            # Hugging Face Spaces環境でのChromium使用
            import os
            if os.path.exists('/usr/bin/chromium'):
                # Hugging Face Spaces環境
                options.binary_location = '/usr/bin/chromium'

                # chromium-driverのパスを設定
                if os.path.exists('/usr/bin/chromedriver'):
                    service = Service('/usr/bin/chromedriver')
                else:
                    # webdriver-managerを使用
                    service = Service(ChromeDriverManager(chrome_type="chromium").install())
            else:
                # 通常環境
                service = Service(ChromeDriverManager().install())

            self.driver = webdriver.Chrome(service=service, options=options)
            self.wait = WebDriverWait(self.driver, self.wait_time)

            logger.info("WebDriver setup completed")
            return True

        except Exception as e:
            logger.error(f"Failed to setup WebDriver: {e}")
            return False

    def close_driver(self):
        """WebDriverの終了"""
        if self.driver:
            self.driver.quit()
            self.driver = None
            self.wait = None
            logger.info("WebDriver closed")

    def search_questions(self, keyword: str, max_results: int = 20, debug: bool = False, max_workers: int = 5) -> List[Dict]:
        """
        キーワードで質問を検索し、各質問の詳細も取得（並列処理対応）

        Args:
            keyword: 検索キーワード
            max_results: 最大取得件数（デフォルト20件、最大20件）
            debug: デバッグモード（ページソースを保存）
            max_workers: 並列処理の最大ワーカー数（デフォルト5、最大5件同時処理）

        Returns:
            質問リスト（詳細情報付き）
        """
        # 最大件数を20に制限
        max_results = min(max_results, 20)
        results = []

        try:
            if not self.driver:
                if not self.setup_driver():
                    return results

            # 検索URLにアクセス
            search_url = f"{self.SEARCH_URL}?p={quote(keyword)}&type=list"
            logger.info(f"Searching: {search_url}")
            self.driver.get(search_url)

            # ページ読み込み待機
            time.sleep(3)

            # デバッグモード：ページソースを保存
            if debug:
                self.save_page_source(f"search_results_{keyword.replace(' ', '_')}.html")

            # 新しいCSSセレクタで検索結果を取得
            question_elements = self.driver.find_elements(
                By.CSS_SELECTOR,
                "li.ListSearchResults_listSearchResults__listItem__PurLr"
            )[:max_results]
            
            logger.info(f"Found {len(question_elements)} question elements with new selector")

            # まず全ての質問情報を収集（ページ遷移前に）
            questions_data = []
            for element in question_elements:
                try:
                    # タイトル取得（新しいCSSセレクタ）
                    title_elem = element.find_element(
                        By.CSS_SELECTOR,
                        "h3.ListSearchResults_listSearchResults__heading__WGSq8 a"
                    )
                    title = title_elem.text.strip()
                    url = title_elem.get_attribute("href")

                    # 質問本文のプレビューを取得
                    content_preview = ""
                    try:
                        content_elem = element.find_element(
                            By.CSS_SELECTOR,
                            "p.ListSearchResults_listSearchResults__summary__0897S"
                        )
                        content_preview = content_elem.text.strip()
                    except NoSuchElementException:
                        pass

                    # 投稿日時取得
                    post_date = "不明"
                    try:
                        date_elem = element.find_element(
                            By.CSS_SELECTOR,
                            "span.ListSearchResults_listSearchResults__informationDate__J4NVn span:last-child"
                        )
                        post_date = date_elem.text.strip()
                    except NoSuchElementException:
                        pass

                    # 回答数取得
                    answer_count = "0"
                    try:
                        answer_elem = element.find_element(
                            By.CSS_SELECTOR,
                            "span.ListSearchResults_listSearchResults__informationAnswers__64Dhv span:last-child"
                        )
                        answer_count = answer_elem.text.strip()
                    except NoSuchElementException:
                        pass

                    questions_data.append({
                        "title": title,
                        "url": url,
                        "content_preview": content_preview,
                        "post_date": post_date,
                        "answer_count": answer_count
                    })

                except Exception as e:
                    logger.warning(f"Failed to parse question element: {e}")
                    continue

            # 並列処理で詳細を取得
            logger.info(f"Starting parallel detail fetching with max {max_workers} workers...")

            # 最大ワーカー数を5に制限
            max_workers = min(max_workers, 5)

            # 詳細取得用の関数（各ワーカーで実行）
            def fetch_detail_with_delay(idx_and_question):
                idx, question = idx_and_question
                try:
                    # 最初の5件は同時開始、それ以降は2秒間隔を設ける
                    # 並列処理でもサーバー負荷を考慮
                    if idx >= max_workers:
                        time.sleep((idx - max_workers + 1) * 2)

                    logger.info(f"[Worker] Getting detail for question {idx+1}/{len(questions_data)}: {question['title'][:50]}...")
                    question_detail = self.get_question_detail_content(question['url'])

                    return {
                        "title": question['title'],
                        "url": question['url'],
                        "content_preview": question['content_preview'],
                        "full_content": question_detail.get("content", question['content_preview']) if question_detail else question['content_preview'],
                        "post_date": question['post_date'],
                        "answer_count": question['answer_count'],
                        "best_answer": question_detail.get("best_answer", None) if question_detail else None,
                        "searched_at": datetime.now().isoformat()
                    }
                except Exception as e:
                    logger.warning(f"[Worker] Failed to get detail for question: {e}")
                    # 詳細取得に失敗しても基本情報は保存
                    return {
                        "title": question['title'],
                        "url": question['url'],
                        "content_preview": question['content_preview'],
                        "full_content": question['content_preview'],
                        "post_date": question['post_date'],
                        "answer_count": question['answer_count'],
                        "best_answer": None,
                        "searched_at": datetime.now().isoformat()
                    }

            # ThreadPoolExecutorで並列処理
            with ThreadPoolExecutor(max_workers=min(max_workers, len(questions_data))) as executor:
                # インデックス付きでサブミット
                future_to_idx = {
                    executor.submit(fetch_detail_with_delay, (idx, q)): idx
                    for idx, q in enumerate(questions_data)
                }

                # 結果を順番通りに格納するための辞書
                results_dict = {}

                # 完了したものから処理（順序は保持）
                for future in as_completed(future_to_idx):
                    idx = future_to_idx[future]
                    try:
                        result = future.result(timeout=30)  # 30秒のタイムアウト
                        results_dict[idx] = result
                        logger.info(f"[Worker] Completed {len(results_dict)}/{len(questions_data)} questions")
                    except Exception as e:
                        logger.error(f"[Worker] Exception for question {idx}: {e}")
                        # エラー時は基本情報のみ
                        question = questions_data[idx]
                        results_dict[idx] = {
                            "title": question['title'],
                            "url": question['url'],
                            "content_preview": question['content_preview'],
                            "full_content": question['content_preview'],
                            "post_date": question['post_date'],
                            "answer_count": question['answer_count'],
                            "best_answer": None,
                            "searched_at": datetime.now().isoformat()
                        }

                # インデックス順にソートして結果リストを作成
                results = [results_dict[i] for i in sorted(results_dict.keys())]

            logger.info(f"Successfully retrieved {len(results)} questions with details")

        except Exception as e:
            logger.error(f"Search failed: {e}")
            if debug:
                self.save_page_source("error_page.html")

        return results

    def search_questions_fast(self, keyword: str, max_results: int = 20, debug: bool = False) -> List[Dict]:
        """
        キーワードで質問を検索（高速版・詳細なし）

        Args:
            keyword: 検索キーワード
            max_results: 最大取得件数（最大20件）
            debug: デバッグモード（ページソースを保存）

        Returns:
            質問リスト（基本情報のみ）
        """
        # 最大件数を20に制限
        max_results = min(max_results, 20)
        results = []

        try:
            if not self.driver:
                if not self.setup_driver():
                    return results

            # 検索URLにアクセス
            search_url = f"{self.SEARCH_URL}?p={quote(keyword)}&type=list"
            logger.info(f"Searching (fast mode): {search_url}")
            self.driver.get(search_url)

            # ページ読み込み待機
            time.sleep(3)

            # デバッグモード：ページソースを保存
            if debug:
                self.save_page_source(f"search_results_{keyword.replace(' ', '_')}.html")

            # 新しいCSSセレクタで検索結果を取得
            question_elements = self.driver.find_elements(
                By.CSS_SELECTOR,
                "li.ListSearchResults_listSearchResults__listItem__PurLr"
            )[:max_results]
            
            logger.info(f"Found {len(question_elements)} question elements (fast mode)")

            for element in question_elements:
                try:
                    # タイトル取得
                    title_elem = element.find_element(
                        By.CSS_SELECTOR,
                        "h3.ListSearchResults_listSearchResults__heading__WGSq8 a"
                    )
                    title = title_elem.text.strip()
                    url = title_elem.get_attribute("href")

                    # 質問本文のプレビューを取得
                    content_preview = ""
                    try:
                        content_elem = element.find_element(
                            By.CSS_SELECTOR,
                            "p.ListSearchResults_listSearchResults__summary__0897S"
                        )
                        content_preview = content_elem.text.strip()
                    except NoSuchElementException:
                        pass

                    # 投稿日時取得
                    post_date = "不明"
                    try:
                        date_elem = element.find_element(
                            By.CSS_SELECTOR,
                            "span.ListSearchResults_listSearchResults__informationDate__J4NVn span:last-child"
                        )
                        post_date = date_elem.text.strip()
                    except NoSuchElementException:
                        pass

                    # 回答数取得
                    answer_count = "0"
                    try:
                        answer_elem = element.find_element(
                            By.CSS_SELECTOR,
                            "span.ListSearchResults_listSearchResults__informationAnswers__64Dhv span:last-child"
                        )
                        answer_count = answer_elem.text.strip()
                    except NoSuchElementException:
                        pass

                    # 閲覧数取得
                    views_count = "0"
                    try:
                        views_elem = element.find_element(
                            By.CSS_SELECTOR,
                            "span.ListSearchResults_listSearchResults__informationViews__VivY6 span:last-child"
                        )
                        views_count = views_elem.text.strip()
                    except NoSuchElementException:
                        pass

                    results.append({
                        "title": title,
                        "url": url,
                        "content_preview": content_preview,
                        "post_date": post_date,
                        "answer_count": answer_count,
                        "views_count": views_count,
                        "searched_at": datetime.now().isoformat()
                    })

                except Exception as e:
                    logger.warning(f"Failed to parse question element: {e}")
                    continue

            logger.info(f"Successfully retrieved {len(results)} questions (fast mode)")

        except Exception as e:
            logger.error(f"Search failed: {e}")
            if debug:
                self.save_page_source("error_page.html")

        return results

    def get_question_detail_content(self, question_url: str) -> Optional[Dict]:
        """
        質問の詳細コンテンツのみ取得（簡易版）

        Args:
            question_url: 質問のURL

        Returns:
            質問の詳細情報（コンテンツとベストアンサーのみ）
        """
        try:
            logger.info(f"Getting question detail: {question_url}")
            self.driver.get(question_url)
            
            # ページ読み込み待機
            time.sleep(3)

            result = {}

            # 質問本文を取得 - h1タグに本文が含まれている
            content = ""
            selectors = [
                # h1タグ（タイトルと本文が同じ要素に含まれる場合）
                "h1.ClapLv1TextBlock_Chie-TextBlock__Text__etZbS",
                "h1[class*='TextBlock']",
                # その他の可能性
                "div.ClapLv1TextBlock_Chie-TextBlock__4j9Y9 h1",
                "article h1"
            ]
            
            for selector in selectors:
                try:
                    content_elem = self.driver.find_element(By.CSS_SELECTOR, selector)
                    content = content_elem.text.strip()
                    if content:
                        logger.info(f"Found question content with selector: {selector}")
                        break
                except NoSuchElementException:
                    continue
                except Exception as e:
                    logger.debug(f"Error with selector {selector}: {e}")
                    continue
            
            result["content"] = content if content else ""

            # ベストアンサーを取得
            best_answer = None
            
            # まず、ベストアンサーのテキストを探す
            answer_selectors = [
                # ベストアンサーのテキストブロック
                "div.ClapLv2AnswerItem_Chie-AnswerItem--Best__yJIDl div.ClapLv1TextBlock_Chie-TextBlock__Text__etZbS",
                "div[class*='AnswerItem--Best'] div[class*='TextBlock__Text']",
                # 通常の回答の最初のもの（ベストアンサーがない場合）
                "div.ClapLv2AnswerItem_Chie-AnswerItem__CYXyb div.ClapLv1TextBlock_Chie-TextBlock__Text__etZbS"
            ]
            
            for selector in answer_selectors:
                try:
                    answer_elem = self.driver.find_element(By.CSS_SELECTOR, selector)
                    best_answer = answer_elem.text.strip()
                    if best_answer:
                        logger.info(f"Found best answer with selector: {selector}")
                        break
                except NoSuchElementException:
                    continue
                except Exception as e:
                    logger.debug(f"Error with answer selector {selector}: {e}")
                    continue
            
            result["best_answer"] = best_answer

            # デバッグ情報
            if not content:
                logger.warning(f"Could not find content for: {question_url}")

            return result

        except Exception as e:
            logger.warning(f"Failed to get question detail: {e}")
            return None

    def get_question_detail(self, question_url: str) -> Optional[Dict]:
        """
        質問の詳細情報を取得

        Args:
            question_url: 質問のURL

        Returns:
            質問の詳細情報
        """
        try:
            if not self.driver:
                if not self.setup_driver():
                    return None

            logger.info(f"Getting question detail: {question_url}")
            self.driver.get(question_url)

            # ページ読み込み待機
            time.sleep(2)

            # 質問タイトル
            try:
                title_elem = self.wait.until(
                    EC.presence_of_element_located(
                        (By.CSS_SELECTOR, "h1.ClapLv1QuestionItem__title")
                    )
                )
                title = title_elem.text.strip()
            except TimeoutException:
                title = "タイトル取得失敗"

            # 質問本文
            try:
                content_elem = self.driver.find_element(
                    By.CSS_SELECTOR,
                    "div.ClapLv1QuestionItem__body"
                )
                content = content_elem.text.strip()
            except NoSuchElementException:
                content = "本文取得失敗"

            # カテゴリ
            try:
                category_elem = self.driver.find_element(
                    By.CSS_SELECTOR,
                    "div.ClapLv1QuestionItem__category"
                )
                category = category_elem.text.strip()
            except NoSuchElementException:
                category = "不明"

            # 投稿日時
            try:
                date_elem = self.driver.find_element(
                    By.CSS_SELECTOR,
                    "span.ClapLv1QuestionItem__date"
                )
                post_date = date_elem.text.strip()
            except NoSuchElementException:
                post_date = "不明"

            # ベストアンサー取得
            best_answer = None
            try:
                best_answer_elem = self.driver.find_element(
                    By.CSS_SELECTOR,
                    "div.ClapLv1AnswerItem--best"
                )
                best_answer_content = best_answer_elem.find_element(
                    By.CSS_SELECTOR,
                    "div.ClapLv1AnswerItem__body"
                ).text.strip()

                best_answer = {
                    "content": best_answer_content,
                    "is_best": True
                }
            except NoSuchElementException:
                logger.info("No best answer found")

            # その他の回答取得
            other_answers = []
            try:
                answer_elements = self.driver.find_elements(
                    By.CSS_SELECTOR,
                    "div.ClapLv1AnswerItem:not(.ClapLv1AnswerItem--best)"
                )

                for answer_elem in answer_elements[:5]:  # 最大5件まで
                    try:
                        answer_content = answer_elem.find_element(
                            By.CSS_SELECTOR,
                            "div.ClapLv1AnswerItem__body"
                        ).text.strip()

                        other_answers.append({
                            "content": answer_content,
                            "is_best": False
                        })
                    except Exception as e:
                        logger.warning(f"Failed to parse answer: {e}")
                        continue

            except Exception as e:
                logger.warning(f"Failed to get other answers: {e}")

            # 全回答をまとめる
            all_answers = []
            if best_answer:
                all_answers.append(best_answer)
            all_answers.extend(other_answers)

            return {
                "url": question_url,
                "title": title,
                "content": content,
                "category": category,
                "post_date": post_date,
                "answers": all_answers,
                "answer_count": len(all_answers),
                "scraped_at": datetime.now().isoformat()
            }

        except Exception as e:
            logger.error(f"Failed to get question detail: {e}")
            return None

    def get_category_questions(self, category: str, max_results: int = 10) -> List[Dict]:
        """
        カテゴリ別に質問を取得

        Args:
            category: カテゴリ名
            max_results: 最大取得件数

        Returns:
            質問リスト
        """
        # カテゴリ検索は通常の検索を使用
        return self.search_questions(f"カテゴリ:{category}", max_results)


# テスト用関数
def test_scraper():
    """
    スクレイパーのテスト関数

    並列処理テスト方法:
    1. 通常の検索（並列処理あり・デフォルト）
       scraper.search_questions("Python", max_results=10)
       -> 最大5件同時処理で高速化

    2. 並列度を変更してテスト
       scraper.search_questions("Python", max_results=10, max_workers=3)
       -> 最大3件同時処理

    3. 逐次処理でテスト（比較用）
       scraper.search_questions("Python", max_results=10, max_workers=1)
       -> 1件ずつ処理（従来の方法と同等）

    4. 処理時間の比較例
       - max_workers=1: 約30秒（10件取得時）
       - max_workers=5: 約6-10秒（10件取得時）
    """
    import time

    scraper = YahooChiebukuroScraper(headless=True)

    try:
        # 並列処理テスト（デフォルト: max_workers=5）
        print("=== 並列処理テスト開始 ===")
        start_time = time.time()

        results = scraper.search_questions(
            "Python プログラミング",
            max_results=10,  # 10件取得
            max_workers=5     # 5件同時処理（デフォルト）
        )

        elapsed_time = time.time() - start_time
        print(f"並列処理（5 workers）: {len(results)}件取得 - {elapsed_time:.1f}秒")

        # 逐次処理との比較（オプション）
        # コメントアウトを外すと逐次処理との比較が可能
        # start_time = time.time()
        # results_sequential = scraper.search_questions(
        #     "Python プログラミング",
        #     max_results=10,
        #     max_workers=1  # 逐次処理
        # )
        # elapsed_time_seq = time.time() - start_time
        # print(f"逐次処理（1 worker）: {len(results_sequential)}件取得 - {elapsed_time_seq:.1f}秒")

        if results:
            print(f"\n最初の質問: {results[0]['title']}")
            print(f"詳細情報: {'あり' if results[0].get('best_answer') else 'なし'}")

    finally:
        scraper.close_driver()
        print("\n=== テスト完了 ===")


if __name__ == "__main__":
    test_scraper()