Spaces:
Runtime error
Runtime error
| """ | |
| Yahoo知恵袋スクレイピングAPI | |
| Gradio UI + FastAPIによるWebアプリケーション | |
| """ | |
| import os | |
| import sys | |
| import logging | |
| import json | |
| from typing import Dict, List, Optional | |
| from datetime import datetime | |
| import pandas as pd | |
| from dotenv import load_dotenv | |
| import gradio as gr | |
| from fastapi import FastAPI, HTTPException, Query, Depends, Header | |
| from fastapi.responses import JSONResponse | |
| from pydantic import BaseModel | |
| from scraper import YahooChiebukuroScraper | |
| from auth_manager import AuthManager | |
| # 環境変数の読み込み | |
| load_dotenv() | |
| # ログ設定 | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # FastAPIアプリケーション | |
| app = FastAPI( | |
| title="Yahoo知恵袋 Scraping API", | |
| description="Yahoo知恵袋の質問と回答を取得するAPI", | |
| version="1.0.0" | |
| ) | |
| # レート制限(秒) | |
| RATE_LIMIT = float(os.getenv("RATE_LIMIT", "1.0")) | |
| # 認証マネージャー | |
| auth_manager = AuthManager() | |
| API_AUTH_CONFIGURED = auth_manager.is_configured() | |
| # レスポンスモデル | |
| class QuestionSummary(BaseModel): | |
| """質問サマリーモデル""" | |
| title: str | |
| url: str | |
| content_preview: str | |
| category: Optional[str] = "" | |
| post_date: str | |
| answer_count: str | |
| searched_at: str | |
| class Answer(BaseModel): | |
| """回答モデル""" | |
| content: str | |
| is_best: bool | |
| class QuestionDetail(BaseModel): | |
| """質問詳細モデル""" | |
| url: str | |
| title: str | |
| content: str | |
| category: str | |
| post_date: str | |
| answers: List[Answer] | |
| answer_count: int | |
| scraped_at: str | |
| class SearchResponse(BaseModel): | |
| """検索レスポンス""" | |
| keyword: str | |
| count: int | |
| results: List[QuestionSummary] | |
| class ErrorResponse(BaseModel): | |
| """エラーレスポンス""" | |
| error: str | |
| detail: str | |
| # 認証チェック | |
| def verify_api_key(x_api_key: Optional[str] = Header(None)): | |
| """API認証チェック""" | |
| if not auth_manager.validate_api_key(x_api_key): | |
| raise HTTPException( | |
| status_code=401, | |
| detail="Invalid or missing API key" | |
| ) | |
| return True | |
| # ========== FastAPI エンドポイント ========== | |
| async def health_check(): | |
| """ヘルスチェック""" | |
| return { | |
| "status": "healthy", | |
| "timestamp": datetime.now().isoformat(), | |
| "api_auth": API_AUTH_CONFIGURED | |
| } | |
| async def search_questions( | |
| keyword: str = Query(..., description="検索キーワード"), | |
| limit: int = Query(20, ge=1, le=20, description="取得件数"), | |
| authorized: bool = Depends(verify_api_key) | |
| ): | |
| """ | |
| キーワードで質問を検索 | |
| Args: | |
| keyword: 検索キーワード | |
| limit: 最大取得件数(1-50) | |
| """ | |
| try: | |
| scraper = YahooChiebukuroScraper(headless=True) | |
| try: | |
| results = scraper.search_questions(keyword, max_results=limit) | |
| return SearchResponse( | |
| keyword=keyword, | |
| count=len(results), | |
| results=results | |
| ) | |
| finally: | |
| scraper.close_driver() | |
| except Exception as e: | |
| logger.error(f"Search failed: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def get_question_detail( | |
| url: str = Query(..., description="質問のURL"), | |
| authorized: bool = Depends(verify_api_key) | |
| ): | |
| """ | |
| 質問の詳細情報を取得 | |
| Args: | |
| url: Yahoo知恵袋の質問URL | |
| """ | |
| try: | |
| scraper = YahooChiebukuroScraper(headless=True) | |
| try: | |
| detail = scraper.get_question_detail(url) | |
| if not detail: | |
| raise HTTPException( | |
| status_code=404, | |
| detail="Question not found or failed to parse" | |
| ) | |
| return detail | |
| finally: | |
| scraper.close_driver() | |
| except HTTPException: | |
| raise | |
| except Exception as e: | |
| logger.error(f"Failed to get question detail: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def get_category_questions( | |
| category: str = Query(..., description="カテゴリ名"), | |
| limit: int = Query(20, ge=1, le=20, description="取得件数"), | |
| authorized: bool = Depends(verify_api_key) | |
| ): | |
| """ | |
| カテゴリ別に質問を取得 | |
| Args: | |
| category: カテゴリ名 | |
| limit: 最大取得件数(1-50) | |
| """ | |
| try: | |
| scraper = YahooChiebukuroScraper(headless=True) | |
| try: | |
| results = scraper.get_category_questions(category, max_results=limit) | |
| return SearchResponse( | |
| keyword=f"カテゴリ:{category}", | |
| count=len(results), | |
| results=results | |
| ) | |
| finally: | |
| scraper.close_driver() | |
| except Exception as e: | |
| logger.error(f"Category search failed: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| # ========== Gradio UI ========== | |
| def check_ui_auth(api_key: str) -> bool: | |
| """UI認証チェック""" | |
| return auth_manager.validate_ui_key(api_key) | |
| def gradio_search(api_key: str, keyword: str, limit: int, get_details: bool = True): | |
| """ | |
| Gradio UI: キーワード検索 | |
| Args: | |
| api_key: APIキー(認証有効時のみ) | |
| keyword: 検索キーワード | |
| limit: 取得件数(最大20件) | |
| get_details: 詳細情報を取得するか(時間がかかる) | |
| """ | |
| try: | |
| # 認証チェック | |
| if API_AUTH_CONFIGURED and not check_ui_auth(api_key): | |
| return "", "認証エラー: 有効なAPIキーを入力してください" | |
| if not keyword: | |
| return "", "エラー: キーワードを入力してください" | |
| # 取得件数を制限 | |
| limit = min(limit, 20) | |
| # スクレイピング実行 | |
| scraper = YahooChiebukuroScraper(headless=True) | |
| try: | |
| if get_details: | |
| status = f"検索中: '{keyword}' ... (最大{limit}件取得、詳細情報付き - 並列処理で高速化)" | |
| results = scraper.search_questions(keyword, max_results=limit) | |
| else: | |
| # 詳細取得をスキップする高速版 | |
| status = f"検索中: '{keyword}' ... (最大{limit}件取得、基本情報のみ)" | |
| results = scraper.search_questions_fast(keyword, max_results=limit) | |
| if not results: | |
| return "", f"検索結果が見つかりませんでした: '{keyword}'" | |
| # JSON形式で整形 | |
| import json | |
| # 結果を整形 | |
| formatted_results = [] | |
| for idx, r in enumerate(results, 1): | |
| formatted_result = { | |
| "番号": idx, | |
| "タイトル": r["title"], | |
| "投稿日": r["post_date"], | |
| "回答数": r["answer_count"], | |
| "URL": r["url"] | |
| } | |
| if get_details: | |
| formatted_result["質問内容"] = r.get("full_content", r["content_preview"]) | |
| formatted_result["ベストアンサー"] = r.get("best_answer", "なし") | |
| else: | |
| formatted_result["内容プレビュー"] = r.get("content_preview", "") | |
| formatted_results.append(formatted_result) | |
| # JSON形式で出力 | |
| json_output = json.dumps(formatted_results, ensure_ascii=False, indent=2) | |
| status = f"✅ {len(results)}件の質問を取得しました" | |
| if get_details: | |
| status += "(詳細情報付き)" | |
| else: | |
| status += "(基本情報のみ)" | |
| return json_output, status | |
| finally: | |
| scraper.close_driver() | |
| except Exception as e: | |
| logger.error(f"Gradio search error: {e}") | |
| return "", f"エラーが発生しました: {str(e)}" | |
| def gradio_get_detail(api_key: str, question_url: str): | |
| """ | |
| Gradio UI: 質問詳細取得 | |
| Args: | |
| api_key: APIキー(認証有効時のみ) | |
| question_url: 質問のURL | |
| """ | |
| try: | |
| # 認証チェック | |
| if API_AUTH_CONFIGURED and not check_ui_auth(api_key): | |
| return "", "", "", "認証エラー: 有効なAPIキーを入力してください" | |
| if not question_url: | |
| return "", "", "", "エラー: URLを入力してください" | |
| if not question_url.startswith("https://chiebukuro.yahoo.co.jp"): | |
| return "", "", "", "エラー: Yahoo知恵袋のURLを入力してください" | |
| # スクレイピング実行 | |
| scraper = YahooChiebukuroScraper(headless=True) | |
| try: | |
| detail = scraper.get_question_detail(question_url) | |
| if not detail: | |
| return "", "", "", "質問の取得に失敗しました" | |
| # 質問情報を整形 | |
| question_info = f"""## {detail['title']} | |
| **カテゴリ:** {detail['category']} | |
| **投稿日:** {detail['post_date']} | |
| **回答数:** {detail['answer_count']} | |
| --- | |
| ### 質問内容 | |
| {detail['content']}""" | |
| # ベストアンサーを抽出 | |
| best_answer = "" | |
| other_answers = [] | |
| for answer in detail['answers']: | |
| if answer['is_best']: | |
| best_answer = f"""### ✨ ベストアンサー | |
| {answer['content']}""" | |
| else: | |
| other_answers.append(answer['content']) | |
| # その他の回答を整形 | |
| other_answers_text = "" | |
| if other_answers: | |
| other_answers_text = "### その他の回答\n\n" | |
| for i, answer in enumerate(other_answers, 1): | |
| other_answers_text += f"**回答 {i}:**\n{answer}\n\n---\n\n" | |
| status = f"✅ 質問詳細を取得しました(回答{detail['answer_count']}件)" | |
| return question_info, best_answer, other_answers_text, status | |
| finally: | |
| scraper.close_driver() | |
| except Exception as e: | |
| logger.error(f"Gradio detail error: {e}") | |
| return "", "", "", f"エラーが発生しました: {str(e)}" | |
| def gradio_search_category(api_key: str, category: str, limit: int, get_details: bool = True): | |
| """ | |
| Gradio UI: カテゴリ検索 | |
| Args: | |
| api_key: APIキー(認証有効時のみ) | |
| category: カテゴリ名 | |
| limit: 取得件数 | |
| get_details: 詳細情報を取得するか | |
| """ | |
| # カテゴリ検索はキーワード検索を利用 | |
| return gradio_search(api_key, f"カテゴリ:{category}", limit, get_details) | |
| # Gradio インターフェース作成 | |
| with gr.Blocks(title="Yahoo知恵袋 Scraper") as demo: | |
| gr.Markdown(""" | |
| # Yahoo!知恵袋 スクレイピングツール | |
| Yahoo!知恵袋から質問と回答を取得します。 | |
| **機能:** | |
| - キーワード検索(質問詳細と回答も取得) | |
| - カテゴリ検索 | |
| - 最大20件まで取得可能 | |
| **注意事項:** | |
| - 利用規約を遵守してください | |
| - サーバー負荷軽減のため、各質問の詳細取得には2秒の間隔を設けています | |
| - 個人利用の範囲でご使用ください | |
| """) | |
| # API認証入力(認証が設定されている場合のみ表示) | |
| if API_AUTH_CONFIGURED: | |
| api_key_input = gr.Textbox( | |
| label="API Key", | |
| placeholder="APIキーを入力してください", | |
| type="password", | |
| interactive=True | |
| ) | |
| else: | |
| api_key_input = gr.Textbox(value="", visible=False) | |
| with gr.Tabs(): | |
| # キーワード検索タブ | |
| with gr.TabItem("🔍 キーワード検索"): | |
| gr.Markdown(""" | |
| ### 使い方 | |
| 1. 検索キーワードを入力 | |
| 2. 取得件数を選択(デフォルト: 20件、最大: 20件) | |
| 3. 詳細取得オプションを選択(デフォルトON - 並列処理で高速化) | |
| 4. 検索ボタンをクリック | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| search_keyword = gr.Textbox( | |
| label="検索キーワード", | |
| placeholder="例: Python プログラミング", | |
| interactive=True | |
| ) | |
| with gr.Column(scale=1): | |
| search_limit = gr.Slider( | |
| label="取得件数", | |
| minimum=1, | |
| maximum=20, | |
| value=20, | |
| step=1 | |
| ) | |
| with gr.Column(scale=1): | |
| get_details = gr.Checkbox( | |
| label="詳細情報を取得", | |
| value=True, | |
| info="質問本文とベストアンサーを取得(並列処理で高速化)" | |
| ) | |
| search_button = gr.Button("検索", variant="primary") | |
| search_status = gr.Textbox(label="ステータス", interactive=False) | |
| search_results = gr.Textbox( | |
| label="検索結果(JSON形式)", | |
| lines=20, | |
| max_lines=30, | |
| interactive=False | |
| ) | |
| search_button.click( | |
| fn=gradio_search, | |
| inputs=[api_key_input, search_keyword, search_limit, get_details], | |
| outputs=[search_results, search_status] | |
| ) | |
| # カテゴリ検索タブ | |
| with gr.TabItem("📁 カテゴリ検索"): | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| category_input = gr.Textbox( | |
| label="カテゴリ名", | |
| placeholder="例: 健康、美容とファッション", | |
| interactive=True | |
| ) | |
| with gr.Column(scale=1): | |
| category_limit = gr.Slider( | |
| label="取得件数", | |
| minimum=1, | |
| maximum=20, | |
| value=20, | |
| step=1 | |
| ) | |
| with gr.Column(scale=1): | |
| category_get_details = gr.Checkbox( | |
| label="詳細情報を取得", | |
| value=True, | |
| info="質問本文とベストアンサーを取得(並列処理で高速化)" | |
| ) | |
| category_button = gr.Button("カテゴリで検索", variant="primary") | |
| category_status = gr.Textbox(label="ステータス", interactive=False) | |
| category_results = gr.Textbox( | |
| label="カテゴリ検索結果(JSON形式)", | |
| lines=20, | |
| max_lines=30, | |
| interactive=False | |
| ) | |
| category_button.click( | |
| fn=gradio_search_category, | |
| inputs=[api_key_input, category_input, category_limit, category_get_details], | |
| outputs=[category_results, category_status] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### API エンドポイント | |
| - `/api/search` - キーワード検索 | |
| - `/api/category` - カテゴリ検索 | |
| 詳細は [README](/) をご覧ください。 | |
| """) | |
| # FastAPIとGradioの統合 | |
| app = gr.mount_gradio_app(app, demo, path="/") | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |