Spaces:
Runtime error
Runtime error
Migrated from another account
Browse files- .gitignore +171 -0
- README.md +323 -6
- app.py +510 -0
- auth_manager.py +77 -0
- packages.txt +2 -0
- requirements.txt +18 -0
- scraper.py +707 -0
.gitignore
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
*.manifest
|
| 31 |
+
*.spec
|
| 32 |
+
|
| 33 |
+
# Installer logs
|
| 34 |
+
pip-log.txt
|
| 35 |
+
pip-delete-this-directory.txt
|
| 36 |
+
|
| 37 |
+
# Unit test / coverage reports
|
| 38 |
+
htmlcov/
|
| 39 |
+
.tox/
|
| 40 |
+
.nox/
|
| 41 |
+
.coverage
|
| 42 |
+
.coverage.*
|
| 43 |
+
.cache
|
| 44 |
+
nosetests.xml
|
| 45 |
+
coverage.xml
|
| 46 |
+
*.cover
|
| 47 |
+
*.py,cover
|
| 48 |
+
.hypothesis/
|
| 49 |
+
.pytest_cache/
|
| 50 |
+
cover/
|
| 51 |
+
|
| 52 |
+
# Translations
|
| 53 |
+
*.mo
|
| 54 |
+
*.pot
|
| 55 |
+
|
| 56 |
+
# Django stuff:
|
| 57 |
+
*.log
|
| 58 |
+
local_settings.py
|
| 59 |
+
db.sqlite3
|
| 60 |
+
db.sqlite3-journal
|
| 61 |
+
|
| 62 |
+
# Flask stuff:
|
| 63 |
+
instance/
|
| 64 |
+
.webassets-cache
|
| 65 |
+
|
| 66 |
+
# Scrapy stuff:
|
| 67 |
+
.scrapy
|
| 68 |
+
|
| 69 |
+
# Sphinx documentation
|
| 70 |
+
docs/_build/
|
| 71 |
+
|
| 72 |
+
# PyBuilder
|
| 73 |
+
.pybuilder/
|
| 74 |
+
target/
|
| 75 |
+
|
| 76 |
+
# Jupyter Notebook
|
| 77 |
+
.ipynb_checkpoints
|
| 78 |
+
|
| 79 |
+
# IPython
|
| 80 |
+
profile_default/
|
| 81 |
+
ipython_config.py
|
| 82 |
+
|
| 83 |
+
# pyenv
|
| 84 |
+
.python-version
|
| 85 |
+
|
| 86 |
+
# pipenv
|
| 87 |
+
Pipfile.lock
|
| 88 |
+
|
| 89 |
+
# poetry
|
| 90 |
+
poetry.lock
|
| 91 |
+
|
| 92 |
+
# pdm
|
| 93 |
+
.pdm.toml
|
| 94 |
+
|
| 95 |
+
# PEP 582
|
| 96 |
+
__pypackages__/
|
| 97 |
+
|
| 98 |
+
# Celery stuff
|
| 99 |
+
celerybeat-schedule
|
| 100 |
+
celerybeat.pid
|
| 101 |
+
|
| 102 |
+
# SageMath parsed files
|
| 103 |
+
*.sage.py
|
| 104 |
+
|
| 105 |
+
# Environments
|
| 106 |
+
.env
|
| 107 |
+
.venv
|
| 108 |
+
env/
|
| 109 |
+
venv/
|
| 110 |
+
ENV/
|
| 111 |
+
env.bak/
|
| 112 |
+
venv.bak/
|
| 113 |
+
|
| 114 |
+
# Spyder project settings
|
| 115 |
+
.spyderproject
|
| 116 |
+
.spyproject
|
| 117 |
+
|
| 118 |
+
# Rope project settings
|
| 119 |
+
.ropeproject
|
| 120 |
+
|
| 121 |
+
# mkdocs documentation
|
| 122 |
+
/site
|
| 123 |
+
|
| 124 |
+
# mypy
|
| 125 |
+
.mypy_cache/
|
| 126 |
+
.dmypy.json
|
| 127 |
+
dmypy.json
|
| 128 |
+
|
| 129 |
+
# Pyre type checker
|
| 130 |
+
.pyre/
|
| 131 |
+
|
| 132 |
+
# pytype static type analyzer
|
| 133 |
+
.pytype/
|
| 134 |
+
|
| 135 |
+
# Cython debug symbols
|
| 136 |
+
cython_debug/
|
| 137 |
+
|
| 138 |
+
# VS Code
|
| 139 |
+
.vscode/
|
| 140 |
+
|
| 141 |
+
# PyCharm
|
| 142 |
+
.idea/
|
| 143 |
+
|
| 144 |
+
# macOS
|
| 145 |
+
.DS_Store
|
| 146 |
+
|
| 147 |
+
# Windows
|
| 148 |
+
Thumbs.db
|
| 149 |
+
ehthumbs.db
|
| 150 |
+
|
| 151 |
+
# Selenium WebDriver
|
| 152 |
+
chromedriver
|
| 153 |
+
geckodriver
|
| 154 |
+
*.log
|
| 155 |
+
|
| 156 |
+
# Temporary files
|
| 157 |
+
*.tmp
|
| 158 |
+
*.bak
|
| 159 |
+
*.swp
|
| 160 |
+
*~
|
| 161 |
+
|
| 162 |
+
# Data files
|
| 163 |
+
*.csv
|
| 164 |
+
*.json
|
| 165 |
+
*.xlsx
|
| 166 |
+
*.xls
|
| 167 |
+
|
| 168 |
+
# Scraped data
|
| 169 |
+
scraped_data/
|
| 170 |
+
downloads/
|
| 171 |
+
output/
|
README.md
CHANGED
|
@@ -1,12 +1,329 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Yahoo Chiebukuro Scraper
|
| 3 |
+
emoji: 🔍
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.19.2
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# Yahoo!知恵袋 スクレイピングAPI
|
| 14 |
+
|
| 15 |
+
Yahoo!知恵袋から質問と回答を高速で取得するWeb APIアプリケーションです。
|
| 16 |
+
|
| 17 |
+
## 🚀 主な特徴
|
| 18 |
+
|
| 19 |
+
- **並列処理対応**: 最大5件同時処理で高速化(デフォルト有効)
|
| 20 |
+
- **キーワード検索**: 指定したキーワードで質問を検索
|
| 21 |
+
- **質問詳細取得**: 特定の質問URLから詳細情報と回答を取得
|
| 22 |
+
- **カテゴリ検索**: カテゴリ別に質問を取得
|
| 23 |
+
- **REST API**: プログラムから利用可能なAPIエンドポイント
|
| 24 |
+
- **Gradio UI**: 使いやすいWebインターフェース
|
| 25 |
+
- **デフォルト設定**: 20件取得・詳細情報ONがデフォルト
|
| 26 |
+
|
| 27 |
+
## 🔧 環境変数
|
| 28 |
+
|
| 29 |
+
Spaceのシークレット設定で以下の環境変数を設定できます:
|
| 30 |
+
|
| 31 |
+
```bash
|
| 32 |
+
# API認証キー(オプション、設定しない場合は認証なし)
|
| 33 |
+
API_KEY=your_api_key_here
|
| 34 |
+
|
| 35 |
+
# レート制限(秒、デフォルト: 1.0)
|
| 36 |
+
RATE_LIMIT=1.0
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
## 📡 APIリファレンス
|
| 40 |
+
|
| 41 |
+
### 1. ヘルスチェック
|
| 42 |
+
|
| 43 |
+
システムの状態を確認します。
|
| 44 |
+
|
| 45 |
+
**エンドポイント:**
|
| 46 |
+
```
|
| 47 |
+
GET /health
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
**レスポンス例:**
|
| 51 |
+
```json
|
| 52 |
+
{
|
| 53 |
+
"status": "healthy",
|
| 54 |
+
"timestamp": "2025-01-20T12:00:00.000000",
|
| 55 |
+
"api_auth": true
|
| 56 |
+
}
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
**cURLコマンド例:**
|
| 60 |
+
```bash
|
| 61 |
+
curl -X GET "https://your-space.hf.space/health"
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
---
|
| 65 |
+
|
| 66 |
+
### 2. キーワード検索
|
| 67 |
+
|
| 68 |
+
指定したキーワードで質問を検索し、詳細情報を取得します。
|
| 69 |
+
|
| 70 |
+
**エンドポイント:**
|
| 71 |
+
```
|
| 72 |
+
GET /api/search
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
**パラメータ:**
|
| 76 |
+
| パラメータ | 型 | デフォルト | 説明 |
|
| 77 |
+
|------------|-----|------------|------|
|
| 78 |
+
| keyword | string | 必須 | 検索キーワード |
|
| 79 |
+
| limit | integer | 20 | 取得件数(1-20) |
|
| 80 |
+
|
| 81 |
+
**ヘッダー(認証有効時):**
|
| 82 |
+
| ヘッダー | 説明 |
|
| 83 |
+
|----------|------|
|
| 84 |
+
| X-API-Key | APIキー |
|
| 85 |
+
|
| 86 |
+
**レスポンス例:**
|
| 87 |
+
```json
|
| 88 |
+
{
|
| 89 |
+
"keyword": "Python プログラミング",
|
| 90 |
+
"count": 2,
|
| 91 |
+
"results": [
|
| 92 |
+
{
|
| 93 |
+
"title": "Pythonでリストの重複を削除する方法",
|
| 94 |
+
"url": "https://chiebukuro.yahoo.co.jp/question/detail/q12345678",
|
| 95 |
+
"content_preview": "Pythonでリストから重複した要素を削除する...",
|
| 96 |
+
"category": "プログラミング",
|
| 97 |
+
"post_date": "2025/01/20",
|
| 98 |
+
"answer_count": "3",
|
| 99 |
+
"searched_at": "2025-01-20T12:00:00.000000"
|
| 100 |
+
},
|
| 101 |
+
{
|
| 102 |
+
"title": "Pythonの環境構築について",
|
| 103 |
+
"url": "https://chiebukuro.yahoo.co.jp/question/detail/q87654321",
|
| 104 |
+
"content_preview": "Python初心者です。環境構築の方法を...",
|
| 105 |
+
"category": "プログラミング",
|
| 106 |
+
"post_date": "2025/01/19",
|
| 107 |
+
"answer_count": "5",
|
| 108 |
+
"searched_at": "2025-01-20T12:00:01.000000"
|
| 109 |
+
}
|
| 110 |
+
]
|
| 111 |
+
}
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
**cURLコマンド例(認証なし):**
|
| 115 |
+
```bash
|
| 116 |
+
curl -X GET "https://your-space.hf.space/api/search?keyword=Python&limit=5"
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
**cURLコマンド例(認証あり):**
|
| 120 |
+
```bash
|
| 121 |
+
curl -X GET "https://your-space.hf.space/api/search?keyword=Python&limit=5" \
|
| 122 |
+
-H "X-API-Key: your_api_key_here"
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
**Pythonコード例:**
|
| 126 |
+
```python
|
| 127 |
+
import requests
|
| 128 |
+
|
| 129 |
+
# 認証なしの場合
|
| 130 |
+
response = requests.get(
|
| 131 |
+
"https://your-space.hf.space/api/search",
|
| 132 |
+
params={"keyword": "Python", "limit": 5}
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
# 認証ありの場合
|
| 136 |
+
response = requests.get(
|
| 137 |
+
"https://your-space.hf.space/api/search",
|
| 138 |
+
params={"keyword": "Python", "limit": 5},
|
| 139 |
+
headers={"X-API-Key": "your_api_key_here"}
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
data = response.json()
|
| 143 |
+
print(f"Found {data['count']} questions")
|
| 144 |
+
```
|
| 145 |
+
|
| 146 |
+
**JavaScriptコード例:**
|
| 147 |
+
```javascript
|
| 148 |
+
// 認証なしの場合
|
| 149 |
+
fetch('https://your-space.hf.space/api/search?keyword=Python&limit=5')
|
| 150 |
+
.then(response => response.json())
|
| 151 |
+
.then(data => console.log(data));
|
| 152 |
+
|
| 153 |
+
// 認証ありの場合
|
| 154 |
+
fetch('https://your-space.hf.space/api/search?keyword=Python&limit=5', {
|
| 155 |
+
headers: {
|
| 156 |
+
'X-API-Key': 'your_api_key_here'
|
| 157 |
+
}
|
| 158 |
+
})
|
| 159 |
+
.then(response => response.json())
|
| 160 |
+
.then(data => console.log(data));
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
+
---
|
| 164 |
+
|
| 165 |
+
### 3. 質問詳細取得
|
| 166 |
+
|
| 167 |
+
特定の質問URLから詳細情報とベストアンサーを取得します。
|
| 168 |
+
|
| 169 |
+
**エンドポイント:**
|
| 170 |
+
```
|
| 171 |
+
GET /api/question
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
**パラメータ:**
|
| 175 |
+
| パラメータ | 型 | デフォルト | 説明 |
|
| 176 |
+
|------------|-----|------------|------|
|
| 177 |
+
| url | string | 必須 | Yahoo知恵袋の質問URL |
|
| 178 |
+
|
| 179 |
+
**レスポンス例:**
|
| 180 |
+
```json
|
| 181 |
+
{
|
| 182 |
+
"url": "https://chiebukuro.yahoo.co.jp/question/detail/q12345678",
|
| 183 |
+
"title": "Pythonでリストの重複を削除する方法",
|
| 184 |
+
"content": "Pythonでリストから重複した要素を削除する最も効率的な方法を教えてください。",
|
| 185 |
+
"category": "プログラミング",
|
| 186 |
+
"post_date": "2025/01/20",
|
| 187 |
+
"answers": [
|
| 188 |
+
{
|
| 189 |
+
"content": "set()を使う方法が最も簡単です。\nmy_list = [1, 2, 2, 3, 3, 4]\nunique_list = list(set(my_list))",
|
| 190 |
+
"is_best": true
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"content": "順序を保持したい場合は、dict.fromkeys()を使います。",
|
| 194 |
+
"is_best": false
|
| 195 |
+
}
|
| 196 |
+
],
|
| 197 |
+
"answer_count": 2,
|
| 198 |
+
"scraped_at": "2025-01-20T12:00:00.000000"
|
| 199 |
+
}
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
**cURLコマンド例:**
|
| 203 |
+
```bash
|
| 204 |
+
# URLパラメータはエンコードが必要
|
| 205 |
+
curl -X GET "https://your-space.hf.space/api/question?url=https%3A%2F%2Fchiebukuro.yahoo.co.jp%2Fquestion%2Fdetail%2Fq12345678" \
|
| 206 |
+
-H "X-API-Key: your_api_key_here"
|
| 207 |
+
```
|
| 208 |
+
|
| 209 |
+
**Pythonコード例:**
|
| 210 |
+
```python
|
| 211 |
+
import requests
|
| 212 |
+
from urllib.parse import quote
|
| 213 |
+
|
| 214 |
+
question_url = "https://chiebukuro.yahoo.co.jp/question/detail/q12345678"
|
| 215 |
+
|
| 216 |
+
response = requests.get(
|
| 217 |
+
"https://your-space.hf.space/api/question",
|
| 218 |
+
params={"url": question_url},
|
| 219 |
+
headers={"X-API-Key": "your_api_key_here"}
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
detail = response.json()
|
| 223 |
+
print(f"Title: {detail['title']}")
|
| 224 |
+
print(f"Best Answer: {detail['answers'][0]['content']}")
|
| 225 |
+
```
|
| 226 |
+
|
| 227 |
+
---
|
| 228 |
+
|
| 229 |
+
### 4. カテゴリ検索
|
| 230 |
+
|
| 231 |
+
特定カテゴリの質問を取得します。
|
| 232 |
+
|
| 233 |
+
**エンドポイント:**
|
| 234 |
+
```
|
| 235 |
+
GET /api/category
|
| 236 |
+
```
|
| 237 |
+
|
| 238 |
+
**パラメータ:**
|
| 239 |
+
| パラメータ | 型 | デフォルト | 説明 |
|
| 240 |
+
|------------|-----|------------|------|
|
| 241 |
+
| category | string | 必須 | カテゴリ名 |
|
| 242 |
+
| limit | integer | 20 | 取得件数(1-20) |
|
| 243 |
+
|
| 244 |
+
**レスポンス例:**
|
| 245 |
+
```json
|
| 246 |
+
{
|
| 247 |
+
"keyword": "カテゴリ:健康",
|
| 248 |
+
"count": 3,
|
| 249 |
+
"results": [
|
| 250 |
+
{
|
| 251 |
+
"title": "睡眠時間について",
|
| 252 |
+
"url": "https://chiebukuro.yahoo.co.jp/question/detail/q11111111",
|
| 253 |
+
"content_preview": "理想的な睡眠時間は何時間でしょうか...",
|
| 254 |
+
"category": "健康、美容とファッション",
|
| 255 |
+
"post_date": "2025/01/20",
|
| 256 |
+
"answer_count": "4",
|
| 257 |
+
"searched_at": "2025-01-20T12:00:00.000000"
|
| 258 |
+
}
|
| 259 |
+
]
|
| 260 |
+
}
|
| 261 |
+
```
|
| 262 |
+
|
| 263 |
+
**cURLコマンド例:**
|
| 264 |
+
```bash
|
| 265 |
+
curl -X GET "https://your-space.hf.space/api/category?category=健康&limit=10" \
|
| 266 |
+
-H "X-API-Key: your_api_key_here"
|
| 267 |
+
```
|
| 268 |
+
|
| 269 |
+
---
|
| 270 |
+
|
| 271 |
+
## 🚄 並列処理について
|
| 272 |
+
|
| 273 |
+
本APIは最大5件の質問詳細を同時に取得する並列処理機能を実装しています:
|
| 274 |
+
|
| 275 |
+
- **処理速度**: 従来の逐次処理と比較して最大5倍高速
|
| 276 |
+
- **負荷分散**: 同時実行数を制限してサーバー負荷を適切に管理
|
| 277 |
+
- **タイムアウト**: 各質問の詳細取得には30秒のタイムアウトを設定
|
| 278 |
+
- **エラー処理**: 個別のエラーハンドリングで安定性を確保
|
| 279 |
+
|
| 280 |
+
### 処理時間の目安
|
| 281 |
+
|
| 282 |
+
| 取得件数 | 逐次処理(従来) | 並列処理(新) |
|
| 283 |
+
|----------|------------------|----------------|
|
| 284 |
+
| 5件 | 約15秒 | 約3-5秒 |
|
| 285 |
+
| 10件 | 約30秒 | 約6-10秒 |
|
| 286 |
+
| 20件 | 約60秒 | 約12-20秒 |
|
| 287 |
+
|
| 288 |
+
## 📊 エラーレスポンス
|
| 289 |
+
|
| 290 |
+
エラー時は以下の形式でレスポンスが返されます:
|
| 291 |
+
|
| 292 |
+
```json
|
| 293 |
+
{
|
| 294 |
+
"error": "エラータイプ",
|
| 295 |
+
"detail": "エラーの詳細説明"
|
| 296 |
+
}
|
| 297 |
+
```
|
| 298 |
+
|
| 299 |
+
### HTTPステータスコード
|
| 300 |
+
|
| 301 |
+
| コード | 説明 |
|
| 302 |
+
|--------|------|
|
| 303 |
+
| 200 | 成功 |
|
| 304 |
+
| 400 | リクエストパラメータエラー |
|
| 305 |
+
| 401 | 認証エラー(APIキーが無効) |
|
| 306 |
+
| 404 | 質問が見つからない |
|
| 307 |
+
| 500 | サーバー内部エラー |
|
| 308 |
+
|
| 309 |
+
## ⚠️ 注意事項
|
| 310 |
+
|
| 311 |
+
- Yahoo!知恵袋の利用規約を遵守してください
|
| 312 |
+
- 過度なアクセスは避けてください(並列処理でも適切な間隔を保っています)
|
| 313 |
+
- 個人利用の範囲でご使用ください
|
| 314 |
+
- スクレイピングが許可されているか確認が必要です
|
| 315 |
+
- CSSセレクタは定期的に変更される可能性があります
|
| 316 |
+
|
| 317 |
+
## 🔄 レート制限
|
| 318 |
+
|
| 319 |
+
- デフォルト: 1秒間隔
|
| 320 |
+
- 並列処理時も各リクエストに適切な遅延を設定
|
| 321 |
+
- 環境変数`RATE_LIMIT`で調整可能
|
| 322 |
+
|
| 323 |
+
## 📝 License
|
| 324 |
+
|
| 325 |
+
MIT License - 教育目的で作成されています。
|
| 326 |
+
|
| 327 |
+
## 🤝 Contributing
|
| 328 |
+
|
| 329 |
+
不具合報告や機能要望は、GitHubのIssuesにてお願いします。
|
app.py
ADDED
|
@@ -0,0 +1,510 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Yahoo知恵袋スクレイピングAPI
|
| 3 |
+
Gradio UI + FastAPIによるWebアプリケーション
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import logging
|
| 9 |
+
import json
|
| 10 |
+
from typing import Dict, List, Optional
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
import pandas as pd
|
| 13 |
+
from dotenv import load_dotenv
|
| 14 |
+
|
| 15 |
+
import gradio as gr
|
| 16 |
+
from fastapi import FastAPI, HTTPException, Query, Depends, Header
|
| 17 |
+
from fastapi.responses import JSONResponse
|
| 18 |
+
from pydantic import BaseModel
|
| 19 |
+
|
| 20 |
+
from scraper import YahooChiebukuroScraper
|
| 21 |
+
from auth_manager import AuthManager
|
| 22 |
+
|
| 23 |
+
# 環境変数の読み込み
|
| 24 |
+
load_dotenv()
|
| 25 |
+
|
| 26 |
+
# ログ設定
|
| 27 |
+
logging.basicConfig(level=logging.INFO)
|
| 28 |
+
logger = logging.getLogger(__name__)
|
| 29 |
+
|
| 30 |
+
# FastAPIアプリケーション
|
| 31 |
+
app = FastAPI(
|
| 32 |
+
title="Yahoo知恵袋 Scraping API",
|
| 33 |
+
description="Yahoo知恵袋の質問と回答を取得するAPI",
|
| 34 |
+
version="1.0.0"
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# レート制限(秒)
|
| 38 |
+
RATE_LIMIT = float(os.getenv("RATE_LIMIT", "1.0"))
|
| 39 |
+
|
| 40 |
+
# 認証マネージャー
|
| 41 |
+
auth_manager = AuthManager()
|
| 42 |
+
API_AUTH_CONFIGURED = auth_manager.is_configured()
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# レスポンスモデル
|
| 46 |
+
class QuestionSummary(BaseModel):
|
| 47 |
+
"""質問サマリーモデル"""
|
| 48 |
+
title: str
|
| 49 |
+
url: str
|
| 50 |
+
content_preview: str
|
| 51 |
+
category: Optional[str] = ""
|
| 52 |
+
post_date: str
|
| 53 |
+
answer_count: str
|
| 54 |
+
searched_at: str
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class Answer(BaseModel):
|
| 58 |
+
"""回答モデル"""
|
| 59 |
+
content: str
|
| 60 |
+
is_best: bool
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class QuestionDetail(BaseModel):
|
| 64 |
+
"""質問詳細モデル"""
|
| 65 |
+
url: str
|
| 66 |
+
title: str
|
| 67 |
+
content: str
|
| 68 |
+
category: str
|
| 69 |
+
post_date: str
|
| 70 |
+
answers: List[Answer]
|
| 71 |
+
answer_count: int
|
| 72 |
+
scraped_at: str
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class SearchResponse(BaseModel):
|
| 76 |
+
"""検索レスポンス"""
|
| 77 |
+
keyword: str
|
| 78 |
+
count: int
|
| 79 |
+
results: List[QuestionSummary]
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
class ErrorResponse(BaseModel):
|
| 83 |
+
"""エラーレスポンス"""
|
| 84 |
+
error: str
|
| 85 |
+
detail: str
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# 認証チェック
|
| 89 |
+
def verify_api_key(x_api_key: Optional[str] = Header(None)):
|
| 90 |
+
"""API認証チェック"""
|
| 91 |
+
if not auth_manager.validate_api_key(x_api_key):
|
| 92 |
+
raise HTTPException(
|
| 93 |
+
status_code=401,
|
| 94 |
+
detail="Invalid or missing API key"
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
return True
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
# ========== FastAPI エンドポイント ==========
|
| 101 |
+
|
| 102 |
+
@app.get("/health")
|
| 103 |
+
async def health_check():
|
| 104 |
+
"""ヘルスチェック"""
|
| 105 |
+
return {
|
| 106 |
+
"status": "healthy",
|
| 107 |
+
"timestamp": datetime.now().isoformat(),
|
| 108 |
+
"api_auth": API_AUTH_CONFIGURED
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
@app.get("/api/search", response_model=SearchResponse)
|
| 113 |
+
async def search_questions(
|
| 114 |
+
keyword: str = Query(..., description="検索キーワード"),
|
| 115 |
+
limit: int = Query(20, ge=1, le=20, description="取得件数"),
|
| 116 |
+
authorized: bool = Depends(verify_api_key)
|
| 117 |
+
):
|
| 118 |
+
"""
|
| 119 |
+
キーワードで質問を検索
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
keyword: 検索キーワード
|
| 123 |
+
limit: 最大取得件数(1-50)
|
| 124 |
+
"""
|
| 125 |
+
try:
|
| 126 |
+
scraper = YahooChiebukuroScraper(headless=True)
|
| 127 |
+
|
| 128 |
+
try:
|
| 129 |
+
results = scraper.search_questions(keyword, max_results=limit)
|
| 130 |
+
|
| 131 |
+
return SearchResponse(
|
| 132 |
+
keyword=keyword,
|
| 133 |
+
count=len(results),
|
| 134 |
+
results=results
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
finally:
|
| 138 |
+
scraper.close_driver()
|
| 139 |
+
|
| 140 |
+
except Exception as e:
|
| 141 |
+
logger.error(f"Search failed: {e}")
|
| 142 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
@app.get("/api/question", response_model=QuestionDetail)
|
| 146 |
+
async def get_question_detail(
|
| 147 |
+
url: str = Query(..., description="質問のURL"),
|
| 148 |
+
authorized: bool = Depends(verify_api_key)
|
| 149 |
+
):
|
| 150 |
+
"""
|
| 151 |
+
質問の詳細情報を取得
|
| 152 |
+
|
| 153 |
+
Args:
|
| 154 |
+
url: Yahoo知恵袋の質問URL
|
| 155 |
+
"""
|
| 156 |
+
try:
|
| 157 |
+
scraper = YahooChiebukuroScraper(headless=True)
|
| 158 |
+
|
| 159 |
+
try:
|
| 160 |
+
detail = scraper.get_question_detail(url)
|
| 161 |
+
|
| 162 |
+
if not detail:
|
| 163 |
+
raise HTTPException(
|
| 164 |
+
status_code=404,
|
| 165 |
+
detail="Question not found or failed to parse"
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
return detail
|
| 169 |
+
|
| 170 |
+
finally:
|
| 171 |
+
scraper.close_driver()
|
| 172 |
+
|
| 173 |
+
except HTTPException:
|
| 174 |
+
raise
|
| 175 |
+
except Exception as e:
|
| 176 |
+
logger.error(f"Failed to get question detail: {e}")
|
| 177 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
@app.get("/api/category", response_model=SearchResponse)
|
| 181 |
+
async def get_category_questions(
|
| 182 |
+
category: str = Query(..., description="カテゴリ名"),
|
| 183 |
+
limit: int = Query(20, ge=1, le=20, description="取得件数"),
|
| 184 |
+
authorized: bool = Depends(verify_api_key)
|
| 185 |
+
):
|
| 186 |
+
"""
|
| 187 |
+
カテゴリ別に質問を取得
|
| 188 |
+
|
| 189 |
+
Args:
|
| 190 |
+
category: カテゴリ名
|
| 191 |
+
limit: 最大取得件数(1-50)
|
| 192 |
+
"""
|
| 193 |
+
try:
|
| 194 |
+
scraper = YahooChiebukuroScraper(headless=True)
|
| 195 |
+
|
| 196 |
+
try:
|
| 197 |
+
results = scraper.get_category_questions(category, max_results=limit)
|
| 198 |
+
|
| 199 |
+
return SearchResponse(
|
| 200 |
+
keyword=f"カテゴリ:{category}",
|
| 201 |
+
count=len(results),
|
| 202 |
+
results=results
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
finally:
|
| 206 |
+
scraper.close_driver()
|
| 207 |
+
|
| 208 |
+
except Exception as e:
|
| 209 |
+
logger.error(f"Category search failed: {e}")
|
| 210 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
# ========== Gradio UI ==========
|
| 214 |
+
|
| 215 |
+
def check_ui_auth(api_key: str) -> bool:
|
| 216 |
+
"""UI認証チェック"""
|
| 217 |
+
return auth_manager.validate_ui_key(api_key)
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def gradio_search(api_key: str, keyword: str, limit: int, get_details: bool = True):
|
| 221 |
+
"""
|
| 222 |
+
Gradio UI: キーワード検索
|
| 223 |
+
|
| 224 |
+
Args:
|
| 225 |
+
api_key: APIキー(認証有効時のみ)
|
| 226 |
+
keyword: 検索キーワード
|
| 227 |
+
limit: 取得件数(最大20件)
|
| 228 |
+
get_details: 詳細情報を取得するか(時間がかかる)
|
| 229 |
+
"""
|
| 230 |
+
try:
|
| 231 |
+
# 認証チェック
|
| 232 |
+
if API_AUTH_CONFIGURED and not check_ui_auth(api_key):
|
| 233 |
+
return "", "認証エラー: 有効なAPIキーを入力してください"
|
| 234 |
+
|
| 235 |
+
if not keyword:
|
| 236 |
+
return "", "エラー: キーワードを入力してください"
|
| 237 |
+
|
| 238 |
+
# 取得件数を制限
|
| 239 |
+
limit = min(limit, 20)
|
| 240 |
+
|
| 241 |
+
# スクレイピング実行
|
| 242 |
+
scraper = YahooChiebukuroScraper(headless=True)
|
| 243 |
+
|
| 244 |
+
try:
|
| 245 |
+
if get_details:
|
| 246 |
+
status = f"検索中: '{keyword}' ... (最大{limit}件取得、詳細情報付き - 並列処理で高速化)"
|
| 247 |
+
results = scraper.search_questions(keyword, max_results=limit)
|
| 248 |
+
else:
|
| 249 |
+
# 詳細取得をスキップする高速版
|
| 250 |
+
status = f"検索中: '{keyword}' ... (最大{limit}件取得、基本情報のみ)"
|
| 251 |
+
results = scraper.search_questions_fast(keyword, max_results=limit)
|
| 252 |
+
|
| 253 |
+
if not results:
|
| 254 |
+
return "", f"検索結果が見つかりませんでした: '{keyword}'"
|
| 255 |
+
|
| 256 |
+
# JSON形式で整形
|
| 257 |
+
import json
|
| 258 |
+
|
| 259 |
+
# 結果を整形
|
| 260 |
+
formatted_results = []
|
| 261 |
+
for idx, r in enumerate(results, 1):
|
| 262 |
+
formatted_result = {
|
| 263 |
+
"番号": idx,
|
| 264 |
+
"タイトル": r["title"],
|
| 265 |
+
"投稿日": r["post_date"],
|
| 266 |
+
"回答数": r["answer_count"],
|
| 267 |
+
"URL": r["url"]
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
if get_details:
|
| 271 |
+
formatted_result["質問内容"] = r.get("full_content", r["content_preview"])
|
| 272 |
+
formatted_result["ベストアンサー"] = r.get("best_answer", "なし")
|
| 273 |
+
else:
|
| 274 |
+
formatted_result["内容プレビュー"] = r.get("content_preview", "")
|
| 275 |
+
|
| 276 |
+
formatted_results.append(formatted_result)
|
| 277 |
+
|
| 278 |
+
# JSON形式で出力
|
| 279 |
+
json_output = json.dumps(formatted_results, ensure_ascii=False, indent=2)
|
| 280 |
+
|
| 281 |
+
status = f"✅ {len(results)}件の質問を取得しました"
|
| 282 |
+
if get_details:
|
| 283 |
+
status += "(詳細情報付き)"
|
| 284 |
+
else:
|
| 285 |
+
status += "(基本情報のみ)"
|
| 286 |
+
|
| 287 |
+
return json_output, status
|
| 288 |
+
|
| 289 |
+
finally:
|
| 290 |
+
scraper.close_driver()
|
| 291 |
+
|
| 292 |
+
except Exception as e:
|
| 293 |
+
logger.error(f"Gradio search error: {e}")
|
| 294 |
+
return "", f"エラーが発生しました: {str(e)}"
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
def gradio_get_detail(api_key: str, question_url: str):
|
| 298 |
+
"""
|
| 299 |
+
Gradio UI: 質問詳細取得
|
| 300 |
+
|
| 301 |
+
Args:
|
| 302 |
+
api_key: APIキー(認証有効時のみ)
|
| 303 |
+
question_url: 質問のURL
|
| 304 |
+
"""
|
| 305 |
+
try:
|
| 306 |
+
# 認証チェック
|
| 307 |
+
if API_AUTH_CONFIGURED and not check_ui_auth(api_key):
|
| 308 |
+
return "", "", "", "認証エラー: 有効なAPIキーを入力してください"
|
| 309 |
+
|
| 310 |
+
if not question_url:
|
| 311 |
+
return "", "", "", "エラー: URLを入力してください"
|
| 312 |
+
|
| 313 |
+
if not question_url.startswith("https://chiebukuro.yahoo.co.jp"):
|
| 314 |
+
return "", "", "", "エラー: Yahoo知恵袋のURLを入力してください"
|
| 315 |
+
|
| 316 |
+
# スクレイピング実行
|
| 317 |
+
scraper = YahooChiebukuroScraper(headless=True)
|
| 318 |
+
|
| 319 |
+
try:
|
| 320 |
+
detail = scraper.get_question_detail(question_url)
|
| 321 |
+
|
| 322 |
+
if not detail:
|
| 323 |
+
return "", "", "", "質問の取得に失敗しました"
|
| 324 |
+
|
| 325 |
+
# 質問情報を整形
|
| 326 |
+
question_info = f"""## {detail['title']}
|
| 327 |
+
|
| 328 |
+
**カテゴリ:** {detail['category']}
|
| 329 |
+
**投稿日:** {detail['post_date']}
|
| 330 |
+
**回答数:** {detail['answer_count']}
|
| 331 |
+
|
| 332 |
+
---
|
| 333 |
+
|
| 334 |
+
### 質問内容
|
| 335 |
+
{detail['content']}"""
|
| 336 |
+
|
| 337 |
+
# ベストアンサーを抽出
|
| 338 |
+
best_answer = ""
|
| 339 |
+
other_answers = []
|
| 340 |
+
|
| 341 |
+
for answer in detail['answers']:
|
| 342 |
+
if answer['is_best']:
|
| 343 |
+
best_answer = f"""### ✨ ベストアンサー
|
| 344 |
+
{answer['content']}"""
|
| 345 |
+
else:
|
| 346 |
+
other_answers.append(answer['content'])
|
| 347 |
+
|
| 348 |
+
# その他の回答を整形
|
| 349 |
+
other_answers_text = ""
|
| 350 |
+
if other_answers:
|
| 351 |
+
other_answers_text = "### その他の回答\n\n"
|
| 352 |
+
for i, answer in enumerate(other_answers, 1):
|
| 353 |
+
other_answers_text += f"**回答 {i}:**\n{answer}\n\n---\n\n"
|
| 354 |
+
|
| 355 |
+
status = f"✅ 質問詳細を取得しました(回答{detail['answer_count']}件)"
|
| 356 |
+
return question_info, best_answer, other_answers_text, status
|
| 357 |
+
|
| 358 |
+
finally:
|
| 359 |
+
scraper.close_driver()
|
| 360 |
+
|
| 361 |
+
except Exception as e:
|
| 362 |
+
logger.error(f"Gradio detail error: {e}")
|
| 363 |
+
return "", "", "", f"エラーが発生しました: {str(e)}"
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
def gradio_search_category(api_key: str, category: str, limit: int, get_details: bool = True):
|
| 367 |
+
"""
|
| 368 |
+
Gradio UI: カテゴリ検索
|
| 369 |
+
|
| 370 |
+
Args:
|
| 371 |
+
api_key: APIキー(認証有効時のみ)
|
| 372 |
+
category: カテゴリ名
|
| 373 |
+
limit: 取得件数
|
| 374 |
+
get_details: 詳細情報を取得するか
|
| 375 |
+
"""
|
| 376 |
+
# カテゴリ検索はキーワード検索を利用
|
| 377 |
+
return gradio_search(api_key, f"カテゴリ:{category}", limit, get_details)
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
# Gradio インターフェース作成
|
| 381 |
+
with gr.Blocks(title="Yahoo知恵袋 Scraper") as demo:
|
| 382 |
+
gr.Markdown("""
|
| 383 |
+
# Yahoo!知恵袋 スクレイピングツール
|
| 384 |
+
|
| 385 |
+
Yahoo!知恵袋から質問と回答を取得します。
|
| 386 |
+
|
| 387 |
+
**機能:**
|
| 388 |
+
- キーワード検索(質問詳細と回答も取得)
|
| 389 |
+
- カテゴリ検索
|
| 390 |
+
- 最大20件まで取得可能
|
| 391 |
+
|
| 392 |
+
**注意事項:**
|
| 393 |
+
- 利用規約を遵守してください
|
| 394 |
+
- サーバー負荷軽減のため、各質問の詳細取得には2秒の間隔を設けています
|
| 395 |
+
- 個人利用の範囲でご使用ください
|
| 396 |
+
""")
|
| 397 |
+
|
| 398 |
+
# API認証入力(認証が設定されている場合のみ表示)
|
| 399 |
+
if API_AUTH_CONFIGURED:
|
| 400 |
+
api_key_input = gr.Textbox(
|
| 401 |
+
label="API Key",
|
| 402 |
+
placeholder="APIキーを入力してください",
|
| 403 |
+
type="password",
|
| 404 |
+
interactive=True
|
| 405 |
+
)
|
| 406 |
+
else:
|
| 407 |
+
api_key_input = gr.Textbox(value="", visible=False)
|
| 408 |
+
|
| 409 |
+
with gr.Tabs():
|
| 410 |
+
# キーワード検索タブ
|
| 411 |
+
with gr.TabItem("🔍 キーワード検索"):
|
| 412 |
+
gr.Markdown("""
|
| 413 |
+
### 使い方
|
| 414 |
+
1. 検索キーワードを入力
|
| 415 |
+
2. 取得件数を選択(デフォルト: 20件、最大: 20件)
|
| 416 |
+
3. 詳細取得オプションを選択(デフォルトON - 並列処理で高速化)
|
| 417 |
+
4. 検索ボタンをクリック
|
| 418 |
+
""")
|
| 419 |
+
|
| 420 |
+
with gr.Row():
|
| 421 |
+
with gr.Column(scale=2):
|
| 422 |
+
search_keyword = gr.Textbox(
|
| 423 |
+
label="検索キーワード",
|
| 424 |
+
placeholder="例: Python プログラミング",
|
| 425 |
+
interactive=True
|
| 426 |
+
)
|
| 427 |
+
with gr.Column(scale=1):
|
| 428 |
+
search_limit = gr.Slider(
|
| 429 |
+
label="取得件数",
|
| 430 |
+
minimum=1,
|
| 431 |
+
maximum=20,
|
| 432 |
+
value=20,
|
| 433 |
+
step=1
|
| 434 |
+
)
|
| 435 |
+
with gr.Column(scale=1):
|
| 436 |
+
get_details = gr.Checkbox(
|
| 437 |
+
label="詳細情報を取得",
|
| 438 |
+
value=True,
|
| 439 |
+
info="質問本文とベストアンサーを取得(並列処理で高速化)"
|
| 440 |
+
)
|
| 441 |
+
|
| 442 |
+
search_button = gr.Button("検索", variant="primary")
|
| 443 |
+
search_status = gr.Textbox(label="ステータス", interactive=False)
|
| 444 |
+
search_results = gr.Textbox(
|
| 445 |
+
label="検索結果(JSON形式)",
|
| 446 |
+
lines=20,
|
| 447 |
+
max_lines=30,
|
| 448 |
+
interactive=False
|
| 449 |
+
)
|
| 450 |
+
|
| 451 |
+
search_button.click(
|
| 452 |
+
fn=gradio_search,
|
| 453 |
+
inputs=[api_key_input, search_keyword, search_limit, get_details],
|
| 454 |
+
outputs=[search_results, search_status]
|
| 455 |
+
)
|
| 456 |
+
|
| 457 |
+
# カテゴリ検索タブ
|
| 458 |
+
with gr.TabItem("📁 カテゴリ検索"):
|
| 459 |
+
with gr.Row():
|
| 460 |
+
with gr.Column(scale=2):
|
| 461 |
+
category_input = gr.Textbox(
|
| 462 |
+
label="カテゴリ名",
|
| 463 |
+
placeholder="例: 健康、美容とファッション",
|
| 464 |
+
interactive=True
|
| 465 |
+
)
|
| 466 |
+
with gr.Column(scale=1):
|
| 467 |
+
category_limit = gr.Slider(
|
| 468 |
+
label="取得件数",
|
| 469 |
+
minimum=1,
|
| 470 |
+
maximum=20,
|
| 471 |
+
value=20,
|
| 472 |
+
step=1
|
| 473 |
+
)
|
| 474 |
+
with gr.Column(scale=1):
|
| 475 |
+
category_get_details = gr.Checkbox(
|
| 476 |
+
label="詳細情報を取得",
|
| 477 |
+
value=True,
|
| 478 |
+
info="質問本文とベストアンサーを取得(並列処理で���速化)"
|
| 479 |
+
)
|
| 480 |
+
|
| 481 |
+
category_button = gr.Button("カテゴリで検索", variant="primary")
|
| 482 |
+
category_status = gr.Textbox(label="ステータス", interactive=False)
|
| 483 |
+
category_results = gr.Textbox(
|
| 484 |
+
label="カテゴリ検索結果(JSON形式)",
|
| 485 |
+
lines=20,
|
| 486 |
+
max_lines=30,
|
| 487 |
+
interactive=False
|
| 488 |
+
)
|
| 489 |
+
|
| 490 |
+
category_button.click(
|
| 491 |
+
fn=gradio_search_category,
|
| 492 |
+
inputs=[api_key_input, category_input, category_limit, category_get_details],
|
| 493 |
+
outputs=[category_results, category_status]
|
| 494 |
+
)
|
| 495 |
+
|
| 496 |
+
gr.Markdown("""
|
| 497 |
+
---
|
| 498 |
+
### API エンドポイント
|
| 499 |
+
- `/api/search` - キーワード検索
|
| 500 |
+
- `/api/category` - カテゴリ検索
|
| 501 |
+
|
| 502 |
+
詳細は [README](/) をご覧ください。
|
| 503 |
+
""")
|
| 504 |
+
|
| 505 |
+
# FastAPIとGradioの統合
|
| 506 |
+
app = gr.mount_gradio_app(app, demo, path="/")
|
| 507 |
+
|
| 508 |
+
if __name__ == "__main__":
|
| 509 |
+
import uvicorn
|
| 510 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
auth_manager.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
認証管理モジュール
|
| 3 |
+
API認証とUI認証を統一管理
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from typing import Optional
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class AuthManager:
|
| 11 |
+
"""認証管理クラス"""
|
| 12 |
+
|
| 13 |
+
def __init__(self):
|
| 14 |
+
"""初期化"""
|
| 15 |
+
self.api_key = os.getenv("API_KEY", "")
|
| 16 |
+
|
| 17 |
+
def is_configured(self) -> bool:
|
| 18 |
+
"""認証が設定されているか確認"""
|
| 19 |
+
return bool(self.api_key)
|
| 20 |
+
|
| 21 |
+
def validate_api_key(self, provided_key: Optional[str]) -> bool:
|
| 22 |
+
"""
|
| 23 |
+
APIキーの検証
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
provided_key: 提供されたAPIキー
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
検証結果
|
| 30 |
+
"""
|
| 31 |
+
# 認証が設定されていない場合は常に成功
|
| 32 |
+
if not self.is_configured():
|
| 33 |
+
return True
|
| 34 |
+
|
| 35 |
+
# キーが提供されていない場合は失敗
|
| 36 |
+
if not provided_key:
|
| 37 |
+
return False
|
| 38 |
+
|
| 39 |
+
# キーの一致を確認
|
| 40 |
+
return provided_key == self.api_key
|
| 41 |
+
|
| 42 |
+
def validate_ui_key(self, provided_key: str) -> bool:
|
| 43 |
+
"""
|
| 44 |
+
UI用のAPIキー検証
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
provided_key: UIから提供されたAPIキー
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
検証結果
|
| 51 |
+
"""
|
| 52 |
+
# 認証が設定されていない場合は常に成功
|
| 53 |
+
if not self.is_configured():
|
| 54 |
+
return True
|
| 55 |
+
|
| 56 |
+
# 空文字の場合は失敗
|
| 57 |
+
if not provided_key:
|
| 58 |
+
return False
|
| 59 |
+
|
| 60 |
+
# キーの一致を確認
|
| 61 |
+
return provided_key == self.api_key
|
| 62 |
+
|
| 63 |
+
def get_masked_key(self) -> str:
|
| 64 |
+
"""
|
| 65 |
+
マスクされたAPIキーを取得(デバッグ用)
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
マスクされたキー文字列
|
| 69 |
+
"""
|
| 70 |
+
if not self.api_key:
|
| 71 |
+
return "Not configured"
|
| 72 |
+
|
| 73 |
+
if len(self.api_key) <= 8:
|
| 74 |
+
return "****"
|
| 75 |
+
|
| 76 |
+
# 最初の2文字と最後の2文字を表示
|
| 77 |
+
return f"{self.api_key[:2]}{'*' * (len(self.api_key) - 4)}{self.api_key[-2:]}"
|
packages.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
chromium
|
| 2 |
+
chromium-driver
|
requirements.txt
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core dependencies
|
| 2 |
+
gradio==4.19.2
|
| 3 |
+
fastapi>=0.115.0
|
| 4 |
+
uvicorn>=0.20.0
|
| 5 |
+
websockets>=11.0
|
| 6 |
+
|
| 7 |
+
# Selenium and web scraping
|
| 8 |
+
selenium>=4.0.0
|
| 9 |
+
webdriver-manager>=4.0.0
|
| 10 |
+
beautifulsoup4>=4.12.0
|
| 11 |
+
lxml>=4.9.0
|
| 12 |
+
|
| 13 |
+
# Data handling
|
| 14 |
+
pandas>=2.0.0
|
| 15 |
+
python-dotenv>=1.0.0
|
| 16 |
+
|
| 17 |
+
# Utilities
|
| 18 |
+
requests>=2.31.0
|
scraper.py
ADDED
|
@@ -0,0 +1,707 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Yahoo知恵袋スクレイピングモジュール
|
| 3 |
+
Selenium WebDriverを使用してYahoo知恵袋から質問と回答を取得
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import time
|
| 7 |
+
import logging
|
| 8 |
+
from typing import Dict, List, Optional
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from urllib.parse import urljoin, quote
|
| 11 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 12 |
+
import threading
|
| 13 |
+
|
| 14 |
+
from selenium import webdriver
|
| 15 |
+
from selenium.webdriver.common.by import By
|
| 16 |
+
from selenium.webdriver.chrome.options import Options
|
| 17 |
+
from selenium.webdriver.chrome.service import Service
|
| 18 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 19 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 20 |
+
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
| 21 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 22 |
+
from bs4 import BeautifulSoup
|
| 23 |
+
|
| 24 |
+
logging.basicConfig(level=logging.INFO)
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class YahooChiebukuroScraper:
|
| 29 |
+
"""Yahoo知恵袋スクレイピングクラス"""
|
| 30 |
+
|
| 31 |
+
BASE_URL = "https://chiebukuro.yahoo.co.jp"
|
| 32 |
+
SEARCH_URL = "https://chiebukuro.yahoo.co.jp/search"
|
| 33 |
+
|
| 34 |
+
def __init__(self, headless: bool = True, wait_time: int = 10):
|
| 35 |
+
"""
|
| 36 |
+
初期化
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
headless: ヘッドレスモードで実行するか
|
| 40 |
+
wait_time: 要素の待機時間(秒)
|
| 41 |
+
"""
|
| 42 |
+
self.headless = headless
|
| 43 |
+
self.wait_time = wait_time
|
| 44 |
+
self.driver = None
|
| 45 |
+
self.wait = None
|
| 46 |
+
|
| 47 |
+
def save_page_source(self, filename: str = "debug_page.html"):
|
| 48 |
+
"""デバッグ用にページソースを保存"""
|
| 49 |
+
if self.driver:
|
| 50 |
+
try:
|
| 51 |
+
with open(filename, "w", encoding="utf-8") as f:
|
| 52 |
+
f.write(self.driver.page_source)
|
| 53 |
+
logger.info(f"Page source saved to {filename}")
|
| 54 |
+
except Exception as e:
|
| 55 |
+
logger.error(f"Failed to save page source: {e}")
|
| 56 |
+
|
| 57 |
+
def setup_driver(self):
|
| 58 |
+
"""WebDriverのセットアップ"""
|
| 59 |
+
try:
|
| 60 |
+
options = Options()
|
| 61 |
+
|
| 62 |
+
# ヘッドレスモード設定
|
| 63 |
+
if self.headless:
|
| 64 |
+
options.add_argument('--headless')
|
| 65 |
+
options.add_argument('--disable-gpu')
|
| 66 |
+
|
| 67 |
+
# Hugging Face Spaces対応の追加オプション
|
| 68 |
+
options.add_argument('--no-sandbox')
|
| 69 |
+
options.add_argument('--disable-dev-shm-usage')
|
| 70 |
+
options.add_argument('--disable-blink-features=AutomationControlled')
|
| 71 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 72 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 73 |
+
|
| 74 |
+
# Hugging Face Spaces環境用の追加設定
|
| 75 |
+
options.add_argument('--disable-software-rasterizer')
|
| 76 |
+
options.add_argument('--disable-extensions')
|
| 77 |
+
options.add_argument('--disable-setuid-sandbox')
|
| 78 |
+
options.add_argument('--single-process')
|
| 79 |
+
|
| 80 |
+
# User-Agent設定
|
| 81 |
+
options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
|
| 82 |
+
|
| 83 |
+
# Hugging Face Spaces環境でのChromium使用
|
| 84 |
+
import os
|
| 85 |
+
if os.path.exists('/usr/bin/chromium'):
|
| 86 |
+
# Hugging Face Spaces環境
|
| 87 |
+
options.binary_location = '/usr/bin/chromium'
|
| 88 |
+
|
| 89 |
+
# chromium-driverのパスを設定
|
| 90 |
+
if os.path.exists('/usr/bin/chromedriver'):
|
| 91 |
+
service = Service('/usr/bin/chromedriver')
|
| 92 |
+
else:
|
| 93 |
+
# webdriver-managerを使用
|
| 94 |
+
service = Service(ChromeDriverManager(chrome_type="chromium").install())
|
| 95 |
+
else:
|
| 96 |
+
# 通常環境
|
| 97 |
+
service = Service(ChromeDriverManager().install())
|
| 98 |
+
|
| 99 |
+
self.driver = webdriver.Chrome(service=service, options=options)
|
| 100 |
+
self.wait = WebDriverWait(self.driver, self.wait_time)
|
| 101 |
+
|
| 102 |
+
logger.info("WebDriver setup completed")
|
| 103 |
+
return True
|
| 104 |
+
|
| 105 |
+
except Exception as e:
|
| 106 |
+
logger.error(f"Failed to setup WebDriver: {e}")
|
| 107 |
+
return False
|
| 108 |
+
|
| 109 |
+
def close_driver(self):
|
| 110 |
+
"""WebDriverの終了"""
|
| 111 |
+
if self.driver:
|
| 112 |
+
self.driver.quit()
|
| 113 |
+
self.driver = None
|
| 114 |
+
self.wait = None
|
| 115 |
+
logger.info("WebDriver closed")
|
| 116 |
+
|
| 117 |
+
def search_questions(self, keyword: str, max_results: int = 20, debug: bool = False, max_workers: int = 5) -> List[Dict]:
|
| 118 |
+
"""
|
| 119 |
+
キーワードで質問を検索し、各質問の詳細も取得(並列処理対応)
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
keyword: 検索キーワード
|
| 123 |
+
max_results: 最大取得件数(デフォルト20件、最大20件)
|
| 124 |
+
debug: デバッグモード(ページソースを保存)
|
| 125 |
+
max_workers: 並列処理の最大ワーカー数(デフォルト5、最大5件同時処理)
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
質問リスト(詳細情報付き)
|
| 129 |
+
"""
|
| 130 |
+
# 最大件数を20に制限
|
| 131 |
+
max_results = min(max_results, 20)
|
| 132 |
+
results = []
|
| 133 |
+
|
| 134 |
+
try:
|
| 135 |
+
if not self.driver:
|
| 136 |
+
if not self.setup_driver():
|
| 137 |
+
return results
|
| 138 |
+
|
| 139 |
+
# 検索URLにアクセス
|
| 140 |
+
search_url = f"{self.SEARCH_URL}?p={quote(keyword)}&type=list"
|
| 141 |
+
logger.info(f"Searching: {search_url}")
|
| 142 |
+
self.driver.get(search_url)
|
| 143 |
+
|
| 144 |
+
# ページ読み込み待機
|
| 145 |
+
time.sleep(3)
|
| 146 |
+
|
| 147 |
+
# デバッグモード:ページソースを保存
|
| 148 |
+
if debug:
|
| 149 |
+
self.save_page_source(f"search_results_{keyword.replace(' ', '_')}.html")
|
| 150 |
+
|
| 151 |
+
# 新しいCSSセレクタで検索結果を取得
|
| 152 |
+
question_elements = self.driver.find_elements(
|
| 153 |
+
By.CSS_SELECTOR,
|
| 154 |
+
"li.ListSearchResults_listSearchResults__listItem__PurLr"
|
| 155 |
+
)[:max_results]
|
| 156 |
+
|
| 157 |
+
logger.info(f"Found {len(question_elements)} question elements with new selector")
|
| 158 |
+
|
| 159 |
+
# まず全ての質問情報を収集(ページ遷移前に)
|
| 160 |
+
questions_data = []
|
| 161 |
+
for element in question_elements:
|
| 162 |
+
try:
|
| 163 |
+
# タイトル取得(新しいCSSセレクタ)
|
| 164 |
+
title_elem = element.find_element(
|
| 165 |
+
By.CSS_SELECTOR,
|
| 166 |
+
"h3.ListSearchResults_listSearchResults__heading__WGSq8 a"
|
| 167 |
+
)
|
| 168 |
+
title = title_elem.text.strip()
|
| 169 |
+
url = title_elem.get_attribute("href")
|
| 170 |
+
|
| 171 |
+
# 質問本文のプレビューを取得
|
| 172 |
+
content_preview = ""
|
| 173 |
+
try:
|
| 174 |
+
content_elem = element.find_element(
|
| 175 |
+
By.CSS_SELECTOR,
|
| 176 |
+
"p.ListSearchResults_listSearchResults__summary__0897S"
|
| 177 |
+
)
|
| 178 |
+
content_preview = content_elem.text.strip()
|
| 179 |
+
except NoSuchElementException:
|
| 180 |
+
pass
|
| 181 |
+
|
| 182 |
+
# 投稿日時取得
|
| 183 |
+
post_date = "不明"
|
| 184 |
+
try:
|
| 185 |
+
date_elem = element.find_element(
|
| 186 |
+
By.CSS_SELECTOR,
|
| 187 |
+
"span.ListSearchResults_listSearchResults__informationDate__J4NVn span:last-child"
|
| 188 |
+
)
|
| 189 |
+
post_date = date_elem.text.strip()
|
| 190 |
+
except NoSuchElementException:
|
| 191 |
+
pass
|
| 192 |
+
|
| 193 |
+
# 回答数取得
|
| 194 |
+
answer_count = "0"
|
| 195 |
+
try:
|
| 196 |
+
answer_elem = element.find_element(
|
| 197 |
+
By.CSS_SELECTOR,
|
| 198 |
+
"span.ListSearchResults_listSearchResults__informationAnswers__64Dhv span:last-child"
|
| 199 |
+
)
|
| 200 |
+
answer_count = answer_elem.text.strip()
|
| 201 |
+
except NoSuchElementException:
|
| 202 |
+
pass
|
| 203 |
+
|
| 204 |
+
questions_data.append({
|
| 205 |
+
"title": title,
|
| 206 |
+
"url": url,
|
| 207 |
+
"content_preview": content_preview,
|
| 208 |
+
"post_date": post_date,
|
| 209 |
+
"answer_count": answer_count
|
| 210 |
+
})
|
| 211 |
+
|
| 212 |
+
except Exception as e:
|
| 213 |
+
logger.warning(f"Failed to parse question element: {e}")
|
| 214 |
+
continue
|
| 215 |
+
|
| 216 |
+
# 並列処理で詳細を取得
|
| 217 |
+
logger.info(f"Starting parallel detail fetching with max {max_workers} workers...")
|
| 218 |
+
|
| 219 |
+
# 最大ワーカー数を5に制限
|
| 220 |
+
max_workers = min(max_workers, 5)
|
| 221 |
+
|
| 222 |
+
# 詳細取得用の関数(各ワーカーで実行)
|
| 223 |
+
def fetch_detail_with_delay(idx_and_question):
|
| 224 |
+
idx, question = idx_and_question
|
| 225 |
+
try:
|
| 226 |
+
# 最初の5件は同時開始、それ以降は2秒間隔を設ける
|
| 227 |
+
# 並列処理でもサーバー負荷を考慮
|
| 228 |
+
if idx >= max_workers:
|
| 229 |
+
time.sleep((idx - max_workers + 1) * 2)
|
| 230 |
+
|
| 231 |
+
logger.info(f"[Worker] Getting detail for question {idx+1}/{len(questions_data)}: {question['title'][:50]}...")
|
| 232 |
+
question_detail = self.get_question_detail_content(question['url'])
|
| 233 |
+
|
| 234 |
+
return {
|
| 235 |
+
"title": question['title'],
|
| 236 |
+
"url": question['url'],
|
| 237 |
+
"content_preview": question['content_preview'],
|
| 238 |
+
"full_content": question_detail.get("content", question['content_preview']) if question_detail else question['content_preview'],
|
| 239 |
+
"post_date": question['post_date'],
|
| 240 |
+
"answer_count": question['answer_count'],
|
| 241 |
+
"best_answer": question_detail.get("best_answer", None) if question_detail else None,
|
| 242 |
+
"searched_at": datetime.now().isoformat()
|
| 243 |
+
}
|
| 244 |
+
except Exception as e:
|
| 245 |
+
logger.warning(f"[Worker] Failed to get detail for question: {e}")
|
| 246 |
+
# 詳細取得に失敗しても基本情報は保存
|
| 247 |
+
return {
|
| 248 |
+
"title": question['title'],
|
| 249 |
+
"url": question['url'],
|
| 250 |
+
"content_preview": question['content_preview'],
|
| 251 |
+
"full_content": question['content_preview'],
|
| 252 |
+
"post_date": question['post_date'],
|
| 253 |
+
"answer_count": question['answer_count'],
|
| 254 |
+
"best_answer": None,
|
| 255 |
+
"searched_at": datetime.now().isoformat()
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
# ThreadPoolExecutorで並列処理
|
| 259 |
+
with ThreadPoolExecutor(max_workers=min(max_workers, len(questions_data))) as executor:
|
| 260 |
+
# インデックス付きでサブミット
|
| 261 |
+
future_to_idx = {
|
| 262 |
+
executor.submit(fetch_detail_with_delay, (idx, q)): idx
|
| 263 |
+
for idx, q in enumerate(questions_data)
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
# 結果を順番通りに格納するための辞書
|
| 267 |
+
results_dict = {}
|
| 268 |
+
|
| 269 |
+
# 完了したものから処理(順序は保持)
|
| 270 |
+
for future in as_completed(future_to_idx):
|
| 271 |
+
idx = future_to_idx[future]
|
| 272 |
+
try:
|
| 273 |
+
result = future.result(timeout=30) # 30秒のタイムアウト
|
| 274 |
+
results_dict[idx] = result
|
| 275 |
+
logger.info(f"[Worker] Completed {len(results_dict)}/{len(questions_data)} questions")
|
| 276 |
+
except Exception as e:
|
| 277 |
+
logger.error(f"[Worker] Exception for question {idx}: {e}")
|
| 278 |
+
# エラー時は基本情報のみ
|
| 279 |
+
question = questions_data[idx]
|
| 280 |
+
results_dict[idx] = {
|
| 281 |
+
"title": question['title'],
|
| 282 |
+
"url": question['url'],
|
| 283 |
+
"content_preview": question['content_preview'],
|
| 284 |
+
"full_content": question['content_preview'],
|
| 285 |
+
"post_date": question['post_date'],
|
| 286 |
+
"answer_count": question['answer_count'],
|
| 287 |
+
"best_answer": None,
|
| 288 |
+
"searched_at": datetime.now().isoformat()
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
# インデックス順にソートして結果リストを作成
|
| 292 |
+
results = [results_dict[i] for i in sorted(results_dict.keys())]
|
| 293 |
+
|
| 294 |
+
logger.info(f"Successfully retrieved {len(results)} questions with details")
|
| 295 |
+
|
| 296 |
+
except Exception as e:
|
| 297 |
+
logger.error(f"Search failed: {e}")
|
| 298 |
+
if debug:
|
| 299 |
+
self.save_page_source("error_page.html")
|
| 300 |
+
|
| 301 |
+
return results
|
| 302 |
+
|
| 303 |
+
def search_questions_fast(self, keyword: str, max_results: int = 20, debug: bool = False) -> List[Dict]:
|
| 304 |
+
"""
|
| 305 |
+
キーワードで質問を検索(高速版・詳細なし)
|
| 306 |
+
|
| 307 |
+
Args:
|
| 308 |
+
keyword: 検索キーワード
|
| 309 |
+
max_results: 最大取得件数(最大20件)
|
| 310 |
+
debug: デバッグモード(ページソースを保存)
|
| 311 |
+
|
| 312 |
+
Returns:
|
| 313 |
+
質問リスト(基本情報のみ)
|
| 314 |
+
"""
|
| 315 |
+
# 最大件数を20に制限
|
| 316 |
+
max_results = min(max_results, 20)
|
| 317 |
+
results = []
|
| 318 |
+
|
| 319 |
+
try:
|
| 320 |
+
if not self.driver:
|
| 321 |
+
if not self.setup_driver():
|
| 322 |
+
return results
|
| 323 |
+
|
| 324 |
+
# 検索URLにアクセス
|
| 325 |
+
search_url = f"{self.SEARCH_URL}?p={quote(keyword)}&type=list"
|
| 326 |
+
logger.info(f"Searching (fast mode): {search_url}")
|
| 327 |
+
self.driver.get(search_url)
|
| 328 |
+
|
| 329 |
+
# ページ読み込み待機
|
| 330 |
+
time.sleep(3)
|
| 331 |
+
|
| 332 |
+
# デバッグモード:ページソースを保存
|
| 333 |
+
if debug:
|
| 334 |
+
self.save_page_source(f"search_results_{keyword.replace(' ', '_')}.html")
|
| 335 |
+
|
| 336 |
+
# 新しいCSSセレクタで検索結果を取得
|
| 337 |
+
question_elements = self.driver.find_elements(
|
| 338 |
+
By.CSS_SELECTOR,
|
| 339 |
+
"li.ListSearchResults_listSearchResults__listItem__PurLr"
|
| 340 |
+
)[:max_results]
|
| 341 |
+
|
| 342 |
+
logger.info(f"Found {len(question_elements)} question elements (fast mode)")
|
| 343 |
+
|
| 344 |
+
for element in question_elements:
|
| 345 |
+
try:
|
| 346 |
+
# タイトル取得
|
| 347 |
+
title_elem = element.find_element(
|
| 348 |
+
By.CSS_SELECTOR,
|
| 349 |
+
"h3.ListSearchResults_listSearchResults__heading__WGSq8 a"
|
| 350 |
+
)
|
| 351 |
+
title = title_elem.text.strip()
|
| 352 |
+
url = title_elem.get_attribute("href")
|
| 353 |
+
|
| 354 |
+
# 質問本文のプレビューを取得
|
| 355 |
+
content_preview = ""
|
| 356 |
+
try:
|
| 357 |
+
content_elem = element.find_element(
|
| 358 |
+
By.CSS_SELECTOR,
|
| 359 |
+
"p.ListSearchResults_listSearchResults__summary__0897S"
|
| 360 |
+
)
|
| 361 |
+
content_preview = content_elem.text.strip()
|
| 362 |
+
except NoSuchElementException:
|
| 363 |
+
pass
|
| 364 |
+
|
| 365 |
+
# 投稿日時取得
|
| 366 |
+
post_date = "不明"
|
| 367 |
+
try:
|
| 368 |
+
date_elem = element.find_element(
|
| 369 |
+
By.CSS_SELECTOR,
|
| 370 |
+
"span.ListSearchResults_listSearchResults__informationDate__J4NVn span:last-child"
|
| 371 |
+
)
|
| 372 |
+
post_date = date_elem.text.strip()
|
| 373 |
+
except NoSuchElementException:
|
| 374 |
+
pass
|
| 375 |
+
|
| 376 |
+
# 回答数取得
|
| 377 |
+
answer_count = "0"
|
| 378 |
+
try:
|
| 379 |
+
answer_elem = element.find_element(
|
| 380 |
+
By.CSS_SELECTOR,
|
| 381 |
+
"span.ListSearchResults_listSearchResults__informationAnswers__64Dhv span:last-child"
|
| 382 |
+
)
|
| 383 |
+
answer_count = answer_elem.text.strip()
|
| 384 |
+
except NoSuchElementException:
|
| 385 |
+
pass
|
| 386 |
+
|
| 387 |
+
# 閲覧数取得
|
| 388 |
+
views_count = "0"
|
| 389 |
+
try:
|
| 390 |
+
views_elem = element.find_element(
|
| 391 |
+
By.CSS_SELECTOR,
|
| 392 |
+
"span.ListSearchResults_listSearchResults__informationViews__VivY6 span:last-child"
|
| 393 |
+
)
|
| 394 |
+
views_count = views_elem.text.strip()
|
| 395 |
+
except NoSuchElementException:
|
| 396 |
+
pass
|
| 397 |
+
|
| 398 |
+
results.append({
|
| 399 |
+
"title": title,
|
| 400 |
+
"url": url,
|
| 401 |
+
"content_preview": content_preview,
|
| 402 |
+
"post_date": post_date,
|
| 403 |
+
"answer_count": answer_count,
|
| 404 |
+
"views_count": views_count,
|
| 405 |
+
"searched_at": datetime.now().isoformat()
|
| 406 |
+
})
|
| 407 |
+
|
| 408 |
+
except Exception as e:
|
| 409 |
+
logger.warning(f"Failed to parse question element: {e}")
|
| 410 |
+
continue
|
| 411 |
+
|
| 412 |
+
logger.info(f"Successfully retrieved {len(results)} questions (fast mode)")
|
| 413 |
+
|
| 414 |
+
except Exception as e:
|
| 415 |
+
logger.error(f"Search failed: {e}")
|
| 416 |
+
if debug:
|
| 417 |
+
self.save_page_source("error_page.html")
|
| 418 |
+
|
| 419 |
+
return results
|
| 420 |
+
|
| 421 |
+
def get_question_detail_content(self, question_url: str) -> Optional[Dict]:
|
| 422 |
+
"""
|
| 423 |
+
質問の詳細コンテンツのみ取得(簡易版)
|
| 424 |
+
|
| 425 |
+
Args:
|
| 426 |
+
question_url: 質問のURL
|
| 427 |
+
|
| 428 |
+
Returns:
|
| 429 |
+
質問の詳細情報(コンテンツとベストアンサーのみ)
|
| 430 |
+
"""
|
| 431 |
+
try:
|
| 432 |
+
logger.info(f"Getting question detail: {question_url}")
|
| 433 |
+
self.driver.get(question_url)
|
| 434 |
+
|
| 435 |
+
# ページ読み込み待機
|
| 436 |
+
time.sleep(3)
|
| 437 |
+
|
| 438 |
+
result = {}
|
| 439 |
+
|
| 440 |
+
# 質問本文を取得 - h1タグに本文が含まれている
|
| 441 |
+
content = ""
|
| 442 |
+
selectors = [
|
| 443 |
+
# h1タグ(タイトルと本文が同じ要素に含まれる場合)
|
| 444 |
+
"h1.ClapLv1TextBlock_Chie-TextBlock__Text__etZbS",
|
| 445 |
+
"h1[class*='TextBlock']",
|
| 446 |
+
# その他の可能性
|
| 447 |
+
"div.ClapLv1TextBlock_Chie-TextBlock__4j9Y9 h1",
|
| 448 |
+
"article h1"
|
| 449 |
+
]
|
| 450 |
+
|
| 451 |
+
for selector in selectors:
|
| 452 |
+
try:
|
| 453 |
+
content_elem = self.driver.find_element(By.CSS_SELECTOR, selector)
|
| 454 |
+
content = content_elem.text.strip()
|
| 455 |
+
if content:
|
| 456 |
+
logger.info(f"Found question content with selector: {selector}")
|
| 457 |
+
break
|
| 458 |
+
except NoSuchElementException:
|
| 459 |
+
continue
|
| 460 |
+
except Exception as e:
|
| 461 |
+
logger.debug(f"Error with selector {selector}: {e}")
|
| 462 |
+
continue
|
| 463 |
+
|
| 464 |
+
result["content"] = content if content else ""
|
| 465 |
+
|
| 466 |
+
# ベストアンサーを取得
|
| 467 |
+
best_answer = None
|
| 468 |
+
|
| 469 |
+
# まず、ベストアンサーのテキストを探す
|
| 470 |
+
answer_selectors = [
|
| 471 |
+
# ベストアンサーのテキストブロック
|
| 472 |
+
"div.ClapLv2AnswerItem_Chie-AnswerItem--Best__yJIDl div.ClapLv1TextBlock_Chie-TextBlock__Text__etZbS",
|
| 473 |
+
"div[class*='AnswerItem--Best'] div[class*='TextBlock__Text']",
|
| 474 |
+
# 通常の回答の最初のもの(ベストアンサーがない場合)
|
| 475 |
+
"div.ClapLv2AnswerItem_Chie-AnswerItem__CYXyb div.ClapLv1TextBlock_Chie-TextBlock__Text__etZbS"
|
| 476 |
+
]
|
| 477 |
+
|
| 478 |
+
for selector in answer_selectors:
|
| 479 |
+
try:
|
| 480 |
+
answer_elem = self.driver.find_element(By.CSS_SELECTOR, selector)
|
| 481 |
+
best_answer = answer_elem.text.strip()
|
| 482 |
+
if best_answer:
|
| 483 |
+
logger.info(f"Found best answer with selector: {selector}")
|
| 484 |
+
break
|
| 485 |
+
except NoSuchElementException:
|
| 486 |
+
continue
|
| 487 |
+
except Exception as e:
|
| 488 |
+
logger.debug(f"Error with answer selector {selector}: {e}")
|
| 489 |
+
continue
|
| 490 |
+
|
| 491 |
+
result["best_answer"] = best_answer
|
| 492 |
+
|
| 493 |
+
# デバッグ情報
|
| 494 |
+
if not content:
|
| 495 |
+
logger.warning(f"Could not find content for: {question_url}")
|
| 496 |
+
|
| 497 |
+
return result
|
| 498 |
+
|
| 499 |
+
except Exception as e:
|
| 500 |
+
logger.warning(f"Failed to get question detail: {e}")
|
| 501 |
+
return None
|
| 502 |
+
|
| 503 |
+
def get_question_detail(self, question_url: str) -> Optional[Dict]:
|
| 504 |
+
"""
|
| 505 |
+
質問の詳細情報を取得
|
| 506 |
+
|
| 507 |
+
Args:
|
| 508 |
+
question_url: 質問のURL
|
| 509 |
+
|
| 510 |
+
Returns:
|
| 511 |
+
質問の詳細情報
|
| 512 |
+
"""
|
| 513 |
+
try:
|
| 514 |
+
if not self.driver:
|
| 515 |
+
if not self.setup_driver():
|
| 516 |
+
return None
|
| 517 |
+
|
| 518 |
+
logger.info(f"Getting question detail: {question_url}")
|
| 519 |
+
self.driver.get(question_url)
|
| 520 |
+
|
| 521 |
+
# ページ読み込み待機
|
| 522 |
+
time.sleep(2)
|
| 523 |
+
|
| 524 |
+
# 質問タイトル
|
| 525 |
+
try:
|
| 526 |
+
title_elem = self.wait.until(
|
| 527 |
+
EC.presence_of_element_located(
|
| 528 |
+
(By.CSS_SELECTOR, "h1.ClapLv1QuestionItem__title")
|
| 529 |
+
)
|
| 530 |
+
)
|
| 531 |
+
title = title_elem.text.strip()
|
| 532 |
+
except TimeoutException:
|
| 533 |
+
title = "タイトル取得失敗"
|
| 534 |
+
|
| 535 |
+
# 質問本文
|
| 536 |
+
try:
|
| 537 |
+
content_elem = self.driver.find_element(
|
| 538 |
+
By.CSS_SELECTOR,
|
| 539 |
+
"div.ClapLv1QuestionItem__body"
|
| 540 |
+
)
|
| 541 |
+
content = content_elem.text.strip()
|
| 542 |
+
except NoSuchElementException:
|
| 543 |
+
content = "本文取得失敗"
|
| 544 |
+
|
| 545 |
+
# カテゴリ
|
| 546 |
+
try:
|
| 547 |
+
category_elem = self.driver.find_element(
|
| 548 |
+
By.CSS_SELECTOR,
|
| 549 |
+
"div.ClapLv1QuestionItem__category"
|
| 550 |
+
)
|
| 551 |
+
category = category_elem.text.strip()
|
| 552 |
+
except NoSuchElementException:
|
| 553 |
+
category = "不明"
|
| 554 |
+
|
| 555 |
+
# 投稿日時
|
| 556 |
+
try:
|
| 557 |
+
date_elem = self.driver.find_element(
|
| 558 |
+
By.CSS_SELECTOR,
|
| 559 |
+
"span.ClapLv1QuestionItem__date"
|
| 560 |
+
)
|
| 561 |
+
post_date = date_elem.text.strip()
|
| 562 |
+
except NoSuchElementException:
|
| 563 |
+
post_date = "不明"
|
| 564 |
+
|
| 565 |
+
# ベストアンサー取得
|
| 566 |
+
best_answer = None
|
| 567 |
+
try:
|
| 568 |
+
best_answer_elem = self.driver.find_element(
|
| 569 |
+
By.CSS_SELECTOR,
|
| 570 |
+
"div.ClapLv1AnswerItem--best"
|
| 571 |
+
)
|
| 572 |
+
best_answer_content = best_answer_elem.find_element(
|
| 573 |
+
By.CSS_SELECTOR,
|
| 574 |
+
"div.ClapLv1AnswerItem__body"
|
| 575 |
+
).text.strip()
|
| 576 |
+
|
| 577 |
+
best_answer = {
|
| 578 |
+
"content": best_answer_content,
|
| 579 |
+
"is_best": True
|
| 580 |
+
}
|
| 581 |
+
except NoSuchElementException:
|
| 582 |
+
logger.info("No best answer found")
|
| 583 |
+
|
| 584 |
+
# その他の回答取得
|
| 585 |
+
other_answers = []
|
| 586 |
+
try:
|
| 587 |
+
answer_elements = self.driver.find_elements(
|
| 588 |
+
By.CSS_SELECTOR,
|
| 589 |
+
"div.ClapLv1AnswerItem:not(.ClapLv1AnswerItem--best)"
|
| 590 |
+
)
|
| 591 |
+
|
| 592 |
+
for answer_elem in answer_elements[:5]: # 最大5件まで
|
| 593 |
+
try:
|
| 594 |
+
answer_content = answer_elem.find_element(
|
| 595 |
+
By.CSS_SELECTOR,
|
| 596 |
+
"div.ClapLv1AnswerItem__body"
|
| 597 |
+
).text.strip()
|
| 598 |
+
|
| 599 |
+
other_answers.append({
|
| 600 |
+
"content": answer_content,
|
| 601 |
+
"is_best": False
|
| 602 |
+
})
|
| 603 |
+
except Exception as e:
|
| 604 |
+
logger.warning(f"Failed to parse answer: {e}")
|
| 605 |
+
continue
|
| 606 |
+
|
| 607 |
+
except Exception as e:
|
| 608 |
+
logger.warning(f"Failed to get other answers: {e}")
|
| 609 |
+
|
| 610 |
+
# 全回答をまとめる
|
| 611 |
+
all_answers = []
|
| 612 |
+
if best_answer:
|
| 613 |
+
all_answers.append(best_answer)
|
| 614 |
+
all_answers.extend(other_answers)
|
| 615 |
+
|
| 616 |
+
return {
|
| 617 |
+
"url": question_url,
|
| 618 |
+
"title": title,
|
| 619 |
+
"content": content,
|
| 620 |
+
"category": category,
|
| 621 |
+
"post_date": post_date,
|
| 622 |
+
"answers": all_answers,
|
| 623 |
+
"answer_count": len(all_answers),
|
| 624 |
+
"scraped_at": datetime.now().isoformat()
|
| 625 |
+
}
|
| 626 |
+
|
| 627 |
+
except Exception as e:
|
| 628 |
+
logger.error(f"Failed to get question detail: {e}")
|
| 629 |
+
return None
|
| 630 |
+
|
| 631 |
+
def get_category_questions(self, category: str, max_results: int = 10) -> List[Dict]:
|
| 632 |
+
"""
|
| 633 |
+
カテゴリ別に質問を取得
|
| 634 |
+
|
| 635 |
+
Args:
|
| 636 |
+
category: カテゴリ名
|
| 637 |
+
max_results: 最大取得件数
|
| 638 |
+
|
| 639 |
+
Returns:
|
| 640 |
+
質問リスト
|
| 641 |
+
"""
|
| 642 |
+
# カテゴリ検索は通常の検索を使用
|
| 643 |
+
return self.search_questions(f"カテゴリ:{category}", max_results)
|
| 644 |
+
|
| 645 |
+
|
| 646 |
+
# テスト用関数
|
| 647 |
+
def test_scraper():
|
| 648 |
+
"""
|
| 649 |
+
スクレイパーのテスト関数
|
| 650 |
+
|
| 651 |
+
並列処理テスト方法:
|
| 652 |
+
1. 通常の検索(並列処理あり・デフォルト)
|
| 653 |
+
scraper.search_questions("Python", max_results=10)
|
| 654 |
+
-> 最大5件同時処理で高速化
|
| 655 |
+
|
| 656 |
+
2. 並列度を変更してテスト
|
| 657 |
+
scraper.search_questions("Python", max_results=10, max_workers=3)
|
| 658 |
+
-> 最大3件同時処理
|
| 659 |
+
|
| 660 |
+
3. 逐次処理でテスト(比較用)
|
| 661 |
+
scraper.search_questions("Python", max_results=10, max_workers=1)
|
| 662 |
+
-> 1件ずつ処理(従来の方法と同等)
|
| 663 |
+
|
| 664 |
+
4. 処理時間の比較例
|
| 665 |
+
- max_workers=1: 約30秒(10件取得時)
|
| 666 |
+
- max_workers=5: 約6-10秒(10件取得時)
|
| 667 |
+
"""
|
| 668 |
+
import time
|
| 669 |
+
|
| 670 |
+
scraper = YahooChiebukuroScraper(headless=True)
|
| 671 |
+
|
| 672 |
+
try:
|
| 673 |
+
# 並列処理テスト(デフォルト: max_workers=5)
|
| 674 |
+
print("=== 並列処理テスト開始 ===")
|
| 675 |
+
start_time = time.time()
|
| 676 |
+
|
| 677 |
+
results = scraper.search_questions(
|
| 678 |
+
"Python プログラミング",
|
| 679 |
+
max_results=10, # 10件取得
|
| 680 |
+
max_workers=5 # 5件同時処理(デフォルト)
|
| 681 |
+
)
|
| 682 |
+
|
| 683 |
+
elapsed_time = time.time() - start_time
|
| 684 |
+
print(f"並列処理(5 workers): {len(results)}件取得 - {elapsed_time:.1f}秒")
|
| 685 |
+
|
| 686 |
+
# 逐次処理との比較(オプション)
|
| 687 |
+
# コメントアウトを外すと逐次処理との比較が可能
|
| 688 |
+
# start_time = time.time()
|
| 689 |
+
# results_sequential = scraper.search_questions(
|
| 690 |
+
# "Python プログラミング",
|
| 691 |
+
# max_results=10,
|
| 692 |
+
# max_workers=1 # 逐次処理
|
| 693 |
+
# )
|
| 694 |
+
# elapsed_time_seq = time.time() - start_time
|
| 695 |
+
# print(f"逐次処理(1 worker): {len(results_sequential)}件取得 - {elapsed_time_seq:.1f}秒")
|
| 696 |
+
|
| 697 |
+
if results:
|
| 698 |
+
print(f"\n最初の質問: {results[0]['title']}")
|
| 699 |
+
print(f"詳細情報: {'あり' if results[0].get('best_answer') else 'なし'}")
|
| 700 |
+
|
| 701 |
+
finally:
|
| 702 |
+
scraper.close_driver()
|
| 703 |
+
print("\n=== テスト完了 ===")
|
| 704 |
+
|
| 705 |
+
|
| 706 |
+
if __name__ == "__main__":
|
| 707 |
+
test_scraper()
|