Spaces:
Build error
Build error
File size: 7,002 Bytes
ab9ff53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import re
from io import StringIO
from typing import Any
import pandas as pd
import requests
from bs4 import BeautifulSoup
def process_list_element(list_element: Any, indent: int = 0) -> str:
"""リスト要素を再帰的に処理する関数"""
result = []
is_ordered = list_element.name == "ol"
for i, li in enumerate(list_element.find_all("li", recursive=False)):
# リスト項目のテキストを取得
# ネストされたリストを除いたテキストを取得
item_text = ""
for content in li.contents:
if content.name not in ["ul", "ol"]:
item_text += str(content)
item_text = BeautifulSoup(item_text, "html.parser").get_text().strip()
# 順序付きリストなら番号を、そうでなければ記号を使用
prefix = " " * indent + (f"{i + 1}. " if is_ordered else "* ")
if item_text:
result.append(prefix + item_text)
# ネストされたリストを処理
for nested_list in li.find_all(["ul", "ol"], recursive=False):
nested_content = process_list_element(nested_list, indent + 1)
if nested_content:
result.append(nested_content)
return "\n".join(result)
def get_wiki_content(title: str, language: str = "en") -> tuple[str, dict[str, pd.DataFrame]]:
"""
Get Wikipedia page content and tables.
Returns:
A tuple containing the page content as a string and a dictionary of tables
extracted from the page. The keys of the dictionary are "table_1", "table_2", etc.
and the values are pandas DataFrames representing the tables.
Example:
content, tables = get_wiki_content("Python_(programming_language)")
print(content)
print(tables["table_1"]) # Access the first table
Args:
title: wikipedia page title (e.g., "Python_(programming_language)")
language: wikipedia language (e.g., "en" for English, "ja" for Japanese)
"""
# パースAPIのURLを構築
api_url = f"https://{language}.wikipedia.org/w/api.php"
# APIパラメータ
params = {
"action": "parse",
"page": title,
"format": "json",
"prop": "text",
"disabletoc": True,
}
# リクエストを送信
response = requests.get(api_url, params=params, timeout=30) # type: ignore
# レスポンスをチェック
if response.status_code != 200:
raise Exception(f"api error: {response.status_code} - {response.text}")
# JSONレスポンスをパース
data = response.json()
# エラーチェック
if "error" in data:
raise Exception(f"api error: {data['error']['info']}")
if "parse" not in data:
raise Exception("api error: No parse data found")
# HTMLコンテンツを取得
html_content = data["parse"]["text"]["*"]
# HTMLをパース
soup = BeautifulSoup(html_content, "html.parser")
content_soup = BeautifulSoup(html_content, "html.parser")
# テーブル情報を取得
tables_dict: dict[str, pd.DataFrame] = {}
table_ids: list[tuple[str, str]] = [] # (table_id, table_html) のリスト
# ターゲットとするテーブルを特定: wikitableとinfobox
table_index = 1
# まず、infobox(バイオグラフィーテーブル)を処理
infoboxes = soup.find_all("table", class_=lambda c: c and "infobox" in c)
for i, table in enumerate(infoboxes):
table_id = f"table_{table_index}"
table_ids.append((table_id, str(table)))
table_index += 1
# 次に、wikitableを処理
wikitables = soup.find_all("table", class_="wikitable")
for i, table in enumerate(wikitables):
table_id = f"table_{table_index}"
table_ids.append((table_id, str(table)))
table_index += 1
# 抽出したテーブルをpandasで処理
for table_id, table_html in table_ids:
try:
dfs = pd.read_html(StringIO(table_html))
if dfs:
tables_dict[table_id] = dfs[0]
except Exception:
# テーブル解析に失敗した場合はスキップ
continue
# コンテンツ内のテーブルをプレースホルダに置き換え
table_placeholders: dict[str, str] = {}
# infoboxの処理
for i, table in enumerate(content_soup.find_all("table", class_=lambda c: c and "infobox" in c)):
table_id = f"table_{i + 1}"
if table_id in tables_dict:
placeholder = f"{{{{{table_id}}}}}"
table_placeholders[table_id] = placeholder
table_placeholder_tag = content_soup.new_tag("p")
table_placeholder_tag.string = placeholder
table.replace_with(table_placeholder_tag)
# wikitableの処理(インデックスは続きから)
wikitable_start_index = len(infoboxes) + 1
for i, table in enumerate(content_soup.find_all("table", class_="wikitable")):
table_id = f"table_{wikitable_start_index + i}"
if table_id in tables_dict:
placeholder = f"{{{{{table_id}}}}}"
table_placeholders[table_id] = placeholder
table_placeholder_tag = content_soup.new_tag("p")
table_placeholder_tag.string = placeholder
table.replace_with(table_placeholder_tag)
# クリーンな本文テキストを抽出
for element in content_soup.find_all(["sup", "div.hatnote", "div.navbox", "span.mw-editsection"]):
element.decompose()
# 見出し、パラグラフ、リストを取得
elements = content_soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol"])
text_content = []
for element in elements:
if element.name and element.name.startswith("h"): # type: ignore
level = int(element.name[1]) # type: ignore
heading_text = element.get_text().strip()
if heading_text: # 空の見出しをスキップ
text_content.append("\n" + "#" * level + " " + heading_text)
elif element.name == "p": # type: ignore
paragraph_text = element.get_text().strip()
if paragraph_text: # 空のパラグラフをスキップ
# テーブルプレースホルダの場合はそのまま追加
if re.match(r"^\{\{table_\d+\}\}$", paragraph_text):
text_content.append(paragraph_text)
else:
text_content.append(paragraph_text)
elif element.name in ["ul", "ol"] and element.parent.name not in ["li", "ul", "ol"]: # type: ignore
# トップレベルのリストのみ処理(ネストされたものは親liで処理)
list_content = process_list_element(element)
if list_content:
text_content.append(list_content)
# テキストコンテンツを結合
content = "\n\n".join(text_content)
return content, tables_dict
|