File size: 7,002 Bytes
ab9ff53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import re
from io import StringIO
from typing import Any

import pandas as pd
import requests
from bs4 import BeautifulSoup


def process_list_element(list_element: Any, indent: int = 0) -> str:
    """リスト要素を再帰的に処理する関数"""
    result = []

    is_ordered = list_element.name == "ol"

    for i, li in enumerate(list_element.find_all("li", recursive=False)):
        # リスト項目のテキストを取得
        # ネストされたリストを除いたテキストを取得
        item_text = ""
        for content in li.contents:
            if content.name not in ["ul", "ol"]:
                item_text += str(content)

        item_text = BeautifulSoup(item_text, "html.parser").get_text().strip()

        # 順序付きリストなら番号を、そうでなければ記号を使用
        prefix = "  " * indent + (f"{i + 1}. " if is_ordered else "* ")
        if item_text:
            result.append(prefix + item_text)

        # ネストされたリストを処理
        for nested_list in li.find_all(["ul", "ol"], recursive=False):
            nested_content = process_list_element(nested_list, indent + 1)
            if nested_content:
                result.append(nested_content)

    return "\n".join(result)


def get_wiki_content(title: str, language: str = "en") -> tuple[str, dict[str, pd.DataFrame]]:
    """
    Get Wikipedia page content and tables.

    Returns:
        A tuple containing the page content as a string and a dictionary of tables
        extracted from the page. The keys of the dictionary are "table_1", "table_2", etc.
        and the values are pandas DataFrames representing the tables.

    Example:
        content, tables = get_wiki_content("Python_(programming_language)")
        print(content)
        print(tables["table_1"])  # Access the first table

    Args:
        title: wikipedia page title (e.g., "Python_(programming_language)")
        language: wikipedia language (e.g., "en" for English, "ja" for Japanese)
    """
    # パースAPIのURLを構築
    api_url = f"https://{language}.wikipedia.org/w/api.php"

    # APIパラメータ
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "text",
        "disabletoc": True,
    }

    # リクエストを送信
    response = requests.get(api_url, params=params, timeout=30)  # type: ignore

    # レスポンスをチェック
    if response.status_code != 200:
        raise Exception(f"api error: {response.status_code} - {response.text}")

    # JSONレスポンスをパース
    data = response.json()

    # エラーチェック
    if "error" in data:
        raise Exception(f"api error: {data['error']['info']}")

    if "parse" not in data:
        raise Exception("api error: No parse data found")

    # HTMLコンテンツを取得
    html_content = data["parse"]["text"]["*"]

    # HTMLをパース
    soup = BeautifulSoup(html_content, "html.parser")
    content_soup = BeautifulSoup(html_content, "html.parser")

    # テーブル情報を取得
    tables_dict: dict[str, pd.DataFrame] = {}
    table_ids: list[tuple[str, str]] = []  # (table_id, table_html) のリスト

    # ターゲットとするテーブルを特定: wikitableとinfobox
    table_index = 1

    # まず、infobox(バイオグラフィーテーブル)を処理
    infoboxes = soup.find_all("table", class_=lambda c: c and "infobox" in c)
    for i, table in enumerate(infoboxes):
        table_id = f"table_{table_index}"
        table_ids.append((table_id, str(table)))
        table_index += 1

    # 次に、wikitableを処理
    wikitables = soup.find_all("table", class_="wikitable")
    for i, table in enumerate(wikitables):
        table_id = f"table_{table_index}"
        table_ids.append((table_id, str(table)))
        table_index += 1

    # 抽出したテーブルをpandasで処理
    for table_id, table_html in table_ids:
        try:
            dfs = pd.read_html(StringIO(table_html))
            if dfs:
                tables_dict[table_id] = dfs[0]
        except Exception:
            # テーブル解析に失敗した場合はスキップ
            continue

    # コンテンツ内のテーブルをプレースホルダに置き換え
    table_placeholders: dict[str, str] = {}

    # infoboxの処理
    for i, table in enumerate(content_soup.find_all("table", class_=lambda c: c and "infobox" in c)):
        table_id = f"table_{i + 1}"
        if table_id in tables_dict:
            placeholder = f"{{{{{table_id}}}}}"
            table_placeholders[table_id] = placeholder
            table_placeholder_tag = content_soup.new_tag("p")
            table_placeholder_tag.string = placeholder
            table.replace_with(table_placeholder_tag)

    # wikitableの処理(インデックスは続きから)
    wikitable_start_index = len(infoboxes) + 1
    for i, table in enumerate(content_soup.find_all("table", class_="wikitable")):
        table_id = f"table_{wikitable_start_index + i}"
        if table_id in tables_dict:
            placeholder = f"{{{{{table_id}}}}}"
            table_placeholders[table_id] = placeholder
            table_placeholder_tag = content_soup.new_tag("p")
            table_placeholder_tag.string = placeholder
            table.replace_with(table_placeholder_tag)

    # クリーンな本文テキストを抽出
    for element in content_soup.find_all(["sup", "div.hatnote", "div.navbox", "span.mw-editsection"]):
        element.decompose()

    # 見出し、パラグラフ、リストを取得
    elements = content_soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol"])
    text_content = []

    for element in elements:
        if element.name and element.name.startswith("h"):  # type: ignore
            level = int(element.name[1])  # type: ignore
            heading_text = element.get_text().strip()
            if heading_text:  # 空の見出しをスキップ
                text_content.append("\n" + "#" * level + " " + heading_text)
        elif element.name == "p":  # type: ignore
            paragraph_text = element.get_text().strip()
            if paragraph_text:  # 空のパラグラフをスキップ
                # テーブルプレースホルダの場合はそのまま追加
                if re.match(r"^\{\{table_\d+\}\}$", paragraph_text):
                    text_content.append(paragraph_text)
                else:
                    text_content.append(paragraph_text)
        elif element.name in ["ul", "ol"] and element.parent.name not in ["li", "ul", "ol"]:  # type: ignore
            # トップレベルのリストのみ処理(ネストされたものは親liで処理)
            list_content = process_list_element(element)
            if list_content:
                text_content.append(list_content)

    # テキストコンテンツを結合
    content = "\n\n".join(text_content)

    return content, tables_dict