Agent_Final_Assignment

Build error

File size: 7,002 Bytes

ab9ff53

import re
from io import StringIO
from typing import Any

import pandas as pd
import requests
from bs4 import BeautifulSoup


def process_list_element(list_element: Any, indent: int = 0) -> str:
    """リスト要素を再帰的に処理する関数"""
    result = []

    is_ordered = list_element.name == "ol"

    for i, li in enumerate(list_element.find_all("li", recursive=False)):
        # リスト項目のテキストを取得
        # ネストされたリストを除いたテキストを取得
        item_text = ""
        for content in li.contents:
            if content.name not in ["ul", "ol"]:
                item_text += str(content)

        item_text = BeautifulSoup(item_text, "html.parser").get_text().strip()

        # 順序付きリストなら番号を、そうでなければ記号を使用
        prefix = "  " * indent + (f"{i + 1}. " if is_ordered else "* ")
        if item_text:
            result.append(prefix + item_text)

        # ネストされたリストを処理
        for nested_list in li.find_all(["ul", "ol"], recursive=False):
            nested_content = process_list_element(nested_list, indent + 1)
            if nested_content:
                result.append(nested_content)

    return "\n".join(result)


def get_wiki_content(title: str, language: str = "en") -> tuple[str, dict[str, pd.DataFrame]]:
    """
    Get Wikipedia page content and tables.

    Returns:
        A tuple containing the page content as a string and a dictionary of tables
        extracted from the page. The keys of the dictionary are "table_1", "table_2", etc.
        and the values are pandas DataFrames representing the tables.

    Example:
        content, tables = get_wiki_content("Python_(programming_language)")
        print(content)
        print(tables["table_1"])  # Access the first table

    Args:
        title: wikipedia page title (e.g., "Python_(programming_language)")
        language: wikipedia language (e.g., "en" for English, "ja" for Japanese)
    """
    # パースAPIのURLを構築
    api_url = f"https://{language}.wikipedia.org/w/api.php"

    # APIパラメータ
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "text",
        "disabletoc": True,
    }

    # リクエストを送信
    response = requests.get(api_url, params=params, timeout=30)  # type: ignore

    # レスポンスをチェック
    if response.status_code != 200:
        raise Exception(f"api error: {response.status_code} - {response.text}")

    # JSONレスポンスをパース
    data = response.json()

    # エラーチェック
    if "error" in data:
        raise Exception(f"api error: {data['error']['info']}")

    if "parse" not in data:
        raise Exception("api error: No parse data found")

    # HTMLコンテンツを取得
    html_content = data["parse"]["text"]["*"]

    # HTMLをパース
    soup = BeautifulSoup(html_content, "html.parser")
    content_soup = BeautifulSoup(html_content, "html.parser")

    # テーブル情報を取得
    tables_dict: dict[str, pd.DataFrame] = {}
    table_ids: list[tuple[str, str]] = []  # (table_id, table_html) のリスト

    # ターゲットとするテーブルを特定: wikitableとinfobox
    table_index = 1

    # まず、infobox（バイオグラフィーテーブル）を処理
    infoboxes = soup.find_all("table", class_=lambda c: c and "infobox" in c)
    for i, table in enumerate(infoboxes):
        table_id = f"table_{table_index}"
        table_ids.append((table_id, str(table)))
        table_index += 1

    # 次に、wikitableを処理
    wikitables = soup.find_all("table", class_="wikitable")
    for i, table in enumerate(wikitables):
        table_id = f"table_{table_index}"
        table_ids.append((table_id, str(table)))
        table_index += 1

    # 抽出したテーブルをpandasで処理
    for table_id, table_html in table_ids:
        try:
            dfs = pd.read_html(StringIO(table_html))
            if dfs:
                tables_dict[table_id] = dfs[0]
        except Exception:
            # テーブル解析に失敗した場合はスキップ
            continue

    # コンテンツ内のテーブルをプレースホルダに置き換え
    table_placeholders: dict[str, str] = {}

    # infoboxの処理
    for i, table in enumerate(content_soup.find_all("table", class_=lambda c: c and "infobox" in c)):
        table_id = f"table_{i + 1}"
        if table_id in tables_dict:
            placeholder = f"{{{{{table_id}}}}}"
            table_placeholders[table_id] = placeholder
            table_placeholder_tag = content_soup.new_tag("p")
            table_placeholder_tag.string = placeholder
            table.replace_with(table_placeholder_tag)

    # wikitableの処理（インデックスは続きから）
    wikitable_start_index = len(infoboxes) + 1
    for i, table in enumerate(content_soup.find_all("table", class_="wikitable")):
        table_id = f"table_{wikitable_start_index + i}"
        if table_id in tables_dict:
            placeholder = f"{{{{{table_id}}}}}"
            table_placeholders[table_id] = placeholder
            table_placeholder_tag = content_soup.new_tag("p")
            table_placeholder_tag.string = placeholder
            table.replace_with(table_placeholder_tag)

    # クリーンな本文テキストを抽出
    for element in content_soup.find_all(["sup", "div.hatnote", "div.navbox", "span.mw-editsection"]):
        element.decompose()

    # 見出し、パラグラフ、リストを取得
    elements = content_soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol"])
    text_content = []

    for element in elements:
        if element.name and element.name.startswith("h"):  # type: ignore
            level = int(element.name[1])  # type: ignore
            heading_text = element.get_text().strip()
            if heading_text:  # 空の見出しをスキップ
                text_content.append("\n" + "#" * level + " " + heading_text)
        elif element.name == "p":  # type: ignore
            paragraph_text = element.get_text().strip()
            if paragraph_text:  # 空のパラグラフをスキップ
                # テーブルプレースホルダの場合はそのまま追加
                if re.match(r"^\{\{table_\d+\}\}$", paragraph_text):
                    text_content.append(paragraph_text)
                else:
                    text_content.append(paragraph_text)
        elif element.name in ["ul", "ol"] and element.parent.name not in ["li", "ul", "ol"]:  # type: ignore
            # トップレベルのリストのみ処理（ネストされたものは親liで処理）
            list_content = process_list_element(element)
            if list_content:
                text_content.append(list_content)

    # テキストコンテンツを結合
    content = "\n\n".join(text_content)

    return content, tables_dict