Agent_Final_Assignment

Build error

App Files Files Community

Agent_Final_Assignment / agent_for_unit4 /wiki.py

ManojParvatham

Upload 5 files

ab9ff53 verified 3 months ago

raw

history blame contribute delete

7 kB

	import re
	from io import StringIO
	from typing import Any

	import pandas as pd
	import requests
	from bs4 import BeautifulSoup


	def process_list_element(list_element: Any, indent: int = 0) -> str:
	"""リスト要素を再帰的に処理する関数"""
	result = []

	is_ordered = list_element.name == "ol"

	for i, li in enumerate(list_element.find_all("li", recursive=False)):
	# リスト項目のテキストを取得
	# ネストされたリストを除いたテキストを取得
	item_text = ""
	for content in li.contents:
	if content.name not in ["ul", "ol"]:
	item_text += str(content)

	item_text = BeautifulSoup(item_text, "html.parser").get_text().strip()

	# 順序付きリストなら番号を、そうでなければ記号を使用
	prefix = " " * indent + (f"{i + 1}. " if is_ordered else "* ")
	if item_text:
	result.append(prefix + item_text)

	# ネストされたリストを処理
	for nested_list in li.find_all(["ul", "ol"], recursive=False):
	nested_content = process_list_element(nested_list, indent + 1)
	if nested_content:
	result.append(nested_content)

	return "\n".join(result)


	def get_wiki_content(title: str, language: str = "en") -> tuple[str, dict[str, pd.DataFrame]]:
	"""
	Get Wikipedia page content and tables.

	Returns:
	A tuple containing the page content as a string and a dictionary of tables
	extracted from the page. The keys of the dictionary are "table_1", "table_2", etc.
	and the values are pandas DataFrames representing the tables.

	Example:
	content, tables = get_wiki_content("Python_(programming_language)")
	print(content)
	print(tables["table_1"]) # Access the first table

	Args:
	title: wikipedia page title (e.g., "Python_(programming_language)")
	language: wikipedia language (e.g., "en" for English, "ja" for Japanese)
	"""
	# パースAPIのURLを構築
	api_url = f"https://{language}.wikipedia.org/w/api.php"

	# APIパラメータ
	params = {
	"action": "parse",
	"page": title,
	"format": "json",
	"prop": "text",
	"disabletoc": True,
	}

	# リクエストを送信
	response = requests.get(api_url, params=params, timeout=30) # type: ignore

	# レスポンスをチェック
	if response.status_code != 200:
	raise Exception(f"api error: {response.status_code} - {response.text}")

	# JSONレスポンスをパース
	data = response.json()

	# エラーチェック
	if "error" in data:
	raise Exception(f"api error: {data['error']['info']}")

	if "parse" not in data:
	raise Exception("api error: No parse data found")

	# HTMLコンテンツを取得
	html_content = data["parse"]["text"]["*"]

	# HTMLをパース
	soup = BeautifulSoup(html_content, "html.parser")
	content_soup = BeautifulSoup(html_content, "html.parser")

	# テーブル情報を取得
	tables_dict: dict[str, pd.DataFrame] = {}
	table_ids: list[tuple[str, str]] = [] # (table_id, table_html) のリスト

	# ターゲットとするテーブルを特定: wikitableとinfobox
	table_index = 1

	# まず、infobox（バイオグラフィーテーブル）を処理
	infoboxes = soup.find_all("table", class_=lambda c: c and "infobox" in c)
	for i, table in enumerate(infoboxes):
	table_id = f"table_{table_index}"
	table_ids.append((table_id, str(table)))
	table_index += 1

	# 次に、wikitableを処理
	wikitables = soup.find_all("table", class_="wikitable")
	for i, table in enumerate(wikitables):
	table_id = f"table_{table_index}"
	table_ids.append((table_id, str(table)))
	table_index += 1

	# 抽出したテーブルをpandasで処理
	for table_id, table_html in table_ids:
	try:
	dfs = pd.read_html(StringIO(table_html))
	if dfs:
	tables_dict[table_id] = dfs[0]
	except Exception:
	# テーブル解析に失敗した場合はスキップ
	continue

	# コンテンツ内のテーブルをプレースホルダに置き換え
	table_placeholders: dict[str, str] = {}

	# infoboxの処理
	for i, table in enumerate(content_soup.find_all("table", class_=lambda c: c and "infobox" in c)):
	table_id = f"table_{i + 1}"
	if table_id in tables_dict:
	placeholder = f"{{{{{table_id}}}}}"
	table_placeholders[table_id] = placeholder
	table_placeholder_tag = content_soup.new_tag("p")
	table_placeholder_tag.string = placeholder
	table.replace_with(table_placeholder_tag)

	# wikitableの処理（インデックスは続きから）
	wikitable_start_index = len(infoboxes) + 1
	for i, table in enumerate(content_soup.find_all("table", class_="wikitable")):
	table_id = f"table_{wikitable_start_index + i}"
	if table_id in tables_dict:
	placeholder = f"{{{{{table_id}}}}}"
	table_placeholders[table_id] = placeholder
	table_placeholder_tag = content_soup.new_tag("p")
	table_placeholder_tag.string = placeholder
	table.replace_with(table_placeholder_tag)

	# クリーンな本文テキストを抽出
	for element in content_soup.find_all(["sup", "div.hatnote", "div.navbox", "span.mw-editsection"]):
	element.decompose()

	# 見出し、パラグラフ、リストを取得
	elements = content_soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol"])
	text_content = []

	for element in elements:
	if element.name and element.name.startswith("h"): # type: ignore
	level = int(element.name[1]) # type: ignore
	heading_text = element.get_text().strip()
	if heading_text: # 空の見出しをスキップ
	text_content.append("\n" + "#" * level + " " + heading_text)
	elif element.name == "p": # type: ignore
	paragraph_text = element.get_text().strip()
	if paragraph_text: # 空のパラグラフをスキップ
	# テーブルプレースホルダの場合はそのまま追加
	if re.match(r"^\{\{table_\d+\}\}$", paragraph_text):
	text_content.append(paragraph_text)
	else:
	text_content.append(paragraph_text)
	elif element.name in ["ul", "ol"] and element.parent.name not in ["li", "ul", "ol"]: # type: ignore
	# トップレベルのリストのみ処理（ネストされたものは親liで処理）
	list_content = process_list_element(element)
	if list_content:
	text_content.append(list_content)

	# テキストコンテンツを結合
	content = "\n\n".join(text_content)

	return content, tables_dict