from io import StringIO from langchain_community.document_loaders.base import BaseLoader from langchain_core.documents import Document import pandas as pd import requests class WikipediaTableLoader(BaseLoader): """Load every table on a Wikipedia page as a separate Document.""" def __init__(self, url: str, title: str): self.url = url self.title = title def load(self): html = requests.get(self.url, timeout=10).text dfs = pd.read_html(StringIO(html)) # grab the tables docs = [] for i, df in enumerate(dfs): docs.append( Document( page_content=df.to_markdown(index=False), metadata={ "source": self.url, "title": self.title, "table_index": i, }, ) ) return docs