Spaces:
Sleeping
Sleeping
| from io import StringIO | |
| from langchain_community.document_loaders.base import BaseLoader | |
| from langchain_core.documents import Document | |
| import pandas as pd | |
| import requests | |
| class WikipediaTableLoader(BaseLoader): | |
| """Load every table on a Wikipedia page as a separate Document.""" | |
| def __init__(self, url: str, title: str): | |
| self.url = url | |
| self.title = title | |
| def load(self): | |
| html = requests.get(self.url, timeout=10).text | |
| dfs = pd.read_html(StringIO(html)) # grab the tables | |
| docs = [] | |
| for i, df in enumerate(dfs): | |
| docs.append( | |
| Document( | |
| page_content=df.to_markdown(index=False), | |
| metadata={ | |
| "source": self.url, | |
| "title": self.title, | |
| "table_index": i, | |
| }, | |
| ) | |
| ) | |
| return docs |