Spaces:
Sleeping
Sleeping
File size: 936 Bytes
1691123 707cf08 1691123 707cf08 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
from io import StringIO
from langchain_community.document_loaders.base import BaseLoader
from langchain_core.documents import Document
import pandas as pd
import requests
class WikipediaTableLoader(BaseLoader):
"""Load every table on a Wikipedia page as a separate Document."""
def __init__(self, url: str, title: str):
self.url = url
self.title = title
def load(self):
html = requests.get(self.url, timeout=10).text
dfs = pd.read_html(StringIO(html)) # grab the tables
docs = []
for i, df in enumerate(dfs):
docs.append(
Document(
page_content=df.to_markdown(index=False),
metadata={
"source": self.url,
"title": self.title,
"table_index": i,
},
)
)
return docs |