Final_Assignment / langchain_custom.py
neznib
updated
1691123
raw
history blame contribute delete
936 Bytes
from io import StringIO
from langchain_community.document_loaders.base import BaseLoader
from langchain_core.documents import Document
import pandas as pd
import requests
class WikipediaTableLoader(BaseLoader):
"""Load every table on a Wikipedia page as a separate Document."""
def __init__(self, url: str, title: str):
self.url = url
self.title = title
def load(self):
html = requests.get(self.url, timeout=10).text
dfs = pd.read_html(StringIO(html)) # grab the tables
docs = []
for i, df in enumerate(dfs):
docs.append(
Document(
page_content=df.to_markdown(index=False),
metadata={
"source": self.url,
"title": self.title,
"table_index": i,
},
)
)
return docs