Spaces:
Sleeping
Sleeping
File size: 776 Bytes
4694efc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
class GridCodePDFLoader:
def __init__(self, pdf_path):
self.pdf_path = pdf_path
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
separators=["\n\n", "\n", ".", " ", ""]
)
def load_and_split(self):
"""Load PDF and split into chunks"""
loader = PyPDFLoader(self.pdf_path)
pages = loader.load()
return self.text_splitter.split_documents(pages)
def extract_metadata(self):
"""Extract metadata from PDF like sections, tables etc."""
# TODO: Implement metadata extraction
pass |