File size: 776 Bytes
4694efc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

class GridCodePDFLoader:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50,
            separators=["\n\n", "\n", ".", " ", ""]
        )
    
    def load_and_split(self):
        """Load PDF and split into chunks"""
        loader = PyPDFLoader(self.pdf_path)
        pages = loader.load()
        return self.text_splitter.split_documents(pages)
    
    def extract_metadata(self):
        """Extract metadata from PDF like sections, tables etc."""
        # TODO: Implement metadata extraction
        pass