| | from langchain.document_loaders import PyPDFLoader |
| | from transformers import AutoTokenizer |
| | from langchain.schema import Document |
| |
|
| |
|
| | class DocParsing: |
| |
|
| | chunk_size = 350 |
| | chunk_overlap = 50 |
| |
|
| | def __init__(self, file_path, model_name, max_model_tokens=384): |
| | """ |
| | Initialize the DocParsing class with the provided file path, model name, and maximum model tokens. |
| | |
| | Parameters: |
| | file_path (str): The path to the PDF file to be processed. |
| | model_name (str): The name of the transformer model to be used for tokenization. |
| | max_model_tokens (int, optional): The maximum number of tokens allowed for each chunk. Defaults to 384. |
| | |
| | Returns: |
| | None |
| | """ |
| | self.file_path = file_path |
| |
|
| | |
| | self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
| |
|
| | self.max_model_tokens = max_model_tokens |
| |
|
| | def process_pdf(self): |
| | """ |
| | Process the PDF file by loading it, splitting it into chunks, and returning the chunks. |
| | |
| | This function first calls the `load_pdf` method to load the PDF file into a list of Document objects. |
| | Then, it calls the `create_chunks` method to split each Document into smaller chunks based on the specified |
| | chunk size and overlap. Finally, it returns the list of chunks. |
| | |
| | Parameters: |
| | None |
| | |
| | Returns: |
| | list: A list of Document objects, where each Document represents a chunk of the PDF file. |
| | """ |
| | self.load_pdf() |
| | self.create_chunks() |
| | return self.chunks |
| |
|
| | def load_pdf(self): |
| | """ |
| | Load the PDF file specified by the file_path attribute into a list of Document objects. |
| | |
| | This function uses the PyPDFLoader class from the langchain library to load the PDF file. |
| | The loaded Document objects are stored in the self.documents attribute. |
| | |
| | Parameters: |
| | None |
| | |
| | Returns: |
| | None |
| | |
| | Raises: |
| | FileNotFoundError: If the specified file_path does not exist or cannot be accessed. |
| | """ |
| | loader = PyPDFLoader(self.file_path) |
| | self.documents = loader.load() |
| |
|
| | def create_chunks(self): |
| | """ |
| | Split the loaded PDF documents into smaller chunks based on the specified chunk size and overlap. |
| | |
| | This function iterates through each Document object in the self.documents list and calls the |
| | token_split_document method to split the Document into smaller chunks. The resulting chunks are |
| | then appended to the self.chunks list. |
| | |
| | Parameters: |
| | None |
| | |
| | Returns: |
| | None |
| | |
| | Attributes: |
| | self.chunks (list): A list of Document objects, where each Document represents a chunk of the PDF file. |
| | """ |
| | self.chunks = [] |
| | for doc in self.documents: |
| | self.chunks.extend( |
| | self.token_split_document( |
| | doc, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap |
| | ) |
| | ) |
| |
|
| | def tokenize(self, text): |
| | """ |
| | Tokenize the input text using the transformer model's tokenizer. |
| | |
| | This method uses the tokenizer provided by the transformer model to encode the input text. |
| | The special tokens are not added to the encoded tokens. |
| | |
| | Parameters: |
| | text (str): The input text to be tokenized. |
| | |
| | Returns: |
| | list: A list of integers representing the tokenized input text. |
| | """ |
| | return self.tokenizer.encode(text, add_special_tokens=False) |
| |
|
| | def token_split_document(self, doc: Document, chunk_size=350, chunk_overlap=50): |
| | """ |
| | Split a single Document into multiple chunks based on token length. |
| | |
| | This function tokenizes the input Document's page content, then splits the tokens into smaller chunks |
| | of specified size. Overlapping chunks are created by moving the start index forward by the difference |
| | between chunk size and overlap. Each chunk is then decoded back into text and a new Document is created |
| | with the same metadata but truncated text. |
| | |
| | Parameters: |
| | doc (Document): The input Document to be split into chunks. |
| | chunk_size (int, optional): The size of each chunk in tokens. Defaults to 350. |
| | chunk_overlap (int, optional): The overlap between chunks in tokens. Defaults to 50. |
| | |
| | Returns: |
| | list: A list of Document objects, where each Document represents a chunk of the input Document. |
| | """ |
| | tokens = self.tokenize(doc.page_content) |
| | chunks = [] |
| | start = 0 |
| | while start < len(tokens): |
| | end = min(start + chunk_size, len(tokens)) |
| | chunk_tokens = tokens[start:end] |
| | chunk_text = self.tokenizer.decode(chunk_tokens) |
| | |
| | chunk_doc = Document(page_content=chunk_text, metadata=doc.metadata) |
| | chunks.append(chunk_doc) |
| | |
| | start += chunk_size - chunk_overlap |
| | return chunks |
| |
|