| import glob |
| from langchain_community.document_loaders import PyPDFLoader |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
| class DataExtractor: |
| def __init__(self, pdf_directory): |
| self.pdf_directory = pdf_directory |
| self.pdf_text = [] |
| self.docs = [] |
| self.split_docs = None |
|
|
| |
| def extract_text(self): |
| print(f'Extracting text from pdf files in {self.pdf_directory}') |
| pdf_files = glob.glob(f'{self.pdf_directory}/*.pdf') |
| print(pdf_files) |
| for pdf_file in pdf_files: |
| print(pdf_file) |
| loader = PyPDFLoader(pdf_file) |
| documents = loader.load() |
| self.pdf_text.append(documents) |
|
|
| return self.pdf_text |
|
|
|
|
| |
| def clean_and_split_text(self, documents): |
| splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) |
| split_docs = [] |
| print(f'Cleaning and splitting text from {len(documents)} documents') |
| for doc in documents: |
| |
| split_docs.extend(splitter.split_documents(doc)) |
| print(f'Number of documents after splitting: {len(split_docs)}') |
| return split_docs |
|
|
|
|
|
|