documind / loader.py
Aaravkumar's picture
Update loader.py
fb4c7b3 verified
Raw
History Blame Contribute Delete
708 Bytes
import fitz
class Loader:
"""
loads the text from the pdf files
"""
def __init__(self,file_path):
self.file = file_path
def load(self):
text_chunks = []
doc = fitz.open(self.file)
display_number = 1
for page in doc:
print(f"Loading for {display_number} ")
display_number += 1
page_text = page.get_text("text")
page_text = "\n".join([line.strip() for line in page_text.split("\n") if line.strip()])
text_chunks.append(page_text)
doc.close()
return "\n\n".join(text_chunks)
return text