QuestionAnswering / extract_text.py
Vlad Bastina
default changes
22c5be7
from pypdf import PdfReader
def extract_text_pypdf(pdf_path):
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text(extraction_mode='plain') + "\n\n"
return text
if __name__ == "__main__":
# Example Usage
pdf_path = "CFR-2019-title21-vol2.pdf" # Replace with your file path
pdf_text = extract_text_pypdf(pdf_path)
# Save to a text file
with open("output.txt", "w", encoding="utf-8") as f:
f.write(pdf_text)
print(pdf_text) # Print extracted text