mpebtraining / extract_pdf.py
jeyanthangj2004's picture
Upload 22 files
a1fc81e verified
raw
history blame contribute delete
411 Bytes
from pypdf import PdfReader
reader = PdfReader("1-s2.0-S2405844024055324-main.pdf")
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
# Limit output to avoid token limit issues, or save to file and read chunks.
# I'll save to a text file.
with open("paper_content.txt", "w", encoding="utf-8") as f:
f.write(text)
print("PDF content extracted to paper_content.txt")