from pdf2image import convert_from_path import pytesseract import tempfile def extract_text_from_pdf(pdf_path): with tempfile.TemporaryDirectory() as tempdir: images = convert_from_path(pdf_path, dpi=300, output_folder=tempdir) all_text = [] for img in images: text = pytesseract.image_to_string(img) all_text.append(text) return "\n".join(all_text)