File size: 410 Bytes
a50fdc7
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
from pdf2image import convert_from_path
import pytesseract
import tempfile

def extract_text_from_pdf(pdf_path):
    with tempfile.TemporaryDirectory() as tempdir:
        images = convert_from_path(pdf_path, dpi=300, output_folder=tempdir)
        all_text = []
        for img in images:
            text = pytesseract.image_to_string(img)
            all_text.append(text)
        return "\n".join(all_text)