pavansuresh commited on
Commit
a50fdc7
·
verified ·
1 Parent(s): 0c1f04f

Create ocr_utils.py

Browse files
Files changed (1) hide show
  1. ocr_utils.py +16 -0
ocr_utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdf2image import convert_from_path
2
+ import pytesseract
3
+ import tempfile
4
+
5
+ def extract_text_from_pdf(pdf_path):
6
+ """
7
+ Extracts text from a scanned PDF using OCR (Tesseract).
8
+ Converts PDF to images and runs pytesseract on each page.
9
+ """
10
+ with tempfile.TemporaryDirectory() as tempdir:
11
+ images = convert_from_path(pdf_path, dpi=300, output_folder=tempdir)
12
+ all_text = []
13
+ for img in images:
14
+ text = pytesseract.image_to_string(img)
15
+ all_text.append(text)
16
+ return "\n".join(all_text)