File size: 1,971 Bytes
7f0cd9b 2cade03 0cfb077 2cade03 7f0cd9b 2cade03 7f0cd9b 2cade03 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | import json
from pdf2image import convert_from_path, convert_from_bytes
import pytesseract
from pypdf import PdfReader
def read_pdf(file_name:str):
reader = PdfReader(file_name)
# Get the number of pages
number_of_pages = len(reader.pages)
# print(f"Number of pages: {number_of_pages}")
content = ""
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
text = page.extract_text()
# print(f"--- Page {page_num + 1} ---")
# print(text)
content += f"--- Page {page_num + 1} ---" + "\n\n" + text
return content
def pdf_to_text_ocr(pdf_path: str, dpi: int = 300, lang: str = "eng") -> str:
"""
Convert a scanned/image-based PDF to text using OCR.
Args:
pdf_path (str): Path to the PDF file.
dpi (int): Resolution for PDF to image conversion (default 300).
lang (str): Language code for OCR (default 'eng').
Returns:
str: Extracted text from the PDF.
"""
text_output = []
images = convert_from_path(pdf_path, dpi=dpi)
for i, img in enumerate(images):
page_text = pytesseract.image_to_string(img, lang=lang, output_type=pytesseract.Output.STRING)
text_output.append(page_text)
return json.dumps(text_output,indent=1)
def pdf_bytes_to_text_ocr(pdf_bytes: bytes, dpi: int = 300, lang: str = "eng") -> str:
"""
Convert a scanned/image-based PDF (from bytes) to text using OCR.
Args:
pdf_bytes (bytes): PDF content in bytes.
dpi (int): Resolution for PDF to image conversion (default 300).
lang (str): Language code for OCR (default 'eng').
Returns:
str: Extracted text from the PDF.
"""
text_output = []
images = convert_from_bytes(pdf_bytes, dpi=dpi)
for i, img in enumerate(images):
page_text = pytesseract.image_to_string(img, lang=lang)
text_output.append(page_text)
return "\n".join(text_output).strip()
|