| | import json |
| | from pdf2image import convert_from_path, convert_from_bytes |
| | import pytesseract |
| | from pypdf import PdfReader |
| |
|
| | def read_pdf(file_name:str): |
| | reader = PdfReader(file_name) |
| | |
| | number_of_pages = len(reader.pages) |
| | |
| |
|
| | content = "" |
| | for page_num in range(len(reader.pages)): |
| | page = reader.pages[page_num] |
| | text = page.extract_text() |
| | |
| | |
| | content += f"--- Page {page_num + 1} ---" + "\n\n" + text |
| | |
| | return content |
| |
|
| | def pdf_to_text_ocr(pdf_path: str, dpi: int = 300, lang: str = "eng") -> str: |
| | """ |
| | Convert a scanned/image-based PDF to text using OCR. |
| | |
| | Args: |
| | pdf_path (str): Path to the PDF file. |
| | dpi (int): Resolution for PDF to image conversion (default 300). |
| | lang (str): Language code for OCR (default 'eng'). |
| | |
| | Returns: |
| | str: Extracted text from the PDF. |
| | """ |
| | text_output = [] |
| | images = convert_from_path(pdf_path, dpi=dpi) |
| |
|
| | for i, img in enumerate(images): |
| | page_text = pytesseract.image_to_string(img, lang=lang, output_type=pytesseract.Output.STRING) |
| | text_output.append(page_text) |
| |
|
| | return json.dumps(text_output,indent=1) |
| |
|
| | def pdf_bytes_to_text_ocr(pdf_bytes: bytes, dpi: int = 300, lang: str = "eng") -> str: |
| | """ |
| | Convert a scanned/image-based PDF (from bytes) to text using OCR. |
| | |
| | Args: |
| | pdf_bytes (bytes): PDF content in bytes. |
| | dpi (int): Resolution for PDF to image conversion (default 300). |
| | lang (str): Language code for OCR (default 'eng'). |
| | |
| | Returns: |
| | str: Extracted text from the PDF. |
| | """ |
| | text_output = [] |
| | images = convert_from_bytes(pdf_bytes, dpi=dpi) |
| |
|
| | for i, img in enumerate(images): |
| | page_text = pytesseract.image_to_string(img, lang=lang) |
| | text_output.append(page_text) |
| |
|
| | return "\n".join(text_output).strip() |
| |
|