File size: 1,971 Bytes
7f0cd9b
2cade03
 
0cfb077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2cade03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f0cd9b
2cade03
 
7f0cd9b
2cade03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import json
from pdf2image import convert_from_path, convert_from_bytes
import pytesseract
from pypdf import PdfReader

def read_pdf(file_name:str):
    reader = PdfReader(file_name)
    # Get the number of pages
    number_of_pages = len(reader.pages)
    # print(f"Number of pages: {number_of_pages}")

    content = ""
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text = page.extract_text()
        # print(f"--- Page {page_num + 1} ---")
        # print(text)
        content += f"--- Page {page_num + 1} ---" + "\n\n" + text
    
    return content

def pdf_to_text_ocr(pdf_path: str, dpi: int = 300, lang: str = "eng") -> str:
    """
    Convert a scanned/image-based PDF to text using OCR.

    Args:
        pdf_path (str): Path to the PDF file.
        dpi (int): Resolution for PDF to image conversion (default 300).
        lang (str): Language code for OCR (default 'eng').

    Returns:
        str: Extracted text from the PDF.
    """
    text_output = []
    images = convert_from_path(pdf_path, dpi=dpi)

    for i, img in enumerate(images):
        page_text = pytesseract.image_to_string(img, lang=lang, output_type=pytesseract.Output.STRING)
        text_output.append(page_text)

    return json.dumps(text_output,indent=1)

def pdf_bytes_to_text_ocr(pdf_bytes: bytes, dpi: int = 300, lang: str = "eng") -> str:
    """
    Convert a scanned/image-based PDF (from bytes) to text using OCR.

    Args:
        pdf_bytes (bytes): PDF content in bytes.
        dpi (int): Resolution for PDF to image conversion (default 300).
        lang (str): Language code for OCR (default 'eng').

    Returns:
        str: Extracted text from the PDF.
    """
    text_output = []
    images = convert_from_bytes(pdf_bytes, dpi=dpi)

    for i, img in enumerate(images):
        page_text = pytesseract.image_to_string(img, lang=lang)
        text_output.append(page_text)

    return "\n".join(text_output).strip()