import json from pdf2image import convert_from_path, convert_from_bytes import pytesseract from pypdf import PdfReader def read_pdf(file_name:str): reader = PdfReader(file_name) # Get the number of pages number_of_pages = len(reader.pages) # print(f"Number of pages: {number_of_pages}") content = "" for page_num in range(len(reader.pages)): page = reader.pages[page_num] text = page.extract_text() # print(f"--- Page {page_num + 1} ---") # print(text) content += f"--- Page {page_num + 1} ---" + "\n\n" + text return content def pdf_to_text_ocr(pdf_path: str, dpi: int = 300, lang: str = "eng") -> str: """ Convert a scanned/image-based PDF to text using OCR. Args: pdf_path (str): Path to the PDF file. dpi (int): Resolution for PDF to image conversion (default 300). lang (str): Language code for OCR (default 'eng'). Returns: str: Extracted text from the PDF. """ text_output = [] images = convert_from_path(pdf_path, dpi=dpi) for i, img in enumerate(images): page_text = pytesseract.image_to_string(img, lang=lang, output_type=pytesseract.Output.STRING) text_output.append(page_text) return json.dumps(text_output,indent=1) def pdf_bytes_to_text_ocr(pdf_bytes: bytes, dpi: int = 300, lang: str = "eng") -> str: """ Convert a scanned/image-based PDF (from bytes) to text using OCR. Args: pdf_bytes (bytes): PDF content in bytes. dpi (int): Resolution for PDF to image conversion (default 300). lang (str): Language code for OCR (default 'eng'). Returns: str: Extracted text from the PDF. """ text_output = [] images = convert_from_bytes(pdf_bytes, dpi=dpi) for i, img in enumerate(images): page_text = pytesseract.image_to_string(img, lang=lang) text_output.append(page_text) return "\n".join(text_output).strip()