Spaces:

Fuzure
/

sheami

Sleeping

App Files Files Community

sheami / pdf_reader.py

vikramvasudevan

Upload folder using huggingface_hub

7f0cd9b verified 7 months ago

raw

history blame contribute delete

1.97 kB

	import json
	from pdf2image import convert_from_path, convert_from_bytes
	import pytesseract
	from pypdf import PdfReader

	def read_pdf(file_name:str):
	reader = PdfReader(file_name)
	# Get the number of pages
	number_of_pages = len(reader.pages)
	# print(f"Number of pages: {number_of_pages}")

	content = ""
	for page_num in range(len(reader.pages)):
	page = reader.pages[page_num]
	text = page.extract_text()
	# print(f"--- Page {page_num + 1} ---")
	# print(text)
	content += f"--- Page {page_num + 1} ---" + "\n\n" + text

	return content

	def pdf_to_text_ocr(pdf_path: str, dpi: int = 300, lang: str = "eng") -> str:
	"""
	Convert a scanned/image-based PDF to text using OCR.

	Args:
	pdf_path (str): Path to the PDF file.
	dpi (int): Resolution for PDF to image conversion (default 300).
	lang (str): Language code for OCR (default 'eng').

	Returns:
	str: Extracted text from the PDF.
	"""
	text_output = []
	images = convert_from_path(pdf_path, dpi=dpi)

	for i, img in enumerate(images):
	page_text = pytesseract.image_to_string(img, lang=lang, output_type=pytesseract.Output.STRING)
	text_output.append(page_text)

	return json.dumps(text_output,indent=1)

	def pdf_bytes_to_text_ocr(pdf_bytes: bytes, dpi: int = 300, lang: str = "eng") -> str:
	"""
	Convert a scanned/image-based PDF (from bytes) to text using OCR.

	Args:
	pdf_bytes (bytes): PDF content in bytes.
	dpi (int): Resolution for PDF to image conversion (default 300).
	lang (str): Language code for OCR (default 'eng').

	Returns:
	str: Extracted text from the PDF.
	"""
	text_output = []
	images = convert_from_bytes(pdf_bytes, dpi=dpi)

	for i, img in enumerate(images):
	page_text = pytesseract.image_to_string(img, lang=lang)
	text_output.append(page_text)

	return "\n".join(text_output).strip()