Spaces:

TorchLLM
/

GeminiRAG

Build error

App Files Files Community

GeminiRAG / src /notebooks /temp.py

TorchLLM

Initial commit for deploying the project

d9e3edb about 1 year ago

raw

history blame contribute delete

1.81 kB

	import fitz # PyMuPDF
	import pytesseract
	from PIL import Image


	# Define a function to extract text from all pages of a PDF
	def extract_text_from_pdf(pdf_path, dpi=300):
	"""
	Extract text from all pages of a PDF.

	Args:
	pdf_path (str): Path to the PDF file.
	dpi (int): Resolution for converting PDF pages to images (default: 300).

	Returns:
	dict: A dictionary where keys are page numbers (1-based) and values are extracted text.
	"""
	# Open the PDF file
	pdf_document = fitz.open(pdf_path)
	extracted_text = {}

	for page_number in range(len(pdf_document)):
	# Select the page
	page = pdf_document[page_number]

	# Convert the page to an image
	pixmap = page.get_pixmap(dpi=dpi)

	# Save the image to a temporary file
	image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)

	# Extract text from the image using Tesseract
	text = pytesseract.image_to_string(image)

	print(text)

	# Store the text in the dictionary
	extracted_text[page_number + 1] = text

	# Close the PDF document
	pdf_document.close()

	return extracted_text


	# Usage example
	if __name__ == "__main__":
	# pdf_path = "c:/Abhi-MTech/Sem-1/AI/Books/Artificial.Intelligence.A.Modern.Approach.4th.Edition.Peter.Norvig. Stuart.Russell.Pearson.9780134610993.EBooksWorld.ir.pdf" # Path to your PDF file
	pdf_path = "c:/Abhi-MTech/Sem-1/AI/AI Technical.pdf" # Path to your PDF file

	try:
	all_text = extract_text_from_pdf(pdf_path)
	for page_num, text in all_text.items():
	print(f"Page {page_num} Text:")
	print(text)
	print("-" * 80) # Separator for readability
	except Exception as e:
	print(f"Error: {e}")