import fitz # PyMuPDF import pytesseract from PIL import Image # Define a function to extract text from all pages of a PDF def extract_text_from_pdf(pdf_path, dpi=300): """ Extract text from all pages of a PDF. Args: pdf_path (str): Path to the PDF file. dpi (int): Resolution for converting PDF pages to images (default: 300). Returns: dict: A dictionary where keys are page numbers (1-based) and values are extracted text. """ # Open the PDF file pdf_document = fitz.open(pdf_path) extracted_text = {} for page_number in range(len(pdf_document)): # Select the page page = pdf_document[page_number] # Convert the page to an image pixmap = page.get_pixmap(dpi=dpi) # Save the image to a temporary file image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples) # Extract text from the image using Tesseract text = pytesseract.image_to_string(image) print(text) # Store the text in the dictionary extracted_text[page_number + 1] = text # Close the PDF document pdf_document.close() return extracted_text # Usage example if __name__ == "__main__": # pdf_path = "c:/Abhi-MTech/Sem-1/AI/Books/Artificial.Intelligence.A.Modern.Approach.4th.Edition.Peter.Norvig. Stuart.Russell.Pearson.9780134610993.EBooksWorld.ir.pdf" # Path to your PDF file pdf_path = "c:/Abhi-MTech/Sem-1/AI/AI Technical.pdf" # Path to your PDF file try: all_text = extract_text_from_pdf(pdf_path) for page_num, text in all_text.items(): print(f"Page {page_num} Text:") print(text) print("-" * 80) # Separator for readability except Exception as e: print(f"Error: {e}")