Spaces:

Deadmon
/

ocr-pdf

Sleeping

App Files Files Community

Deadmon commited on Mar 9, 2025

Commit

36ada58

verified ·

1 Parent(s): d1e4811

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -74

app.py CHANGED Viewed

@@ -2,9 +2,7 @@ import os
 from pathlib import Path
 import fitz  # PyMuPDF for PDF handling
 from PIL import Image
-import pytesseract  # For OCR
 from transformers import BlipProcessor, BlipForConditionalGeneration  # For image captioning
-import io
 import torch
 import gradio as gr
@@ -12,56 +10,59 @@ import gradio as gr
 OUTPUT_DIR = Path("outputs")
 OUTPUT_DIR.mkdir(exist_ok=True)
-def pdf_to_images(pdf_path):
     """
-    Convert PDF pages to appropriately sized images
     """
     try:
         # Open the PDF
         pdf_document = fitz.open(pdf_path)
-        images = []
-        for page_num in range(len(pdf_document)):
-            page = pdf_document[page_num]
-            # Get the page dimensions to determine appropriate resolution
-            rect = page.rect
-            width = rect.width
-            height = rect.height
-            # Calculate appropriate zoom factor to get good quality images
-            # Aim for approximately 2000 pixels on the longest side
-            zoom = 2000 / max(width, height)
-            # Create a transformation matrix
-            mat = fitz.Matrix(zoom, zoom)
-            # Render page to an image
-            pix = page.get_pixmap(matrix=mat)
-            # Convert to PIL Image
-            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-            # Save image
-            image_path = OUTPUT_DIR / f"page_{page_num + 1}.png"
-            img.save(image_path, "PNG")
-            images.append((image_path, img))
         pdf_document.close()
-        return images
     except Exception as e:
-        print(f"Error converting PDF to images: {str(e)}")
-        return []
-def extract_text_from_image(image):
     """
-    Extract text from an image using OCR
     """
     try:
-        text = pytesseract.image_to_string(image)
         return text.strip()
     except Exception as e:
-        print(f"Error during OCR: {str(e)}")
         return ""
 def analyze_image(image_path):
@@ -91,43 +92,53 @@ def process_pdf(pdf_path, output_txt_path):
     """
     Main function to process the PDF and generate output
     """
-    # Convert PDF to images
-    print("Converting PDF to images...")
-    images = pdf_to_images(pdf_path)
-    if not images:
-        print("No images were generated from the PDF.")
-        return
-    # Prepare output file
-    with open(output_txt_path, 'w', encoding='utf-8') as f:
-        f.write(f"Analysis of {os.path.basename(pdf_path)}\n")
-        f.write("=" * 50 + "\n\n")
-        # Process each page
-        for page_num, (image_path, image) in enumerate(images, 1):
-            print(f"Processing page {page_num}...")
-            # Write page header
-            f.write(f"Page {page_num}\n")
-            f.write("-" * 30 + "\n\n")
-            # Extract and write text
-            text = extract_text_from_image(image)
-            if text:
-                f.write("Extracted Text:\n")
-                f.write(text)
-                f.write("\n\n")
-            else:
-                f.write("No text could be extracted from this page.\n\n")
-            # Analyze image and write description
-            description = analyze_image(image_path)
-            f.write("Image Description:\n")
-            f.write(f"{description}\n")
-            f.write("\n" + "=" * 50 + "\n\n")
-    print(f"Processing complete. Results saved to {output_txt_path}")
 def process_uploaded_pdf(pdf_file):
     if pdf_file is None:
@@ -148,7 +159,7 @@ interface = gr.Interface(
     inputs=gr.File(label="Upload PDF"),
     outputs=gr.Textbox(label="Analysis Results"),
     title="PDF Analyzer",
-    description="Upload a PDF file to extract text and analyze images."
 )
 interface.launch()

 from pathlib import Path
 import fitz  # PyMuPDF for PDF handling
 from PIL import Image
 from transformers import BlipProcessor, BlipForConditionalGeneration  # For image captioning
 import torch
 import gradio as gr
 OUTPUT_DIR = Path("outputs")
 OUTPUT_DIR.mkdir(exist_ok=True)
+def generate_page_image(pdf_path, page_num):
     """
+    Generate an image from a specific PDF page for analysis
     """
     try:
         # Open the PDF
         pdf_document = fitz.open(pdf_path)
+        page = pdf_document[page_num]
+        # Get the page dimensions to determine appropriate resolution
+        rect = page.rect
+        width = rect.width
+        height = rect.height
+        # Calculate appropriate zoom factor to get good quality images
+        # Aim for approximately 2000 pixels on the longest side
+        zoom = 2000 / max(width, height)
+        # Create a transformation matrix
+        mat = fitz.Matrix(zoom, zoom)
+        # Render page to an image
+        pix = page.get_pixmap(matrix=mat)
+        # Convert to PIL Image
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        # Save image
+        image_path = OUTPUT_DIR / f"page_{page_num + 1}.png"
+        img.save(image_path, "PNG")
         pdf_document.close()
+        return image_path
     except Exception as e:
+        print(f"Error generating image for page {page_num + 1}: {str(e)}")
+        return None
+def extract_text_from_pdf(pdf_path, page_num):
     """
+    Extract text directly from a specific PDF page
     """
     try:
+        # Open the PDF
+        pdf_document = fitz.open(pdf_path)
+        page = pdf_document[page_num]
+        # Extract text
+        text = page.get_text("text")
+        pdf_document.close()
         return text.strip()
     except Exception as e:
+        print(f"Error extracting text from page {page_num + 1}: {str(e)}")
         return ""
 def analyze_image(image_path):
     """
     Main function to process the PDF and generate output
     """
+    try:
+        # Open the PDF to get page count
+        pdf_document = fitz.open(pdf_path)
+        num_pages = len(pdf_document)
+        pdf_document.close()
+        if num_pages == 0:
+            print("The PDF is empty.")
+            return
+        # Prepare output file
+        with open(output_txt_path, 'w', encoding='utf-8') as f:
+            f.write(f"Analysis of {os.path.basename(pdf_path)}\n")
+            f.write("=" * 50 + "\n\n")
+            # Process each page
+            for page_num in range(num_pages):
+                print(f"Processing page {page_num + 1}...")
+                # Write page header
+                f.write(f"Page {page_num + 1}\n")
+                f.write("-" * 30 + "\n\n")
+                # Extract and write text
+                text = extract_text_from_pdf(pdf_path, page_num)
+                if text:
+                    f.write("Extracted Text:\n")
+                    f.write(text)
+                    f.write("\n\n")
+                else:
+                    f.write("No text could be extracted from this page.\n\n")
+                # Generate image for analysis and write description
+                image_path = generate_page_image(pdf_path, page_num)
+                if image_path:
+                    description = analyze_image(image_path)
+                    f.write("Image Description:\n")
+                    f.write(f"{description}\n")
+                    f.write("\n" + "=" * 50 + "\n\n")
+                else:
+                    f.write("Image Description:\n")
+                    f.write("Could not generate image for analysis.\n")
+                    f.write("\n" + "=" * 50 + "\n\n")
+        print(f"Processing complete. Results saved to {output_txt_path}")
+    except Exception as e:
+        print(f"Error processing PDF: {str(e)}")
 def process_uploaded_pdf(pdf_file):
     if pdf_file is None:
     inputs=gr.File(label="Upload PDF"),
     outputs=gr.Textbox(label="Analysis Results"),
     title="PDF Analyzer",
+    description="Upload a PDF file to extract text directly and analyze images."
 )
 interface.launch()