PDF_to_JSON / app.py
Biifruu's picture
Update app.py
e9cb6b1 verified
import io
import base64
import numpy as np
import cv2
import fitz # PyMuPDF
import pytesseract
from PIL import Image
import gradio as gr
def text_area_ratio(image):
"""
Calculates the proportion of the area occupied by text based on letter contours.
"""
np_img = np.array(image.convert("L"))
_, thresh = cv2.threshold(np_img, 150, 255, cv2.THRESH_BINARY_INV)
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
text_area = 0
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
if 8 < h < 40 and 5 < w < 100:
text_area += w * h
total_area = np_img.shape[0] * np_img.shape[1]
return text_area / total_area if total_area > 0 else 0
def has_significant_text(image):
"""
Determines whether an image contains significant letter-like contours.
"""
return text_area_ratio(image) > 0.25
def is_primarily_text(image, ocr_threshold=30):
"""
Uses OCR to determine if the crop contains mostly text.
If contour analysis suggests text presence and OCR returns
more than 'ocr_threshold' characters, it is considered mostly textual.
"""
if has_significant_text(image):
ocr_result = pytesseract.image_to_string(image, lang="eng+spa")
if len(ocr_result.strip()) > ocr_threshold:
return True
return False
def is_likely_photo(crop):
"""
Evaluates whether a crop is likely an image (photo or diagram)
based on tonal variation and color count.
"""
np_crop = np.array(crop)
gray = cv2.cvtColor(np_crop, cv2.COLOR_RGB2GRAY)
std_dev = np.std(gray)
unique_colors = len(np.unique(gray))
return std_dev > 25 and unique_colors > 50
def extract_visual_regions(image):
"""
Extracts regions from the image that resemble embedded images.
Returns a list of (bounding_box, crop) pairs that meet the following:
- Are visual (is_likely_photo),
- Have less than 25% text area,
- And are not considered primarily text by OCR.
"""
np_img = np.array(image.convert("RGB"))
gray = cv2.cvtColor(np_img, cv2.COLOR_RGB2GRAY)
_, binary = cv2.threshold(gray, 220, 255, cv2.THRESH_BINARY_INV)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15))
closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(closed, connectivity=8)
results = []
for i in range(1, num_labels): # skip background
x, y, w, h, area = stats[i]
aspect_ratio = w / float(h)
if area > 2000 and 0.3 < aspect_ratio < 3.5:
bbox = (x, y, x + w, y + h)
crop = image.crop(bbox)
ratio = text_area_ratio(crop)
if is_likely_photo(crop) and ratio < 0.25 and not is_primarily_text(crop):
results.append((bbox, crop))
return results
def pdf_to_images_from_bytes(pdf_bytes):
"""
Converts a PDF (as bytes) into a list of PIL images.
"""
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
images = []
for page in doc:
pix = page.get_pixmap(dpi=200)
img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
images.append(img)
doc.close()
return images
def extract_text_from_pdf_bytes(pdf_bytes):
"""
Extracts and concatenates the text from all pages in a PDF.
"""
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
all_text = ""
for page in doc:
all_text += page.get_text() + "\n"
doc.close()
return all_text.strip()
def pil_to_base64(img):
"""
Converts a PIL image to a base64-encoded PNG string.
"""
buffered = io.BytesIO()
img.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")
def process_pdf(pdf_file):
"""
Main function that processes the PDF.
Extracts text and image crops.
"""
try:
pdf_bytes = pdf_file.read() # file object
except AttributeError:
with open(pdf_file, "rb") as f:
pdf_bytes = f.read()
text = extract_text_from_pdf_bytes(pdf_bytes)
imgs = pdf_to_images_from_bytes(pdf_bytes)
crops = []
for img in imgs:
regions = extract_visual_regions(img)
for (_, crop) in regions:
crops.append(crop)
images_base64 = [pil_to_base64(img) for img in crops]
return {"text": text, "images": images_base64}
# Configure Gradio interface to return JSON.
iface = gr.Interface(
fn=process_pdf,
inputs=gr.File(label="Upload a PDF"),
outputs="json",
title="PDF Processor",
description="Extracts text and image crops from a PDF. Output is a JSON with 'text' and 'images' (base64-encoded)."
)
iface.launch()