Spaces:

Biifruu
/

PDF_to_JSON

Running

App Files Files Community

PDF_to_JSON / app.py

Biifruu

Update app.py

e9cb6b1 verified 6 months ago

raw

history blame contribute delete

4.76 kB

	import io
	import base64
	import numpy as np
	import cv2
	import fitz # PyMuPDF
	import pytesseract
	from PIL import Image
	import gradio as gr

	def text_area_ratio(image):
	"""
	Calculates the proportion of the area occupied by text based on letter contours.
	"""
	np_img = np.array(image.convert("L"))
	_, thresh = cv2.threshold(np_img, 150, 255, cv2.THRESH_BINARY_INV)
	contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
	text_area = 0
	for cnt in contours:
	x, y, w, h = cv2.boundingRect(cnt)
	if 8 < h < 40 and 5 < w < 100:
	text_area += w * h
	total_area = np_img.shape[0] * np_img.shape[1]
	return text_area / total_area if total_area > 0 else 0

	def has_significant_text(image):
	"""
	Determines whether an image contains significant letter-like contours.
	"""
	return text_area_ratio(image) > 0.25

	def is_primarily_text(image, ocr_threshold=30):
	"""
	Uses OCR to determine if the crop contains mostly text.
	If contour analysis suggests text presence and OCR returns
	more than 'ocr_threshold' characters, it is considered mostly textual.
	"""
	if has_significant_text(image):
	ocr_result = pytesseract.image_to_string(image, lang="eng+spa")
	if len(ocr_result.strip()) > ocr_threshold:
	return True
	return False

	def is_likely_photo(crop):
	"""
	Evaluates whether a crop is likely an image (photo or diagram)
	based on tonal variation and color count.
	"""
	np_crop = np.array(crop)
	gray = cv2.cvtColor(np_crop, cv2.COLOR_RGB2GRAY)
	std_dev = np.std(gray)
	unique_colors = len(np.unique(gray))
	return std_dev > 25 and unique_colors > 50

	def extract_visual_regions(image):
	"""
	Extracts regions from the image that resemble embedded images.
	Returns a list of (bounding_box, crop) pairs that meet the following:
	- Are visual (is_likely_photo),
	- Have less than 25% text area,
	- And are not considered primarily text by OCR.
	"""
	np_img = np.array(image.convert("RGB"))
	gray = cv2.cvtColor(np_img, cv2.COLOR_RGB2GRAY)
	_, binary = cv2.threshold(gray, 220, 255, cv2.THRESH_BINARY_INV)
	kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15))
	closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)

	num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(closed, connectivity=8)
	results = []
	for i in range(1, num_labels): # skip background
	x, y, w, h, area = stats[i]
	aspect_ratio = w / float(h)
	if area > 2000 and 0.3 < aspect_ratio < 3.5:
	bbox = (x, y, x + w, y + h)
	crop = image.crop(bbox)
	ratio = text_area_ratio(crop)
	if is_likely_photo(crop) and ratio < 0.25 and not is_primarily_text(crop):
	results.append((bbox, crop))
	return results

	def pdf_to_images_from_bytes(pdf_bytes):
	"""
	Converts a PDF (as bytes) into a list of PIL images.
	"""
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	images = []
	for page in doc:
	pix = page.get_pixmap(dpi=200)
	img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
	images.append(img)
	doc.close()
	return images

	def extract_text_from_pdf_bytes(pdf_bytes):
	"""
	Extracts and concatenates the text from all pages in a PDF.
	"""
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	all_text = ""
	for page in doc:
	all_text += page.get_text() + "\n"
	doc.close()
	return all_text.strip()

	def pil_to_base64(img):
	"""
	Converts a PIL image to a base64-encoded PNG string.
	"""
	buffered = io.BytesIO()
	img.save(buffered, format="PNG")
	return base64.b64encode(buffered.getvalue()).decode("utf-8")

	def process_pdf(pdf_file):
	"""
	Main function that processes the PDF.
	Extracts text and image crops.
	"""
	try:
	pdf_bytes = pdf_file.read() # file object
	except AttributeError:
	with open(pdf_file, "rb") as f:
	pdf_bytes = f.read()

	text = extract_text_from_pdf_bytes(pdf_bytes)
	imgs = pdf_to_images_from_bytes(pdf_bytes)
	crops = []
	for img in imgs:
	regions = extract_visual_regions(img)
	for (_, crop) in regions:
	crops.append(crop)
	images_base64 = [pil_to_base64(img) for img in crops]
	return {"text": text, "images": images_base64}

	# Configure Gradio interface to return JSON.
	iface = gr.Interface(
	fn=process_pdf,
	inputs=gr.File(label="Upload a PDF"),
	outputs="json",
	title="PDF Processor",
	description="Extracts text and image crops from a PDF. Output is a JSON with 'text' and 'images' (base64-encoded)."
	)

	iface.launch()