Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -9,7 +9,7 @@ import gradio as gr
|
|
| 9 |
|
| 10 |
def text_area_ratio(image):
|
| 11 |
"""
|
| 12 |
-
|
| 13 |
"""
|
| 14 |
np_img = np.array(image.convert("L"))
|
| 15 |
_, thresh = cv2.threshold(np_img, 150, 255, cv2.THRESH_BINARY_INV)
|
|
@@ -24,15 +24,15 @@ def text_area_ratio(image):
|
|
| 24 |
|
| 25 |
def has_significant_text(image):
|
| 26 |
"""
|
| 27 |
-
|
| 28 |
"""
|
| 29 |
return text_area_ratio(image) > 0.25
|
| 30 |
|
| 31 |
def is_primarily_text(image, ocr_threshold=30):
|
| 32 |
"""
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
"""
|
| 37 |
if has_significant_text(image):
|
| 38 |
ocr_result = pytesseract.image_to_string(image, lang="eng+spa")
|
|
@@ -42,8 +42,8 @@ def is_primarily_text(image, ocr_threshold=30):
|
|
| 42 |
|
| 43 |
def is_likely_photo(crop):
|
| 44 |
"""
|
| 45 |
-
|
| 46 |
-
|
| 47 |
"""
|
| 48 |
np_crop = np.array(crop)
|
| 49 |
gray = cv2.cvtColor(np_crop, cv2.COLOR_RGB2GRAY)
|
|
@@ -53,11 +53,11 @@ def is_likely_photo(crop):
|
|
| 53 |
|
| 54 |
def extract_visual_regions(image):
|
| 55 |
"""
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
-
|
| 59 |
-
-
|
| 60 |
-
-
|
| 61 |
"""
|
| 62 |
np_img = np.array(image.convert("RGB"))
|
| 63 |
gray = cv2.cvtColor(np_img, cv2.COLOR_RGB2GRAY)
|
|
@@ -67,7 +67,7 @@ def extract_visual_regions(image):
|
|
| 67 |
|
| 68 |
num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(closed, connectivity=8)
|
| 69 |
results = []
|
| 70 |
-
for i in range(1, num_labels): #
|
| 71 |
x, y, w, h, area = stats[i]
|
| 72 |
aspect_ratio = w / float(h)
|
| 73 |
if area > 2000 and 0.3 < aspect_ratio < 3.5:
|
|
@@ -80,7 +80,7 @@ def extract_visual_regions(image):
|
|
| 80 |
|
| 81 |
def pdf_to_images_from_bytes(pdf_bytes):
|
| 82 |
"""
|
| 83 |
-
|
| 84 |
"""
|
| 85 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 86 |
images = []
|
|
@@ -93,7 +93,7 @@ def pdf_to_images_from_bytes(pdf_bytes):
|
|
| 93 |
|
| 94 |
def extract_text_from_pdf_bytes(pdf_bytes):
|
| 95 |
"""
|
| 96 |
-
|
| 97 |
"""
|
| 98 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 99 |
all_text = ""
|
|
@@ -104,7 +104,7 @@ def extract_text_from_pdf_bytes(pdf_bytes):
|
|
| 104 |
|
| 105 |
def pil_to_base64(img):
|
| 106 |
"""
|
| 107 |
-
|
| 108 |
"""
|
| 109 |
buffered = io.BytesIO()
|
| 110 |
img.save(buffered, format="PNG")
|
|
@@ -112,12 +112,11 @@ def pil_to_base64(img):
|
|
| 112 |
|
| 113 |
def process_pdf(pdf_file):
|
| 114 |
"""
|
| 115 |
-
|
| 116 |
-
|
| 117 |
"""
|
| 118 |
-
# Si pdf_file tiene el método read(), lo usamos, de lo contrario asumimos que es una ruta de archivo.
|
| 119 |
try:
|
| 120 |
-
pdf_bytes = pdf_file.read() #
|
| 121 |
except AttributeError:
|
| 122 |
with open(pdf_file, "rb") as f:
|
| 123 |
pdf_bytes = f.read()
|
|
@@ -132,13 +131,13 @@ def process_pdf(pdf_file):
|
|
| 132 |
images_base64 = [pil_to_base64(img) for img in crops]
|
| 133 |
return {"text": text, "images": images_base64}
|
| 134 |
|
| 135 |
-
#
|
| 136 |
iface = gr.Interface(
|
| 137 |
fn=process_pdf,
|
| 138 |
-
inputs=gr.File(label="
|
| 139 |
outputs="json",
|
| 140 |
-
title="
|
| 141 |
-
description="
|
| 142 |
)
|
| 143 |
|
| 144 |
iface.launch()
|
|
|
|
| 9 |
|
| 10 |
def text_area_ratio(image):
|
| 11 |
"""
|
| 12 |
+
Calculates the proportion of the area occupied by text based on letter contours.
|
| 13 |
"""
|
| 14 |
np_img = np.array(image.convert("L"))
|
| 15 |
_, thresh = cv2.threshold(np_img, 150, 255, cv2.THRESH_BINARY_INV)
|
|
|
|
| 24 |
|
| 25 |
def has_significant_text(image):
|
| 26 |
"""
|
| 27 |
+
Determines whether an image contains significant letter-like contours.
|
| 28 |
"""
|
| 29 |
return text_area_ratio(image) > 0.25
|
| 30 |
|
| 31 |
def is_primarily_text(image, ocr_threshold=30):
|
| 32 |
"""
|
| 33 |
+
Uses OCR to determine if the crop contains mostly text.
|
| 34 |
+
If contour analysis suggests text presence and OCR returns
|
| 35 |
+
more than 'ocr_threshold' characters, it is considered mostly textual.
|
| 36 |
"""
|
| 37 |
if has_significant_text(image):
|
| 38 |
ocr_result = pytesseract.image_to_string(image, lang="eng+spa")
|
|
|
|
| 42 |
|
| 43 |
def is_likely_photo(crop):
|
| 44 |
"""
|
| 45 |
+
Evaluates whether a crop is likely an image (photo or diagram)
|
| 46 |
+
based on tonal variation and color count.
|
| 47 |
"""
|
| 48 |
np_crop = np.array(crop)
|
| 49 |
gray = cv2.cvtColor(np_crop, cv2.COLOR_RGB2GRAY)
|
|
|
|
| 53 |
|
| 54 |
def extract_visual_regions(image):
|
| 55 |
"""
|
| 56 |
+
Extracts regions from the image that resemble embedded images.
|
| 57 |
+
Returns a list of (bounding_box, crop) pairs that meet the following:
|
| 58 |
+
- Are visual (is_likely_photo),
|
| 59 |
+
- Have less than 25% text area,
|
| 60 |
+
- And are not considered primarily text by OCR.
|
| 61 |
"""
|
| 62 |
np_img = np.array(image.convert("RGB"))
|
| 63 |
gray = cv2.cvtColor(np_img, cv2.COLOR_RGB2GRAY)
|
|
|
|
| 67 |
|
| 68 |
num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(closed, connectivity=8)
|
| 69 |
results = []
|
| 70 |
+
for i in range(1, num_labels): # skip background
|
| 71 |
x, y, w, h, area = stats[i]
|
| 72 |
aspect_ratio = w / float(h)
|
| 73 |
if area > 2000 and 0.3 < aspect_ratio < 3.5:
|
|
|
|
| 80 |
|
| 81 |
def pdf_to_images_from_bytes(pdf_bytes):
|
| 82 |
"""
|
| 83 |
+
Converts a PDF (as bytes) into a list of PIL images.
|
| 84 |
"""
|
| 85 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 86 |
images = []
|
|
|
|
| 93 |
|
| 94 |
def extract_text_from_pdf_bytes(pdf_bytes):
|
| 95 |
"""
|
| 96 |
+
Extracts and concatenates the text from all pages in a PDF.
|
| 97 |
"""
|
| 98 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 99 |
all_text = ""
|
|
|
|
| 104 |
|
| 105 |
def pil_to_base64(img):
|
| 106 |
"""
|
| 107 |
+
Converts a PIL image to a base64-encoded PNG string.
|
| 108 |
"""
|
| 109 |
buffered = io.BytesIO()
|
| 110 |
img.save(buffered, format="PNG")
|
|
|
|
| 112 |
|
| 113 |
def process_pdf(pdf_file):
|
| 114 |
"""
|
| 115 |
+
Main function that processes the PDF.
|
| 116 |
+
Extracts text and image crops.
|
| 117 |
"""
|
|
|
|
| 118 |
try:
|
| 119 |
+
pdf_bytes = pdf_file.read() # file object
|
| 120 |
except AttributeError:
|
| 121 |
with open(pdf_file, "rb") as f:
|
| 122 |
pdf_bytes = f.read()
|
|
|
|
| 131 |
images_base64 = [pil_to_base64(img) for img in crops]
|
| 132 |
return {"text": text, "images": images_base64}
|
| 133 |
|
| 134 |
+
# Configure Gradio interface to return JSON.
|
| 135 |
iface = gr.Interface(
|
| 136 |
fn=process_pdf,
|
| 137 |
+
inputs=gr.File(label="Upload a PDF"),
|
| 138 |
outputs="json",
|
| 139 |
+
title="PDF Processor",
|
| 140 |
+
description="Extracts text and image crops from a PDF. Output is a JSON with 'text' and 'images' (base64-encoded)."
|
| 141 |
)
|
| 142 |
|
| 143 |
iface.launch()
|