Spaces:

Biifruu
/

PDF_to_JSON

Runtime error

App Files Files Community

Biifruu commited on Jun 23, 2025

Commit

e9cb6b1

verified ·

1 Parent(s): 0345a7c

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -24

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import gradio as gr
 def text_area_ratio(image):
     """
-    Calcula la proporción del área ocupada por texto basado en contornos de letras.
     """
     np_img = np.array(image.convert("L"))
     _, thresh = cv2.threshold(np_img, 150, 255, cv2.THRESH_BINARY_INV)
@@ -24,15 +24,15 @@ def text_area_ratio(image):
 def has_significant_text(image):
     """
-    Determina si una imagen presenta abundantes contornos compatibles con letras.
     """
     return text_area_ratio(image) > 0.25
 def is_primarily_text(image, ocr_threshold=30):
     """
-    Usa OCR para determinar si el recorte contiene principalmente texto.
-    Si el análisis de contornos indica presencia de texto y el OCR devuelve
-    más de 'ocr_threshold' caracteres, se considera principalmente textual.
     """
     if has_significant_text(image):
         ocr_result = pytesseract.image_to_string(image, lang="eng+spa")
@@ -42,8 +42,8 @@ def is_primarily_text(image, ocr_threshold=30):
 def is_likely_photo(crop):
     """
-    Evalúa si un recorte es probablemente una imagen (foto o diagrama)
-    basándose en la variación tonal y la cantidad de colores.
     """
     np_crop = np.array(crop)
     gray = cv2.cvtColor(np_crop, cv2.COLOR_RGB2GRAY)
@@ -53,11 +53,11 @@ def is_likely_photo(crop):
 def extract_visual_regions(image):
     """
-    Extrae recortes de la imagen que se asemejan a imágenes embebidas.
-    Devuelve una lista de pares (bounding_box, crop) aceptados si:
-      - Son visuales (is_likely_photo),
-      - Tienen menos del 25% de área ocupada por texto,
-      - Y no se consideran principalmente texto según OCR.
     """
     np_img = np.array(image.convert("RGB"))
     gray = cv2.cvtColor(np_img, cv2.COLOR_RGB2GRAY)
@@ -67,7 +67,7 @@ def extract_visual_regions(image):
     num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(closed, connectivity=8)
     results = []
-    for i in range(1, num_labels):  # se omite el fondo
         x, y, w, h, area = stats[i]
         aspect_ratio = w / float(h)
         if area > 2000 and 0.3 < aspect_ratio < 3.5:
@@ -80,7 +80,7 @@ def extract_visual_regions(image):
 def pdf_to_images_from_bytes(pdf_bytes):
     """
-    Convierte un PDF (en bytes) en una lista de imágenes PIL.
     """
     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
     images = []
@@ -93,7 +93,7 @@ def pdf_to_images_from_bytes(pdf_bytes):
 def extract_text_from_pdf_bytes(pdf_bytes):
     """
-    Extrae y concatena el texto de todas las páginas de un PDF.
     """
     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
     all_text = ""
@@ -104,7 +104,7 @@ def extract_text_from_pdf_bytes(pdf_bytes):
 def pil_to_base64(img):
     """
-    Convierte una imagen PIL a una cadena base64 codificada en PNG.
     """
     buffered = io.BytesIO()
     img.save(buffered, format="PNG")
@@ -112,12 +112,11 @@ def pil_to_base64(img):
 def process_pdf(pdf_file):
     """
-    Función principal que procesa el PDF.
-    Extrae el texto y los recortes de imagen.
     """
-    # Si pdf_file tiene el método read(), lo usamos, de lo contrario asumimos que es una ruta de archivo.
     try:
-        pdf_bytes = pdf_file.read()  # si es objeto file
     except AttributeError:
         with open(pdf_file, "rb") as f:
             pdf_bytes = f.read()
@@ -132,13 +131,13 @@ def process_pdf(pdf_file):
     images_base64 = [pil_to_base64(img) for img in crops]
     return {"text": text, "images": images_base64}
-# Configuramos la interfaz de Gradio para devolver JSON.
 iface = gr.Interface(
     fn=process_pdf,
-    inputs=gr.File(label="Sube un PDF"),
     outputs="json",
-    title="Procesador de PDFs",
-    description="Extrae el texto y los recortes de imagen de un PDF. La salida es un JSON con 'text' e 'images' (imagenes en base64)."
 )
 iface.launch()

 def text_area_ratio(image):
     """
+    Calculates the proportion of the area occupied by text based on letter contours.
     """
     np_img = np.array(image.convert("L"))
     _, thresh = cv2.threshold(np_img, 150, 255, cv2.THRESH_BINARY_INV)
 def has_significant_text(image):
     """
+    Determines whether an image contains significant letter-like contours.
     """
     return text_area_ratio(image) > 0.25
 def is_primarily_text(image, ocr_threshold=30):
     """
+    Uses OCR to determine if the crop contains mostly text.
+    If contour analysis suggests text presence and OCR returns
+    more than 'ocr_threshold' characters, it is considered mostly textual.
     """
     if has_significant_text(image):
         ocr_result = pytesseract.image_to_string(image, lang="eng+spa")
 def is_likely_photo(crop):
     """
+    Evaluates whether a crop is likely an image (photo or diagram)
+    based on tonal variation and color count.
     """
     np_crop = np.array(crop)
     gray = cv2.cvtColor(np_crop, cv2.COLOR_RGB2GRAY)
 def extract_visual_regions(image):
     """
+    Extracts regions from the image that resemble embedded images.
+    Returns a list of (bounding_box, crop) pairs that meet the following:
+      - Are visual (is_likely_photo),
+      - Have less than 25% text area,
+      - And are not considered primarily text by OCR.
     """
     np_img = np.array(image.convert("RGB"))
     gray = cv2.cvtColor(np_img, cv2.COLOR_RGB2GRAY)
     num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(closed, connectivity=8)
     results = []
+    for i in range(1, num_labels):  # skip background
         x, y, w, h, area = stats[i]
         aspect_ratio = w / float(h)
         if area > 2000 and 0.3 < aspect_ratio < 3.5:
 def pdf_to_images_from_bytes(pdf_bytes):
     """
+    Converts a PDF (as bytes) into a list of PIL images.
     """
     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
     images = []
 def extract_text_from_pdf_bytes(pdf_bytes):
     """
+    Extracts and concatenates the text from all pages in a PDF.
     """
     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
     all_text = ""
 def pil_to_base64(img):
     """
+    Converts a PIL image to a base64-encoded PNG string.
     """
     buffered = io.BytesIO()
     img.save(buffered, format="PNG")
 def process_pdf(pdf_file):
     """
+    Main function that processes the PDF.
+    Extracts text and image crops.
     """
     try:
+        pdf_bytes = pdf_file.read()  # file object
     except AttributeError:
         with open(pdf_file, "rb") as f:
             pdf_bytes = f.read()
     images_base64 = [pil_to_base64(img) for img in crops]
     return {"text": text, "images": images_base64}
+# Configure Gradio interface to return JSON.
 iface = gr.Interface(
     fn=process_pdf,
+    inputs=gr.File(label="Upload a PDF"),
     outputs="json",
+    title="PDF Processor",
+    description="Extracts text and image crops from a PDF. Output is a JSON with 'text' and 'images' (base64-encoded)."
 )
 iface.launch()