Biifruu commited on
Commit
e9cb6b1
·
verified ·
1 Parent(s): 0345a7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -24
app.py CHANGED
@@ -9,7 +9,7 @@ import gradio as gr
9
 
10
  def text_area_ratio(image):
11
  """
12
- Calcula la proporción del área ocupada por texto basado en contornos de letras.
13
  """
14
  np_img = np.array(image.convert("L"))
15
  _, thresh = cv2.threshold(np_img, 150, 255, cv2.THRESH_BINARY_INV)
@@ -24,15 +24,15 @@ def text_area_ratio(image):
24
 
25
  def has_significant_text(image):
26
  """
27
- Determina si una imagen presenta abundantes contornos compatibles con letras.
28
  """
29
  return text_area_ratio(image) > 0.25
30
 
31
  def is_primarily_text(image, ocr_threshold=30):
32
  """
33
- Usa OCR para determinar si el recorte contiene principalmente texto.
34
- Si el análisis de contornos indica presencia de texto y el OCR devuelve
35
- más de 'ocr_threshold' caracteres, se considera principalmente textual.
36
  """
37
  if has_significant_text(image):
38
  ocr_result = pytesseract.image_to_string(image, lang="eng+spa")
@@ -42,8 +42,8 @@ def is_primarily_text(image, ocr_threshold=30):
42
 
43
  def is_likely_photo(crop):
44
  """
45
- Evalúa si un recorte es probablemente una imagen (foto o diagrama)
46
- basándose en la variación tonal y la cantidad de colores.
47
  """
48
  np_crop = np.array(crop)
49
  gray = cv2.cvtColor(np_crop, cv2.COLOR_RGB2GRAY)
@@ -53,11 +53,11 @@ def is_likely_photo(crop):
53
 
54
  def extract_visual_regions(image):
55
  """
56
- Extrae recortes de la imagen que se asemejan a imágenes embebidas.
57
- Devuelve una lista de pares (bounding_box, crop) aceptados si:
58
- - Son visuales (is_likely_photo),
59
- - Tienen menos del 25% de área ocupada por texto,
60
- - Y no se consideran principalmente texto según OCR.
61
  """
62
  np_img = np.array(image.convert("RGB"))
63
  gray = cv2.cvtColor(np_img, cv2.COLOR_RGB2GRAY)
@@ -67,7 +67,7 @@ def extract_visual_regions(image):
67
 
68
  num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(closed, connectivity=8)
69
  results = []
70
- for i in range(1, num_labels): # se omite el fondo
71
  x, y, w, h, area = stats[i]
72
  aspect_ratio = w / float(h)
73
  if area > 2000 and 0.3 < aspect_ratio < 3.5:
@@ -80,7 +80,7 @@ def extract_visual_regions(image):
80
 
81
  def pdf_to_images_from_bytes(pdf_bytes):
82
  """
83
- Convierte un PDF (en bytes) en una lista de imágenes PIL.
84
  """
85
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
86
  images = []
@@ -93,7 +93,7 @@ def pdf_to_images_from_bytes(pdf_bytes):
93
 
94
  def extract_text_from_pdf_bytes(pdf_bytes):
95
  """
96
- Extrae y concatena el texto de todas las páginas de un PDF.
97
  """
98
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
99
  all_text = ""
@@ -104,7 +104,7 @@ def extract_text_from_pdf_bytes(pdf_bytes):
104
 
105
  def pil_to_base64(img):
106
  """
107
- Convierte una imagen PIL a una cadena base64 codificada en PNG.
108
  """
109
  buffered = io.BytesIO()
110
  img.save(buffered, format="PNG")
@@ -112,12 +112,11 @@ def pil_to_base64(img):
112
 
113
  def process_pdf(pdf_file):
114
  """
115
- Función principal que procesa el PDF.
116
- Extrae el texto y los recortes de imagen.
117
  """
118
- # Si pdf_file tiene el método read(), lo usamos, de lo contrario asumimos que es una ruta de archivo.
119
  try:
120
- pdf_bytes = pdf_file.read() # si es objeto file
121
  except AttributeError:
122
  with open(pdf_file, "rb") as f:
123
  pdf_bytes = f.read()
@@ -132,13 +131,13 @@ def process_pdf(pdf_file):
132
  images_base64 = [pil_to_base64(img) for img in crops]
133
  return {"text": text, "images": images_base64}
134
 
135
- # Configuramos la interfaz de Gradio para devolver JSON.
136
  iface = gr.Interface(
137
  fn=process_pdf,
138
- inputs=gr.File(label="Sube un PDF"),
139
  outputs="json",
140
- title="Procesador de PDFs",
141
- description="Extrae el texto y los recortes de imagen de un PDF. La salida es un JSON con 'text' e 'images' (imagenes en base64)."
142
  )
143
 
144
  iface.launch()
 
9
 
10
  def text_area_ratio(image):
11
  """
12
+ Calculates the proportion of the area occupied by text based on letter contours.
13
  """
14
  np_img = np.array(image.convert("L"))
15
  _, thresh = cv2.threshold(np_img, 150, 255, cv2.THRESH_BINARY_INV)
 
24
 
25
  def has_significant_text(image):
26
  """
27
+ Determines whether an image contains significant letter-like contours.
28
  """
29
  return text_area_ratio(image) > 0.25
30
 
31
  def is_primarily_text(image, ocr_threshold=30):
32
  """
33
+ Uses OCR to determine if the crop contains mostly text.
34
+ If contour analysis suggests text presence and OCR returns
35
+ more than 'ocr_threshold' characters, it is considered mostly textual.
36
  """
37
  if has_significant_text(image):
38
  ocr_result = pytesseract.image_to_string(image, lang="eng+spa")
 
42
 
43
  def is_likely_photo(crop):
44
  """
45
+ Evaluates whether a crop is likely an image (photo or diagram)
46
+ based on tonal variation and color count.
47
  """
48
  np_crop = np.array(crop)
49
  gray = cv2.cvtColor(np_crop, cv2.COLOR_RGB2GRAY)
 
53
 
54
  def extract_visual_regions(image):
55
  """
56
+ Extracts regions from the image that resemble embedded images.
57
+ Returns a list of (bounding_box, crop) pairs that meet the following:
58
+ - Are visual (is_likely_photo),
59
+ - Have less than 25% text area,
60
+ - And are not considered primarily text by OCR.
61
  """
62
  np_img = np.array(image.convert("RGB"))
63
  gray = cv2.cvtColor(np_img, cv2.COLOR_RGB2GRAY)
 
67
 
68
  num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(closed, connectivity=8)
69
  results = []
70
+ for i in range(1, num_labels): # skip background
71
  x, y, w, h, area = stats[i]
72
  aspect_ratio = w / float(h)
73
  if area > 2000 and 0.3 < aspect_ratio < 3.5:
 
80
 
81
  def pdf_to_images_from_bytes(pdf_bytes):
82
  """
83
+ Converts a PDF (as bytes) into a list of PIL images.
84
  """
85
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
86
  images = []
 
93
 
94
  def extract_text_from_pdf_bytes(pdf_bytes):
95
  """
96
+ Extracts and concatenates the text from all pages in a PDF.
97
  """
98
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
99
  all_text = ""
 
104
 
105
  def pil_to_base64(img):
106
  """
107
+ Converts a PIL image to a base64-encoded PNG string.
108
  """
109
  buffered = io.BytesIO()
110
  img.save(buffered, format="PNG")
 
112
 
113
  def process_pdf(pdf_file):
114
  """
115
+ Main function that processes the PDF.
116
+ Extracts text and image crops.
117
  """
 
118
  try:
119
+ pdf_bytes = pdf_file.read() # file object
120
  except AttributeError:
121
  with open(pdf_file, "rb") as f:
122
  pdf_bytes = f.read()
 
131
  images_base64 = [pil_to_base64(img) for img in crops]
132
  return {"text": text, "images": images_base64}
133
 
134
+ # Configure Gradio interface to return JSON.
135
  iface = gr.Interface(
136
  fn=process_pdf,
137
+ inputs=gr.File(label="Upload a PDF"),
138
  outputs="json",
139
+ title="PDF Processor",
140
+ description="Extracts text and image crops from a PDF. Output is a JSON with 'text' and 'images' (base64-encoded)."
141
  )
142
 
143
  iface.launch()