Spaces:

Biifruu
/

pdf_extractor

Sleeping

App Files Files Community

Biifruu commited on Jul 24, 2025

Commit

755a0ae

verified ·

1 Parent(s): c180972

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -28

app.py CHANGED Viewed

@@ -7,8 +7,7 @@ import pytesseract
 import base64
 import os
 import unicodedata
-# NUEVO: Traducción
 from transformers import pipeline
 # Inicializa el pipeline de traducción EN->ES una sola vez
@@ -22,12 +21,43 @@ def clean_ocr_text(text):
     cleaned_lines = [line.strip() for line in lines if line.strip()]
     return "\n".join(cleaned_lines)
 def translate_text(text):
-    """
-    Traduce texto del inglés al español si está en inglés (siempre lo traduce para simplificar)
-    """
-    # Para hacerlo robusto podrías agregar detección de idioma (langdetect),
-    # pero para este ejemplo traducimos siempre
     if len(text.strip()) < 5:
         return text
     chunks = [text[i:i+500] for i in range(0, len(text), 500)]
@@ -66,7 +96,6 @@ def extract_visual_regions(image):
     gray = cv2.cvtColor(np_img, cv2.COLOR_RGB2GRAY)
     _, binary = cv2.threshold(gray, 220, 255, cv2.THRESH_BINARY_INV)
     closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15)))
     num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(closed, connectivity=8)
     results = []
     for i in range(1, num_labels):
@@ -80,17 +109,6 @@ def extract_visual_regions(image):
 # ---------- Extracción de texto + imágenes ----------
-def clean_bullet_line(text):
-    text = unicodedata.normalize("NFKC", text)
-    text = text.replace("e@", "-")
-    text = text.replace("@", "-")
-    text = text.replace("•", "-")
-    text = text.replace("*", "-")
-    text = text.replace("·", "-")
-    text = text.replace("–", "-")
-    text = " ".join(text.split())
-    return text
 def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
     markdown_output = f"\n## Página {page_index + 1}\n\n"
     image_counter = 1
@@ -104,7 +122,7 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
             for line in b["lines"]:
                 line_y = line["bbox"][1]
                 line_text = " ".join([span["text"] for span in line["spans"]]).strip()
-                line_text = clean_bullet_line(line_text)
                 max_font_size = max([span.get("size", 10) for span in line["spans"]])
                 if line_text:
                     elements.append((line_y, line_text, max_font_size))
@@ -135,7 +153,9 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
         is_header = font_size >= 14
         if previous_y is not None and abs(y - previous_y) > 10:
             markdown_output += "\n"
-        translated = translate_text(text.strip())
         markdown_output += f"\n### {translated}\n" if is_header else translated + "\n"
         previous_y = y
@@ -156,11 +176,9 @@ def convert(pdf_file):
         text = page.get_text("text").strip()
         if len(text) > 30:
-            # Texto nativo del PDF
             extracted = extract_text_markdown([page], image_paths, page_num, seen_xrefs)
             markdown_output += extracted + "\n"
         else:
-            # Página "escaneada" -> OCR
             markdown_output += f"\n## Página {page_num + 1}\n\n"
             pix = page.get_pixmap(dpi=300)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
@@ -170,12 +188,13 @@ def convert(pdf_file):
             markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
             try:
-                ocr_text = pytesseract.image_to_string(img, lang="eng+spa")
             except pytesseract.TesseractError:
                 ocr_text = ""
-            ocr_text_clean = clean_ocr_text(ocr_text)
-            translated_ocr = translate_text(ocr_text_clean)
-            markdown_output += translated_ocr + "\n"
             crops = extract_visual_regions(img)
             for i, crop in enumerate(crops):
@@ -205,4 +224,4 @@ with gr.Blocks() as demo:
     submit_btn.click(fn=convert, inputs=[pdf_input], outputs=[markdown_output, gallery_output, download_md])
-demo.launch()

 import base64
 import os
 import unicodedata
+import re
 from transformers import pipeline
 # Inicializa el pipeline de traducción EN->ES una sola vez
     cleaned_lines = [line.strip() for line in lines if line.strip()]
     return "\n".join(cleaned_lines)
+def fix_common_ocr_errors(text):
+    text = text.replace(" e ", " • ")  # cuando OCR confunde viñetas con "e"
+    text = re.sub(r'\bposibl\b', 'posible', text, flags=re.IGNORECASE)
+    text = re.sub(r'\binstatar\b', 'instalar', text)
+    text = re.sub(r'\bfuncionación\b', 'función taller', text)
+    text = re.sub(r'\boptar\b', 'opta', text)
+    text = re.sub(r'ICACIONES\b', 'APLICACIONES', text)
+    text = re.sub(r'Lar\b', 'la', text)
+    text = re.sub(r'([a-zA-Z])-\n([a-zA-Z])', r'\1\2', text)  # une palabras partidas por salto
+    return text
+def format_text_to_markdown(text):
+    lines = text.splitlines()
+    final_lines = []
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        if re.match(r"^(posible causa|causa):", line, re.IGNORECASE):
+            final_lines.append("### 🛑 Posible causa")
+            final_lines.append("")
+            final_lines.append(re.sub(r"^(posible causa|causa):", "", line, flags=re.IGNORECASE).strip())
+        elif re.match(r"^(posible solución|solución):", line, re.IGNORECASE):
+            final_lines.append("### ✅ Posible solución")
+            final_lines.append("")
+            final_lines.append(re.sub(r"^(posible solución|solución):", "", line, flags=re.IGNORECASE).strip())
+        elif re.match(r"^descripción del problema", line, re.IGNORECASE):
+            final_lines.append("### 📝 Descripción del problema")
+        elif re.match(r"^\d+\.", line):
+            final_lines.append("- " + line)
+        elif re.match(r"^•\s*", line):
+            final_lines.append("- " + line)
+        else:
+            final_lines.append(line)
+    return "\n".join(final_lines)
 def translate_text(text):
     if len(text.strip()) < 5:
         return text
     chunks = [text[i:i+500] for i in range(0, len(text), 500)]
     gray = cv2.cvtColor(np_img, cv2.COLOR_RGB2GRAY)
     _, binary = cv2.threshold(gray, 220, 255, cv2.THRESH_BINARY_INV)
     closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15)))
     num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(closed, connectivity=8)
     results = []
     for i in range(1, num_labels):
 # ---------- Extracción de texto + imágenes ----------
 def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
     markdown_output = f"\n## Página {page_index + 1}\n\n"
     image_counter = 1
             for line in b["lines"]:
                 line_y = line["bbox"][1]
                 line_text = " ".join([span["text"] for span in line["spans"]]).strip()
+                line_text = clean_ocr_text(line_text)
                 max_font_size = max([span.get("size", 10) for span in line["spans"]])
                 if line_text:
                     elements.append((line_y, line_text, max_font_size))
         is_header = font_size >= 14
         if previous_y is not None and abs(y - previous_y) > 10:
             markdown_output += "\n"
+        fixed = fix_common_ocr_errors(text.strip())
+        formatted = format_text_to_markdown(fixed)
+        translated = translate_text(formatted)
         markdown_output += f"\n### {translated}\n" if is_header else translated + "\n"
         previous_y = y
         text = page.get_text("text").strip()
         if len(text) > 30:
             extracted = extract_text_markdown([page], image_paths, page_num, seen_xrefs)
             markdown_output += extracted + "\n"
         else:
             markdown_output += f"\n## Página {page_num + 1}\n\n"
             pix = page.get_pixmap(dpi=300)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
             markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
             try:
+                ocr_text = pytesseract.image_to_string(img, lang="spa", config="--oem 3 --psm 6")
             except pytesseract.TesseractError:
                 ocr_text = ""
+            ocr_text_clean = clean_ocr_text(fix_common_ocr_errors(ocr_text))
+            formatted = format_text_to_markdown(ocr_text_clean)
+            translated = translate_text(formatted)
+            markdown_output += translated + "\n"
             crops = extract_visual_regions(img)
             for i, crop in enumerate(crops):
     submit_btn.click(fn=convert, inputs=[pdf_input], outputs=[markdown_output, gallery_output, download_md])
+demo.launch()