Spaces:

Biifruu
/

pdf_extractor

Sleeping

App Files Files Community

Biifruu commited on Jul 24, 2025

Commit

52b9ecf

verified ·

1 Parent(s): 3be3327

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -32

app.py CHANGED Viewed

@@ -33,45 +33,47 @@ def fix_common_ocr_errors(text):
     return text
 def format_text_to_markdown(text):
     lines = text.splitlines()
-    final_lines = []
-    buffer = []
-    def flush_buffer():
-        if buffer:
-            final_lines.append(" ".join(buffer).strip())
-            final_lines.append("")  # salto de línea para nuevo párrafo
-            buffer.clear()
     for line in lines:
         line = line.strip()
         if not line:
-            flush_buffer()
             continue
-        if re.match(r"^(posible causa|causa):", line, re.IGNORECASE):
-            flush_buffer()
-            final_lines.append("### 🛑 Posible causa")
-            final_lines.append("")
-            final_lines.append(re.sub(r"^(posible causa|causa):", "", line, flags=re.IGNORECASE).strip())
-        elif re.match(r"^(posible solución|solución):", line, re.IGNORECASE):
-            flush_buffer()
-            final_lines.append("### ✅ Posible solución")
-            final_lines.append("")
-            final_lines.append(re.sub(r"^(posible solución|solución):", "", line, flags=re.IGNORECASE).strip())
-        elif re.match(r"^descripción del problema", line, re.IGNORECASE):
-            flush_buffer()
-            final_lines.append("### 📝 Descripción del problema")
         elif re.match(r"^\d+\.", line):
-            flush_buffer()
-            final_lines.append("- " + line)
-        elif re.match(r"^•\s*", line):
-            flush_buffer()
-            final_lines.append("- " + line)
         else:
-            buffer.append(line)
-    flush_buffer()
-    return "\n".join(final_lines)
 def translate_text(text):
     if len(text.strip()) < 5:

     return text
 def format_text_to_markdown(text):
+    # Corrige errores comunes
+    text = fix_common_ocr_errors(text)
+    # Línea por línea
     lines = text.splitlines()
+    markdown_lines = []
     for line in lines:
         line = line.strip()
         if not line:
             continue
+        # Encabezados conocidos
+        if re.search(r"\b(posible causa)\b", line, re.IGNORECASE):
+            markdown_lines.append("### 🛑 Posible causa\n")
+            continue
+        elif re.search(r"\b(posible solución)\b", line, re.IGNORECASE):
+            markdown_lines.append("### ✅ Posible solución\n")
+            continue
+        elif re.search(r"\b(descripción del problema)\b", line, re.IGNORECASE):
+            markdown_lines.append("### 📝 Descripción del problema\n")
+            continue
         elif re.match(r"^\d+\.", line):
+            markdown_lines.append(f"- {line}")
+            continue
+        elif re.match(r"^[•*-]", line):
+            markdown_lines.append(f"- {line[1:].strip()}")
+            continue
         else:
+            # Detecta si la línea es basura (letras sin sentido, símbolos)
+            if len(line) < 5:
+                continue
+            if re.search(r"[^\w\s.,:;¡!¿?\-()]", line):
+                symbols = re.findall(r"[^\w\s.,:;¡!¿?\-()]", line)
+                if len(symbols) > 3:
+                    continue  # basura
+            markdown_lines.append(line)
+    # Une líneas y separa párrafos correctamente
+    formatted_text = "\n\n".join(markdown_lines)
+    return formatted_text
 def translate_text(text):
     if len(text.strip()) < 5: