Biifruu commited on
Commit
755a0ae
·
verified ·
1 Parent(s): c180972

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -28
app.py CHANGED
@@ -7,8 +7,7 @@ import pytesseract
7
  import base64
8
  import os
9
  import unicodedata
10
-
11
- # NUEVO: Traducción
12
  from transformers import pipeline
13
 
14
  # Inicializa el pipeline de traducción EN->ES una sola vez
@@ -22,12 +21,43 @@ def clean_ocr_text(text):
22
  cleaned_lines = [line.strip() for line in lines if line.strip()]
23
  return "\n".join(cleaned_lines)
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def translate_text(text):
26
- """
27
- Traduce texto del inglés al español si está en inglés (siempre lo traduce para simplificar)
28
- """
29
- # Para hacerlo robusto podrías agregar detección de idioma (langdetect),
30
- # pero para este ejemplo traducimos siempre
31
  if len(text.strip()) < 5:
32
  return text
33
  chunks = [text[i:i+500] for i in range(0, len(text), 500)]
@@ -66,7 +96,6 @@ def extract_visual_regions(image):
66
  gray = cv2.cvtColor(np_img, cv2.COLOR_RGB2GRAY)
67
  _, binary = cv2.threshold(gray, 220, 255, cv2.THRESH_BINARY_INV)
68
  closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15)))
69
-
70
  num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(closed, connectivity=8)
71
  results = []
72
  for i in range(1, num_labels):
@@ -80,17 +109,6 @@ def extract_visual_regions(image):
80
 
81
  # ---------- Extracción de texto + imágenes ----------
82
 
83
- def clean_bullet_line(text):
84
- text = unicodedata.normalize("NFKC", text)
85
- text = text.replace("e@", "-")
86
- text = text.replace("@", "-")
87
- text = text.replace("•", "-")
88
- text = text.replace("*", "-")
89
- text = text.replace("·", "-")
90
- text = text.replace("–", "-")
91
- text = " ".join(text.split())
92
- return text
93
-
94
  def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
95
  markdown_output = f"\n## Página {page_index + 1}\n\n"
96
  image_counter = 1
@@ -104,7 +122,7 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
104
  for line in b["lines"]:
105
  line_y = line["bbox"][1]
106
  line_text = " ".join([span["text"] for span in line["spans"]]).strip()
107
- line_text = clean_bullet_line(line_text)
108
  max_font_size = max([span.get("size", 10) for span in line["spans"]])
109
  if line_text:
110
  elements.append((line_y, line_text, max_font_size))
@@ -135,7 +153,9 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
135
  is_header = font_size >= 14
136
  if previous_y is not None and abs(y - previous_y) > 10:
137
  markdown_output += "\n"
138
- translated = translate_text(text.strip())
 
 
139
  markdown_output += f"\n### {translated}\n" if is_header else translated + "\n"
140
  previous_y = y
141
 
@@ -156,11 +176,9 @@ def convert(pdf_file):
156
  text = page.get_text("text").strip()
157
 
158
  if len(text) > 30:
159
- # Texto nativo del PDF
160
  extracted = extract_text_markdown([page], image_paths, page_num, seen_xrefs)
161
  markdown_output += extracted + "\n"
162
  else:
163
- # Página "escaneada" -> OCR
164
  markdown_output += f"\n## Página {page_num + 1}\n\n"
165
  pix = page.get_pixmap(dpi=300)
166
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
@@ -170,12 +188,13 @@ def convert(pdf_file):
170
  markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
171
 
172
  try:
173
- ocr_text = pytesseract.image_to_string(img, lang="eng+spa")
174
  except pytesseract.TesseractError:
175
  ocr_text = ""
176
- ocr_text_clean = clean_ocr_text(ocr_text)
177
- translated_ocr = translate_text(ocr_text_clean)
178
- markdown_output += translated_ocr + "\n"
 
179
 
180
  crops = extract_visual_regions(img)
181
  for i, crop in enumerate(crops):
@@ -205,4 +224,4 @@ with gr.Blocks() as demo:
205
 
206
  submit_btn.click(fn=convert, inputs=[pdf_input], outputs=[markdown_output, gallery_output, download_md])
207
 
208
- demo.launch()
 
7
  import base64
8
  import os
9
  import unicodedata
10
+ import re
 
11
  from transformers import pipeline
12
 
13
  # Inicializa el pipeline de traducción EN->ES una sola vez
 
21
  cleaned_lines = [line.strip() for line in lines if line.strip()]
22
  return "\n".join(cleaned_lines)
23
 
24
+ def fix_common_ocr_errors(text):
25
+ text = text.replace(" e ", " • ") # cuando OCR confunde viñetas con "e"
26
+ text = re.sub(r'\bposibl\b', 'posible', text, flags=re.IGNORECASE)
27
+ text = re.sub(r'\binstatar\b', 'instalar', text)
28
+ text = re.sub(r'\bfuncionación\b', 'función taller', text)
29
+ text = re.sub(r'\boptar\b', 'opta', text)
30
+ text = re.sub(r'ICACIONES\b', 'APLICACIONES', text)
31
+ text = re.sub(r'Lar\b', 'la', text)
32
+ text = re.sub(r'([a-zA-Z])-\n([a-zA-Z])', r'\1\2', text) # une palabras partidas por salto
33
+ return text
34
+
35
+ def format_text_to_markdown(text):
36
+ lines = text.splitlines()
37
+ final_lines = []
38
+ for line in lines:
39
+ line = line.strip()
40
+ if not line:
41
+ continue
42
+ if re.match(r"^(posible causa|causa):", line, re.IGNORECASE):
43
+ final_lines.append("### 🛑 Posible causa")
44
+ final_lines.append("")
45
+ final_lines.append(re.sub(r"^(posible causa|causa):", "", line, flags=re.IGNORECASE).strip())
46
+ elif re.match(r"^(posible solución|solución):", line, re.IGNORECASE):
47
+ final_lines.append("### ✅ Posible solución")
48
+ final_lines.append("")
49
+ final_lines.append(re.sub(r"^(posible solución|solución):", "", line, flags=re.IGNORECASE).strip())
50
+ elif re.match(r"^descripción del problema", line, re.IGNORECASE):
51
+ final_lines.append("### 📝 Descripción del problema")
52
+ elif re.match(r"^\d+\.", line):
53
+ final_lines.append("- " + line)
54
+ elif re.match(r"^•\s*", line):
55
+ final_lines.append("- " + line)
56
+ else:
57
+ final_lines.append(line)
58
+ return "\n".join(final_lines)
59
+
60
  def translate_text(text):
 
 
 
 
 
61
  if len(text.strip()) < 5:
62
  return text
63
  chunks = [text[i:i+500] for i in range(0, len(text), 500)]
 
96
  gray = cv2.cvtColor(np_img, cv2.COLOR_RGB2GRAY)
97
  _, binary = cv2.threshold(gray, 220, 255, cv2.THRESH_BINARY_INV)
98
  closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15)))
 
99
  num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(closed, connectivity=8)
100
  results = []
101
  for i in range(1, num_labels):
 
109
 
110
  # ---------- Extracción de texto + imágenes ----------
111
 
 
 
 
 
 
 
 
 
 
 
 
112
  def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
113
  markdown_output = f"\n## Página {page_index + 1}\n\n"
114
  image_counter = 1
 
122
  for line in b["lines"]:
123
  line_y = line["bbox"][1]
124
  line_text = " ".join([span["text"] for span in line["spans"]]).strip()
125
+ line_text = clean_ocr_text(line_text)
126
  max_font_size = max([span.get("size", 10) for span in line["spans"]])
127
  if line_text:
128
  elements.append((line_y, line_text, max_font_size))
 
153
  is_header = font_size >= 14
154
  if previous_y is not None and abs(y - previous_y) > 10:
155
  markdown_output += "\n"
156
+ fixed = fix_common_ocr_errors(text.strip())
157
+ formatted = format_text_to_markdown(fixed)
158
+ translated = translate_text(formatted)
159
  markdown_output += f"\n### {translated}\n" if is_header else translated + "\n"
160
  previous_y = y
161
 
 
176
  text = page.get_text("text").strip()
177
 
178
  if len(text) > 30:
 
179
  extracted = extract_text_markdown([page], image_paths, page_num, seen_xrefs)
180
  markdown_output += extracted + "\n"
181
  else:
 
182
  markdown_output += f"\n## Página {page_num + 1}\n\n"
183
  pix = page.get_pixmap(dpi=300)
184
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
 
188
  markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
189
 
190
  try:
191
+ ocr_text = pytesseract.image_to_string(img, lang="spa", config="--oem 3 --psm 6")
192
  except pytesseract.TesseractError:
193
  ocr_text = ""
194
+ ocr_text_clean = clean_ocr_text(fix_common_ocr_errors(ocr_text))
195
+ formatted = format_text_to_markdown(ocr_text_clean)
196
+ translated = translate_text(formatted)
197
+ markdown_output += translated + "\n"
198
 
199
  crops = extract_visual_regions(img)
200
  for i, crop in enumerate(crops):
 
224
 
225
  submit_btn.click(fn=convert, inputs=[pdf_input], outputs=[markdown_output, gallery_output, download_md])
226
 
227
+ demo.launch()