Biifruu commited on
Commit
fabee1a
·
verified ·
1 Parent(s): 8428bca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +188 -106
app.py CHANGED
@@ -1,115 +1,197 @@
1
- import fitz # PyMuPDF
2
- import pytesseract
3
- import io
4
  import os
 
 
5
  from PIL import Image
6
  import gradio as gr
7
- import tempfile
8
- import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- def extract_text_from_pdf(pdf_path):
11
- text_output = []
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  image_paths = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- with fitz.open(pdf_path) as doc:
15
- for page_num, page in enumerate(doc):
16
- # Render page to an image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  pix = page.get_pixmap(dpi=300)
18
- image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
19
- pix.save(image_path)
20
- image_paths.append(image_path)
21
-
22
- # Run OCR on the image
23
- image = Image.open(image_path)
24
- raw_text = pytesseract.image_to_string(image, lang='spa')
25
-
26
- # Basic clean-up: remove gibberish if most lines are bad
27
- def is_gibberish(text):
28
- lines = text.splitlines()
29
- bad_lines = [line for line in lines if len(re.findall(r'[a-zA-Z]', line)) < 5]
30
- return len(bad_lines) / max(1, len(lines)) > 0.4
31
-
32
- if is_gibberish(raw_text):
33
- raw_text = ""
34
-
35
- text_output.append({
36
- "page_num": page_num + 1,
37
- "image_path": image_path,
38
- "text": raw_text
39
- })
40
-
41
- return text_output
42
-
43
- def generate_markdown(text_data, extra_image_path):
44
- md = ""
45
- for page in text_data:
46
- md += f"## Página {page['page_num']}\n\n"
47
- md += f"![Pagina Escaneada]({page['image_path']})\n\n"
48
- md += ("### Detalle del error de carga\n\n"
49
- "**Marca / Modelo:** VAG \n"
50
- "**Año:** 2014 \n"
51
- "**Código de Motor:** EV/híbrido\n\n"
52
- "**Síntoma / Código de Falla:**\n\n"
53
- "> La carga de la batería de alto voltaje se interrumpe al cabo de aproximadamente 1 minuto. Sin embargo, se puede cargar cuando el automóvil está fuera de línea. \n"
54
- "> **No se almacenan códigos de falla.**\n\n"
55
- "---\n\n"
56
- "### Posible causa\n\n"
57
- "El cliente ha instalado una aplicación de terceros que interrumpe la carga. \n"
58
- "Revisar el smartphone del cliente, que actúa como Master del coche. \n"
59
- "Las aplicaciones que estén conectadas al coche deben estar desconectadas.\n\n"
60
- "Algunas aplicaciones pueden ajustar la carga para que consuma energía en los momentos en que la energía es más barata (por ejemplo, de noche o con energía solar).\n\n"
61
- "Cuando la aplicación determina que no es beneficioso cargar, interrumpe el proceso.\n\n"
62
- "---\n\n"
63
- "### Solución sugerida\n\n"
64
- "1. Poner el coche en **modo Offline** desde la función de taller. \n"
65
- " El símbolo del globo cambiará de color:\n"
66
- " - **Globo gris** = modo sin conexión \n"
67
- " - **Globo blanco** = modo online\n\n"
68
- "2. Si al estar Offline el coche carga normalmente, es señal de que la aplicación es la causa.\n\n"
69
- "3. **No basta con desinstalar la aplicación**: \n"
70
- " Se debe **desvincular el coche** de ella por completo.\n\n"
71
- "---\n\n"
72
- "### Reinstalación (opcional)\n\n"
73
- "El cliente puede optar por eliminar y reinstalar la aplicación para probar si una nueva conexión resuelve el problema.\n\n"
74
- "---\n\n"
75
- "### Aplicaciones conocidas que causan este problema\n\n"
76
- "- Aplicación de coche eléctrico \n"
77
- "- Evcc \n"
78
- "- gridio \n"
79
- "- Github WeConnect-cli \n"
80
- "- tronidad \n"
81
- "- Elli Naturstrom \n\n")
82
-
83
- md += f"## Imagen relevante\n\n"
84
- md += f"![Indicadores de carga e interfaz de enchufe]({extra_image_path})\n\n"
85
- md += "---\n\n"
86
- return md
87
-
88
- def ocr_app(file, extra_image):
89
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
90
- tmp_file.write(file.read())
91
- pdf_path = tmp_file.name
92
-
93
- text_data = extract_text_from_pdf(pdf_path)
94
- markdown_result = generate_markdown(text_data, extra_image.name)
95
-
96
- output_md_path = "/tmp/resultado.md"
97
- with open(output_md_path, "w") as f:
98
- f.write(markdown_result)
99
-
100
- return markdown_result, output_md_path
101
-
102
- demo = gr.Interface(
103
- fn=ocr_app,
104
- inputs=[
105
- gr.File(label="Sube tu PDF", file_types=[".pdf"]),
106
- gr.File(label="Imagen correcta (solo una)", file_types=[".png", ".jpg", ".jpeg"])
107
- ],
108
- outputs=[
109
- gr.Markdown(label="Texto Extraído"),
110
- gr.File(label="Descargar Markdown")
111
- ],
112
- title="OCR PDF - Extracción Limpia"
113
- )
114
 
115
  demo.launch()
 
 
 
 
1
  import os
2
+ import unicodedata
3
+ import fitz
4
  from PIL import Image
5
  import gradio as gr
6
+ import numpy as np
7
+ import cv2
8
+ from dotenv import load_dotenv
9
+ import easyocr
10
+ import pytesseract
11
+
12
+ load_dotenv()
13
+
14
+ reader = easyocr.Reader(['es', 'en'])
15
+
16
+ def clean_text(text):
17
+ text = unicodedata.normalize("NFC", text)
18
+ lines = text.splitlines()
19
+ cleaned_lines = [line.strip() for line in lines if line.strip()]
20
+ return "\n".join(cleaned_lines)
21
+
22
+ def clean_ocr_lines(text):
23
+ lines = text.splitlines()
24
+ cleaned = []
25
+ for line in lines:
26
+ line = line.strip()
27
+ if line:
28
+ line = " ".join(line.split())
29
+ cleaned.append(line)
30
+ return "\n".join(cleaned)
31
+
32
+ def preprocess_for_ocr(pil_image):
33
+ gray = pil_image.convert('L')
34
+ np_img = np.array(gray)
35
+ try:
36
+ from skimage.filters import threshold_sauvola
37
+ window_size = 25
38
+ thresh_sauvola = threshold_sauvola(np_img, window_size=window_size)
39
+ binary = (np_img > thresh_sauvola).astype("uint8") * 255
40
+ except:
41
+ binary = cv2.adaptiveThreshold(np_img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
42
+ cv2.THRESH_BINARY, 31, 10)
43
+ return Image.fromarray(binary)
44
 
45
+ def run_easyocr(image_path):
46
+ img = Image.open(image_path)
47
+ img = preprocess_for_ocr(img)
48
+ img.save(image_path)
49
+ results = reader.readtext(image_path, detail=0, paragraph=False, decoder='greedy')
50
+ text = "\n".join(results)
51
+ return clean_ocr_lines(text)
52
+
53
+ def run_tesseract_ocr(pil_image):
54
+ pil_image = preprocess_for_ocr(pil_image)
55
+ config = '--oem 3 --psm 6 -l spa+eng'
56
+ text = pytesseract.image_to_string(pil_image, config=config)
57
+ return clean_ocr_lines(text)
58
+
59
+ def extract_embedded_images(page, page_number, seen_xrefs):
60
  image_paths = []
61
+ blocks = []
62
+ for img_index, img in enumerate(page.get_images(full=True)):
63
+ xref = img[0]
64
+ if xref in seen_xrefs:
65
+ continue
66
+ seen_xrefs.add(xref)
67
+ base_image = page.parent.extract_image(xref)
68
+ image_bytes = base_image["image"]
69
+ ext = base_image["ext"]
70
+ image_path = f"/tmp/embedded_p{page_number + 1}_{img_index + 1}.{ext}"
71
+ with open(image_path, "wb") as f:
72
+ f.write(image_bytes)
73
+ image_paths.append(image_path)
74
+ blocks.append(f"![Imagen_Embedded]({image_path})\n")
75
+ return blocks, image_paths
76
+
77
+ def extract_visual_regions(image, page_number):
78
+ results = []
79
+ np_img = np.array(image.convert("RGB"))
80
+ gray = cv2.cvtColor(np_img, cv2.COLOR_RGB2GRAY)
81
+ _, binary = cv2.threshold(gray, 220, 255, cv2.THRESH_BINARY_INV)
82
+ closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15)))
83
+ num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(closed, connectivity=8)
84
+
85
+ for i in range(1, num_labels):
86
+ x, y, w, h, area = stats[i]
87
+ if area > 5000 and h > 50 and w > 50 and 0.3 < (w / float(h)) < 3.5:
88
+ bbox = (x, y, x + w, y + h)
89
+ crop = image.crop(bbox)
90
+ crop_path = f"/tmp/visual_crop_p{page_number + 1}_{i}.jpg"
91
+ crop.save(crop_path)
92
+ text_crop = run_tesseract_ocr(crop)
93
+ word_count = len(text_crop.split())
94
+ if 2 < word_count < 20:
95
+ results.append(crop_path)
96
+ return results
97
+
98
+ def is_scanned_page(page):
99
+ text = page.get_text("text")
100
+ return not text or len(text.strip()) < 30
101
+
102
+ def process_document(input_file):
103
+ if not input_file:
104
+ return None, "No file uploaded", None
105
 
106
+ temp_path = input_file.name
107
+ ext = os.path.splitext(temp_path)[-1].lower()
108
+ markdown_output = ""
109
+ all_images = []
110
+ seen_xrefs = set()
111
+
112
+ if ext in [".png", ".jpg", ".jpeg"]:
113
+ image = Image.open(temp_path)
114
+ text = run_tesseract_ocr(image)
115
+ markdown_output += f"## Resultado OCR\n\n{clean_text(text)}\n"
116
+ return markdown_output, [], None
117
+
118
+ doc = fitz.open(temp_path)
119
+ for i, page in enumerate(doc):
120
+ markdown_output += f"\n## Página {i + 1}\n\n"
121
+ text_dict = page.get_text("dict")
122
+ lines = []
123
+ for block in text_dict["blocks"]:
124
+ if "lines" in block:
125
+ for l in block["lines"]:
126
+ line_parts = [span["text"].strip() for span in l["spans"] if span["text"].strip()]
127
+ if line_parts:
128
+ lines.append(" ".join(line_parts))
129
+ lines.append("")
130
+ text = "\n".join(lines).strip()
131
+
132
+ if not is_scanned_page(page):
133
+ markdown_output += f"{clean_text(text)}\n"
134
+ else:
135
  pix = page.get_pixmap(dpi=300)
136
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
137
+ image_path = f"/tmp/ocr_page_{i + 1}.jpg"
138
+ img.save(image_path)
139
+ all_images.append(image_path)
140
+ markdown_output += f"![Pagina_Scaneada]({image_path})\n\n"
141
+ ocr_text = run_tesseract_ocr(img)
142
+ markdown_output += f"{clean_text(ocr_text)}\n"
143
+ crops = extract_visual_regions(img, i)
144
+ for crop_path in crops:
145
+ all_images.append(crop_path)
146
+ markdown_output += f"![Region_Detectada]({crop_path})\n"
147
+
148
+ blocks, embedded_images = extract_embedded_images(page, i, seen_xrefs)
149
+ for block in blocks:
150
+ markdown_output += block
151
+ all_images.extend(embedded_images)
152
+ markdown_output += "\n---\n\n"
153
+
154
+ markdown_path = "/tmp/resultado.md"
155
+ with open(markdown_path, "w", encoding="utf-8") as f:
156
+ f.write(markdown_output)
157
+
158
+ return markdown_output.strip(), all_images, markdown_path
159
+
160
+ # UI
161
+
162
+ theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="rose", neutral_hue="stone")
163
+
164
+ with gr.Blocks(theme=theme) as demo:
165
+ gr.Markdown("# OCR Preciso + Extracción Inteligente de Imágenes del PDF")
166
+
167
+ with gr.Row():
168
+ with gr.Column(scale=1):
169
+ input_file = gr.File(label="Sube PDF o Imagen", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
170
+ run_button = gr.Button("Ejecutar OCR")
171
+ with gr.Column(scale=2):
172
+ markdown_output = gr.Textbox(
173
+ label="Markdown Generado",
174
+ lines=25,
175
+ max_lines=1000,
176
+ interactive=True,
177
+ elem_id="markdown_scrollbox"
178
+ )
179
+ gallery_output = gr.Gallery(label="Imágenes Extraídas", type="file")
180
+ download_md = gr.File(label="Descargar Markdown")
181
+
182
+ run_button.click(
183
+ fn=process_document,
184
+ inputs=[input_file],
185
+ outputs=[markdown_output, gallery_output, download_md]
186
+ )
187
+
188
+ demo.css = """
189
+ #markdown_scrollbox textarea {
190
+ overflow-y: auto !important;
191
+ max-height: 600px;
192
+ resize: vertical;
193
+ font-family: monospace;
194
+ }
195
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
  demo.launch()