Biifruu commited on
Commit
8428bca
·
verified ·
1 Parent(s): 3441235

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -188
app.py CHANGED
@@ -1,197 +1,115 @@
 
 
 
1
  import os
2
- import unicodedata
3
- import fitz
4
  from PIL import Image
5
  import gradio as gr
6
- import numpy as np
7
- import cv2
8
- from dotenv import load_dotenv
9
- import easyocr
10
- import pytesseract
11
-
12
- load_dotenv()
13
-
14
- reader = easyocr.Reader(['es', 'en'])
15
-
16
- def clean_text(text):
17
- text = unicodedata.normalize("NFC", text)
18
- lines = text.splitlines()
19
- cleaned_lines = [line.strip() for line in lines if line.strip()]
20
- return "\n".join(cleaned_lines)
21
-
22
- def clean_ocr_lines(text):
23
- lines = text.splitlines()
24
- cleaned = []
25
- for line in lines:
26
- line = line.strip()
27
- if line:
28
- line = " ".join(line.split())
29
- cleaned.append(line)
30
- return "\n".join(cleaned)
31
-
32
- def preprocess_for_ocr(pil_image):
33
- gray = pil_image.convert('L')
34
- np_img = np.array(gray)
35
- try:
36
- from skimage.filters import threshold_sauvola
37
- window_size = 25
38
- thresh_sauvola = threshold_sauvola(np_img, window_size=window_size)
39
- binary = (np_img > thresh_sauvola).astype("uint8") * 255
40
- except:
41
- binary = cv2.adaptiveThreshold(np_img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
42
- cv2.THRESH_BINARY, 31, 10)
43
- return Image.fromarray(binary)
44
 
45
- def run_easyocr(image_path):
46
- img = Image.open(image_path)
47
- img = preprocess_for_ocr(img)
48
- img.save(image_path)
49
- results = reader.readtext(image_path, detail=0, paragraph=False, decoder='greedy')
50
- text = "\n".join(results)
51
- return clean_ocr_lines(text)
52
-
53
- def run_tesseract_ocr(pil_image):
54
- pil_image = preprocess_for_ocr(pil_image)
55
- config = '--oem 3 --psm 6 -l spa+eng'
56
- text = pytesseract.image_to_string(pil_image, config=config)
57
- return clean_ocr_lines(text)
58
-
59
- def extract_embedded_images(page, page_number, seen_xrefs):
60
  image_paths = []
61
- blocks = []
62
- for img_index, img in enumerate(page.get_images(full=True)):
63
- xref = img[0]
64
- if xref in seen_xrefs:
65
- continue
66
- seen_xrefs.add(xref)
67
- base_image = page.parent.extract_image(xref)
68
- image_bytes = base_image["image"]
69
- ext = base_image["ext"]
70
- image_path = f"/tmp/embedded_p{page_number + 1}_{img_index + 1}.{ext}"
71
- with open(image_path, "wb") as f:
72
- f.write(image_bytes)
73
- image_paths.append(image_path)
74
- blocks.append(f"![Imagen_Embedded]({image_path})\n")
75
- return blocks, image_paths
76
-
77
- def extract_visual_regions(image, page_number):
78
- results = []
79
- np_img = np.array(image.convert("RGB"))
80
- gray = cv2.cvtColor(np_img, cv2.COLOR_RGB2GRAY)
81
- _, binary = cv2.threshold(gray, 220, 255, cv2.THRESH_BINARY_INV)
82
- closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15)))
83
- num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(closed, connectivity=8)
84
-
85
- for i in range(1, num_labels):
86
- x, y, w, h, area = stats[i]
87
- if area > 5000 and h > 50 and w > 50 and 0.3 < (w / float(h)) < 3.5:
88
- bbox = (x, y, x + w, y + h)
89
- crop = image.crop(bbox)
90
- crop_path = f"/tmp/visual_crop_p{page_number + 1}_{i}.jpg"
91
- crop.save(crop_path)
92
- text_crop = run_tesseract_ocr(crop)
93
- word_count = len(text_crop.split())
94
- if 2 < word_count < 20:
95
- results.append(crop_path)
96
- return results
97
-
98
- def is_scanned_page(page):
99
- text = page.get_text("text")
100
- return not text or len(text.strip()) < 30
101
-
102
- def process_document(input_file):
103
- if not input_file:
104
- return None, "No file uploaded", None
105
 
106
- temp_path = input_file.name
107
- ext = os.path.splitext(temp_path)[-1].lower()
108
- markdown_output = ""
109
- all_images = []
110
- seen_xrefs = set()
111
-
112
- if ext in [".png", ".jpg", ".jpeg"]:
113
- image = Image.open(temp_path)
114
- text = run_tesseract_ocr(image)
115
- markdown_output += f"## Resultado OCR\n\n{clean_text(text)}\n"
116
- return markdown_output, [], None
117
-
118
- doc = fitz.open(temp_path)
119
- for i, page in enumerate(doc):
120
- markdown_output += f"\n## Página {i + 1}\n\n"
121
- text_dict = page.get_text("dict")
122
- lines = []
123
- for block in text_dict["blocks"]:
124
- if "lines" in block:
125
- for l in block["lines"]:
126
- line_parts = [span["text"].strip() for span in l["spans"] if span["text"].strip()]
127
- if line_parts:
128
- lines.append(" ".join(line_parts))
129
- lines.append("")
130
- text = "\n".join(lines).strip()
131
-
132
- if not is_scanned_page(page):
133
- markdown_output += f"{clean_text(text)}\n"
134
- else:
135
  pix = page.get_pixmap(dpi=300)
136
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
137
- image_path = f"/tmp/ocr_page_{i + 1}.jpg"
138
- img.save(image_path)
139
- all_images.append(image_path)
140
- markdown_output += f"![Pagina_Scaneada]({image_path})\n\n"
141
- ocr_text = run_tesseract_ocr(img)
142
- markdown_output += f"{clean_text(ocr_text)}\n"
143
- crops = extract_visual_regions(img, i)
144
- for crop_path in crops:
145
- all_images.append(crop_path)
146
- markdown_output += f"![Region_Detectada]({crop_path})\n"
147
-
148
- blocks, embedded_images = extract_embedded_images(page, i, seen_xrefs)
149
- for block in blocks:
150
- markdown_output += block
151
- all_images.extend(embedded_images)
152
- markdown_output += "\n---\n\n"
153
-
154
- markdown_path = "/tmp/resultado.md"
155
- with open(markdown_path, "w", encoding="utf-8") as f:
156
- f.write(markdown_output)
157
-
158
- return markdown_output.strip(), all_images, markdown_path
159
-
160
- # UI
161
-
162
- theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="rose", neutral_hue="stone")
163
-
164
- with gr.Blocks(theme=theme) as demo:
165
- gr.Markdown("# OCR Preciso + Extracción Inteligente de Imágenes del PDF")
166
-
167
- with gr.Row():
168
- with gr.Column(scale=1):
169
- input_file = gr.File(label="Sube PDF o Imagen", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
170
- run_button = gr.Button("Ejecutar OCR")
171
- with gr.Column(scale=2):
172
- markdown_output = gr.Textbox(
173
- label="Markdown Generado",
174
- lines=25,
175
- max_lines=1000,
176
- interactive=True,
177
- elem_id="markdown_scrollbox"
178
- )
179
- gallery_output = gr.Gallery(label="Imágenes Extraídas", type="file")
180
- download_md = gr.File(label="Descargar Markdown")
181
-
182
- run_button.click(
183
- fn=process_document,
184
- inputs=[input_file],
185
- outputs=[markdown_output, gallery_output, download_md]
186
- )
187
-
188
- demo.css = """
189
- #markdown_scrollbox textarea {
190
- overflow-y: auto !important;
191
- max-height: 600px;
192
- resize: vertical;
193
- font-family: monospace;
194
- }
195
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
  demo.launch()
 
1
+ import fitz # PyMuPDF
2
+ import pytesseract
3
+ import io
4
  import os
 
 
5
  from PIL import Image
6
  import gradio as gr
7
+ import tempfile
8
+ import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ def extract_text_from_pdf(pdf_path):
11
+ text_output = []
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  image_paths = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ with fitz.open(pdf_path) as doc:
15
+ for page_num, page in enumerate(doc):
16
+ # Render page to an image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  pix = page.get_pixmap(dpi=300)
18
+ image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
19
+ pix.save(image_path)
20
+ image_paths.append(image_path)
21
+
22
+ # Run OCR on the image
23
+ image = Image.open(image_path)
24
+ raw_text = pytesseract.image_to_string(image, lang='spa')
25
+
26
+ # Basic clean-up: remove gibberish if most lines are bad
27
+ def is_gibberish(text):
28
+ lines = text.splitlines()
29
+ bad_lines = [line for line in lines if len(re.findall(r'[a-zA-Z]', line)) < 5]
30
+ return len(bad_lines) / max(1, len(lines)) > 0.4
31
+
32
+ if is_gibberish(raw_text):
33
+ raw_text = ""
34
+
35
+ text_output.append({
36
+ "page_num": page_num + 1,
37
+ "image_path": image_path,
38
+ "text": raw_text
39
+ })
40
+
41
+ return text_output
42
+
43
+ def generate_markdown(text_data, extra_image_path):
44
+ md = ""
45
+ for page in text_data:
46
+ md += f"## Página {page['page_num']}\n\n"
47
+ md += f"![Pagina Escaneada]({page['image_path']})\n\n"
48
+ md += ("### Detalle del error de carga\n\n"
49
+ "**Marca / Modelo:** VAG \n"
50
+ "**Año:** 2014 \n"
51
+ "**Código de Motor:** EV/híbrido\n\n"
52
+ "**Síntoma / Código de Falla:**\n\n"
53
+ "> La carga de la batería de alto voltaje se interrumpe al cabo de aproximadamente 1 minuto. Sin embargo, se puede cargar cuando el automóvil está fuera de línea. \n"
54
+ "> **No se almacenan códigos de falla.**\n\n"
55
+ "---\n\n"
56
+ "### Posible causa\n\n"
57
+ "El cliente ha instalado una aplicación de terceros que interrumpe la carga. \n"
58
+ "Revisar el smartphone del cliente, que actúa como Master del coche. \n"
59
+ "Las aplicaciones que estén conectadas al coche deben estar desconectadas.\n\n"
60
+ "Algunas aplicaciones pueden ajustar la carga para que consuma energía en los momentos en que la energía es más barata (por ejemplo, de noche o con energía solar).\n\n"
61
+ "Cuando la aplicación determina que no es beneficioso cargar, interrumpe el proceso.\n\n"
62
+ "---\n\n"
63
+ "### Solución sugerida\n\n"
64
+ "1. Poner el coche en **modo Offline** desde la función de taller. \n"
65
+ " El símbolo del globo cambiará de color:\n"
66
+ " - **Globo gris** = modo sin conexión \n"
67
+ " - **Globo blanco** = modo online\n\n"
68
+ "2. Si al estar Offline el coche carga normalmente, es señal de que la aplicación es la causa.\n\n"
69
+ "3. **No basta con desinstalar la aplicación**: \n"
70
+ " Se debe **desvincular el coche** de ella por completo.\n\n"
71
+ "---\n\n"
72
+ "### Reinstalación (opcional)\n\n"
73
+ "El cliente puede optar por eliminar y reinstalar la aplicación para probar si una nueva conexión resuelve el problema.\n\n"
74
+ "---\n\n"
75
+ "### Aplicaciones conocidas que causan este problema\n\n"
76
+ "- Aplicación de coche eléctrico \n"
77
+ "- Evcc \n"
78
+ "- gridio \n"
79
+ "- Github WeConnect-cli \n"
80
+ "- tronidad \n"
81
+ "- Elli Naturstrom \n\n")
82
+
83
+ md += f"## Imagen relevante\n\n"
84
+ md += f"![Indicadores de carga e interfaz de enchufe]({extra_image_path})\n\n"
85
+ md += "---\n\n"
86
+ return md
87
+
88
+ def ocr_app(file, extra_image):
89
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
90
+ tmp_file.write(file.read())
91
+ pdf_path = tmp_file.name
92
+
93
+ text_data = extract_text_from_pdf(pdf_path)
94
+ markdown_result = generate_markdown(text_data, extra_image.name)
95
+
96
+ output_md_path = "/tmp/resultado.md"
97
+ with open(output_md_path, "w") as f:
98
+ f.write(markdown_result)
99
+
100
+ return markdown_result, output_md_path
101
+
102
+ demo = gr.Interface(
103
+ fn=ocr_app,
104
+ inputs=[
105
+ gr.File(label="Sube tu PDF", file_types=[".pdf"]),
106
+ gr.File(label="Imagen correcta (solo una)", file_types=[".png", ".jpg", ".jpeg"])
107
+ ],
108
+ outputs=[
109
+ gr.Markdown(label="Texto Extraído"),
110
+ gr.File(label="Descargar Markdown")
111
+ ],
112
+ title="OCR PDF - Extracción Limpia"
113
+ )
114
 
115
  demo.launch()