Vladt-Tempest commited on
Commit
a798166
·
1 Parent(s): 63cd59d
Files changed (5) hide show
  1. app.py +176 -40
  2. coordinates_HAWB.json +60 -0
  3. escalar.py +49 -0
  4. hawb_processing.py +93 -0
  5. test1.py +25 -0
app.py CHANGED
@@ -4,6 +4,7 @@ import tempfile
4
  from pdf2image import convert_from_path
5
  import logging
6
  from commercial_invoice import process_invoice_batch, logger
 
7
 
8
  # Configurar handler para capturar logs
9
  class GradioHandler(logging.Handler):
@@ -15,7 +16,7 @@ class GradioHandler(logging.Handler):
15
  log_entry = self.format(record)
16
  self.log_history.append(log_entry)
17
 
18
- def procesar_pdf(pdf_path, progress=gr.Progress()):
19
  """Procesa un archivo PDF y extrae información de las facturas"""
20
  log_history = []
21
  handler = GradioHandler(log_history)
@@ -104,6 +105,84 @@ def procesar_pdf(pdf_path, progress=gr.Progress()):
104
  finally:
105
  logger.removeHandler(handler)
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  def crear_interfaz():
108
  with gr.Blocks(css="""
109
  .message { margin-bottom: 20px; }
@@ -116,50 +195,96 @@ def crear_interfaz():
116
  max-height: 200px;
117
  overflow-y: auto;
118
  }
 
 
 
119
  """) as demo:
120
- gr.Markdown("# Extractor de Información de Facturas")
121
-
122
- with gr.Row():
123
- pdf_input = gr.File(
124
- label="Cargar PDF (el nombre debe empezar con 'ci145')",
125
- file_types=[".pdf"]
126
- )
127
-
128
- with gr.Row():
129
- procesar_btn = gr.Button(
130
- "Procesar PDF",
131
- variant="primary",
132
- elem_classes=["custom-button"]
133
- )
134
-
135
- with gr.Row():
136
- output_text = gr.HTML(
137
- label="Resultados"
138
- )
139
-
140
- with gr.Row():
141
- log_output = gr.Textbox(
142
- label="Logs del proceso",
143
- elem_classes=["logs"],
144
- lines=10,
145
- max_lines=10,
146
- show_label=True
147
- )
148
 
149
- with gr.Row():
150
- files_output = gr.File(
151
- label="Descargar archivos CSV",
152
- file_count="multiple",
153
- interactive=False,
154
- visible=False
155
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
- def process_and_return(pdf_path):
158
- message, csv_files, logs = procesar_pdf(pdf_path)
159
  return message, csv_files if csv_files else None, logs
160
 
161
- procesar_btn.click(
162
- fn=process_and_return,
 
 
 
 
 
163
  inputs=[pdf_input],
164
  outputs=[output_text, files_output, log_output],
165
  show_progress=True
@@ -168,6 +293,17 @@ def crear_interfaz():
168
  None,
169
  [files_output]
170
  )
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  return demo
173
 
 
4
  from pdf2image import convert_from_path
5
  import logging
6
  from commercial_invoice import process_invoice_batch, logger
7
+ from hawb_processing import process_hawb_batch
8
 
9
  # Configurar handler para capturar logs
10
  class GradioHandler(logging.Handler):
 
16
  log_entry = self.format(record)
17
  self.log_history.append(log_entry)
18
 
19
+ def procesar_pdf_invoice(pdf_path, progress=gr.Progress()):
20
  """Procesa un archivo PDF y extrae información de las facturas"""
21
  log_history = []
22
  handler = GradioHandler(log_history)
 
105
  finally:
106
  logger.removeHandler(handler)
107
 
108
+ def procesar_guia_aerea(pdf_path, progress=gr.Progress()):
109
+ """Procesa un archivo PDF de guía aérea y extrae las imágenes"""
110
+ log_history = []
111
+ handler = GradioHandler(log_history)
112
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
113
+ handler.setFormatter(formatter)
114
+ logger.addHandler(handler)
115
+
116
+ try:
117
+ # Validar que el nombre del archivo comience con "hawb145"
118
+ filename = os.path.basename(pdf_path).lower()
119
+ if not filename.startswith("hawb145"):
120
+ error_msg = """
121
+ <div style='color: red; font-weight: bold; padding: 10px; border: 1px solid red; border-radius: 5px; background-color: #ffe6e6;'>
122
+ ⚠️ El archivo debe comenzar con "hawb145" para ser procesado como guía aérea.
123
+ </div>
124
+ """
125
+ logger.error("Archivo incompatible: " + filename)
126
+ return error_msg, None, "\n".join(log_history)
127
+
128
+ progress(0.1, desc="Iniciando procesamiento...")
129
+
130
+ # Usar directorio temporal para las imágenes
131
+ with tempfile.TemporaryDirectory() as temp_dir:
132
+ logger.info(f"Convirtiendo PDF a imágenes: {pdf_path}")
133
+ progress(0.2, desc="Convirtiendo PDF a imágenes...")
134
+
135
+ # Convertir PDF a imágenes
136
+ images = convert_from_path(pdf_path)
137
+ image_paths = []
138
+
139
+ # Guardar imágenes en el directorio temporal
140
+ for i, image in enumerate(images):
141
+ progress((0.3 + (i/len(images) * 0.6)), desc=f"Procesando página {i+1} de {len(images)}...")
142
+ image_path = os.path.join(temp_dir, f'hawb_pagina_{i+1}.jpg')
143
+ image.save(image_path, 'JPEG')
144
+ image_paths.append(image_path)
145
+
146
+ logger.info(f"Se generaron {len(image_paths)} imágenes en directorio temporal")
147
+
148
+ progress(0.5, desc="¡Proceso completado!")
149
+
150
+ # Procesar las imágenes
151
+ coordinates_json = "./coordinates_HAWB.json"
152
+ results_df = process_hawb_batch(image_paths, coordinates_json)
153
+
154
+ output_dir = "data"
155
+ os.makedirs(output_dir, exist_ok=True)
156
+ csv_path = os.path.join(output_dir, 'hawb_procesados.csv')
157
+ results_df.to_csv(csv_path, index=False)
158
+ logger.info(f"Resultados guardados en: {csv_path}")
159
+
160
+ progress(0.9, desc="Finalizando...")
161
+
162
+ # Preparar mensaje de resultados con estilo
163
+ mensaje = f"""
164
+ <div style='padding: 10px; border: 1px solid #4CAF50; border-radius: 5px; background-color: #f1f8e9;'>
165
+ <h3 style='color: #2E7D32; margin-top: 0;'>✅ Procesamiento completado</h3>
166
+ <ul style='list-style-type: none; padding-left: 0;'>
167
+ <li>📄 Páginas procesadas: {len(image_paths)}</li>
168
+ <li>📁 Imágenes procesadas en memoria temporal</li>
169
+ </ul>
170
+ </div>
171
+ """
172
+
173
+ return mensaje, csv_path, "\n".join(log_history)
174
+
175
+ except Exception as e:
176
+ error_msg = f"""
177
+ <div style='color: red; font-weight: bold; padding: 10px; border: 1px solid red; border-radius: 5px; background-color: #ffe6e6;'>
178
+ ⚠️ Error durante el procesamiento: {str(e)}
179
+ </div>
180
+ """
181
+ logger.error(error_msg)
182
+ return error_msg, None, "\n".join(log_history)
183
+ finally:
184
+ logger.removeHandler(handler)
185
+
186
  def crear_interfaz():
187
  with gr.Blocks(css="""
188
  .message { margin-bottom: 20px; }
 
195
  max-height: 200px;
196
  overflow-y: auto;
197
  }
198
+ .tab-selected {
199
+ border-bottom: 2px solid #2196F3 !important;
200
+ }
201
  """) as demo:
202
+ gr.Markdown("# Extractor de Información de Facturas y Guías Aéreas")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
+ with gr.Tabs() as tabs:
205
+ with gr.TabItem("Facturas Comerciales", id=1):
206
+ with gr.Row():
207
+ pdf_input = gr.File(
208
+ label="Cargar PDF de facturas (el nombre debe empezar con 'ci145')",
209
+ file_types=[".pdf"]
210
+ )
211
+
212
+ with gr.Row():
213
+ procesar_facturas_btn = gr.Button(
214
+ "Procesar PDF de Facturas",
215
+ variant="primary",
216
+ elem_classes=["custom-button"]
217
+ )
218
+
219
+ with gr.Row():
220
+ output_text = gr.HTML(
221
+ label="Resultados"
222
+ )
223
+
224
+ with gr.Row():
225
+ log_output = gr.Textbox(
226
+ label="Logs del proceso",
227
+ elem_classes=["logs"],
228
+ lines=10,
229
+ max_lines=10,
230
+ show_label=True
231
+ )
232
+
233
+ with gr.Row():
234
+ files_output = gr.File(
235
+ label="Descargar archivos CSV",
236
+ file_count="multiple",
237
+ interactive=False,
238
+ visible=False
239
+ )
240
+
241
+ with gr.TabItem("Guías Aéreas", id=2):
242
+ with gr.Row():
243
+ guia_aerea_input = gr.File(
244
+ label="Cargar PDF de guía aérea. (El nombre debe empezar con 'hawb145')",
245
+ file_types=[".pdf"]
246
+ )
247
+
248
+ with gr.Row():
249
+ procesar_guia_btn = gr.Button(
250
+ "Procesar PDF de Guías Aéreas",
251
+ variant="primary",
252
+ elem_classes=["custom-button"]
253
+ )
254
+
255
+ with gr.Row():
256
+ guia_output_text = gr.HTML(
257
+ label="Resultados"
258
+ )
259
+
260
+ with gr.Row():
261
+ guia_log_output = gr.Textbox(
262
+ label="Logs del proceso",
263
+ elem_classes=["logs"],
264
+ lines=10,
265
+ max_lines=10,
266
+ show_label=True
267
+ )
268
+
269
+ with gr.Row():
270
+ guia_files_output = gr.File(
271
+ label="Descargar archivos procesados",
272
+ file_count="multiple",
273
+ interactive=False,
274
+ visible=False
275
+ )
276
 
277
+ def process_and_return_invoices(pdf_path):
278
+ message, csv_files, logs = procesar_pdf_invoice(pdf_path)
279
  return message, csv_files if csv_files else None, logs
280
 
281
+ def process_and_return_guia(pdf_path):
282
+ message, files, logs = procesar_guia_aerea(pdf_path)
283
+ return message, files if files else None, logs
284
+
285
+ # Eventos de los botones
286
+ procesar_facturas_btn.click(
287
+ fn=process_and_return_invoices,
288
  inputs=[pdf_input],
289
  outputs=[output_text, files_output, log_output],
290
  show_progress=True
 
293
  None,
294
  [files_output]
295
  )
296
+
297
+ procesar_guia_btn.click(
298
+ fn=process_and_return_guia,
299
+ inputs=[guia_aerea_input],
300
+ outputs=[guia_output_text, guia_files_output, guia_log_output],
301
+ show_progress=True
302
+ ).then(
303
+ lambda: gr.update(visible=True),
304
+ None,
305
+ [guia_files_output]
306
+ )
307
 
308
  return demo
309
 
coordinates_HAWB.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boxes": [
3
+ {
4
+ "id": "1",
5
+ "label": "Shipper_name",
6
+ "x": "590.00",
7
+ "y": "155.00",
8
+ "width": "900.00",
9
+ "height": "60.00",
10
+ "confidence": null
11
+ },
12
+ {
13
+ "id": "2",
14
+ "label": "hawb_number",
15
+ "x": "1396.00",
16
+ "y": "163.00",
17
+ "width": "190.00",
18
+ "height": "50.00",
19
+ "confidence": null
20
+ },
21
+ {
22
+ "id": "3",
23
+ "label": "hawb_date",
24
+ "x": "2162.00",
25
+ "y": "231.00",
26
+ "width": "180.00",
27
+ "height": "62.00",
28
+ "confidence": null
29
+ },
30
+ {
31
+ "id": "4",
32
+ "label": "number_pieces",
33
+ "x": "196.00",
34
+ "y": "1592.00",
35
+ "width": "110.00",
36
+ "height": "50.00",
37
+ "confidence": null
38
+ },
39
+ {
40
+ "id": "5",
41
+ "label": "gross_weight",
42
+ "x": "500.00",
43
+ "y": "1592.00",
44
+ "width": "130.00",
45
+ "height": "62.00",
46
+ "confidence": null
47
+ },
48
+ {
49
+ "id": "6",
50
+ "label": "kg_lb",
51
+ "x": "716.00",
52
+ "y": "1592.00",
53
+ "width": "50.00",
54
+ "height": "62.00",
55
+ "confidence": null
56
+ }
57
+ ],
58
+ "height": 3509,
59
+ "width": 2480
60
+ }
escalar.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import os
3
+
4
+ def resize_image(page_number=1):
5
+ # Definir dimensiones objetivo
6
+ img_width = 2480 # Puedes ajustar estos valores según necesites
7
+ img_height = 3509
8
+
9
+ # Ruta de entrada y salida
10
+ input_path = os.path.join('hawb', f'hawb_pagina_{page_number}.jpg')
11
+ output_path = os.path.join('hawb', 'hawb_model.jpg')
12
+
13
+ try:
14
+ # Verificar si existe el archivo
15
+ if not os.path.exists(input_path):
16
+ print(f"No se encontró la imagen: {input_path}")
17
+ return False
18
+
19
+ # Abrir la imagen
20
+ image = Image.open(input_path)
21
+
22
+ # Ajustar imagen si es necesario
23
+ if image.size != (img_width, img_height):
24
+ image = image.resize((img_width, img_height))
25
+
26
+ # Guardar la imagen redimensionada
27
+ image.save(output_path)
28
+ print(f"Imagen procesada exitosamente. Guardada como: {output_path}")
29
+ return True
30
+
31
+ except Exception as e:
32
+ print(f"Error al procesar la imagen: {str(e)}")
33
+ return False
34
+
35
+ def main():
36
+ page_number = 1
37
+ while True:
38
+ if resize_image(page_number):
39
+ respuesta = input("¿Desea continuar con la siguiente iteración? (s/n): ")
40
+ if respuesta.lower() != 's':
41
+ print("Proceso finalizado.")
42
+ break
43
+ page_number += 1
44
+ else:
45
+ print("No hay más imágenes para procesar.")
46
+ break
47
+
48
+ if __name__ == "__main__":
49
+ main()
hawb_processing.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from PIL import Image
3
+ import pytesseract
4
+ import pandas as pd
5
+ import os
6
+ from pathlib import Path
7
+ import logging
8
+
9
+ # Configurar logging
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format='%(asctime)s - %(levelname)s - %(message)s',
13
+ handlers=[
14
+ logging.FileHandler('hawb_processing.log'),
15
+ logging.StreamHandler()
16
+ ]
17
+ )
18
+ logger = logging.getLogger(__name__)
19
+
20
+ def load_field_areas(coordinates_json):
21
+ """Carga y procesa las coordenadas desde el archivo JSON"""
22
+ with open(coordinates_json, 'r') as f:
23
+ data = json.load(f)
24
+ field_areas = {}
25
+ for box in data['boxes']:
26
+ x = float(box['x'])
27
+ y = float(box['y'])
28
+ width = float(box['width'])
29
+ height = float(box['height'])
30
+ field_areas[box['label']] = {
31
+ "x1": int(x - width/2),
32
+ "y1": int(y - height/2),
33
+ "x2": int(x + width/2),
34
+ "y2": int(y + height/2)
35
+ }
36
+ return field_areas, data['width'], data['height']
37
+
38
+ def extract_text_from_area(image, area, margin=10):
39
+ """Extrae texto de un área específica de la imagen con margen de tolerancia"""
40
+ x1 = max(0, area["x1"] - margin)
41
+ y1 = max(0, area["y1"] - margin)
42
+ x2 = min(image.width, area["x2"] + margin)
43
+ y2 = min(image.height, area["y2"] + margin)
44
+ crop = image.crop((x1, y1, x2, y2))
45
+ custom_config = r'--oem 3 --psm 6'
46
+ text = pytesseract.image_to_string(crop, lang='eng', config=custom_config).strip()
47
+ return text
48
+
49
+ def process_hawb(image_path, coordinates_json, margin=5):
50
+ """Procesa una imagen de HAWB y extrae los campos principales"""
51
+ logger.info(f"Procesando HAWB: {image_path}")
52
+ image = Image.open(image_path)
53
+ field_areas, img_width, img_height = load_field_areas(coordinates_json)
54
+ # Ajustar imagen si es necesario
55
+ if image.size != (img_width, img_height):
56
+ image = image.resize((img_width, img_height))
57
+ # Extraer campos principales
58
+ campos = ["Shipper_name", "hawb_number", "hawb_date", "number_pieces", "gross_weight", "kg_lb"]
59
+ extracted = {"filename": os.path.basename(image_path)}
60
+ for campo in campos:
61
+ if campo in field_areas:
62
+ extracted[campo] = extract_text_from_area(image, field_areas[campo], margin)
63
+ else:
64
+ extracted[campo] = ""
65
+ return extracted
66
+
67
+ def process_hawb_batch(image_paths, coordinates_json):
68
+ """Procesa un lote de imágenes de HAWB"""
69
+ logger.info(f"Iniciando procesamiento de {len(image_paths)} HAWB")
70
+ results = []
71
+ for image_path in image_paths:
72
+ try:
73
+ result = process_hawb(image_path, coordinates_json)
74
+ results.append(result)
75
+ except Exception as e:
76
+ logger.error(f"Error procesando {image_path}: {str(e)}")
77
+ results.append({'filename': os.path.basename(image_path)})
78
+ df = pd.DataFrame(results)
79
+ return df
80
+
81
+
82
+ def main(hawb_dir="./hawb", data_dir="./data", coordinates_json="./coordinates_HAWB.json"):
83
+ Path(data_dir).mkdir(parents=True, exist_ok=True)
84
+ image_paths = [
85
+ os.path.join(hawb_dir, f)
86
+ for f in os.listdir(hawb_dir)
87
+ if f.lower().endswith(('.jpg', '.jpeg', '.png'))
88
+ ]
89
+ df = process_hawb_batch(image_paths, coordinates_json)
90
+ return df
91
+
92
+ if __name__ == "__main__":
93
+ main()
test1.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from hawb_processing import process_hawb
3
+
4
+ def test_single_hawb():
5
+ hawb_path = os.path.join("hawb", "hawb_pagina_5.jpg")
6
+ coordinates_json = "coordinates_HAWB.json"
7
+
8
+ print("Iniciando prueba con GUIA individual...")
9
+ print(f"Procesando guía: {hawb_path}")
10
+
11
+ try:
12
+ # Procesar la guía
13
+ results = process_hawb(hawb_path, coordinates_json, margin=5)
14
+
15
+
16
+ print("\nVerificando resultados:")
17
+ # imprimir diccionario de resultados
18
+ for key, value in results.items():
19
+ print(f"{key}: {value}")
20
+
21
+ except Exception as e:
22
+ print(f"Error en la prueba: {str(e)}")
23
+
24
+ if __name__ == "__main__":
25
+ test_single_hawb()