Spaces:

Vladt-Tempest
/

PDFXtrc

Sleeping

App Files Files Community

Vladt-Tempest commited on Apr 14, 2025

Commit

63995e0

1 Parent(s): c4fb207

app y comercial_invoice están funcionando trayendo inf. hasta producto 1

Browse files

Files changed (6) hide show

.gitignore +8 -1
app.py +6 -0
commercial_invoice.py +125 -56
coordinates_CI.json +6 -142
packages.txt +2 -1
test.py +72 -0

.gitignore CHANGED Viewed

@@ -7,4 +7,11 @@ ENV/
 pyvenv.cfg
 # Carpeta de trabajo
-invoices/

 pyvenv.cfg
 # Carpeta de trabajo
+invoices/
+# Log de errores
+*.log
+# archivos de resultados
+data/

app.py CHANGED Viewed

@@ -10,6 +10,12 @@ def procesar_pdf(pdf_archivo):
     if not os.path.exists(carpeta_salida):
         os.makedirs(carpeta_salida)
     # Convertir el PDF a imágenes
     paginas = convert_from_path(pdf_archivo, dpi=300)

     if not os.path.exists(carpeta_salida):
         os.makedirs(carpeta_salida)
+    # Si en carpeta_salida ya hay archivos, eliminarlos
+    for archivo in os.listdir(carpeta_salida):
+        archivo_path = os.path.join(carpeta_salida, archivo)
+        if os.path.isfile(archivo_path):
+            os.remove(archivo_path)
     # Convertir el PDF a imágenes
     paginas = convert_from_path(pdf_archivo, dpi=300)

commercial_invoice.py CHANGED Viewed

@@ -1,78 +1,147 @@
 import json
 from PIL import Image
 import pytesseract
 def load_field_areas(coordinates_json):
     """Carga y procesa las coordenadas desde el archivo JSON"""
-    with open(coordinates_json, 'r') as f:
-        data = json.load(f)
-    field_areas = {}
-    for box in data['boxes']:
-        x = float(box['x'])
-        y = float(box['y'])
-        width = float(box['width'])
-        height = float(box['height'])
-        field_areas[box['label']] = {
-            "x1": int(x - width/2),
-            "y1": int(y - height/2),
-            "x2": int(x + width/2),
-            "y2": int(y + height/2)
-        }
-    return field_areas, data['width'], data['height']
 def extract_text_from_area(image, area, margin=10):
     """Extrae texto de un área específica de la imagen con margen de tolerancia"""
-    # Aplicar margen a las coordenadas
-    x1 = max(0, area["x1"] - margin)
-    y1 = max(0, area["y1"] - margin)
-    x2 = min(image.width, area["x2"] + margin)
-    y2 = min(image.height, area["y2"] + margin)
-    # Recortar la imagen al área especificada
-    crop = image.crop((x1, y1, x2, y2))
-    # Configurar parámetros de OCR para mejor precisión
-    custom_config = r'--oem 3 --psm 6'
-    text = pytesseract.image_to_string(crop, lang='eng', config=custom_config).strip()
-    return text
 def process_invoice(image_path, coordinates_json, margin=10):
     """Procesa la factura y extrae los campos con margen de tolerancia"""
-    # Cargar imagen
-    image = Image.open(image_path)
-    # Cargar áreas de los campos
-    field_areas, img_width, img_height = load_field_areas(coordinates_json)
-    # Ajustar imagen si es necesario
-    if image.size != (img_width, img_height):
-        image = image.resize((img_width, img_height))
-    # Extraer texto de cada área
-    extracted_fields = {}
-    for label, area in field_areas.items():
-        text = extract_text_from_area(image, area, margin)
-        if text:
-            extracted_fields[label] = text
-        else:
-            # Si no se encuentra texto, intentar con un margen mayor
-            text = extract_text_from_area(image, area, margin * 2)
             if text:
-                extracted_fields[label] = text
-    return extracted_fields
 if __name__ == "__main__":
-    # Rutas de archivos
-    image_path = "./invoices/pagina_9.jpg"
     coordinates_json = "./coordinates_CI.json"
-    # Procesar factura
-    results = process_invoice(image_path, coordinates_json, margin=10)
-    # Imprimir resultados
-    print("\nCampos encontrados:")
-    for field, value in results.items():
-        print(f"{field}: {value}")

 import json
 from PIL import Image
 import pytesseract
+import pandas as pd
+import os
+from pathlib import Path
+import logging
+# Configurar logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('invoice_processing.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
 def load_field_areas(coordinates_json):
     """Carga y procesa las coordenadas desde el archivo JSON"""
+    logger.debug(f"Cargando coordenadas desde: {coordinates_json}")
+    try:
+        with open(coordinates_json, 'r') as f:
+            data = json.load(f)
+        field_areas = {}
+        for box in data['boxes']:
+            x = float(box['x'])
+            y = float(box['y'])
+            width = float(box['width'])
+            height = float(box['height'])
+            field_areas[box['label']] = {
+                "x1": int(x - width/2),
+                "y1": int(y - height/2),
+                "x2": int(x + width/2),
+                "y2": int(y + height/2)
+            }
+        logger.debug(f"Se cargaron {len(field_areas)} áreas de campos")
+        return field_areas, data['width'], data['height']
+    except Exception as e:
+        logger.error(f"Error al cargar coordenadas: {str(e)}")
+        raise
 def extract_text_from_area(image, area, margin=10):
     """Extrae texto de un área específica de la imagen con margen de tolerancia"""
+    logger.debug(f"Extrayendo texto del área: {area} con margen: {margin}")
+    try:
+        # Aplicar margen a las coordenadas
+        x1 = max(0, area["x1"] - margin)
+        y1 = max(0, area["y1"] - margin)
+        x2 = min(image.width, area["x2"] + margin)
+        y2 = min(image.height, area["y2"] + margin)
+        # Recortar la imagen al área especificada
+        crop = image.crop((x1, y1, x2, y2))
+        # Configurar parámetros de OCR para mejor precisión
+        custom_config = r'--oem 3 --psm 6'
+        text = pytesseract.image_to_string(crop, lang='eng', config=custom_config).strip()
+        if not text:
+            logger.debug("No se encontró texto en el área")
+        else:
+            logger.debug(f"Texto extraído: {text[:50]}...")
+        return text
+    except Exception as e:
+        logger.error(f"Error al extraer texto: {str(e)}")
+        return ""
 def process_invoice(image_path, coordinates_json, margin=10):
     """Procesa la factura y extrae los campos con margen de tolerancia"""
+    logger.info(f"Procesando factura: {image_path}")
+    try:
+        # Cargar imagen
+        image = Image.open(image_path)
+        # Cargar áreas de los campos
+        field_areas, img_width, img_height = load_field_areas(coordinates_json)
+        # Ajustar imagen si es necesario
+        if image.size != (img_width, img_height):
+            logger.debug(f"Redimensionando imagen de {image.size} a ({img_width}, {img_height})")
+            image = image.resize((img_width, img_height))
+        # Extraer texto de cada área
+        extracted_fields = {}
+        for label, area in field_areas.items():
+            logger.debug(f"Procesando campo: {label}")
+            text = extract_text_from_area(image, area, margin)
             if text:
+                extracted_fields[label] = text
+            else:
+                logger.debug(f"Reintentando {label} con margen mayor")
+                text = extract_text_from_area(image, area, margin * 2)
+                if text:
+                    extracted_fields[label] = text
+        # Agregar el nombre del archivo como identificador
+        extracted_fields['filename'] = os.path.basename(image_path)
+        logger.info(f"Extracción completada: {len(extracted_fields)} campos encontrados")
+        return extracted_fields
+    except Exception as e:
+        logger.error(f"Error procesando factura {image_path}: {str(e)}")
+        return {'filename': os.path.basename(image_path)}
 if __name__ == "__main__":
+    logger.info("Iniciando procesamiento de facturas")
+    # Configurar directorios
+    invoice_dir = "./invoices"
+    data_dir = "./data"
     coordinates_json = "./coordinates_CI.json"
+    # Crear directorio data si no existe
+    Path(data_dir).mkdir(parents=True, exist_ok=True)
+    logger.debug(f"Directorio de datos creado: {data_dir}")
+    # Lista para almacenar los resultados de todas las facturas
+    all_results = []
+    # Procesar todas las imágenes en el directorio
+    total_files = len([f for f in os.listdir(invoice_dir)
+                      if f.endswith(('.jpg', '.jpeg', '.png'))])
+    logger.info(f"Se encontraron {total_files} archivos para procesar")
+    for filename in os.listdir(invoice_dir):
+        if filename.endswith(('.jpg', '.jpeg', '.png')):
+            image_path = os.path.join(invoice_dir, filename)
+            results = process_invoice(image_path, coordinates_json, margin=5)
+            all_results.append(results)
+    # Crear DataFrame con todos los resultados
+    df = pd.DataFrame(all_results)
+    # Reordenar columnas (filename al inicio)
+    cols = ['filename'] + [col for col in df.columns if col != 'filename']
+    df = df[cols]
+    # Guardar resultados en CSV
+    csv_path = os.path.join(data_dir, 'ci_data.csv')
+    df.to_csv(csv_path, index=False)
+    logger.info(f"Proceso completado. Resultados guardados en: {csv_path}")
+    logger.info(f"Total de facturas procesadas: {len(all_results)}")

coordinates_CI.json CHANGED Viewed

@@ -66,9 +66,9 @@
         {
             "id": "8",
             "label": "Client_city_country",
-            "x": "621.67",
-            "y": "761.67",
-            "width": "390.00",
             "height": "56.67",
             "confidence": null
         },
@@ -93,9 +93,9 @@
         {
             "id": "B",
             "label": "invoice_number",
-            "x": "1868.33",
             "y": "255.00",
-            "width": "223.33",
             "height": "50.00",
             "confidence": null
         },
@@ -175,7 +175,7 @@
             "id": "K",
             "label": "Tariff_number_01",
             "x": "1318.33",
-            "y": "1053.33",
             "width": "283.33",
             "height": "60.00",
             "confidence": null
@@ -206,144 +206,8 @@
             "width": "206.67",
             "height": "50.00",
             "confidence": null
-        },
-        {
-            "id": "O",
-            "label": "Boxes_02",
-            "x": "210.00",
-            "y": "1108.33",
-            "width": "186.67",
-            "height": "43.33",
-            "confidence": null
-        },
-        {
-            "id": "P",
-            "label": "Pieces_02",
-            "x": "446.67",
-            "y": "1108.33",
-            "width": "260.00",
-            "height": "50.00",
-            "confidence": null
-        },
-        {
-            "id": "Q",
-            "label": "Product_02",
-            "x": "886.67",
-            "y": "1103.33",
-            "width": "540.00",
-            "height": "46.67",
-            "confidence": null
-        },
-        {
-            "id": "R",
-            "label": "Tariff_number_02",
-            "x": "1316.67",
-            "y": "1110.00",
-            "width": "273.33",
-            "height": "46.67",
-            "confidence": null
-        },
-        {
-            "id": "S",
-            "label": "Stems_02",
-            "x": "1781.67",
-            "y": "1106.67",
-            "width": "143.33",
-            "height": "46.67",
-            "confidence": null
-        },
-        {
-            "id": "T",
-            "label": "Unit_price_02",
-            "x": "1978.33",
-            "y": "1105.00",
-            "width": "196.67",
-            "height": "50.00",
-            "confidence": null
-        },
-        {
-            "id": "U",
-            "label": "Extended_price_02",
-            "x": "2211.67",
-            "y": "1106.67",
-            "width": "216.67",
-            "height": "40.00",
-            "confidence": null
-        },
-        {
-            "id": "V",
-            "label": "Boxes_03",
-            "x": "208.33",
-            "y": "1161.67",
-            "width": "183.33",
-            "height": "50.00",
-            "confidence": null
-        },
-        {
-            "id": "W",
-            "label": "Pieces_03",
-            "x": "446.67",
-            "y": "1165.00",
-            "width": "260.00",
-            "height": "56.67",
-            "confidence": null
-        },
-        {
-            "id": "X",
-            "label": "Product_03",
-            "x": "888.33",
-            "y": "1161.67",
-            "width": "543.33",
-            "height": "50.00",
-            "confidence": null
-        },
-        {
-            "id": "Y",
-            "label": "Tarif_number_03",
-            "x": "1318.33",
-            "y": "1166.67",
-            "width": "270.00",
-            "height": "53.33",
-            "confidence": null
-        },
-        {
-            "id": "Z",
-            "label": "Stems_03",
-            "x": "1781.67",
-            "y": "1158.33",
-            "width": "143.33",
-            "height": "50.00",
-            "confidence": null
-        },
-        {
-            "id": "a",
-            "label": "Unit_price_03",
-            "x": "1985.00",
-            "y": "1158.33",
-            "width": "203.33",
-            "height": "50.00",
-            "confidence": null
-        },
-        {
-            "id": "b",
-            "label": "Extended_price_03",
-            "x": "2216.67",
-            "y": "1158.33",
-            "width": "226.67",
-            "height": "43.33",
-            "confidence": null
-        },
-        {
-            "id": "c",
-            "label": "Forwarder",
-            "x": "1786.67",
-            "y": "1486.67",
-            "width": "1086.67",
-            "height": "426.67",
-            "confidence": null
         }
     ],
     "height": 3509,
-    "key": "pagina_1.jpg",
     "width": 2480
 }

         {
             "id": "8",
             "label": "Client_city_country",
+            "x": "951.33",
+            "y": "750.00",
+            "width": "1056.00",
             "height": "56.67",
             "confidence": null
         },
         {
             "id": "B",
             "label": "invoice_number",
+            "x": "1888.33",
             "y": "255.00",
+            "width": "238.00",
             "height": "50.00",
             "confidence": null
         },
             "id": "K",
             "label": "Tariff_number_01",
             "x": "1318.33",
+            "y": "1048.33",
             "width": "283.33",
             "height": "60.00",
             "confidence": null
             "width": "206.67",
             "height": "50.00",
             "confidence": null
         }
     ],
     "height": 3509,
     "width": 2480
 }

packages.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 tesseract-ocr-all
-poppler-utils

 tesseract-ocr-all
+poppler-utils
+pandas

test.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import json
+from PIL import Image
+import pytesseract
+def extract_text_from_area(image, area, margin=5):
+    """Extrae texto de un área específica de la imagen"""
+    try:
+        # Aplicar margen a las coordenadas
+        x1 = max(0, area["x1"] - margin)
+        y1 = max(0, area["y1"] - margin)
+        x2 = min(image.width, area["x2"] + margin)
+        y2 = min(image.height, area["y2"] + margin)
+        # Recortar la imagen al área especificada
+        crop = image.crop((x1, y1, x2, y2))
+        # Configurar OCR
+        custom_config = r'--oem 3 --psm 6'
+        return pytesseract.image_to_string(crop, lang='eng', config=custom_config).strip()
+    except Exception as e:
+        print(f"Error al extraer texto: {str(e)}")
+        return ""
+def test_single_invoice():
+    """Prueba la extracción de campos en una sola factura"""
+    try:
+        # Rutas de archivos
+        image_path = "./invoices/pagina_1.jpg"
+        json_path = "./coordinates_CI.json"
+        # Cargar imagen
+        print(f"\nProcesando imagen: {image_path}")
+        image = Image.open(image_path)
+        # Cargar coordenadas
+        print("Cargando coordenadas...")
+        with open(json_path, 'r') as f:
+            data = json.load(f)
+        # Procesar cada campo
+        print("\nCampos encontrados:")
+        print("-" * 50)
+        for box in data['boxes']:
+            # Calcular coordenadas
+            x = float(box['x'])
+            y = float(box['y'])
+            width = float(box['width'])
+            height = float(box['height'])
+            area = {
+                "x1": int(x - width/2),
+                "y1": int(y - height/2),
+                "x2": int(x + width/2),
+                "y2": int(y + height/2)
+            }
+            # Extraer texto
+            text = extract_text_from_area(image, area)
+            if text:
+                print(f"{box['label']}: {text}")
+    except FileNotFoundError:
+        print("Error: No se encontró el archivo de imagen o coordenadas")
+    except Exception as e:
+        print(f"Error inesperado: {str(e)}")
+if __name__ == "__main__":
+    test_single_invoice()