Spaces:

sgonzalezu
/

ocr_service

Sleeping

App Files Files Community

Sebastian Gonzalez commited on Jan 10

Commit

0a6b0fb

1 Parent(s): d4f735f

Deploy OCR Service via Script

Browse files

Files changed (8) hide show

.gitignore +5 -0
Dockerfile +31 -0
app.py +67 -0
azure_ocr_processor.py +315 -0
dollar_correction.py +169 -0
ocr_processors.py +352 -0
requirements.txt +9 -0
unified_extractors.py +1478 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+__pycache__/
+*.pyc
+.DS_Store
+.env
+venv/

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+FROM python:3.9-slim
+# Install system dependencies for OpenCV and Tesseract
+RUN apt-get update && apt-get install -y \
+    tesseract-ocr \
+    libtesseract-dev \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /app
+# Copy requirements first to leverage cache
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Create a user to run the app (Hugging Face Spaces requirement for security)
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Copy the rest of the application
+COPY --chown=user . .
+# Expose the port (Hugging Face Spaces expects 7860)
+EXPOSE 7860
+# Command to run the application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from fastapi import FastAPI, HTTPException, Body
+from pydantic import BaseModel
+import numpy as np
+import cv2
+import base64
+from typing import Dict, List, Any
+import os
+import sys
+# Add current directory to path to ensure imports work
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from ocr_processors import OCRManager
+from unified_extractors import Vendor, VendorSchemaManager
+app = FastAPI(title="OCR Service")
+# Initialize managers globally
+ocr_manager = OCRManager()
+schema_manager = VendorSchemaManager()
+class OCRRequest(BaseModel):
+    image: str  # Base64 encoded image
+    vendor_id: str
+@app.get("/")
+def health_check():
+    return {"status": "ok", "service": "OCR Service"}
+@app.post("/process")
+def process_image(request: OCRRequest):
+    try:
+        # Decode image
+        image_data = base64.b64decode(request.image)
+        nparr = np.frombuffer(image_data, np.uint8)
+        image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+        if image is None:
+            raise HTTPException(status_code=400, detail="Invalid image data")
+        # Resolve vendor
+        try:
+            vendor = Vendor(request.vendor_id)
+        except ValueError:
+            # Fallback for unknown vendors if necessary, or error
+            # For now, let's assume valid vendor or default
+            vendor = Vendor.DEFAULT
+        # Extract text using the EXACT same logic as the original app
+        # The OCRManager inside this service is the original code
+        results = ocr_manager.extract_text_with_positions(
+            image,
+            vendor,
+            schema_manager
+        )
+        return {"status": "success", "text_blocks": results}
+    except Exception as e:
+        print(f"ERROR in OCR Service: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

azure_ocr_processor.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# azure_ocr_processor.py
+# Procesador OCR usando Azure Document Intelligence
+import json
+import numpy as np
+from io import BytesIO
+from typing import Dict, List
+try:
+    from azure.core.credentials import AzureKeyCredential
+    from azure.ai.documentintelligence import DocumentIntelligenceClient
+    AZURE_AVAILABLE = True
+except ImportError:
+    AZURE_AVAILABLE = False
+    print("ADVERTENCIA: azure-ai-documentintelligence no está disponible.")
+class AzureOCRProcessor:
+    """Procesador usando Azure Document Intelligence"""
+    def __init__(self, endpoint: str = None, key: str = None):
+        if not AZURE_AVAILABLE:
+            raise RuntimeError("Azure Document Intelligence no está disponible")
+        # Usar credenciales desde variables de entorno o parámetros
+        import os
+        # Prioridad: parámetros > variables de entorno > valores por defecto
+        self.endpoint = endpoint or os.environ.get(
+            "AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT",
+            "https://invoicerecog.cognitiveservices.azure.com/"
+        )
+        self.key = key or os.environ.get(
+            "AZURE_DOCUMENT_INTELLIGENCE_KEY",
+            "BnvYqZbBSscFxbxZurfTEj9H6ZP4anDzvE2gQTB8fvau0wzlAk0TJQQJ99BKACYeBjFXJ3w3AAALACOGyauB"
+        )
+        if not self.endpoint or not self.key:
+            raise ValueError(
+                "Se requieren credenciales de Azure. "
+                "Define las variables de entorno AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT "
+                "y AZURE_DOCUMENT_INTELLIGENCE_KEY, o pásalas como parámetros."
+            )
+        print(f"INFO: Inicializando Azure Document Intelligence")
+        print(f"INFO: Endpoint: {self.endpoint}")
+        self.client = DocumentIntelligenceClient(
+            endpoint=self.endpoint,
+            credential=AzureKeyCredential(self.key)
+        )
+    def process(self, image: np.ndarray, ocr_config: Dict) -> List[Dict]:
+        """
+        Procesa la imagen usando Azure Document Intelligence.
+        Retorna text_blocks simulando el formato de otros OCR pero con datos estructurados.
+        """
+        model = ocr_config.get("model", "prebuilt-invoice")
+        print(f"INFO: Procesando con Azure Document Intelligence, modelo: {model}")
+        # === NUEVO: COMPRESIÓN DE IMAGEN PARA AZURE (PROCESO INDEPENDIENTE) ===
+        # Esta compresión se ejecuta antes del procesamiento normal y no afecta la funcionalidad original
+        image_to_process = self._compress_image_for_azure(image)
+        # === FIN COMPRESIÓN ===
+        # Convertir numpy array a bytes (formato PNG) - CÓDIGO ORIGINAL INTACTO
+        import cv2
+        success, encoded_image = cv2.imencode('.png', image_to_process)
+        if not success:
+            raise RuntimeError("No se pudo codificar la imagen")
+        image_bytes = encoded_image.tobytes()
+        print(f"INFO: Imagen codificada: {len(image_bytes)} bytes")
+        # Analizar con Azure - CÓDIGO ORIGINAL INTACTO
+        try:
+            print("INFO: Enviando imagen a Azure Document Intelligence...")
+            poller = self.client.begin_analyze_document(
+                model,
+                body=BytesIO(image_bytes),
+                content_type="image/png"
+            )
+            print("INFO: Esperando respuesta de Azure...")
+            result = poller.result()
+            print(f"INFO: Análisis completado. Documentos encontrados: {len(result.documents) if result.documents else 0}")
+            # Convertir resultado de Azure a formato de texto estructurado
+            formatted_text = self._format_azure_result_as_text(result)
+            # Retornar como un único text_block con flag especial
+            return [{
+                'text': formatted_text,
+                'x': 0,
+                'y': 0,
+                'width': 0,
+                'height': 0,
+                'confidence': 95.0,
+                'engine': 'azure',
+                'is_azure_structured': True
+            }]
+        except Exception as e:
+            print(f"ERROR en Azure Document Intelligence: {e}")
+            import traceback
+            traceback.print_exc()
+            raise
+    def _compress_image_for_azure(self, image: np.ndarray) -> np.ndarray:
+        """
+        COMPRESIÓN INDEPENDIENTE: Comprime la imagen para Azure sin afectar el procesamiento original.
+        Esta función es completamente independiente y no modifica la lógica existente.
+        """
+        import cv2
+        # Obtener información de la imagen original
+        height, width = image.shape[:2]
+        original_size_mb = image.nbytes / (1024 * 1024)
+        print(f"INFO: Compresión Azure - Imagen original: {width}x{height}, {original_size_mb:.2f}MB")
+        # Si la imagen ya es pequeña, no comprimir
+        if original_size_mb <= 4.5:
+            print("INFO: Compresión Azure - Imagen ya está dentro del límite, no se requiere compresión")
+            return image
+        print("INFO: Compresión Azure - Aplicando compresión...")
+        # Redimensionar si es muy grande (manteniendo relación de aspecto)
+        max_dimension = 2000
+        if width > max_dimension or height > max_dimension:
+            if width > height:
+                new_width = max_dimension
+                new_height = int((max_dimension / width) * height)
+            else:
+                new_height = max_dimension
+                new_width = int((max_dimension / height) * width)
+            print(f"INFO: Compresión Azure - Redimensionando a {new_width}x{new_height}")
+            compressed_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
+            compressed_size_mb = compressed_image.nbytes / (1024 * 1024)
+            print(f"INFO: Compresión Azure - Después de redimensionar: {compressed_size_mb:.2f}MB")
+            # Verificar si después de redimensionar ya está dentro del límite
+            if compressed_size_mb <= 4.5:
+                return compressed_image
+        else:
+            compressed_image = image
+        # Si aún es grande después de redimensionar, aplicar compresión JPEG temporal
+        temp_quality = 85
+        while temp_quality >= 50:
+            # Codificar temporalmente como JPEG para ver el tamaño
+            success, jpeg_encoded = cv2.imencode('.jpg', compressed_image, [cv2.IMWRITE_JPEG_QUALITY, temp_quality])
+            if success:
+                jpeg_size_mb = len(jpeg_encoded.tobytes()) / (1024 * 1024)
+                print(f"INFO: Compresión Azure - Calidad {temp_quality}: {jpeg_size_mb:.2f}MB")
+                if jpeg_size_mb <= 4.5:
+                    print(f"INFO: Compresión Azure - Calidad {temp_quality} aceptada")
+                    # Decodificar de vuelta a numpy array para mantener compatibilidad
+                    decoded_image = cv2.imdecode(jpeg_encoded, cv2.IMREAD_COLOR)
+                    if decoded_image is not None:
+                        final_size_mb = decoded_image.nbytes / (1024 * 1024)
+                        print(f"INFO: Compresión Azure - Imagen final: {final_size_mb:.2f}MB")
+                        return decoded_image
+            temp_quality -= 10
+        # Si llegamos aquí, usar la imagen redimensionada sin compresión JPEG
+        print("INFO: Compresión Azure - Usando imagen redimensionada sin compresión JPEG adicional")
+        return compressed_image
+    def _format_azure_result_as_text(self, result) -> str:
+        """
+        Convierte el resultado de Azure a un texto formateado limpio (sin líneas de confianza).
+        """
+        output_lines = []
+        if not result.documents:
+            return "ERROR: No se encontraron documentos en la factura"
+        # Procesar el primer documento
+        document = result.documents[0]
+        fields = document.fields
+        output_lines.append("-------- Análisis de Azure Document Intelligence --------")
+        output_lines.append("")
+        # Información del proveedor
+        vendor_name = fields.get("VendorName")
+        if vendor_name:
+            output_lines.append(f"Proveedor: {vendor_name.content}")
+        vendor_address = fields.get("VendorAddress")
+        if vendor_address:
+            output_lines.append(f"Dirección: {vendor_address.content}")
+        vendor_tax = fields.get("VendorTaxId")
+        if vendor_tax:
+            output_lines.append(f"GST/HST: {vendor_tax.content}")
+        output_lines.append("")
+        # Información de la factura
+        invoice_id = fields.get("InvoiceId")
+        if invoice_id:
+            output_lines.append(f"Invoice ID: {invoice_id.content}")
+        invoice_date = fields.get("InvoiceDate")
+        if invoice_date:
+            output_lines.append(f"Fecha: {invoice_date.content}")
+        customer_name = fields.get("CustomerName")
+        if customer_name:
+            output_lines.append(f"Cliente: {customer_name.content}")
+        output_lines.append("")
+        output_lines.append("=" * 60)
+        output_lines.append("ÍTEMS DE LA FACTURA")
+        output_lines.append("=" * 60)
+        output_lines.append("")
+        # Extraer items
+        items_field = fields.get("Items")
+        total_items = 0
+        if items_field and hasattr(items_field, "value_array"):
+            total_items = len(items_field.value_array)
+            print(f"INFO: Procesando {total_items} items...")
+            for item_idx, item in enumerate(items_field.value_array):
+                item_obj = item.value_object if hasattr(item, "value_object") else {}
+                output_lines.append(f"--- Ítem #{item_idx + 1} ---")
+                # Código de producto
+                product_code = item_obj.get("ProductCode")
+                if product_code and product_code.content:
+                    output_lines.append(f"Código: {product_code.content}")
+                # Descripción
+                description = item_obj.get("Description")
+                if description and description.content:
+                    output_lines.append(f"Descripción: {description.content}")
+                # Cantidad
+                quantity = item_obj.get("Quantity")
+                if quantity and quantity.content:
+                    output_lines.append(f"Cantidad: {quantity.content}")
+                # Precio unitario
+                unit_price = item_obj.get("UnitPrice")
+                if unit_price and unit_price.content:
+                    output_lines.append(f"Precio unitario: {unit_price.content}")
+                # Impuesto por ítem - SOLO si es > 0
+                tax = item_obj.get("Tax")
+                if tax and tax.content:
+                    try:
+                        # Extraer el valor numérico del tax
+                        tax_value_str = tax.content.replace('$', '').replace(',', '').strip()
+                        tax_value = float(tax_value_str)
+                        # Solo incluir si es mayor a 0
+                        if tax_value > 0:
+                            output_lines.append(f"Impuesto (H): {tax.content}")
+                    except (ValueError, AttributeError):
+                        pass
+                # Total por ítem
+                amount = item_obj.get("Amount")
+                if amount and amount.content:
+                    output_lines.append(f"Total por ítem: {amount.content}")
+                output_lines.append("")
+        else:
+            output_lines.append("No se encontraron items en la factura")
+        # Totales
+        output_lines.append("=" * 60)
+        output_lines.append("TOTALES")
+        output_lines.append("=" * 60)
+        output_lines.append("")
+        subtotal = fields.get("SubTotal")
+        if subtotal and subtotal.content:
+            output_lines.append(f"Subtotal: {subtotal.content}")
+        total_tax = fields.get("TotalTax")
+        if total_tax and total_tax.content:
+            output_lines.append(f"Total impuestos: {total_tax.content}")
+        invoice_total = fields.get("InvoiceTotal")
+        if invoice_total and invoice_total.content:
+            output_lines.append(f"Total de la factura: {invoice_total.content}")
+        output_lines.append("")
+        output_lines.append("=" * 60)
+        output_lines.append(f"Total de items extraídos: {total_items}")
+        output_lines.append("=" * 60)
+        formatted_text = "\n".join(output_lines)
+        print(f"\n{'='*60}")
+        print("TEXTO FORMATEADO GENERADO:")
+        print(f"{'='*60}")
+        print(formatted_text[:800] + "..." if len(formatted_text) > 800 else formatted_text)
+        print(f"{'='*60}\n")
+        return formatted_text

dollar_correction.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# dollar_correction.py
+# Proceso independiente para corrección de confusión $ vs 8
+import re
+from typing import Dict, List
+class DollarSignCorrectionProcessor:
+    """
+    Proceso independiente para corregir confusiones del OCR entre $ y 8.
+    Similar al proceso multilinea, puede ser aplicado a cualquier proveedor.
+    """
+    def __init__(self, config: Dict = None):
+        """
+        Args:
+            config: Configuración del procesador
+                - aggressive: bool - Si True, aplica correcciones más agresivas
+                - context_aware: bool - Si True, usa contexto para decidir correcciones
+                - min_confidence: float - Confianza mínima para aplicar corrección
+        """
+        self.config = config or {}
+        self.aggressive = self.config.get("aggressive", False)
+        self.context_aware = self.config.get("context_aware", True)
+        self.min_confidence = self.config.get("min_confidence", 0.7)
+    def process(self, text_blocks: List[Dict]) -> List[Dict]:
+        """
+        Procesa los bloques de texto y corrige confusiones entre $ y 8.
+        Args:
+            text_blocks: Lista de bloques de texto del OCR
+        Returns:
+            Lista de bloques de texto corregidos
+        """
+        corrected_blocks = []
+        corrections_made = 0
+        for block in text_blocks:
+            original_text = block['text']
+            corrected_text = self._correct_text(original_text, block)
+            if corrected_text != original_text:
+                corrections_made += 1
+                print(f"DEBUG: Corrección $ vs 8: '{original_text}' -> '{corrected_text}'")
+                # Crear nuevo bloque con texto corregido
+                corrected_block = block.copy()
+                corrected_block['text'] = corrected_text
+                corrected_block['was_corrected'] = True
+                corrected_block['original_text'] = original_text
+                corrected_blocks.append(corrected_block)
+            else:
+                corrected_blocks.append(block)
+        print(f"INFO: Correcciones $ vs 8 aplicadas: {corrections_made} de {len(text_blocks)} bloques")
+        return corrected_blocks
+    def _correct_text(self, text: str, block: Dict) -> str:
+        """
+        Aplica correcciones al texto basándose en patrones y contexto.
+        Args:
+            text: Texto a corregir
+            block: Bloque de texto con metadata (posición, confianza, etc.)
+        Returns:
+            Texto corregido
+        """
+        corrected = text
+        # Patrón 1: "8" seguido de números (probablemente es "$")
+        # Ejemplo: "8 12.99" -> "$ 12.99"
+        # Ejemplo: "812.99" -> "$12.99"
+        corrected = re.sub(
+            r'\b8\s*(\d+\.?\d*)\b',
+            lambda m: f"$ {m.group(1)}" if self._is_likely_price(m.group(1)) else m.group(0),
+            corrected
+        )
+        # Patrón 2: "8" al inicio de línea seguido de espacio y números
+        # Ejemplo: "8 Total" -> "$ Total"
+        if self.context_aware:
+            corrected = re.sub(
+                r'^8\s+(Total|Subtotal|HST|Tax|Amount|Price)',
+                r'$ \1',
+                corrected,
+                flags=re.IGNORECASE
+            )
+        # Patrón 3: "8" en contexto de moneda (después de palabras clave)
+        # Ejemplo: "Total 8 123.45" -> "Total $ 123.45"
+        corrected = re.sub(
+            r'(Total|Subtotal|HST|Tax|Amount|Price|Cost)\s+8\s*(\d+\.?\d*)',
+            r'\1 $ \2',
+            corrected,
+            flags=re.IGNORECASE
+        )
+        # Patrón 4: Múltiples "8" en secuencia (probablemente "$")
+        # Ejemplo: "88" -> "$$" (raro pero posible)
+        if self.aggressive:
+            corrected = re.sub(r'88', '$$', corrected)
+        # Patrón 5: "8" entre espacios y números decimales
+        # Ejemplo: "Item 8 12.99 8 24.98" -> "Item $ 12.99 $ 24.98"
+        corrected = re.sub(
+            r'\s8\s+(\d+\.\d{2})\b',
+            r' $ \1',
+            corrected
+        )
+        # Patrón 6: "8" al final de palabra seguido de números
+        # Ejemplo: "Price8123.45" -> "Price$123.45"
+        corrected = re.sub(
+            r'([a-zA-Z])8(\d+\.?\d*)',
+            lambda m: f"{m.group(1)}${m.group(2)}" if self._is_likely_price(m.group(2)) else m.group(0),
+            corrected
+        )
+        # Patrón 7: "8" solo seguido de espacio y dígitos con decimales
+        # Ejemplo: "8 1.99" -> "$ 1.99"
+        corrected = re.sub(
+            r'\b8\s+(\d+\.\d{2})\b',
+            r'$ \1',
+            corrected
+        )
+        # Patrón 8: Líneas que empiezan con "8" y tienen formato de precio
+        # Ejemplo: "8123.45" -> "$123.45"
+        corrected = re.sub(
+            r'^8(\d+\.\d{2})\b',
+            r'$\1',
+            corrected,
+            flags=re.MULTILINE
+        )
+        return corrected
+    def _is_likely_price(self, number_str: str) -> bool:
+        """
+        Determina si un número es probablemente un precio.
+        Args:
+            number_str: String con el número
+        Returns:
+            True si parece un precio
+        """
+        try:
+            value = float(number_str)
+            # Precios típicos: entre 0.01 y 10000
+            if value < 0.01 or value > 10000:
+                return False
+            # Si tiene 2 decimales, muy probable que sea precio
+            if '.' in number_str and len(number_str.split('.')[1]) == 2:
+                return True
+            # Si es un número redondo pequeño, menos probable
+            if value < 10 and '.' not in number_str:
+                return False
+            return True
+        except ValueError:
+            return False

ocr_processors.py ADDED Viewed

	@@ -0,0 +1,352 @@

+# ocr_processors.py
+# Procesadores OCR independientes y su gestor
+import cv2
+import numpy as np
+import easyocr
+from typing import Dict, List
+from dollar_correction import DollarSignCorrectionProcessor
+from unified_extractors import Vendor, VendorSchemaManager
+try:
+    import pytesseract
+    from pytesseract import Output
+    PYTESSERACT_AVAILABLE = True
+except ImportError:
+    PYTESSERACT_AVAILABLE = False
+    print("ADVERTENCIA: pytesseract no está disponible. Usando EasyOCR por defecto.")
+from azure_ocr_processor import AzureOCRProcessor, AZURE_AVAILABLE
+class OCRProcessor:
+    """Clase base para procesadores OCR"""
+    def __init__(self):
+        pass
+    def process(self, image: np.ndarray, ocr_config: Dict) -> List[Dict]:
+        """Procesa la imagen y retorna bloques de texto"""
+        raise NotImplementedError
+class EasyOCRProcessor(OCRProcessor):
+    """Procesador usando EasyOCR"""
+    def __init__(self):
+        super().__init__()
+        self.reader = easyocr.Reader(['en', 'fr'], gpu=False)
+    def process(self, image: np.ndarray, ocr_config: Dict) -> List[Dict]:
+        """Extrae texto usando EasyOCR"""
+        results = self.reader.readtext(
+            image,
+            contrast_ths=0.05,
+            adjust_contrast=0.7,
+            low_text=0.3,
+            detail=1
+        )
+        text_blocks = []
+        for (bbox, text, confidence) in results:
+            if confidence > 0.3:
+                x_coords = [point[0] for point in bbox]
+                y_coords = [point[1] for point in bbox]
+                text_blocks.append({
+                    'text': text.strip(),
+                    'x': min(x_coords),
+                    'y': min(y_coords),
+                    'width': max(x_coords) - min(x_coords),
+                    'height': max(y_coords) - min(y_coords),
+                    'confidence': confidence * 100,
+                    'engine': 'easyocr'
+                })
+        return sorted(text_blocks, key=lambda b: (b['y'], b['x']))
+class PytesseractOCRProcessor(OCRProcessor):
+    """Procesador usando Pytesseract con soporte para tablas"""
+    def __init__(self):
+        super().__init__()
+        if not PYTESSERACT_AVAILABLE:
+            raise RuntimeError("Pytesseract no está disponible")
+    def process(self, image: np.ndarray, ocr_config: Dict) -> List[Dict]:
+        """Extrae texto usando Pytesseract"""
+        mode = ocr_config.get("mode", "block")
+        # Preprocesar imagen
+        processed_image = self._preprocess_image(image, ocr_config)
+        if mode == "table":
+            text_blocks = self._extract_table_structure(processed_image, ocr_config)
+            # Si se requiere reconstrucción multilinea
+            if ocr_config.get("requires_reconstruction", False):
+                reconstructed_text = self._reconstruct_multiline_text(text_blocks, ocr_config)
+                if reconstructed_text:
+                    text_blocks.append({
+                        'text': f"TEXTO_RECONSTRUIDO:\n{reconstructed_text}",
+                        'x': 0,
+                        'y': 0,
+                        'width': 100,
+                        'height': 100,
+                        'confidence': 100,
+                        'engine': 'reconstructed',
+                        'is_reconstructed': True
+                    })
+        else:
+            text_blocks = self._extract_block_structure(processed_image)
+        return text_blocks
+    def _preprocess_image(self, image: np.ndarray, ocr_config: Dict) -> np.ndarray:
+        """Preprocesa la imagen según configuración"""
+        preprocessing = ocr_config.get("preprocessing", {})
+        # Convertir a escala de grises
+        if len(image.shape) == 3:
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = image
+        # Aplicar denoising si está configurado
+        if preprocessing.get("denoise", False):
+            gray = cv2.medianBlur(gray, 3)
+        # Aplicar enhancement si está configurado
+        if preprocessing.get("enhance", False):
+            clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
+            gray = clahe.apply(gray)
+        # Aplicar binarización si está configurado
+        if preprocessing.get("binarize", False):
+            gray = cv2.adaptiveThreshold(
+                gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                cv2.THRESH_BINARY, 15, 8
+            )
+            # Limpieza morfológica
+            kernel = np.ones((2,2), np.uint8)
+            gray = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel)
+            gray = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel)
+        return gray
+    def _extract_table_structure(self, image: np.ndarray, ocr_config: Dict) -> List[Dict]:
+        """Extrae estructura de tabla"""
+        custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
+        table_data = pytesseract.image_to_data(image, output_type=Output.DICT, config=custom_config)
+        text_blocks = []
+        n_boxes = len(table_data['text'])
+        for i in range(n_boxes):
+            text = table_data['text'][i].strip()
+            confidence = int(table_data['conf'][i])
+            if text and confidence > 20:
+                text_blocks.append({
+                    'text': text,
+                    'x': table_data['left'][i],
+                    'y': table_data['top'][i],
+                    'width': table_data['width'][i],
+                    'height': table_data['height'][i],
+                    'confidence': confidence,
+                    'block_num': table_data['block_num'][i],
+                    'par_num': table_data['par_num'][i],
+                    'line_num': table_data['line_num'][i],
+                    'word_num': table_data['word_num'][i],
+                    'engine': 'pytesseract'
+                })
+        # Si hay muy pocos bloques, intentar con métodos alternativos
+        if len(text_blocks) < 10:
+            return self._extract_with_alternative_methods(image)
+        return text_blocks
+    def _extract_with_alternative_methods(self, image: np.ndarray) -> List[Dict]:
+        """Intenta extraer con múltiples configuraciones"""
+        configs = [
+            r'--oem 3 --psm 4',
+            r'--oem 3 --psm 6',
+            r'--oem 3 --psm 8',
+            r'--oem 3 --psm 11',
+        ]
+        all_blocks = []
+        for config in configs:
+            try:
+                data = pytesseract.image_to_data(image, output_type=Output.DICT, config=config)
+                for i in range(len(data['text'])):
+                    text = data['text'][i].strip()
+                    if text and int(data['conf'][i]) > 10:
+                        all_blocks.append({
+                            'text': text,
+                            'x': data['left'][i],
+                            'y': data['top'][i],
+                            'width': data['width'][i],
+                            'height': data['height'][i],
+                            'confidence': int(data['conf'][i]),
+                            'engine': 'pytesseract_alt'
+                        })
+            except Exception as e:
+                print(f"ADVERTENCIA: Falló configuración {config}: {e}")
+        # Eliminar duplicados
+        unique_blocks = []
+        seen_positions = set()
+        for block in all_blocks:
+            position_key = (block['x'], block['y'], block['text'])
+            if position_key not in seen_positions:
+                seen_positions.add(position_key)
+                unique_blocks.append(block)
+        return sorted(unique_blocks, key=lambda b: (b['y'], b['x']))
+    def _extract_block_structure(self, image: np.ndarray) -> List[Dict]:
+        """Extrae estructura de bloques"""
+        custom_config = r'--oem 3 --psm 1'
+        data = pytesseract.image_to_data(image, output_type=Output.DICT, config=custom_config)
+        text_blocks = []
+        n_boxes = len(data['text'])
+        for i in range(n_boxes):
+            text = data['text'][i].strip()
+            confidence = int(data['conf'][i])
+            if text and confidence > 30:
+                text_blocks.append({
+                    'text': text,
+                    'x': data['left'][i],
+                    'y': data['top'][i],
+                    'width': data['width'][i],
+                    'height': data['height'][i],
+                    'confidence': confidence,
+                    'engine': 'pytesseract'
+                })
+        return sorted(text_blocks, key=lambda b: (b['y'], b['x']))
+    def _reconstruct_multiline_text(self, text_blocks: List[Dict], ocr_config: Dict) -> str:
+        """Reconstruye texto multilinea para proveedores que lo requieren"""
+        # Filtrar bloques reconstruidos previos
+        original_blocks = [block for block in text_blocks if not block.get('is_reconstructed')]
+        if not original_blocks:
+            return ""
+        # Agrupar en líneas
+        line_threshold = ocr_config.get("line_threshold", 20)
+        lines = self._group_into_lines(original_blocks, line_threshold)
+        # Reconstruir texto
+        reconstructed_text = ""
+        for line_blocks in lines:
+            line_blocks.sort(key=lambda b: b['x'])
+            line_text = ' '.join(block['text'].strip() for block in line_blocks)
+            if line_text.strip():
+                reconstructed_text += line_text + "\n"
+        return reconstructed_text
+    def _group_into_lines(self, sorted_blocks: List[Dict], line_threshold: int = 20) -> List[List[Dict]]:
+        """Agrupa bloques en líneas"""
+        if not sorted_blocks:
+            return []
+        sorted_blocks = sorted(sorted_blocks, key=lambda b: b['y'])
+        lines = []
+        current_line = [sorted_blocks[0]]
+        current_y = sorted_blocks[0]['y']
+        for block in sorted_blocks[1:]:
+            y_diff = abs(block['y'] - current_y)
+            if y_diff <= line_threshold:
+                current_line.append(block)
+                current_y = sum(b['y'] for b in current_line) / len(current_line)
+            else:
+                current_line.sort(key=lambda b: b['x'])
+                lines.append(current_line)
+                current_line = [block]
+                current_y = block['y']
+        if current_line:
+            current_line.sort(key=lambda b: b['x'])
+            lines.append(current_line)
+        return lines
+# Modificar la clase OCRManager:
+class OCRManager:
+    """Gestiona los diferentes procesadores OCR según el proveedor"""
+    def __init__(self):
+        self.processors = {
+            'easyocr': EasyOCRProcessor(),
+            'pytesseract': PytesseractOCRProcessor() if PYTESSERACT_AVAILABLE else None,
+            'azure': None  # Se inicializará bajo demanda
+        }
+    def _get_azure_processor(self):
+        """Inicializa el procesador Azure bajo demanda"""
+        if self.processors['azure'] is None and AZURE_AVAILABLE:
+            try:
+                self.processors['azure'] = AzureOCRProcessor()
+                print("INFO: Procesador Azure Document Intelligence inicializado")
+            except Exception as e:
+                print(f"ERROR al inicializar Azure: {e}")
+                return None
+        return self.processors['azure']
+    def extract_text_with_positions(self, image: np.ndarray, vendor: Vendor, schema_manager: VendorSchemaManager) -> List[Dict]:
+        """Extrae texto usando el procesador apropiado para el proveedor"""
+        # Obtener configuración OCR del proveedor
+        ocr_config = schema_manager.get_ocr_config(vendor)
+        engine = ocr_config.get("engine", "easyocr")
+        print(f"INFO: Usando engine '{engine}' para proveedor {vendor.value}")
+        print(f"INFO: Configuración OCR: {ocr_config}")
+        # Seleccionar procesador
+        if engine == 'azure':
+            processor = self._get_azure_processor()
+            if processor is None:
+                print("ADVERTENCIA: Azure no disponible, usando EasyOCR como fallback")
+                processor = self.processors['easyocr']
+                ocr_config = {"engine": "easyocr", "mode": "block"}
+        else:
+            processor = self.processors.get(engine)
+            if processor is None:
+                print(f"ADVERTENCIA: Engine '{engine}' no disponible, usando EasyOCR")
+                processor = self.processors['easyocr']
+                ocr_config = {"engine": "easyocr", "mode": "block"}
+        # Procesar imagen
+        try:
+            text_blocks = processor.process(image, ocr_config)
+            print(f"INFO: Extraídos {len(text_blocks)} bloques de texto con {engine}")
+            # NO aplicar corrección $ vs 8 para Azure (ya viene procesado)
+            if engine != 'azure':
+                dollar_correction_config = ocr_config.get("dollar_sign_correction", {})
+                if dollar_correction_config.get("enabled", False):
+                    print(f"INFO: Aplicando corrección $ vs 8 para {vendor.value}")
+                    corrector = DollarSignCorrectionProcessor(dollar_correction_config)
+                    text_blocks = corrector.process(text_blocks)
+            return text_blocks
+        except Exception as e:
+            print(f"ERROR en procesamiento OCR con {engine}: {e}")
+            # Fallback a EasyOCR
+            if engine != 'easyocr':
+                print("INFO: Intentando con EasyOCR como fallback...")
+                return self.processors['easyocr'].process(image, {"engine": "easyocr"})
+            raise

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi
+uvicorn
+python-multipart
+numpy
+opencv-python-headless
+easyocr
+pytesseract
+requests
+pydantic

unified_extractors.py ADDED Viewed

	@@ -0,0 +1,1478 @@

+"""
+Sistema unificado de extracción de facturas basado en patrones regex y reglas
+Incluye configuración de proveedores y esquemas
+Sin dependencia de LLMs - más rápido y confiable
+"""
+import re
+import json
+from typing import Dict, List, Optional, Tuple
+from datetime import datetime
+from dataclasses import dataclass, asdict
+from enum import Enum
+# ==== CONFIGURACIÓN DE PROVEEDORES ====
+class Vendor(Enum):
+    """
+    Define los proveedores soportados en el sistema.
+    El valor de la enumeración se usa como ID en la URL y en el sistema de esquemas.
+    """
+    A1 = "A1 Cash and Carry_Fisico"
+    COSTCO = "Costco_Formato1"
+    COSTCO2 = "Costco_Formato2"
+    DEFAULT = "Default"
+# ==== CONFIGURACIÓN OCR POR PROVEEDOR ====
+# Cada proveedor puede tener su propia configuración de OCR
+VENDOR_OCR_CONFIG = {
+    Vendor.A1: {
+        "engine": "easyocr",
+        "mode": "block",
+        "add_blank_lines_on_spacing": True,
+        "spacing_threshold": 1,
+        # NUEVO: Configuración para corrección $ vs 8
+        "dollar_sign_correction": {
+            "enabled": True,
+            "aggressive": False,
+            "context_aware": False,
+            "min_confidence": 0.05
+        }
+    },
+    Vendor.COSTCO: {
+        "engine": "pytesseract",
+        "mode": "table",
+        "columns": 7,
+        "multiline": True,
+        "requires_reconstruction": True,
+        "line_threshold": 20,
+        "preprocessing": {
+            "denoise": True,
+            "enhance": True,
+            "binarize": True
+        }
+    },
+    Vendor.COSTCO2: {
+        "engine": "pytesseract",
+        "mode": "table",
+        "columns": 7,
+        "multiline": True,
+        "requires_reconstruction": True,
+        "line_threshold": 20,
+        "preprocessing": {
+            "denoise": True,
+            "enhance": True,
+            "binarize": True
+        }
+    },
+    Vendor.DEFAULT: {
+        "engine": "azure",  # Motor especial para Azure
+        "mode": "document_intelligence",
+        "model": "prebuilt-invoice"  # Modelo de Azure a usar
+    }
+}
+# ==== CLASES DE DATOS ====
+@dataclass
+class InvoiceItem:
+    description: str
+    amount: float
+    quantity: float = 1.0
+    unit_price: float = 0.0
+    sku: Optional[str] = None
+    unit_of_measure: Optional[str] = None
+    discount: float = 0.0
+    tax_code: Optional[str] = None
+    category: Optional[str] = None
+@dataclass
+class Invoice:
+    vendor: str
+    issuer: str
+    date: str = ""
+    transaction_id: str = ""
+    items: List[InvoiceItem] = None
+    subtotal: float = 0.0
+    hst: Optional[float] = None
+    total: float = 0.0
+    raw_text: str = ""
+    confidence: float = 0.0
+    issuer_address: Optional[str] = None
+    gst_hst_number: Optional[str] = None
+    invoice_number: str = ""
+    customer_name: Optional[str] = None
+    # Campos adicionales para gestión
+    invoice_id: str = ""
+    status: str = "procesado"
+    created_at: str = ""
+    file_path: str = ""
+    job_id: str = ""
+    def __post_init__(self):
+        if self.items is None:
+            self.items = []
+# ==== CLASE BASE PARA EXTRACTORES ====
+class BasePatternExtractor:
+    """Clase base para extractores de patrones"""
+    def __init__(self, raw_text: str, text_blocks: List[Dict] = None, ocr_config: Dict = None):
+        self.raw_text = raw_text
+        self.text_blocks = text_blocks or []
+        self.ocr_config = ocr_config or {}
+        # Aplicar proceso de inserción de líneas en blanco si está habilitado
+        if self.ocr_config.get("add_blank_lines_on_spacing", False):
+            processed_text = self._add_blank_lines_on_spacing(raw_text, text_blocks)
+            self.raw_text = processed_text
+        self.lines = [line.strip() for line in self.raw_text.split('\n') if line.strip()]
+    def _add_blank_lines_on_spacing(self, raw_text: str, text_blocks: List[Dict]) -> str:
+        """
+        Inserta líneas en blanco cuando detecta espacios significativos entre renglones consecutivos.
+        Este proceso es independiente y reutilizable para cualquier proveedor.
+        """
+        if not text_blocks:
+            return raw_text
+        spacing_threshold = self.ocr_config.get("spacing_threshold", 15)
+        # Ordenar bloques por posición Y y X
+        sorted_blocks = sorted(text_blocks, key=lambda b: (b.get('page_number', 1), b['y'], b['x']))
+        # Construir texto con líneas en blanco insertadas
+        processed_lines = []
+        prev_block = None
+        for block in sorted_blocks:
+            current_y = block['y']
+            current_height = block.get('height', 0)
+            current_page = block.get('page_number', 1)
+            # Si hay un bloque anterior, calcular el espacio entre renglones
+            if prev_block is not None:
+                prev_y = prev_block['y']
+                prev_height = prev_block.get('height', 0)
+                prev_page = prev_block.get('page_number', 1)
+                # Si cambiamos de página, resetear
+                if current_page != prev_page:
+                    processed_lines.append("")  # Línea en blanco entre páginas
+                else:
+                    # Calcular el espacio vertical entre el final del bloque anterior y el inicio del actual
+                    prev_bottom = prev_y + prev_height
+                    vertical_gap = current_y - prev_bottom
+                    # Si el espacio supera el threshold, insertar línea en blanco
+                    if vertical_gap > spacing_threshold:
+                        processed_lines.append("")
+                        print(f"DEBUG: Línea en blanco insertada (gap vertical de {vertical_gap:.1f}px entre renglones)")
+            # Agregar el texto del bloque actual
+            processed_lines.append(block['text'])
+            prev_block = block
+        return '\n'.join(processed_lines)
+    def extract_date(self, patterns: List[str]) -> str:
+        """Extrae fecha usando múltiples patrones"""
+        for pattern in patterns:
+            match = re.search(pattern, self.raw_text, re.IGNORECASE)
+            if match:
+                date_str = match.group(1).strip()
+                try:
+                    for fmt in ['%m/%d/%Y', '%d/%m/%Y', '%Y-%m-%d', '%d %b %Y', '%d %B %Y']:
+                        try:
+                            dt = datetime.strptime(date_str, fmt)
+                            return dt.strftime('%Y-%m-%d')
+                        except:
+                            continue
+                    return date_str
+                except:
+                    return date_str
+        return datetime.now().strftime('%Y-%m-%d')
+    def extract_amount(self, patterns: List[str], multiline: bool = False) -> Optional[float]:
+        """Extrae montos monetarios"""
+        text = self.raw_text if multiline else ' '.join(self.lines)
+        for pattern in patterns:
+            match = re.search(pattern, text, re.IGNORECASE | (re.MULTILINE if multiline else 0))
+            if match:
+                amount_str = match.group(1).replace('$', '').replace(',', '').strip()
+                try:
+                    return float(amount_str)
+                except:
+                    continue
+        return None
+    def extract_text(self, patterns: List[str]) -> Optional[str]:
+        """Extrae texto usando patrones"""
+        for pattern in patterns:
+            match = re.search(pattern, self.raw_text, re.IGNORECASE | re.MULTILINE)
+            if match:
+                return match.group(1).strip()
+        return None
+    def extract_invoice(self) -> Invoice:
+        """Método principal - debe ser implementado por cada extractor"""
+        raise NotImplementedError
+class A1PatternExtractor(BasePatternExtractor):
+    """Extractor ultra-optimizado para Burlington Cash and Carry"""
+    def extract_invoice(self) -> Invoice:
+        issuer = self.extract_text([
+            r'(Burlington\s+Cash\s+and\s+Carry)',
+            r'(burlington\s*icashandcarry)',
+            r'(A1\s*Cash\s*and\s*Carry)',
+        ]) or "Burlington Cash and Carry"
+        gst_hst = self.extract_text([
+            r'GST/HST\s*[:\s]*([0-9\s]+RT\s+[0-9]+)',
+            r'HST\s*#?\s*[:\s]*([0-9\s]+)',
+        ])
+        date = self.extract_date([
+            r'Date[:\s]*(\d{1,2}/\d{1,2}/\d{4})',
+            r'(\d{1,2}/\d{1,2}/\d{4})',
+        ])
+        transaction_id = self.extract_text([
+            r'Transaction\s*#?\s*[:\s]*(BL\s*\w+)',
+            r'(L\d{12})',
+            r'Transaction\s*#?\s*[:\s]*([A-Z0-9]+)',
+        ]) or ""
+        customer_name = self.extract_text([
+            r'Customer\s*[:\s]*([A-Za-z\s]+)',
+            r'(Familia\s+Fine\s+Foods)',
+        ]) or "FAMILIA FINE FOODS"
+        address = self.extract_text([
+            r'(\d+\s*[\'#]?\s*Service\s+Rd)',
+            r'(\d+\s+[A-Za-z\s]+Rd)',
+        ]) or "3495 Service Rd Burlington"
+        customer_code = self.extract_text([
+            r'Customer\s*[:\s]*[A-Za-z\s]+\s+([A-Z0-9]{7,})',
+            r'(C\d{6,})',
+        ])
+        items = self._extract_a1_items_ultra()
+        # Buscar totales con el patrón correcto
+        subtotal, hst, total = self._extract_totals_sequential()
+        print(f"DEBUG TOTALES FINALES: Subtotal=${subtotal}, HST=${hst}, Total=${total}")
+        return Invoice(
+            vendor="A1",
+            issuer=issuer,
+            date=date,
+            transaction_id=transaction_id,
+            customer_name=customer_name,
+            issuer_address=address,
+            gst_hst_number=gst_hst,
+            invoice_number=customer_code or transaction_id,
+            items=items,
+            subtotal=subtotal,
+            hst=hst,
+            total=total,
+            raw_text=self.raw_text,
+            confidence=95.0 if len(items) > 0 else 85.0
+        )
+    def _clean_amount(self, amount_str: str) -> float:
+        """Limpia y convierte montos con errores OCR"""
+        if not amount_str:
+            return 0.0
+        # Eliminar espacios y símbolos de dólar
+        cleaned = amount_str.replace(' ', '').replace('$', '')
+        # Manejar casos como "17,.01" o "176 .85" o "21,99"
+        if ',' in cleaned and '.' in cleaned:
+            cleaned = cleaned.replace(',', '')
+        elif ',' in cleaned:
+            parts = cleaned.split(',')
+            if len(parts) == 2 and len(parts[1]) <= 2:
+                cleaned = cleaned.replace(',', '.')
+            else:
+                cleaned = cleaned.replace(',', '')
+        try:
+            return float(cleaned)
+        except (ValueError, TypeError):
+            print(f"DEBUG: No se pudo convertir '{amount_str}' a float")
+            return 0.0
+    def _extract_totals_sequential(self) -> tuple:
+        """Extrae Subtotal, HST y Total según su patrón de ubicación"""
+        subtotal = 0.0
+        hst = 0.0
+        total = 0.0
+        # Trabajar con las últimas 30 líneas
+        end_lines = self.lines[-30:] if len(self.lines) > 30 else self.lines
+        # Buscar SUBTOTAL: valor está en la línea ANTERIOR a "Sub Total"
+        for i, line in enumerate(end_lines):
+            if re.search(r'Sub\s*Total', line, re.IGNORECASE):
+                print(f"DEBUG: Línea 'Sub Total' encontrada en índice {i}: '{line.strip()}'")
+                if i > 0:
+                    prev_line = end_lines[i - 1]
+                    print(f"DEBUG: Buscando subtotal en línea anterior: '{prev_line.strip()}'")
+                    amount_match = re.search(r'\$?\s*([\d,\s\.]+)', prev_line)
+                    if amount_match:
+                        subtotal = self._clean_amount(amount_match.group(1))
+                        print(f"DEBUG: ✓ Subtotal encontrado: ${subtotal}")
+                break
+        # Buscar HST: valor está en la línea POSTERIOR a "HST"
+        for i, line in enumerate(end_lines):
+            if re.search(r'^HST\s*$', line.strip(), re.IGNORECASE):
+                print(f"DEBUG: Línea 'HST' encontrada en índice {i}: '{line.strip()}'")
+                if i + 1 < len(end_lines):
+                    next_line = end_lines[i + 1]
+                    print(f"DEBUG: Buscando HST en línea siguiente: '{next_line.strip()}'")
+                    amount_match = re.search(r'\$?\s*([\d,\s\.]+)', next_line)
+                    if amount_match:
+                        hst = self._clean_amount(amount_match.group(1))
+                        print(f"DEBUG: ✓ HST encontrado: ${hst}")
+                break
+        # Buscar TOTAL: valor está en la línea ANTERIOR a "Total"
+        for i, line in enumerate(end_lines):
+            if re.search(r'^Total\s*$', line.strip(), re.IGNORECASE) or re.search(r'^[Tt]ota[l1]\s*$', line.strip()):
+                print(f"DEBUG: Línea 'Total' encontrada en índice {i}: '{line.strip()}'")
+                if i > 0:
+                    prev_line = end_lines[i - 1]
+                    print(f"DEBUG: Buscando total en línea anterior: '{prev_line.strip()}'")
+                    amount_match = re.search(r'\$?\s*([\d,\s\.]+)', prev_line)
+                    if amount_match:
+                        total = self._clean_amount(amount_match.group(1))
+                        print(f"DEBUG: ✓ Total encontrado: ${total}")
+                break
+        # Validación
+        if subtotal > 0 and hst > 0 and total == 0:
+            total = subtotal + hst
+            print(f"DEBUG: Total calculado: ${total}")
+        return subtotal, hst, total
+    def _is_sku_line(self, line: str) -> str:
+        """Determina si una línea es un SKU y lo retorna normalizado"""
+        line_stripped = line.strip()
+        # Debe tener entre 5 y 10 caracteres
+        if not (5 <= len(line_stripped) <= 10):
+            return ""
+        # Debe contener al menos una letra y un número
+        has_letter = bool(re.search(r'[A-Za-z]', line_stripped))
+        has_number = bool(re.search(r'\d', line_stripped))
+        if not (has_letter and has_number):
+            return ""
+        # No debe contener símbolos de dinero, espacios múltiples, o palabras clave
+        if re.search(r'\$|:|\s{2,}', line_stripped):
+            return ""
+        if re.search(r'^(Total|Sub|HST|Change|Details|Customer|Date|Transaction)', line_stripped, re.IGNORECASE):
+            return ""
+        # Patrones específicos conocidos
+        patterns = [
+            r'^[A-Z]{2,}[0-9]{2,}$',  # ALU104, FLRO58, ST0221, BAGO10
+            r'^[A-Z][a-z][A-Z][a-z][0-9]{2}$',  # HaTo67
+            r'^[A-Z][a-z][A-Z][a-z0-9]{2}[0-9]{2}$',  # WaTo66
+            r'^[A-Z]{2}[0-9]{4}$',  # KS1598
+            r'^[A-Z]{4}[0-9]{2}$',  # WRPO4A
+        ]
+        for pat in patterns:
+            if re.match(pat, line_stripped):
+                return line_stripped.upper()
+        # Patrón genérico: combinación de letras y números
+        # Debe empezar con letra
+        if re.match(r'^[A-Z][A-Za-z0-9]{4,9}$', line_stripped, re.IGNORECASE):
+            # Verificar que no sea solo letras
+            if not line_stripped.isalpha():
+                return line_stripped.upper()
+        return ""
+    def _extract_a1_items_ultra(self) -> List[InvoiceItem]:
+        """Extractor ultra-robusto para items de A1/Burlington Cash and Carry
+        Patrón esperado:
+        1. SKU (línea sola)
+        2. Descripción (una o más líneas)
+        3. Precio unitario con/sin H
+        4. Precio total con/sin H
+        5. Cantidad de unidades compradas
+        6. Cantidad por unidad de empaque (última línea antes del espacio)
+        """
+        items = []
+        item_matches = []
+        # Encontrar inicio y fin del área de items
+        start_idx = 0
+        end_idx = len(self.lines)
+        for i, line in enumerate(self.lines):
+            if re.search(r'^(Details|SKU)\s*$', line.strip(), re.IGNORECASE):
+                start_idx = i + 1
+                print(f"DEBUG: Inicio de items en línea {start_idx}")
+                break
+        for i, line in enumerate(self.lines[start_idx:], start=start_idx):
+            if re.search(r'Sub\s*Total', line, re.IGNORECASE):
+                end_idx = i
+                print(f"DEBUG: Fin de items en línea {end_idx}")
+                break
+        print(f"\nDEBUG: Escaneando líneas {start_idx} a {end_idx} buscando SKUs...")
+        print(f"{'='*70}\n")
+        # Buscar TODOS los SKUs usando el método robusto
+        for i in range(start_idx, end_idx):
+            line = self.lines[i]
+            sku = self._is_sku_line(line)
+            if sku:
+                item_matches.append({
+                    'line_index': i,
+                    'sku': sku
+                })
+                print(f"DEBUG: ✓ SKU '{sku}' detectado en línea {i}: '{line.strip()}'")
+        print(f"\nDEBUG: Encontrados {len(item_matches)} SKUs en total")
+        print(f"{'='*70}\n")
+        # Procesar cada item
+        for idx, item_data in enumerate(item_matches):
+            i = item_data['line_index']
+            sku = item_data['sku']
+            # Determinar rango hasta el siguiente SKU
+            if idx + 1 < len(item_matches):
+                search_end = item_matches[idx + 1]['line_index']
+            else:
+                search_end = min(i + 25, end_idx)
+            item_lines = self.lines[i+1:search_end]
+            print(f"\n{'='*70}")
+            print(f"DEBUG: Procesando SKU #{idx+1}: {sku} (líneas {i+1} a {search_end-1})")
+            print(f"{'='*70}")
+            for j, line in enumerate(item_lines, start=1):
+                print(f"  [{j:2d}] '{line.strip()}'")
+            # Extraer según el patrón de abajo hacia arriba
+            description_parts = []
+            unit_price = 0.0
+            line_total = 0.0
+            quantity_packages = 0.0
+            quantity_per_package = ""
+            tax_code = ""
+            # Iterar desde el final hacia arriba
+            num_lines = len(item_lines)
+            # Última línea: cantidad por unidad de empaque (ej: "100 ct", "12x355 ml")
+            if num_lines >= 1:
+                last_line = item_lines[-1].strip()
+                # Patrones más flexibles para unidades
+                unit_match = re.search(r'(\d+)\s*(ct|pk|ea|case|box)', last_line, re.IGNORECASE)
+                if not unit_match:
+                    unit_match = re.search(r'(\d+)\s*x\s*(\d+)\s*(m1|ml)', last_line, re.IGNORECASE)
+                if unit_match:
+                    quantity_per_package = unit_match.group(0)
+                    print(f"\nDEBUG: ✓ Cantidad por paquete: '{quantity_per_package}'")
+                else:
+                    print(f"\nDEBUG: ⚠ No se encontró cantidad por paquete en: '{last_line}'")
+            # Antepenúltima línea: cantidad de unidades compradas
+            if num_lines >= 2:
+                qty_line = item_lines[-2].strip()
+                qty_match = re.match(r'^(\d+[,\.]?\d*)\s*$', qty_line)
+                if qty_match:
+                    qty_str = qty_match.group(1).replace(',', '.')
+                    try:
+                        quantity_packages = float(qty_str)
+                        print(f"DEBUG: ✓ Cantidad de paquetes: {quantity_packages}")
+                    except ValueError:
+                        print(f"DEBUG: ⚠ No se pudo parsear cantidad: '{qty_str}'")
+                else:
+                    print(f"DEBUG: ⚠ No se encontró cantidad en: '{qty_line}'")
+            # Líneas anteriores: precios (total y unitario, con posible H)
+            # Buscar las líneas con $ en los últimos renglones antes de la cantidad
+            price_lines = []
+            search_limit = max(0, num_lines - 6)  # Buscar en las últimas 6 líneas
+            for k in range(search_limit, max(0, num_lines - 2)):
+                if k < len(item_lines):
+                    line = item_lines[k].strip()
+                    # Buscar líneas con precios ($)
+                    if re.search(r'\$', line):
+                        price_lines.append({'index': k, 'line': line})
+                        print(f"DEBUG: Línea con precio [{k}]: '{line}'")
+            print(f"DEBUG: Total líneas con precios: {len(price_lines)}")
+            # Extraer precios de las líneas encontradas
+            all_prices = []
+            for price_info in price_lines:
+                line = price_info['line']
+                # Extraer todos los precios de la línea
+                price_matches = re.findall(r'\$\s*([\d,\s\.]+)\s*(H)?', line, re.IGNORECASE)
+                for pm in price_matches:
+                    price_val = self._clean_amount(pm[0])
+                    has_h = bool(pm[1])
+                    if price_val > 0:
+                        all_prices.append({
+                            'value': price_val,
+                            'has_h': has_h,
+                            'line_idx': price_info['index']
+                        })
+                        if has_h:
+                            tax_code = "H"
+            # Asignar precios: tomar los dos últimos valores únicos
+            if len(all_prices) >= 2:
+                # Ordenar por valor
+                unique_prices = []
+                seen_values = set()
+                for p in all_prices:
+                    if p['value'] not in seen_values:
+                        unique_prices.append(p)
+                        seen_values.add(p['value'])
+                if len(unique_prices) >= 2:
+                    unique_prices.sort(key=lambda x: x['value'])
+                    unit_price = unique_prices[0]['value']
+                    line_total = unique_prices[-1]['value']
+                    print(f"DEBUG: ✓ Unitario: ${unit_price}, Total: ${line_total}")
+                elif len(unique_prices) == 1:
+                    unit_price = unique_prices[0]['value']
+                    line_total = unit_price
+                    print(f"DEBUG: ✓ Precio único: ${unit_price}")
+            elif len(all_prices) == 1:
+                unit_price = all_prices[0]['value']
+                line_total = unit_price
+                if all_prices[0]['has_h']:
+                    tax_code = "H"
+                print(f"DEBUG: ✓ Precio único: ${unit_price}")
+            # Buscar H en líneas cercanas si no se encontró
+            if not tax_code:
+                for k in range(max(0, num_lines - 6), num_lines):
+                    if k < len(item_lines):
+                        if re.search(r'\bH\b', item_lines[k]):
+                            tax_code = "H"
+                            print(f"DEBUG: ✓ H encontrado en línea {k}")
+                            break
+            # Descripción: todas las líneas antes de los precios
+            desc_end = price_lines[0]['index'] if price_lines else max(0, num_lines - 4)
+            for k in range(0, desc_end):
+                if k < len(item_lines):
+                    line = item_lines[k].strip()
+                    # Excluir líneas con solo precios, números, o símbolos
+                    if line and not re.match(r'^[\$\d,\.\s]+$', line) and not re.match(r'^[,\.\s]+$', line):
+                        desc_clean = re.sub(r'[^\w\s\-\.,/\'"#%&()x]', ' ', line)
+                        desc_clean = ' '.join(desc_clean.split())
+                        if desc_clean and len(desc_clean) > 2:
+                            description_parts.append(desc_clean)
+            description = ' '.join(description_parts) if description_parts else ""
+            print(f"\nDEBUG: Resumen extraído:")
+            print(f"  Descripción: '{description}'")
+            print(f"  Unitario: ${unit_price}")
+            print(f"  Total: ${line_total}")
+            print(f"  Cantidad: {quantity_packages}")
+            print(f"  Tax: {tax_code}")
+            # Validaciones y cálculos
+            if not description:
+                print(f"DEBUG: ✗ Item {sku} - SIN DESCRIPCIÓN, omitido\n")
+                continue
+            if quantity_packages == 0:
+                quantity_packages = 1.0
+                print(f"DEBUG: Cantidad por defecto: 1.0")
+            if line_total == 0 and unit_price > 0:
+                line_total = quantity_packages * unit_price
+                print(f"DEBUG: Total calculado: ${line_total}")
+            if unit_price == 0 and line_total > 0 and quantity_packages > 0:
+                unit_price = line_total / quantity_packages
+                print(f"DEBUG: Unitario calculado: ${unit_price}")
+            # Agregar item
+            if description and (unit_price > 0 or line_total > 0):
+                items.append(InvoiceItem(
+                    sku=sku,
+                    description=description.strip(),
+                    quantity=quantity_packages,
+                    unit_price=unit_price,
+                    amount=line_total,
+                    tax_code=tax_code
+                ))
+                print(f"\nDEBUG: ✓✓✓ ITEM #{len(items)} AGREGADO EXITOSAMENTE")
+                print(f"       SKU: {sku}")
+                print(f"       Desc: {description[:60]}...")
+                print(f"       Qty: {quantity_packages}")
+                print(f"       Unit: ${unit_price}")
+                print(f"       Total: ${line_total}")
+                print(f"       Tax: {tax_code}\n")
+            else:
+                print(f"\nDEBUG: ✗✗✗ Item {sku} - DATOS INCOMPLETOS, omitido\n")
+        print(f"\n{'='*70}")
+        print(f"DEBUG: RESUMEN FINAL - {len(items)} items extraídos de {len(item_matches)} SKUs detectados")
+        print(f"{'='*70}\n")
+        return items
+class DefaultAzureExtractor(BasePatternExtractor):
+    """Extractor que parsea el texto formateado de Azure Document Intelligence"""
+    def extract_invoice(self) -> Invoice:
+        """
+        Extrae datos desde el formato de texto generado por Azure.
+        """
+        # Extraer información básica
+        issuer = self.extract_text([
+            r'Proveedor:\s*(.+)',
+            r'Supplier:\s*(.+)',
+            r'Vendor:\s*(.+)'
+        ]) or "Proveedor Desconocido"
+        date = self.extract_text([
+            r'Fecha:\s*(.+)',
+            r'Date:\s*(.+)'
+        ]) or ""
+        transaction_id = self.extract_text([
+            r'Invoice ID:\s*(.+)',
+            r'Invoice No\.?:\s*(.+)',
+            r'Factura N°?:\s*(.+)'
+        ]) or ""
+        customer_name = self.extract_text([
+            r'Cliente:\s*(.+)',
+            r'Customer:\s*(.+)'
+        ]) or ""
+        address = self.extract_text([
+            r'Dirección:\s*(.+)',
+            r'Address:\s*(.+)'
+        ]) or ""
+        gst_hst = self.extract_text([
+            r'GST/HST:\s*(.+)',
+            r'Tax ID:\s*(.+)'
+        ]) or ""
+        # Extraer items
+        items = self._extract_azure_items()
+        # Extraer totales de manera más robusta
+        subtotal, total_tax, total = self._extract_totals()
+        # Calcular confidence
+        confidence = 90.0 if len(items) > 0 else 70.0
+        print(f"\nDEBUG: Extracción Azure completada:")
+        print(f"  Proveedor: {issuer}")
+        print(f"  Fecha: {date}")
+        print(f"  Transaction ID: {transaction_id}")
+        print(f"  Items: {len(items)}")
+        print(f"  Subtotal: ${subtotal}")
+        print(f"  Tax: ${total_tax}")
+        print(f"  Total: ${total}\n")
+        return Invoice(
+            vendor="Default",
+            issuer=issuer,
+            date=date,
+            transaction_id=transaction_id,
+            customer_name=customer_name,
+            issuer_address=address,
+            gst_hst_number=gst_hst,
+            invoice_number=transaction_id,
+            items=items,
+            subtotal=subtotal,
+            hst=total_tax,
+            total=total,
+            raw_text=self.raw_text,
+            confidence=confidence
+        )
+    def _extract_totals(self) -> tuple:
+        """Extrae subtotal, impuestos y total de manera robusta"""
+        # Primero buscar total de la factura (el más importante)
+        total = self._find_total()
+        # Luego buscar subtotal
+        subtotal = self._find_subtotal()
+        # Finalmente buscar impuestos
+        total_tax = self._find_tax()
+        # Validaciones cruzadas
+        if total > 0 and subtotal == 0:
+            # Si tenemos total pero no subtotal, estimar
+            if total_tax > 0:
+                subtotal = total - total_tax
+            else:
+                subtotal = total
+        return subtotal, total_tax, total
+    def _find_total(self) -> float:
+        """Encuentra el total de la factura"""
+        patterns = [
+            r'Total de la factura:\s*\$?\s*([\d,\.]+)',
+            r'Total:\s*\$?\s*([\d,\.]+)',
+            r'Invoice Total:\s*\$?\s*([\d,\.]+)',
+            r'Amount Due:\s*\$?\s*([\d,\.]+)',
+            r'Grand Total:\s*\$?\s*([\d,\.]+)',
+            r'TOTAL:\s*\$?\s*([\d,\.]+)'
+        ]
+        for pattern in patterns:
+            amount = self._extract_single_amount(pattern)
+            if amount > 0:
+                print(f"DEBUG: Total encontrado con patrón '{pattern}': ${amount}")
+                return amount
+        # Si no encontramos con patrones, buscar numéricamente el monto más grande cerca de "Total"
+        total_matches = list(re.finditer(r'Total[^\d]*\$?\s*([\d,\.]+)', self.raw_text, re.IGNORECASE))
+        if total_matches:
+            amounts = []
+            for match in total_matches:
+                try:
+                    amount_str = match.group(1).replace('$', '').replace(',', '').strip()
+                    amount = float(amount_str)
+                    amounts.append(amount)
+                except ValueError:
+                    continue
+            if amounts:
+                max_amount = max(amounts)
+                print(f"DEBUG: Total inferido como máximo encontrado: ${max_amount}")
+                return max_amount
+        return 0.0
+    def _find_subtotal(self) -> float:
+        """Encuentra el subtotal"""
+        patterns = [
+            r'Subtotal:\s*\$?\s*([\d,\.]+)',
+            r'Sub Total:\s*\$?\s*([\d,\.]+)',
+            r'SUB-TOTAL:\s*\$?\s*([\d,\.]+)'
+        ]
+        for pattern in patterns:
+            amount = self._extract_single_amount(pattern)
+            if amount > 0:
+                return amount
+        return 0.0
+    def _find_tax(self) -> float:
+        """Encuentra los impuestos"""
+        patterns = [
+            r'Total impuestos:\s*\$?\s*([\d,\.]+)',
+            r'Total taxes?:\s*\$?\s*([\d,\.]+)',
+            r'Tax:\s*\$?\s*([\d,\.]+)',
+            r'HST:\s*\$?\s*([\d,\.]+)',
+            r'GST:\s*\$?\s*([\d,\.]+)',
+            r'Impuesto:\s*\$?\s*([\d,\.]+)'
+        ]
+        for pattern in patterns:
+            amount = self._extract_single_amount(pattern)
+            if amount > 0:
+                return amount
+        return 0.0
+    def _extract_single_amount(self, pattern: str) -> float:
+        """Extrae un solo monto usando un patrón"""
+        match = re.search(pattern, self.raw_text, re.IGNORECASE)
+        if match:
+            try:
+                amount_str = match.group(1).replace('$', '').replace(',', '').strip()
+                return float(amount_str)
+            except ValueError:
+                pass
+        return 0.0
+    def _extract_azure_items(self) -> List[InvoiceItem]:
+        """
+        Extrae items usando un enfoque más directo y robusto
+        """
+        items = []
+        # Estrategia principal: buscar todas las ocurrencias de "--- Ítem #"
+        item_starts = list(re.finditer(r'---\s*Ítem\s*#\d+\s*---', self.raw_text))
+        if not item_starts:
+            # Intentar con formato alternativo
+            item_starts = list(re.finditer(r'---\s*Item\s*#\d+\s*---', self.raw_text))
+        print(f"DEBUG: Encontrados {len(item_starts)} inicios de items")
+        for i, start_match in enumerate(item_starts):
+            start_pos = start_match.end()  # Comenzar después del separador
+            # Encontrar el final de este item (siguiente item o sección TOTALES)
+            if i < len(item_starts) - 1:
+                end_pos = item_starts[i + 1].start()
+            else:
+                # Para el último item, buscar el inicio de TOTALES
+                totales_match = re.search(r'TOTALES|===|Subtotal:|Total:', self.raw_text[start_pos:])
+                if totales_match:
+                    end_pos = start_pos + totales_match.start()
+                else:
+                    end_pos = start_pos + 1000  # Límite por seguridad
+            section = self.raw_text[start_pos:end_pos].strip()
+            item = self._parse_item_section(section, i + 1)
+            if item and item.amount > 0:  # Solo incluir items con total > 0
+                items.append(item)
+        # Si no encontramos items con separadores, usar método alternativo
+        if not items:
+            items = self._fallback_item_extraction()
+        print(f"DEBUG: Total de items extraídos: {len(items)}")
+        return items
+    def _parse_item_section(self, section: str, item_number: int) -> Optional[InvoiceItem]:
+        """Parsea una sección de item individual"""
+        print(f"\nDEBUG: Procesando Item #{item_number}")
+        # Extraer SKU
+        sku = self._extract_field(section, [
+            r'Código:\s*([^\n]+)',
+            r'Code:\s*([^\n]+)',
+            r'SKU:\s*([^\n]+)'
+        ])
+        # Extraer descripción (manejar multilínea)
+        description = self._extract_multiline_field(section, [
+            r'Descripción:\s*(.+?)(?=\n\s*(?:Cantidad|Precio|Impuesto|Total|Código|Code|$))',
+            r'Description:\s*(.+?)(?=\n\s*(?:Quantity|Price|Tax|Total|Code|$))'
+        ])
+        # Si no hay código pero la descripción empieza con patrón de SKU, extraerlo
+        if not sku and description:
+            first_line = description.split('\n')[0].strip()
+            if self._is_potential_sku(first_line):
+                sku = first_line
+                # Remover el SKU de la descripción
+                lines = description.split('\n')
+                if len(lines) > 1:
+                    description = '\n'.join(lines[1:]).strip()
+                else:
+                    description = ""
+        # Validar que tengamos al menos código O descripción
+        if not sku and not description:
+            print(f"DEBUG: ✗ Item #{item_number} omitido - sin código ni descripción")
+            return None
+        # Extraer valores numéricos
+        quantity = self._extract_numeric_value(section, [
+            r'Cantidad:\s*([\d,\.]+)',
+            r'Quantity:\s*([\d,\.]+)'
+        ], default=1.0)
+        unit_price = self._extract_numeric_value(section, [
+            r'Precio unitario:\s*\$?\s*([\d,\.]+)',
+            r'Unit Price:\s*\$?\s*([\d,\.]+)',
+            r'Price:\s*\$?\s*([\d,\.]+)'
+        ])
+        amount = self._extract_numeric_value(section, [
+            r'Total por ítem:\s*\$?\s*([\d,\.]+)',
+            r'Item Total:\s*\$?\s*([\d,\.]+)',
+            r'Total:\s*\$?\s*([\d,\.]+)'
+        ])
+        # Determinar tax code
+        tax_code = ""
+        tax_amount = self._extract_numeric_value(section, [
+            r'Impuesto\s*\(?H\)?:\s*\$?\s*([\d,\.]+)',
+            r'Tax\s*\(?H\)?:\s*\$?\s*([\d,\.]+)'
+        ])
+        if tax_amount > 0:
+            tax_code = "H"
+        # Calcular valores faltantes
+        if amount == 0 and unit_price > 0 and quantity > 0:
+            amount = quantity * unit_price
+            print(f"DEBUG: Total calculado: {quantity} × ${unit_price} = ${amount}")
+        if unit_price == 0 and amount > 0 and quantity > 0:
+            unit_price = amount / quantity
+            print(f"DEBUG: Precio unitario calculado: ${amount} ÷ {quantity} = ${unit_price}")
+        # Si aún no tenemos amount, usar unit_price como último recurso
+        if amount == 0 and unit_price > 0:
+            amount = unit_price
+            quantity = 1.0
+            print(f"DEBUG: Usando precio unitario como total: ${amount}")
+        # Para el caso BEER STORE: si el amount tiene "T" al final, limpiarlo
+        if amount == 0:
+            # Buscar patrones alternativos de total
+            amount_match = re.search(r'Total por ítem:\s*([\d,\.]+)\s*T', section, re.IGNORECASE)
+            if amount_match:
+                try:
+                    amount = float(amount_match.group(1).replace(',', ''))
+                    print(f"DEBUG: Total extraído con 'T': ${amount}")
+                except ValueError:
+                    pass
+        # Validación final: solo incluir si tenemos amount > 0
+        if amount == 0:
+            print(f"DEBUG: ✗ Item #{item_number} omitido - amount = 0")
+            return None
+        item = InvoiceItem(
+            sku=sku or "",
+            description=description or "",
+            quantity=quantity,
+            unit_price=unit_price,
+            amount=amount,
+            tax_code=tax_code,
+            category=""
+        )
+        print(f"DEBUG: ✓ Item #{item_number} extraído:")
+        print(f"       SKU: '{sku or 'N/A'}'")
+        print(f"       Descripción: '{description[:50] if description else 'N/A'}...'")
+        print(f"       Cantidad: {quantity}")
+        print(f"       Precio unitario: ${unit_price:.2f}")
+        print(f"       Total: ${amount:.2f}")
+        print(f"       Tax code: '{tax_code}'")
+        return item
+    def _fallback_item_extraction(self) -> List[InvoiceItem]:
+        """Método de respaldo para extraer items cuando falla el método principal"""
+        print("DEBUG: Usando método de respaldo para extracción de items")
+        items = []
+        # Buscar por patrones de "Código:" seguidos de otros campos
+        code_pattern = r'Código:\s*([^\n]+)'
+        code_matches = list(re.finditer(code_pattern, self.raw_text))
+        for i, code_match in enumerate(code_matches):
+            start_pos = code_match.start()
+            # Encontrar el final de este item
+            if i < len(code_matches) - 1:
+                end_pos = code_matches[i + 1].start()
+            else:
+                end_pos = start_pos + 500
+            section = self.raw_text[start_pos:end_pos]
+            item = self._parse_item_section(section, i + 1)
+            if item and item.amount > 0:
+                items.append(item)
+        # Si aún no tenemos items, buscar por "Total por ítem"
+        if not items:
+            total_pattern = r'Total por ítem:\s*\$?\s*([\d,\.]+)'
+            total_matches = list(re.finditer(total_pattern, self.raw_text))
+            for i, total_match in enumerate(total_matches):
+                # Buscar sección alrededor de este total
+                start_pos = max(0, total_match.start() - 200)
+                end_pos = total_match.end() + 100
+                section = self.raw_text[start_pos:end_pos]
+                item = self._parse_item_section(section, i + 1)
+                if item and item.amount > 0:
+                    items.append(item)
+        print(f"DEBUG: Método de respaldo encontró {len(items)} items")
+        return items
+    def _extract_field(self, text: str, patterns: List[str]) -> str:
+        """Extrae un campo de texto usando múltiples patrones"""
+        for pattern in patterns:
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                value = match.group(1).strip()
+                # Limpiar confianza si existe
+                value = re.sub(r'\s*\(Confianza:.*?\)', '', value).strip()
+                return value
+        return ""
+    def _extract_multiline_field(self, text: str, patterns: List[str]) -> str:
+        """Extrae un campo multilínea"""
+        for pattern in patterns:
+            match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
+            if match:
+                value = match.group(1).strip()
+                # Limpiar confianza
+                value = re.sub(r'\s*\(Confianza:.*?\)', '', value).strip()
+                return value
+        return ""
+    def _extract_numeric_value(self, text: str, patterns: List[str], default: float = 0.0) -> float:
+        """Extrae un valor numérico"""
+        for pattern in patterns:
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                value_str = match.group(1).replace('$', '').replace(',', '').strip()
+                try:
+                    return float(value_str)
+                except ValueError:
+                    continue
+        return default
+    def _is_potential_sku(self, text: str) -> bool:
+        """
+        Determina si un texto parece ser un código SKU.
+        """
+        text = text.strip()
+        if len(text) > 20 or len(text) < 2:
+            return False
+        # No debe tener espacios (a menos que sea muy corto)
+        if ' ' in text and len(text) > 10:
+            return False
+        # Patrón 1: Solo números (3-15 dígitos)
+        if text.replace('-', '').isdigit() and 3 <= len(text.replace('-', '')) <= 15:
+            return True
+        # Patrón 2: Mezcla de letras y números (como "TOC774", "OIL093")
+        if re.match(r'^[A-Z]{2,5}\d{2,4}$', text):
+            return True
+        # Patrón 3: Principalmente números
+        digit_ratio = sum(c.isdigit() for c in text) / len(text)
+        if digit_ratio >= 0.6:
+            return True
+        return False
+class CostcoPatternExtractor(BasePatternExtractor):
+    """
+    Extractor ultra-optimizado para Costco Business Centre.
+    Versión (v6) con lógica condicional para manejar formatos de ítems COMPACTOS vs. DETALLADOS.
+    """
+    def __init__(self, raw_text: str, text_blocks: List[Dict] = None, ocr_config: Dict = None):
+        super().__init__(raw_text, text_blocks, ocr_config)
+    def extract_invoice(self) -> Invoice:
+        # (Los metadatos se mantienen igual)
+        issuer = "Costco Wholesale Business Centre"
+        gst_hst = self.extract_text([r'GST/HST\s*\[([0-9\s]+RT\s+[0-9]+)\]'])
+        date = self.extract_date([r'Order Date[:\s]*(\d{1,2}/\d{1,2}/\d{4})', r'(\d{1,2}/\d{1,2}/\d{4})'])
+        transaction_id = self.extract_text([r'Order Number[:\s]*(\d{10})', r'(\d{10})']) or ""
+        customer_name = "FAMILIA FINE FOODS"
+        address = self.extract_text([r'(\d+\s+NORTH\s+SERVICE\s+RD)']) or "3 NORTH SERVICE RD ST. CATHARINES, ON"
+        membership = self.extract_text([r'Membership\s*\.?\s*Number[:\s]*(\d+)'])
+        items = self._extract_costco_items_ultra()
+        subtotal = self.extract_amount([r'Subtotal\s*\(\d+\s*Items\)\s*\$\s*([\d,]+\.?\d*)', r'Subtotal[^\$]*\$\s*([\d,]+\.?\d*)',]) or 0.0
+        hst = self.extract_amount([r'HST\s*\(H\)\s*\$\s*([\d,]+\.?\d*)']) or 0.0
+        total = self.extract_amount([r'Invoice Total\s*\$\s*([\d,]+\.?\d*)', r'Order Total\s*\$\s*([\d,]+\.?\d*)',]) or 0.0
+        confidence = 85.0
+        if len(items) > 20: confidence = 95.0
+        return Invoice(
+            vendor="Costco", issuer=issuer, date=date, transaction_id=transaction_id, customer_name=customer_name,
+            issuer_address=address, gst_hst_number=gst_hst, invoice_number=membership or transaction_id,
+            items=items, subtotal=subtotal, hst=hst, total=total, raw_text=self.raw_text, confidence=confidence
+        )
+    def _extract_costco_items_ultra(self) -> List[InvoiceItem]:
+        """
+        Extractor ultra-robusto (v6).
+        Implementa lógica condicional para detectar ítems compactos.
+        """
+        items = []
+        item_matches = []
+        # Regex de ítem flexible (V5)
+        item_pattern = r'^\s*(?:I|l)tem\s+(\d+)\s+\$\s*([\d,]+\.?\d*)\s*(?:\([A-Z]\))?\s*$'
+        for i, line in enumerate(self.lines):
+            match = re.search(item_pattern, line)
+            if match:
+                item_matches.append({
+                    'line_index': i,
+                    'sku': match.group(1),
+                    'unit_price': float(match.group(2).replace(',', ''))
+                })
+        print(f"DEBUG: Encontrados {len(item_matches)} SKUs con regex flexible")
+        for item_data in item_matches:
+            i = item_data['line_index']
+            sku = item_data['sku']
+            unit_price = item_data['unit_price']
+            description = ""
+            quantity = 0.0
+            line_total = 0.0
+            tax_code = ""
+            status = ""
+            is_compact = False # Nuevo indicador para el formato
+            try:
+                # 1. Descripción (i-1)
+                if i == 0: continue
+                description = self.lines[i-1]
+                if 'http' in description or 'Orders & Purchases' in description or 'Invoice Total' in description:
+                    continue
+                # 2. Determinar el formato: Compacto (3 líneas después) o Detallado (5 líneas después)
+                # Intentamos leer la Cantidad Enviada (i+2) para el formato Detallado
+                # Es el mejor indicador, ya que la línea (i+1) puede ser Qty Ordered o Status.
+                if len(self.lines) > i + 2:
+                    qty_shipped_match = re.match(r'^(\d+(?:\.\d+)?)$', self.lines[i+2])
+                else:
+                    qty_shipped_match = None
+                if qty_shipped_match:
+                    # Formato Detallado (5 líneas después): QtyO, QtyS, Status, TotalO, TotalS
+                    is_compact = False
+                    quantity = float(qty_shipped_match.group(1))
+                    # 3. Estado (i+3)
+                    if len(self.lines) <= i + 3: continue
+                    status = self.lines[i+3]
+                    # Índices de total
+                    total_index = i + 4
+                    invoice_total_index = i + 5
+                else:
+                    # Formato Compacto (3 líneas después): Status, TotalO, TotalS
+                    is_compact = True
+                    quantity = 1.0 # Asumimos 1 si no hay líneas de cantidad
+                    # 3. Estado (i+1)
+                    if len(self.lines) <= i + 1: continue
+                    status = self.lines[i+1]
+                    # Índices de total
+                    total_index = i + 2
+                    invoice_total_index = i + 3
+                # Manejo del estado
+                if status.lower() == 'cancelled':
+                    print(f"DEBUG: Item {sku} cancelado")
+                    continue
+                if status not in ['Delivered', 'Shipped', 'Pending']:
+                     print(f"DEBUG: Item {sku} - estado no válido '{status}'")
+                     continue
+                # 4. Impuesto y Totales (aplicando el offset correcto)
+                current_index = total_index
+                # Chequear por código de impuesto (Si está presente, avanza el índice)
+                if len(self.lines) > current_index:
+                    tax_match = re.match(r'^\((H|G|P|Q)\)$', self.lines[current_index])
+                    if tax_match:
+                        tax_code = tax_match.group(1)
+                        current_index += 1 # Índice avanzado
+                # El Total de Factura (Total Invoiced) siempre es la siguiente línea válida después del Total de Pedido (Total Ordered)
+                final_total_index = current_index + 1
+                if len(self.lines) <= final_total_index: continue
+                total_invoice_match = re.match(r'^\$\s*([\d,]+\.?\d*)$', self.lines[final_total_index])
+                if total_invoice_match:
+                    line_total = float(total_invoice_match.group(1).replace(',', ''))
+                else:
+                    print(f"DEBUG: Item {sku} - no se encontró el total de factura en '{self.lines[final_total_index]}'")
+                    continue
+                # 5. Agregar item
+                if description and status:
+                    items.append(InvoiceItem(
+                        sku=sku, description=description, quantity=quantity, unit_price=unit_price,
+                        amount=line_total, tax_code=tax_code
+                    ))
+                    # print(f"DEBUG: Item {sku} ({'Compacto' if is_compact else 'Detallado'}): {description[:30]}... qty={quantity}")
+            except IndexError:
+                print(f"DEBUG: Item {sku} - Error de índice procesando item")
+                continue
+            except Exception as e:
+                print(f"DEBUG: Item {sku} - Excepción: {e}")
+                continue
+        # Eliminar duplicados
+        final_items = []
+        seen_keys = set()
+        for item in items:
+            item_key = (item.sku, item.quantity, item.amount, item.description)
+            if item_key not in seen_keys:
+                final_items.append(item)
+                seen_keys.add(item_key)
+        print(f"DEBUG: Total items finales: {len(final_items)}")
+        return final_items
+class Costco2PatternExtractor(BasePatternExtractor):
+    """Extractor ultra-optimizado para Costco Business Centre"""
+    def extract_invoice(self) -> Invoice:
+        issuer = "Costco Wholesale Business Centre"
+        gst_hst = self.extract_text([
+            r'GST/HST\s*\[([0-9\s]+RT\s+[0-9]+)\]',
+        ])
+        date = self.extract_date([
+            r'Order Date[:\s]*(\d{1,2}/\d{1,2}/\d{4})',
+            r'(\d{1,2}/\d{1,2}/\d{4})',
+        ])
+        transaction_id = self.extract_text([
+            r'Order Number[:\s]*(\d{10})',
+            r'(\d{10})',
+        ]) or ""
+        customer_name = "FAMILIA FINE FOODS"
+        address = self.extract_text([
+            r'(\d+\s+NORTH\s+SERVICE\s+RD)',
+        ]) or "3 NORTH SERVICE RD ST. CATHARINES, ON"
+        membership = self.extract_text([
+            r'Membership Number[:\s]*(\d+)',
+        ])
+        items = self._extract_costco_items_ultra()
+        subtotal = self.extract_amount([
+            r'Subtotal\s*\(\d+\s*Items\)\s*\$\s*([\d,]+\.?\d*)',
+            r'Subtotal[^\$]*\$\s*([\d,]+\.?\d*)',
+        ]) or 0.0
+        hst = self.extract_amount([
+            r'HST\s*\(H\)\s*\$\s*([\d,]+\.?\d*)',
+        ]) or 0.0
+        total = self.extract_amount([
+            r'Invoice Total\s*\$\s*([\d,]+\.?\d*)',
+            r'Order Total\s*\$\s*([\d,]+\.?\d*)',
+        ]) or 0.0
+        return Invoice(
+            vendor="Costco",
+            issuer=issuer,
+            date=date,
+            transaction_id=transaction_id,
+            customer_name=customer_name,
+            issuer_address=address,
+            gst_hst_number=gst_hst,
+            invoice_number=membership or transaction_id,
+            items=items,
+            subtotal=subtotal,
+            hst=hst,
+            total=total,
+            raw_text=self.raw_text,
+            confidence=95.0 if len(items) > 30 else 85.0
+        )
+    def _extract_costco_items_ultra(self) -> List[InvoiceItem]:
+        """Extractor ultra-robusto para items de Costco"""
+        items = []
+        item_matches = []
+        # Encontrar todos los SKUs
+        for i, line in enumerate(self.lines):
+            match = re.search(r'Item\s+(\d+)\s+\$\s*([\d,]+\.?\d*)', line)
+            if match:
+                item_matches.append({
+                    'line_index': i,
+                    'sku': match.group(1),
+                    'unit_price': float(match.group(2).replace(',', ''))
+                })
+        print(f"DEBUG: Encontrados {len(item_matches)} SKUs")
+        # Extraer cada item
+        for item_data in item_matches:
+            i = item_data['line_index']
+            sku = item_data['sku']
+            unit_price = item_data['unit_price']
+            description = ""
+            quantity = 0.0
+            line_total = 0.0
+            tax_code = ""
+            status = ""
+            search_start = max(0, i - 5)
+            search_lines = self.lines[search_start:i]
+            # Buscar patrón completo
+            for prev_line in reversed(search_lines):
+                pattern = r'^(.+?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(Delivered|Cancelled|Shipped)\s*(\(H\))?\s*\$\s*([\d,]+\.?\d*)\s+\$\s*([\d,]+\.?\d*)$'
+                match = re.search(pattern, prev_line)
+                if match:
+                    description = match.group(1).strip()
+                    quantity = float(match.group(3))
+                    status = match.group(4)
+                    tax_code = match.group(5).strip('()') if match.group(5) else ""
+                    line_total = float(match.group(7).replace(',', ''))
+                    break
+            # Buscar descripción si no se encontró
+            if not description:
+                for prev_line in reversed(search_lines):
+                    if re.match(r'^[A-Z][A-Za-z\s,\.%-]+', prev_line) and len(prev_line) > 10:
+                        desc_match = re.match(r'^([A-Za-z\s,\.%-]+?)(?:\s+\d|\s+$|$)', prev_line)
+                        if desc_match:
+                            potential_desc = desc_match.group(1).strip()
+                            if potential_desc and not re.match(r'^(Item|Order|Status|Qty)', potential_desc):
+                                description = potential_desc
+                                break
+            # Buscar cantidades
+            if not quantity:
+                combined = ' '.join(search_lines)
+                qty_pattern = r'(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(Delivered|Cancelled|Shipped)\s*(\(H\))?\s*\$\s*([\d,]+\.?\d*)\s+\$\s*([\d,]+\.?\d*)'
+                qty_match = re.search(qty_pattern, combined)
+                if qty_match:
+                    quantity = float(qty_match.group(2))
+                    status = qty_match.group(3)
+                    tax_code = qty_match.group(4).strip('()') if qty_match.group(4) else ""
+                    line_total = float(qty_match.group(6).replace(',', ''))
+            # Búsqueda simple
+            if description and not quantity:
+                for prev_line in search_lines:
+                    simple = re.search(r'(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+Delivered', prev_line)
+                    if simple:
+                        quantity = float(simple.group(2))
+                        status = "Delivered"
+                        totals = re.findall(r'\$\s*([\d,]+\.?\d*)', prev_line)
+                        if len(totals) >= 2:
+                            line_total = float(totals[-1].replace(',', ''))
+                        break
+            # Agregar item
+            if description and status:
+                if status.lower() == 'cancelled':
+                    print(f"DEBUG: Item {sku} cancelado")
+                    continue
+                if line_total == 0 and quantity > 0:
+                    line_total = quantity * unit_price
+                if quantity == 0 and line_total > 0:
+                    quantity = line_total / unit_price if unit_price > 0 else 1.0
+                items.append(InvoiceItem(
+                    sku=sku,
+                    description=description,
+                    quantity=quantity if quantity > 0 else 1.0,
+                    unit_price=unit_price,
+                    amount=line_total if line_total > 0 else unit_price,
+                    tax_code=tax_code
+                ))
+                print(f"DEBUG: Item {sku}: {description[:30]}... qty={quantity}")
+            else:
+                print(f"DEBUG: Item {sku} - datos incompletos")
+        return items
+# ==== FACTORY ====
+class ExtractorFactory:
+    """Factory para crear extractores"""
+    EXTRACTORS = {
+        "A1 Cash and Carry_Fisico": A1PatternExtractor,
+        "Costco_Formato1": CostcoPatternExtractor,
+        "Costco_Formato2": Costco2PatternExtractor,
+        "Default": DefaultAzureExtractor,
+    }
+    @classmethod
+    def create_extractor(cls, vendor: str, raw_text: str, text_blocks: List[Dict] = None):
+        """Crea el extractor apropiado"""
+        extractor_class = cls.EXTRACTORS.get(vendor)
+        # Obtener configuración OCR para el vendor
+        ocr_config = {}
+        for vendor_enum, config in VENDOR_OCR_CONFIG.items():
+            if vendor_enum.value == vendor:
+                ocr_config = config
+                break
+        if extractor_class:
+            return extractor_class(raw_text, text_blocks, ocr_config)
+        return A1PatternExtractor(raw_text, text_blocks, ocr_config)
+    @classmethod
+    def get_supported_vendors(cls) -> List[str]:
+        """Retorna vendors soportados"""
+        return list(cls.EXTRACTORS.keys())
+# ==== GESTOR DE ESQUEMAS DE PROVEEDORES ====
+class VendorSchemaManager:
+    """Maneja los esquemas de diferentes proveedores."""
+    # Definición de la lista de proveedores disponibles como atributo de clase
+    vendor_list: List[Vendor] = [Vendor.A1, Vendor.COSTCO, Vendor.COSTCO2, Vendor.DEFAULT]
+    def __init__(self):
+        # No necesitamos esquemas JSON ya que usamos extractores de patrones
+        pass
+    def get_ocr_config(self, vendor: Vendor) -> Dict:
+        """Obtiene la configuración OCR para un proveedor específico."""
+        return VENDOR_OCR_CONFIG.get(vendor, {"engine": "easyocr", "mode": "block"})
+    def get_vendor_list(self) -> List[Dict]:
+        """Obtiene la lista de proveedores para el frontend."""
+        return [
+            {"id": v.value, "name": v.value, "description": f"Facturas de {v.value}"}
+            for v in self.vendor_list
+        ]