Spaces:

sgonzalezu
/

ocr_service

Running

File size: 13,186 Bytes

0a6b0fb

# azure_ocr_processor.py
# Procesador OCR usando Azure Document Intelligence

import json
import numpy as np
from io import BytesIO
from typing import Dict, List

try:
    from azure.core.credentials import AzureKeyCredential
    from azure.ai.documentintelligence import DocumentIntelligenceClient
    AZURE_AVAILABLE = True
except ImportError:
    AZURE_AVAILABLE = False
    print("ADVERTENCIA: azure-ai-documentintelligence no está disponible.")


class AzureOCRProcessor:
    """Procesador usando Azure Document Intelligence"""
    
    def __init__(self, endpoint: str = None, key: str = None):
        if not AZURE_AVAILABLE:
            raise RuntimeError("Azure Document Intelligence no está disponible")
        
        # Usar credenciales desde variables de entorno o parámetros
        import os
        
        # Prioridad: parámetros > variables de entorno > valores por defecto
        self.endpoint = endpoint or os.environ.get(
            "AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT",
            "https://invoicerecog.cognitiveservices.azure.com/"
        )
        
        self.key = key or os.environ.get(
            "AZURE_DOCUMENT_INTELLIGENCE_KEY",
            "BnvYqZbBSscFxbxZurfTEj9H6ZP4anDzvE2gQTB8fvau0wzlAk0TJQQJ99BKACYeBjFXJ3w3AAALACOGyauB"
        )
        
        if not self.endpoint or not self.key:
            raise ValueError(
                "Se requieren credenciales de Azure. "
                "Define las variables de entorno AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT "
                "y AZURE_DOCUMENT_INTELLIGENCE_KEY, o pásalas como parámetros."
            )
        
        print(f"INFO: Inicializando Azure Document Intelligence")
        print(f"INFO: Endpoint: {self.endpoint}")
        
        self.client = DocumentIntelligenceClient(
            endpoint=self.endpoint,
            credential=AzureKeyCredential(self.key)
        )
    
    def process(self, image: np.ndarray, ocr_config: Dict) -> List[Dict]:
        """
        Procesa la imagen usando Azure Document Intelligence.
        Retorna text_blocks simulando el formato de otros OCR pero con datos estructurados.
        """
        model = ocr_config.get("model", "prebuilt-invoice")
        
        print(f"INFO: Procesando con Azure Document Intelligence, modelo: {model}")
        
        # === NUEVO: COMPRESIÓN DE IMAGEN PARA AZURE (PROCESO INDEPENDIENTE) ===
        # Esta compresión se ejecuta antes del procesamiento normal y no afecta la funcionalidad original
        image_to_process = self._compress_image_for_azure(image)
        # === FIN COMPRESIÓN ===
        
        # Convertir numpy array a bytes (formato PNG) - CÓDIGO ORIGINAL INTACTO
        import cv2
        success, encoded_image = cv2.imencode('.png', image_to_process)
        if not success:
            raise RuntimeError("No se pudo codificar la imagen")
        
        image_bytes = encoded_image.tobytes()
        
        print(f"INFO: Imagen codificada: {len(image_bytes)} bytes")
        
        # Analizar con Azure - CÓDIGO ORIGINAL INTACTO
        try:
            print("INFO: Enviando imagen a Azure Document Intelligence...")
            
            poller = self.client.begin_analyze_document(
                model,
                body=BytesIO(image_bytes),
                content_type="image/png"
            )
            
            print("INFO: Esperando respuesta de Azure...")
            result = poller.result()
            
            print(f"INFO: Análisis completado. Documentos encontrados: {len(result.documents) if result.documents else 0}")
            
            # Convertir resultado de Azure a formato de texto estructurado
            formatted_text = self._format_azure_result_as_text(result)
            
            # Retornar como un único text_block con flag especial
            return [{
                'text': formatted_text,
                'x': 0,
                'y': 0,
                'width': 0,
                'height': 0,
                'confidence': 95.0,
                'engine': 'azure',
                'is_azure_structured': True
            }]
        
        except Exception as e:
            print(f"ERROR en Azure Document Intelligence: {e}")
            import traceback
            traceback.print_exc()
            raise
    
    def _compress_image_for_azure(self, image: np.ndarray) -> np.ndarray:
        """
        COMPRESIÓN INDEPENDIENTE: Comprime la imagen para Azure sin afectar el procesamiento original.
        Esta función es completamente independiente y no modifica la lógica existente.
        """
        import cv2
        
        # Obtener información de la imagen original
        height, width = image.shape[:2]
        original_size_mb = image.nbytes / (1024 * 1024)
        print(f"INFO: Compresión Azure - Imagen original: {width}x{height}, {original_size_mb:.2f}MB")
        
        # Si la imagen ya es pequeña, no comprimir
        if original_size_mb <= 4.5:
            print("INFO: Compresión Azure - Imagen ya está dentro del límite, no se requiere compresión")
            return image
        
        print("INFO: Compresión Azure - Aplicando compresión...")
        
        # Redimensionar si es muy grande (manteniendo relación de aspecto)
        max_dimension = 2000
        if width > max_dimension or height > max_dimension:
            if width > height:
                new_width = max_dimension
                new_height = int((max_dimension / width) * height)
            else:
                new_height = max_dimension
                new_width = int((max_dimension / height) * width)
            
            print(f"INFO: Compresión Azure - Redimensionando a {new_width}x{new_height}")
            compressed_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
            compressed_size_mb = compressed_image.nbytes / (1024 * 1024)
            print(f"INFO: Compresión Azure - Después de redimensionar: {compressed_size_mb:.2f}MB")
            
            # Verificar si después de redimensionar ya está dentro del límite
            if compressed_size_mb <= 4.5:
                return compressed_image
        else:
            compressed_image = image
        
        # Si aún es grande después de redimensionar, aplicar compresión JPEG temporal
        temp_quality = 85
        while temp_quality >= 50:
            # Codificar temporalmente como JPEG para ver el tamaño
            success, jpeg_encoded = cv2.imencode('.jpg', compressed_image, [cv2.IMWRITE_JPEG_QUALITY, temp_quality])
            if success:
                jpeg_size_mb = len(jpeg_encoded.tobytes()) / (1024 * 1024)
                print(f"INFO: Compresión Azure - Calidad {temp_quality}: {jpeg_size_mb:.2f}MB")
                
                if jpeg_size_mb <= 4.5:
                    print(f"INFO: Compresión Azure - Calidad {temp_quality} aceptada")
                    # Decodificar de vuelta a numpy array para mantener compatibilidad
                    decoded_image = cv2.imdecode(jpeg_encoded, cv2.IMREAD_COLOR)
                    if decoded_image is not None:
                        final_size_mb = decoded_image.nbytes / (1024 * 1024)
                        print(f"INFO: Compresión Azure - Imagen final: {final_size_mb:.2f}MB")
                        return decoded_image
            
            temp_quality -= 10
        
        # Si llegamos aquí, usar la imagen redimensionada sin compresión JPEG
        print("INFO: Compresión Azure - Usando imagen redimensionada sin compresión JPEG adicional")
        return compressed_image
    
    def _format_azure_result_as_text(self, result) -> str:
        """
        Convierte el resultado de Azure a un texto formateado limpio (sin líneas de confianza).
        """
        output_lines = []
        
        if not result.documents:
            return "ERROR: No se encontraron documentos en la factura"
        
        # Procesar el primer documento
        document = result.documents[0]
        fields = document.fields
        
        output_lines.append("-------- Análisis de Azure Document Intelligence --------")
        output_lines.append("")
        
        # Información del proveedor
        vendor_name = fields.get("VendorName")
        if vendor_name:
            output_lines.append(f"Proveedor: {vendor_name.content}")
        
        vendor_address = fields.get("VendorAddress")
        if vendor_address:
            output_lines.append(f"Dirección: {vendor_address.content}")
        
        vendor_tax = fields.get("VendorTaxId")
        if vendor_tax:
            output_lines.append(f"GST/HST: {vendor_tax.content}")
        
        output_lines.append("")
        
        # Información de la factura
        invoice_id = fields.get("InvoiceId")
        if invoice_id:
            output_lines.append(f"Invoice ID: {invoice_id.content}")
        
        invoice_date = fields.get("InvoiceDate")
        if invoice_date:
            output_lines.append(f"Fecha: {invoice_date.content}")
        
        customer_name = fields.get("CustomerName")
        if customer_name:
            output_lines.append(f"Cliente: {customer_name.content}")
        
        output_lines.append("")
        output_lines.append("=" * 60)
        output_lines.append("ÍTEMS DE LA FACTURA")
        output_lines.append("=" * 60)
        output_lines.append("")
        
        # Extraer items
        items_field = fields.get("Items")
        total_items = 0
        
        if items_field and hasattr(items_field, "value_array"):
            total_items = len(items_field.value_array)
            print(f"INFO: Procesando {total_items} items...")
            
            for item_idx, item in enumerate(items_field.value_array):
                item_obj = item.value_object if hasattr(item, "value_object") else {}
                
                output_lines.append(f"--- Ítem #{item_idx + 1} ---")
                
                # Código de producto
                product_code = item_obj.get("ProductCode")
                if product_code and product_code.content:
                    output_lines.append(f"Código: {product_code.content}")
                
                # Descripción
                description = item_obj.get("Description")
                if description and description.content:
                    output_lines.append(f"Descripción: {description.content}")
                
                # Cantidad
                quantity = item_obj.get("Quantity")
                if quantity and quantity.content:
                    output_lines.append(f"Cantidad: {quantity.content}")
                
                # Precio unitario
                unit_price = item_obj.get("UnitPrice")
                if unit_price and unit_price.content:
                    output_lines.append(f"Precio unitario: {unit_price.content}")
                
                # Impuesto por ítem - SOLO si es > 0
                tax = item_obj.get("Tax")
                if tax and tax.content:
                    try:
                        # Extraer el valor numérico del tax
                        tax_value_str = tax.content.replace('$', '').replace(',', '').strip()
                        tax_value = float(tax_value_str)
                        
                        # Solo incluir si es mayor a 0
                        if tax_value > 0:
                            output_lines.append(f"Impuesto (H): {tax.content}")
                    except (ValueError, AttributeError):
                        pass
                
                # Total por ítem
                amount = item_obj.get("Amount")
                if amount and amount.content:
                    output_lines.append(f"Total por ítem: {amount.content}")
                
                output_lines.append("")
        else:
            output_lines.append("No se encontraron items en la factura")
        
        # Totales
        output_lines.append("=" * 60)
        output_lines.append("TOTALES")
        output_lines.append("=" * 60)
        output_lines.append("")
        
        subtotal = fields.get("SubTotal")
        if subtotal and subtotal.content:
            output_lines.append(f"Subtotal: {subtotal.content}")
        
        total_tax = fields.get("TotalTax")
        if total_tax and total_tax.content:
            output_lines.append(f"Total impuestos: {total_tax.content}")
        
        invoice_total = fields.get("InvoiceTotal")
        if invoice_total and invoice_total.content:
            output_lines.append(f"Total de la factura: {invoice_total.content}")
        
        output_lines.append("")
        output_lines.append("=" * 60)
        output_lines.append(f"Total de items extraídos: {total_items}")
        output_lines.append("=" * 60)
        
        formatted_text = "\n".join(output_lines)
        
        print(f"\n{'='*60}")
        print("TEXTO FORMATEADO GENERADO:")
        print(f"{'='*60}")
        print(formatted_text[:800] + "..." if len(formatted_text) > 800 else formatted_text)
        print(f"{'='*60}\n")
        
        return formatted_text