Spaces:

sgonzalezu
/

ocr_service

Running

ocr_service / azure_ocr_processor.py

Sebastian Gonzalez

Deploy OCR Service via Script

0a6b0fb 24 days ago

13.2 kB

	# azure_ocr_processor.py
	# Procesador OCR usando Azure Document Intelligence

	import json
	import numpy as np
	from io import BytesIO
	from typing import Dict, List

	try:
	from azure.core.credentials import AzureKeyCredential
	from azure.ai.documentintelligence import DocumentIntelligenceClient
	AZURE_AVAILABLE = True
	except ImportError:
	AZURE_AVAILABLE = False
	print("ADVERTENCIA: azure-ai-documentintelligence no está disponible.")


	class AzureOCRProcessor:
	"""Procesador usando Azure Document Intelligence"""

	def __init__(self, endpoint: str = None, key: str = None):
	if not AZURE_AVAILABLE:
	raise RuntimeError("Azure Document Intelligence no está disponible")

	# Usar credenciales desde variables de entorno o parámetros
	import os

	# Prioridad: parámetros > variables de entorno > valores por defecto
	self.endpoint = endpoint or os.environ.get(
	"AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT",
	"https://invoicerecog.cognitiveservices.azure.com/"
	)

	self.key = key or os.environ.get(
	"AZURE_DOCUMENT_INTELLIGENCE_KEY",
	"BnvYqZbBSscFxbxZurfTEj9H6ZP4anDzvE2gQTB8fvau0wzlAk0TJQQJ99BKACYeBjFXJ3w3AAALACOGyauB"
	)

	if not self.endpoint or not self.key:
	raise ValueError(
	"Se requieren credenciales de Azure. "
	"Define las variables de entorno AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT "
	"y AZURE_DOCUMENT_INTELLIGENCE_KEY, o pásalas como parámetros."
	)

	print(f"INFO: Inicializando Azure Document Intelligence")
	print(f"INFO: Endpoint: {self.endpoint}")

	self.client = DocumentIntelligenceClient(
	endpoint=self.endpoint,
	credential=AzureKeyCredential(self.key)
	)

	def process(self, image: np.ndarray, ocr_config: Dict) -> List[Dict]:
	"""
	Procesa la imagen usando Azure Document Intelligence.
	Retorna text_blocks simulando el formato de otros OCR pero con datos estructurados.
	"""
	model = ocr_config.get("model", "prebuilt-invoice")

	print(f"INFO: Procesando con Azure Document Intelligence, modelo: {model}")

	# === NUEVO: COMPRESIÓN DE IMAGEN PARA AZURE (PROCESO INDEPENDIENTE) ===
	# Esta compresión se ejecuta antes del procesamiento normal y no afecta la funcionalidad original
	image_to_process = self._compress_image_for_azure(image)
	# === FIN COMPRESIÓN ===

	# Convertir numpy array a bytes (formato PNG) - CÓDIGO ORIGINAL INTACTO
	import cv2
	success, encoded_image = cv2.imencode('.png', image_to_process)
	if not success:
	raise RuntimeError("No se pudo codificar la imagen")

	image_bytes = encoded_image.tobytes()

	print(f"INFO: Imagen codificada: {len(image_bytes)} bytes")

	# Analizar con Azure - CÓDIGO ORIGINAL INTACTO
	try:
	print("INFO: Enviando imagen a Azure Document Intelligence...")

	poller = self.client.begin_analyze_document(
	model,
	body=BytesIO(image_bytes),
	content_type="image/png"
	)

	print("INFO: Esperando respuesta de Azure...")
	result = poller.result()

	print(f"INFO: Análisis completado. Documentos encontrados: {len(result.documents) if result.documents else 0}")

	# Convertir resultado de Azure a formato de texto estructurado
	formatted_text = self._format_azure_result_as_text(result)

	# Retornar como un único text_block con flag especial
	return [{
	'text': formatted_text,
	'x': 0,
	'y': 0,
	'width': 0,
	'height': 0,
	'confidence': 95.0,
	'engine': 'azure',
	'is_azure_structured': True
	}]

	except Exception as e:
	print(f"ERROR en Azure Document Intelligence: {e}")
	import traceback
	traceback.print_exc()
	raise

	def _compress_image_for_azure(self, image: np.ndarray) -> np.ndarray:
	"""
	COMPRESIÓN INDEPENDIENTE: Comprime la imagen para Azure sin afectar el procesamiento original.
	Esta función es completamente independiente y no modifica la lógica existente.
	"""
	import cv2

	# Obtener información de la imagen original
	height, width = image.shape[:2]
	original_size_mb = image.nbytes / (1024 * 1024)
	print(f"INFO: Compresión Azure - Imagen original: {width}x{height}, {original_size_mb:.2f}MB")

	# Si la imagen ya es pequeña, no comprimir
	if original_size_mb <= 4.5:
	print("INFO: Compresión Azure - Imagen ya está dentro del límite, no se requiere compresión")
	return image

	print("INFO: Compresión Azure - Aplicando compresión...")

	# Redimensionar si es muy grande (manteniendo relación de aspecto)
	max_dimension = 2000
	if width > max_dimension or height > max_dimension:
	if width > height:
	new_width = max_dimension
	new_height = int((max_dimension / width) * height)
	else:
	new_height = max_dimension
	new_width = int((max_dimension / height) * width)

	print(f"INFO: Compresión Azure - Redimensionando a {new_width}x{new_height}")
	compressed_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
	compressed_size_mb = compressed_image.nbytes / (1024 * 1024)
	print(f"INFO: Compresión Azure - Después de redimensionar: {compressed_size_mb:.2f}MB")

	# Verificar si después de redimensionar ya está dentro del límite
	if compressed_size_mb <= 4.5:
	return compressed_image
	else:
	compressed_image = image

	# Si aún es grande después de redimensionar, aplicar compresión JPEG temporal
	temp_quality = 85
	while temp_quality >= 50:
	# Codificar temporalmente como JPEG para ver el tamaño
	success, jpeg_encoded = cv2.imencode('.jpg', compressed_image, [cv2.IMWRITE_JPEG_QUALITY, temp_quality])
	if success:
	jpeg_size_mb = len(jpeg_encoded.tobytes()) / (1024 * 1024)
	print(f"INFO: Compresión Azure - Calidad {temp_quality}: {jpeg_size_mb:.2f}MB")

	if jpeg_size_mb <= 4.5:
	print(f"INFO: Compresión Azure - Calidad {temp_quality} aceptada")
	# Decodificar de vuelta a numpy array para mantener compatibilidad
	decoded_image = cv2.imdecode(jpeg_encoded, cv2.IMREAD_COLOR)
	if decoded_image is not None:
	final_size_mb = decoded_image.nbytes / (1024 * 1024)
	print(f"INFO: Compresión Azure - Imagen final: {final_size_mb:.2f}MB")
	return decoded_image

	temp_quality -= 10

	# Si llegamos aquí, usar la imagen redimensionada sin compresión JPEG
	print("INFO: Compresión Azure - Usando imagen redimensionada sin compresión JPEG adicional")
	return compressed_image

	def _format_azure_result_as_text(self, result) -> str:
	"""
	Convierte el resultado de Azure a un texto formateado limpio (sin líneas de confianza).
	"""
	output_lines = []

	if not result.documents:
	return "ERROR: No se encontraron documentos en la factura"

	# Procesar el primer documento
	document = result.documents[0]
	fields = document.fields

	output_lines.append("-------- Análisis de Azure Document Intelligence --------")
	output_lines.append("")

	# Información del proveedor
	vendor_name = fields.get("VendorName")
	if vendor_name:
	output_lines.append(f"Proveedor: {vendor_name.content}")

	vendor_address = fields.get("VendorAddress")
	if vendor_address:
	output_lines.append(f"Dirección: {vendor_address.content}")

	vendor_tax = fields.get("VendorTaxId")
	if vendor_tax:
	output_lines.append(f"GST/HST: {vendor_tax.content}")

	output_lines.append("")

	# Información de la factura
	invoice_id = fields.get("InvoiceId")
	if invoice_id:
	output_lines.append(f"Invoice ID: {invoice_id.content}")

	invoice_date = fields.get("InvoiceDate")
	if invoice_date:
	output_lines.append(f"Fecha: {invoice_date.content}")

	customer_name = fields.get("CustomerName")
	if customer_name:
	output_lines.append(f"Cliente: {customer_name.content}")

	output_lines.append("")
	output_lines.append("=" * 60)
	output_lines.append("ÍTEMS DE LA FACTURA")
	output_lines.append("=" * 60)
	output_lines.append("")

	# Extraer items
	items_field = fields.get("Items")
	total_items = 0

	if items_field and hasattr(items_field, "value_array"):
	total_items = len(items_field.value_array)
	print(f"INFO: Procesando {total_items} items...")

	for item_idx, item in enumerate(items_field.value_array):
	item_obj = item.value_object if hasattr(item, "value_object") else {}

	output_lines.append(f"--- Ítem #{item_idx + 1} ---")

	# Código de producto
	product_code = item_obj.get("ProductCode")
	if product_code and product_code.content:
	output_lines.append(f"Código: {product_code.content}")

	# Descripción
	description = item_obj.get("Description")
	if description and description.content:
	output_lines.append(f"Descripción: {description.content}")

	# Cantidad
	quantity = item_obj.get("Quantity")
	if quantity and quantity.content:
	output_lines.append(f"Cantidad: {quantity.content}")

	# Precio unitario
	unit_price = item_obj.get("UnitPrice")
	if unit_price and unit_price.content:
	output_lines.append(f"Precio unitario: {unit_price.content}")

	# Impuesto por ítem - SOLO si es > 0
	tax = item_obj.get("Tax")
	if tax and tax.content:
	try:
	# Extraer el valor numérico del tax
	tax_value_str = tax.content.replace('$', '').replace(',', '').strip()
	tax_value = float(tax_value_str)

	# Solo incluir si es mayor a 0
	if tax_value > 0:
	output_lines.append(f"Impuesto (H): {tax.content}")
	except (ValueError, AttributeError):
	pass

	# Total por ítem
	amount = item_obj.get("Amount")
	if amount and amount.content:
	output_lines.append(f"Total por ítem: {amount.content}")

	output_lines.append("")
	else:
	output_lines.append("No se encontraron items en la factura")

	# Totales
	output_lines.append("=" * 60)
	output_lines.append("TOTALES")
	output_lines.append("=" * 60)
	output_lines.append("")

	subtotal = fields.get("SubTotal")
	if subtotal and subtotal.content:
	output_lines.append(f"Subtotal: {subtotal.content}")

	total_tax = fields.get("TotalTax")
	if total_tax and total_tax.content:
	output_lines.append(f"Total impuestos: {total_tax.content}")

	invoice_total = fields.get("InvoiceTotal")
	if invoice_total and invoice_total.content:
	output_lines.append(f"Total de la factura: {invoice_total.content}")

	output_lines.append("")
	output_lines.append("=" * 60)
	output_lines.append(f"Total de items extraídos: {total_items}")
	output_lines.append("=" * 60)

	formatted_text = "\n".join(output_lines)

	print(f"\n{'='*60}")
	print("TEXTO FORMATEADO GENERADO:")
	print(f"{'='*60}")
	print(formatted_text[:800] + "..." if len(formatted_text) > 800 else formatted_text)
	print(f"{'='*60}\n")

	return formatted_text