Spaces:
Running
Running
| # dollar_correction.py | |
| # Proceso independiente para correcci贸n de confusi贸n $ vs 8 | |
| import re | |
| from typing import Dict, List | |
| class DollarSignCorrectionProcessor: | |
| """ | |
| Proceso independiente para corregir confusiones del OCR entre $ y 8. | |
| Similar al proceso multilinea, puede ser aplicado a cualquier proveedor. | |
| """ | |
| def __init__(self, config: Dict = None): | |
| """ | |
| Args: | |
| config: Configuraci贸n del procesador | |
| - aggressive: bool - Si True, aplica correcciones m谩s agresivas | |
| - context_aware: bool - Si True, usa contexto para decidir correcciones | |
| - min_confidence: float - Confianza m铆nima para aplicar correcci贸n | |
| """ | |
| self.config = config or {} | |
| self.aggressive = self.config.get("aggressive", False) | |
| self.context_aware = self.config.get("context_aware", True) | |
| self.min_confidence = self.config.get("min_confidence", 0.7) | |
| def process(self, text_blocks: List[Dict]) -> List[Dict]: | |
| """ | |
| Procesa los bloques de texto y corrige confusiones entre $ y 8. | |
| Args: | |
| text_blocks: Lista de bloques de texto del OCR | |
| Returns: | |
| Lista de bloques de texto corregidos | |
| """ | |
| corrected_blocks = [] | |
| corrections_made = 0 | |
| for block in text_blocks: | |
| original_text = block['text'] | |
| corrected_text = self._correct_text(original_text, block) | |
| if corrected_text != original_text: | |
| corrections_made += 1 | |
| print(f"DEBUG: Correcci贸n $ vs 8: '{original_text}' -> '{corrected_text}'") | |
| # Crear nuevo bloque con texto corregido | |
| corrected_block = block.copy() | |
| corrected_block['text'] = corrected_text | |
| corrected_block['was_corrected'] = True | |
| corrected_block['original_text'] = original_text | |
| corrected_blocks.append(corrected_block) | |
| else: | |
| corrected_blocks.append(block) | |
| print(f"INFO: Correcciones $ vs 8 aplicadas: {corrections_made} de {len(text_blocks)} bloques") | |
| return corrected_blocks | |
| def _correct_text(self, text: str, block: Dict) -> str: | |
| """ | |
| Aplica correcciones al texto bas谩ndose en patrones y contexto. | |
| Args: | |
| text: Texto a corregir | |
| block: Bloque de texto con metadata (posici贸n, confianza, etc.) | |
| Returns: | |
| Texto corregido | |
| """ | |
| corrected = text | |
| # Patr贸n 1: "8" seguido de n煤meros (probablemente es "$") | |
| # Ejemplo: "8 12.99" -> "$ 12.99" | |
| # Ejemplo: "812.99" -> "$12.99" | |
| corrected = re.sub( | |
| r'\b8\s*(\d+\.?\d*)\b', | |
| lambda m: f"$ {m.group(1)}" if self._is_likely_price(m.group(1)) else m.group(0), | |
| corrected | |
| ) | |
| # Patr贸n 2: "8" al inicio de l铆nea seguido de espacio y n煤meros | |
| # Ejemplo: "8 Total" -> "$ Total" | |
| if self.context_aware: | |
| corrected = re.sub( | |
| r'^8\s+(Total|Subtotal|HST|Tax|Amount|Price)', | |
| r'$ \1', | |
| corrected, | |
| flags=re.IGNORECASE | |
| ) | |
| # Patr贸n 3: "8" en contexto de moneda (despu茅s de palabras clave) | |
| # Ejemplo: "Total 8 123.45" -> "Total $ 123.45" | |
| corrected = re.sub( | |
| r'(Total|Subtotal|HST|Tax|Amount|Price|Cost)\s+8\s*(\d+\.?\d*)', | |
| r'\1 $ \2', | |
| corrected, | |
| flags=re.IGNORECASE | |
| ) | |
| # Patr贸n 4: M煤ltiples "8" en secuencia (probablemente "$") | |
| # Ejemplo: "88" -> "$$" (raro pero posible) | |
| if self.aggressive: | |
| corrected = re.sub(r'88', '$$', corrected) | |
| # Patr贸n 5: "8" entre espacios y n煤meros decimales | |
| # Ejemplo: "Item 8 12.99 8 24.98" -> "Item $ 12.99 $ 24.98" | |
| corrected = re.sub( | |
| r'\s8\s+(\d+\.\d{2})\b', | |
| r' $ \1', | |
| corrected | |
| ) | |
| # Patr贸n 6: "8" al final de palabra seguido de n煤meros | |
| # Ejemplo: "Price8123.45" -> "Price$123.45" | |
| corrected = re.sub( | |
| r'([a-zA-Z])8(\d+\.?\d*)', | |
| lambda m: f"{m.group(1)}${m.group(2)}" if self._is_likely_price(m.group(2)) else m.group(0), | |
| corrected | |
| ) | |
| # Patr贸n 7: "8" solo seguido de espacio y d铆gitos con decimales | |
| # Ejemplo: "8 1.99" -> "$ 1.99" | |
| corrected = re.sub( | |
| r'\b8\s+(\d+\.\d{2})\b', | |
| r'$ \1', | |
| corrected | |
| ) | |
| # Patr贸n 8: L铆neas que empiezan con "8" y tienen formato de precio | |
| # Ejemplo: "8123.45" -> "$123.45" | |
| corrected = re.sub( | |
| r'^8(\d+\.\d{2})\b', | |
| r'$\1', | |
| corrected, | |
| flags=re.MULTILINE | |
| ) | |
| return corrected | |
| def _is_likely_price(self, number_str: str) -> bool: | |
| """ | |
| Determina si un n煤mero es probablemente un precio. | |
| Args: | |
| number_str: String con el n煤mero | |
| Returns: | |
| True si parece un precio | |
| """ | |
| try: | |
| value = float(number_str) | |
| # Precios t铆picos: entre 0.01 y 10000 | |
| if value < 0.01 or value > 10000: | |
| return False | |
| # Si tiene 2 decimales, muy probable que sea precio | |
| if '.' in number_str and len(number_str.split('.')[1]) == 2: | |
| return True | |
| # Si es un n煤mero redondo peque帽o, menos probable | |
| if value < 10 and '.' not in number_str: | |
| return False | |
| return True | |
| except ValueError: | |
| return False |