File size: 5,944 Bytes

dcc24f8

"""
FinEE Normalizer - Data normalization utilities.

Handles normalization of:
- Amounts (₹2,500.00 → 2500.0)
- Dates (various formats → DD-MM-YYYY)
- Account numbers (masking, formatting)
- Reference numbers (padding)
"""

import re
from datetime import datetime, date
from typing import Optional, Union
from dateutil import parser as date_parser


def normalize_amount(amount_str: Union[str, float, int, None]) -> Optional[float]:
    """
    Normalize amount string to float.
    
    Handles:
    - Currency symbols (Rs., ₹, INR)
    - Commas (2,500.00)
    - Spaces (Rs. 2 500)
    
    Args:
        amount_str: Amount in various formats
        
    Returns:
        Float amount or None if parsing fails
    """
    if amount_str is None:
        return None
    
    if isinstance(amount_str, (int, float)):
        return float(amount_str)
    
    if not isinstance(amount_str, str):
        return None
    
    # Remove currency symbols (specific prefixes)
    cleaned = amount_str.strip()
    cleaned = re.sub(r'^(?:Rs\.?|INR|₹)\s*', '', cleaned, flags=re.IGNORECASE)
    
    # Remove commas
    cleaned = cleaned.replace(',', '')
    
    # Handle Indian lakhs/crores notation (if present)
    cleaned = cleaned.replace(' ', '')
    
    try:
        return float(cleaned)
    except ValueError:
        return None


def normalize_date(date_str: Optional[str], output_format: str = '%d-%m-%Y') -> Optional[str]:
    """
    Normalize date string to standard format.
    
    Handles:
    - DD-MM-YY, DD-MM-YYYY
    - DD/MM/YY, DD/MM/YYYY
    - DD Mon YYYY (28 Dec 2025)
    - YYYY-MM-DD (ISO format)
    
    Args:
        date_str: Date in various formats
        output_format: Output format (default: DD-MM-YYYY)
        
    Returns:
        Normalized date string or None if parsing fails
    """
    if not date_str:
        return None
    
    # Clean input
    date_str = date_str.strip()
    
    # Common Indian date formats to try
    formats = [
        '%d-%m-%Y',      # 28-12-2025
        '%d-%m-%y',      # 28-12-25
        '%d/%m/%Y',      # 28/12/2025
        '%d/%m/%y',      # 28/12/25
        '%d %b %Y',      # 28 Dec 2025
        '%d %b %y',      # 28 Dec 25
        '%d %B %Y',      # 28 December 2025
        '%d %B %y',      # 28 December 25
        '%Y-%m-%d',      # 2025-12-28 (ISO)
        '%d.%m.%Y',      # 28.12.2025
        '%d.%m.%y',      # 28.12.25
    ]
    
    # Try each format
    for fmt in formats:
        try:
            parsed = datetime.strptime(date_str, fmt)
            
            # Handle 2-digit years (assume 20xx for years < 50)
            if parsed.year < 100:
                if parsed.year < 50:
                    parsed = parsed.replace(year=parsed.year + 2000)
                else:
                    parsed = parsed.replace(year=parsed.year + 1900)
            
            return parsed.strftime(output_format)
        except ValueError:
            continue
    
    # Fallback to dateutil parser
    try:
        parsed = date_parser.parse(date_str, dayfirst=True)
        return parsed.strftime(output_format)
    except (ValueError, TypeError):
        return None


def normalize_account(account_str: Optional[str], mask: bool = False) -> Optional[str]:
    """
    Normalize account number.
    
    Args:
        account_str: Account number string
        mask: If True, mask all but last 4 digits
        
    Returns:
        Normalized account number
    """
    if not account_str:
        return None
    
    # Extract digits only
    digits = re.sub(r'\D', '', str(account_str))
    
    if not digits:
        return None
    
    if mask and len(digits) > 4:
        return '*' * (len(digits) - 4) + digits[-4:]
    
    return digits


def normalize_reference(ref_str: Optional[str]) -> Optional[str]:
    """
    Normalize transaction reference number.
    
    Args:
        ref_str: Reference number string
        
    Returns:
        Normalized reference number
    """
    if not ref_str:
        return None
    
    # Extract alphanumeric characters
    cleaned = re.sub(r'[^A-Za-z0-9]', '', str(ref_str))
    
    return cleaned if cleaned else None


def normalize_vpa(vpa_str: Optional[str]) -> Optional[str]:
    """
    Normalize UPI VPA.
    
    Args:
        vpa_str: VPA string
        
    Returns:
        Lowercase VPA
    """
    if not vpa_str:
        return None
    
    # Remove extra whitespace and lowercase
    cleaned = vpa_str.strip().lower()
    
    # Validate VPA format (should have @)
    if '@' not in cleaned:
        return None
    
    return cleaned


def normalize_merchant(merchant_str: Optional[str]) -> Optional[str]:
    """
    Normalize merchant name.
    
    Args:
        merchant_str: Merchant name string
        
    Returns:
        Cleaned merchant name
    """
    if not merchant_str:
        return None
    
    # Title case and clean
    cleaned = merchant_str.strip()
    
    # Remove common prefixes/suffixes
    prefixes = ['payment to', 'paid to', 'transfer to', 'upi-']
    for prefix in prefixes:
        if cleaned.lower().startswith(prefix):
            cleaned = cleaned[len(prefix):].strip()
    
    return cleaned if cleaned else None


def normalize_type(type_str: Optional[str]) -> Optional[str]:
    """
    Normalize transaction type.
    
    Args:
        type_str: Type string (debit/credit variants)
        
    Returns:
        'debit' or 'credit'
    """
    if not type_str:
        return None
    
    type_lower = str(type_str).lower().strip()
    
    debit_keywords = ['debit', 'debited', 'withdrawn', 'sent', 'paid', 'spent', 'purchase']
    credit_keywords = ['credit', 'credited', 'received', 'refund', 'cashback', 'reversed']
    
    for kw in debit_keywords:
        if kw in type_lower:
            return 'debit'
    
    for kw in credit_keywords:
        if kw in type_lower:
            return 'credit'
    
    return None