GSoumyajit2005's picture
feat: Update Dockerfile and requirements for PDF processing, add new dependencies, and refactor API structure
faa3050
# src/utils.py
import hashlib
from typing import Dict, Any
from decimal import Decimal
from datetime import date
def generate_semantic_hash(invoice_data: Dict[str, Any]) -> str:
"""
Generates a unique fingerprint using a Composite Key strategy.
Composite Key = Vendor + Date + Total + Receipt Number
"""
# Define the specific fields that determine uniqueness
keys_to_hash = ['vendor', 'date', 'total_amount', 'receipt_number']
normalized_values = []
for key in keys_to_hash:
value = invoice_data[key]
# Normalize without modifying the original object
if value is None:
norm_val = ""
elif isinstance(value, (date, Decimal, int, float)):
norm_val = str(value)
else:
# String normalization
norm_val = str(value).lower().strip()
normalized_values.append(norm_val)
# Create the fingerprint string
composite_string = "|".join(normalized_values)
# Return the SHA256 hash of the string
return hashlib.sha256(composite_string.encode()).hexdigest()