Spaces:

satvaSolutions
/

Transaction_Reconciliation

Sleeping

App Files Files Community

RajanMalaviya commited on May 26, 2025

Commit

7334b9a

verified ·

1 Parent(s): da26974

Update app.py

Browse files

Files changed (1) hide show

app.py +273 -289

app.py CHANGED Viewed

@@ -1,305 +1,289 @@
-from fastapi import FastAPI, File, UploadFile, HTTPException
-import pytesseract
-import cv2
-import os
-from PIL import Image
-import json
-import unicodedata
-from pdf2image import convert_from_bytes
-from pypdf import PdfReader
-import numpy as np
-from typing import List
-import io
-import logging
-import time
-import asyncio
-import psutil
-import cachetools
-import hashlib
-from huggingface_hub import InferenceClient
-app = FastAPI()
-# Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# Set Tesseract path
-pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
-# Get Hugging Face token from environment variable
-hf_token = os.getenv("HF_TOKEN")
-if not hf_token:
-    logger.error("HF_TOKEN environment variable not set")
-    raise HTTPException(status_code=500, detail="HF_TOKEN environment variable not set")
-# Initialize Hugging Face Inference Client
-client = InferenceClient(token=hf_token)
-logger.info("Hugging Face Inference Client initialized")
-# In-memory caches (1-hour TTL)
-raw_text_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
-structured_data_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
-def log_memory_usage():
-    """Log current memory usage."""
-    process = psutil.Process()
-    mem_info = process.memory_info()
-    return f"Memory usage: {mem_info.rss / 1024 / 1024:.2f} MB"
-def get_file_hash(file_bytes):
-    """Generate MD5 hash of file content."""
-    return hashlib.md5(file_bytes).hexdigest()
-def get_text_hash(raw_text):
-    """Generate MD5 hash of raw text."""
-    return hashlib.md5(raw_text.encode('utf-8')).hexdigest()
-async def process_image(img_bytes, filename, idx):
-    """Process a single image (JPG/JPEG/PNG) with OCR."""
-    start_time = time.time()
-    logger.info(f"Starting OCR for {filename} image {idx}, {log_memory_usage()}")
     try:
-        img = Image.open(io.BytesIO(img_bytes)).resize((600, 400))  # Smaller for speed
-        img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
-        gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
-        img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
-        custom_config = r'--oem 1 --psm 6 -l eng'  # English only for speed
-        page_text = pytesseract.image_to_string(img_pil, config=custom_config)
-        logger.info(f"Completed OCR for {filename} image {idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
-        return page_text + "\n"
     except Exception as e:
-        logger.error(f"OCR failed for {filename} image {idx}: {str(e)}, {log_memory_usage()}")
-        return ""
-async def process_pdf_page(img, page_idx):
-    """Process a single PDF page with OCR."""
-    start_time = time.time()
-    logger.info(f"Starting OCR for PDF page {page_idx}, {log_memory_usage()}")
     try:
-        img = img.resize((600, 400))  # Smaller for speed
-        img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
-        gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
-        img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
-        custom_config = r'--oem 1 --psm 6 -l eng'  # English only for speed
-        page_text = pytesseract.image_to_string(img_pil, config=custom_config)
-        logger.info(f"Completed OCR for PDF page {page_idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
-        return page_text + "\n"
     except Exception as e:
-        logger.error(f"OCR failed for PDF page {page_idx}: {str(e)}, {log_memory_usage()}")
-        return ""
-async def process_with_llm(filename: str, raw_text: str):
-    """Process raw text with LLM via Hugging Face Inference API."""
-    start_time = time.time()
-    logger.info(f"Starting LLM API processing for {filename}, {log_memory_usage()}")
-    # Check structured data cache
-    text_hash = get_text_hash(raw_text)
-    if text_hash in structured_data_cache:
-        logger.info(f"Structured data cache hit for {filename}, {log_memory_usage()}")
-        return structured_data_cache[text_hash]
-    # Truncate text for API
-    if len(raw_text) > 2000:
-        raw_text = raw_text[:2000]
-        logger.info(f"Truncated raw text for {filename} to 2000 characters, {log_memory_usage()}")
-    # Define models to try
-    models = [
-        {"model": "google/gemma-2-9b-it", "provider": "hyperbolic"},
-        {"model": "mistral/Mixtral-8x22B-Instruct-v0.1", "provider": "auto"}
-    ]
-    for model_info in models:
-        model = model_info["model"]
-        provider = model_info["provider"]
-        logger.info(f"Attempting LLM API call with model {model} and provider {provider}")
-        for attempt in range(2):  # Retry once
-            try:
-                prompt = f"""
-                Extract key invoice fields as JSON from the raw text. Support English. Detect currency (e.g., USD, INR). Output only valid JSON, with no additional text, comments, or markdown.
-                Raw text: {raw_text}
-                Output JSON:
-                {{
-                    "currency": "",
-                    "Name_Client": "",
-                    "Products": [],
-                    "Subtotal": "",
-                    "Tax": "",
-                    "total": "",
-                    "invoice date": "",
-                    "invoice number": ""
-                }}
-                """
-                # Call Hugging Face Inference API
-                response = await asyncio.to_thread(client.chat_completion,
-                    model=model,
-                    messages=[{"role": "user", "content": prompt}],
-                    max_tokens=256,
-                    temperature=0.7,
-                    provider=provider
-                )
-                llm_output = response.choices[0].message.content
-                # Extract JSON from output
-                llm_output = llm_output.strip()
-                if not llm_output.startswith("{"):
-                    raise ValueError("API output is not valid JSON")
-                json_start = llm_output.find("{")
-                json_end = llm_output.rfind("}") + 1
-                json_str = llm_output[json_start:json_end]
-                try:
-                    structured_data = json.loads(json_str)
-                except json.JSONDecodeError:
-                    logger.warning(f"JSON parsing failed for {filename}, attempting to fix")
-                    json_str = llm_output[llm_output.find("{"):llm_output.rfind("}")+1]
-                    structured_data = json.loads(json_str)
-                structured_data_cache[text_hash] = structured_data
-                logger.info(f"LLM API processing for {filename} with {model}, attempt {attempt+1}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
-                return structured_data
-            except Exception as e:
-                if hasattr(e, 'response') and e.response.status_code == 429:  # Rate limit
-                    logger.warning(f"Rate limit hit for {filename} with {model}, attempt {attempt+1}: {str(e)}, {log_memory_usage()}")
-                    if attempt == 1:
-                        break
-                    await asyncio.sleep(2 ** attempt)  # Exponential backoff
-                else:
-                    logger.warning(f"LLM API processing failed for {filename} with {model}, attempt {attempt+1}: {str(e)}, {log_memory_usage()}")
-                    break
-    # If all models fail
-    error_msg = "All LLM API models failed. Check model availability, authentication, or rate limits."
-    logger.error(f"{error_msg} for {filename}, {log_memory_usage()}")
-    return {"error": error_msg}
-@app.post("/ocr")
-async def extract_and_structure(files: List[UploadFile] = File(...)):
-    output_json = {
-        "success": True,
-        "message": "",
-        "data": []
-    }
-    success_count = 0
-    fail_count = 0
-    logger.info(f"Starting processing for {len(files)} files, {log_memory_usage()}")
-    for file in files:
-        total_start_time = time.time()
-        logger.info(f"Processing file: {file.filename}, {log_memory_usage()}")
-        # Validate file format
-        valid_extensions = {'.pdf', '.jpg', '.jpeg', '.png'}
-        file_ext = os.path.splitext(file.filename.lower())[1]
-        if file_ext not in valid_extensions:
-            fail_count += 1
-            output_json["data"].append({
-                "filename": file.filename,
-                "structured_data": {"error": f"Unsupported file format: {file_ext}"},
-                "error": f"Unsupported file format: {file_ext}"
-            })
-            logger.error(f"Unsupported file format for {file.filename}: {file_ext}")
-            continue
-        # Read file into memory
-        try:
-            file_start_time = time.time()
-            file_bytes = await file.read()
-            file_stream = io.BytesIO(file_bytes)
-            file_hash = get_file_hash(file_bytes)
-            logger.info(f"Read file {file.filename}, took {time.time() - file_start_time:.2f} seconds, size: {len(file_bytes)/1024:.2f} KB, {log_memory_usage()}")
-        except Exception as e:
-            fail_count += 1
-            output_json["data"].append({
-                "filename": file.filename,
-                "structured_data": {"error": f"Failed to read file: {str(e)}"},
-                "error": f"Failed to read file: {str(e)}"
-            })
-            logger.error(f"Failed to read file {file.filename}: {str(e)}, {log_memory_usage()}")
-            continue
-        # Check raw text cache
-        raw_text = ""
-        if file_hash in raw_text_cache:
-            raw_text = raw_text_cache[file_hash]
-            logger.info(f"Raw text cache hit for {file.filename}, {log_memory_usage()}")
-        else:
-            if file_ext == '.pdf':
-                # Try extracting embedded text
-                try:
-                    extract_start_time = time.time()
-                    reader = PdfReader(file_stream)
-                    for page in reader.pages:
-                        text = page.extract_text()
-                        if text:
-                            raw_text += text + "\n"
-                    logger.info(f"Embedded text extraction for {file.filename}, took {time.time() - extract_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
-                except Exception as e:
-                    logger.warning(f"Embedded text extraction failed for {file.filename}: {str(e)}, {log_memory_usage()}")
-                # If no embedded text, perform OCR
-                if not raw_text.strip():
-                    try:
-                        convert_start_time = time.time()
-                        images = convert_from_bytes(file_bytes, poppler_path="/usr/local/bin", dpi=100)
-                        logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
-                        ocr_start_time = time.time()
-                        page_texts = []
-                        for i, img in enumerate(images):
-                            page_text = await process_pdf_page(img, i)
-                            page_texts.append(page_text)
-                        raw_text = "".join(page_texts)
-                        logger.info(f"Total OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
-                    except Exception as e:
-                        fail_count += 1
-                        output_json["data"].append({
-                            "filename": file.filename,
-                            "structured_data": {"error": f"OCR failed: {str(e)}"},
-                            "error": f"OCR failed: {str(e)}"
-                        })
-                        logger.error(f"OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
-                        continue
-            else:  # JPG/JPEG/PNG
-                try:
-                    ocr_start_time = time.time()
-                    raw_text = await process_image(file_bytes, file.filename, 0)
-                    logger.info(f"Image OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
-                except Exception as e:
-                    fail_count += 1
-                    output_json["data"].append({
-                        "filename": file.filename,
-                        "structured_data": {"error": f"Image OCR failed: {str(e)}"},
-                        "error": f"Image OCR failed: {str(e)}"
-                    })
-                    logger.error(f"Image OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
-                    continue
-            # Normalize text
-            try:
-                normalize_start_time = time.time()
-                raw_text = unicodedata.normalize('NFKC', raw_text)
-                raw_text = raw_text.encode().decode('utf-8')
-                raw_text_cache[file_hash] = raw_text
-                logger.info(f"Text normalization for {file.filename}, took {time.time() - normalize_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
-            except Exception as e:
-                logger.warning(f"Text normalization failed for {filename}: {str(e)}, {log_memory_usage()}")
-        # Process with LLM API
-        structured_data = await process_with_llm(file.filename, raw_text)
-        success_count += 1
-        output_json["data"].append({
-            "filename": file.filename,
-            "structured_data": structured_data,
-            "error": ""
-        })
-        logger.info(f"Total processing for {file.filename}, took {time.time() - total_start_time:.2f} seconds, {log_memory_usage()}")
-    output_json["message"] = f"Processed {len(files)} files. {success_count} succeeded, {fail_count} failed."
-    if fail_count > 0 and success_count == 0:
-        output_json["success"] = False
-    logger.info(f"Completed processing for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
-    return output_json

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+from typing import List, Optional, Dict, Any
+from datetime import datetime, date
+import re
+from difflib import SequenceMatcher
+import uvicorn
+app = FastAPI(
+    title="Transaction Reconciliation API",
+    description="Reconcile bank and credit card transactions using fuzzy matching",
+    version="1.0.0"
+)
+# Pydantic Models
+class Transaction(BaseModel):
+    id: str
+    date: str
+    amount: float
+    description: str
+    type: str
+    reference_number: Optional[str] = None
+class ReconciliationInput(BaseModel):
+    bank_transactions: List[Transaction]
+    credit_card_transactions: List[Transaction]
+class MatchedTransaction(BaseModel):
+    bank_id: str
+    credit_card_id: str
+    match_score: float = Field(..., ge=0, le=1)
+    match_reason: str
+class UnmatchedTransaction(BaseModel):
+    id: str
+    date: str
+    amount: float
+    description: str
+    type: str
+    reference_number: Optional[str] = None
+class ReconciliationOutput(BaseModel):
+    matched_transactions: List[MatchedTransaction]
+    unmatched_bank_transactions: List[UnmatchedTransaction]
+    unmatched_credit_card_transactions: List[UnmatchedTransaction]
+class ReconciliationService:
+    def __init__(self,
+                 description_threshold: float = 0.7,
+                 amount_tolerance: float = 0.01,
+                 max_date_diff_days: int = 7):
+        self.description_threshold = description_threshold
+        self.amount_tolerance = amount_tolerance
+        self.max_date_diff_days = max_date_diff_days
+    def fuzzy_match_description(self, desc1: str, desc2: str) -> float:
+        """Calculate fuzzy match score between two descriptions"""
+        # Clean descriptions for better matching
+        clean_desc1 = self._clean_description(desc1.lower())
+        clean_desc2 = self._clean_description(desc2.lower())
+        # Use SequenceMatcher for fuzzy matching
+        similarity = SequenceMatcher(None, clean_desc1, clean_desc2).ratio()
+        # Additional check for common transaction patterns
+        if self._check_common_patterns(clean_desc1, clean_desc2):
+            similarity = max(similarity, 0.8)
+        return similarity
+    def _clean_description(self, description: str) -> str:
+        """Clean description for better matching"""
+        # Remove special characters and extra spaces
+        cleaned = re.sub(r'[^\w\s]', ' ', description)
+        cleaned = re.sub(r'\s+', ' ', cleaned).strip()
+        return cleaned
+    def _check_common_patterns(self, desc1: str, desc2: str) -> bool:
+        """Check for common transaction patterns"""
+        patterns = [
+            (r'uber', r'uber'),
+            (r'amazon|amzn', r'amazon|amzn'),
+            (r'invoice\s*#?\s*(\d+)', r'invoice\s*#?\s*(\d+)'),
+            (r'payment.*invoice', r'payment.*invoice'),
+            (r'trip\s*id\s*(\d+)', r'trip\s*id\s*(\d+)')
+        ]
+        for pattern1, pattern2 in patterns:
+            if re.search(pattern1, desc1) and re.search(pattern2, desc2):
+                return True
+        return False
+    def calculate_date_difference(self, date1: str, date2: str) -> int:
+        """Calculate difference in days between two dates"""
+        try:
+            d1 = datetime.strptime(date1, "%Y-%m-%d").date()
+            d2 = datetime.strptime(date2, "%Y-%m-%d").date()
+            return abs((d1 - d2).days)
+        except ValueError:
+            return float('inf')
+    def amounts_match(self, amount1: float, amount2: float) -> bool:
+        """Check if amounts are close enough to match"""
+        return abs(abs(amount1) - abs(amount2)) <= self.amount_tolerance
+    def types_match(self, bank_type: str, cc_type: str) -> bool:
+        """Check if transaction types match according to business logic"""
+        type_mappings = {
+            ('debit', 'payment'),
+            ('credit', 'receipt'),
+            ('withdrawal', 'payment'),
+            ('deposit', 'receipt')
+        }
+        return (bank_type.lower(), cc_type.lower()) in type_mappings or bank_type.lower() == cc_type.lower()
+    def calculate_match_score(self, bank_txn: Transaction, cc_txn: Transaction) -> tuple[float, str]:
+        """Calculate overall match score and reason"""
+        scores = []
+        reasons = []
+        # Amount matching (weight: 0.4)
+        if self.amounts_match(bank_txn.amount, cc_txn.amount):
+            scores.append(0.4)
+            reasons.append("amounts match")
+        else:
+            amount_diff = abs(abs(bank_txn.amount) - abs(cc_txn.amount))
+            amount_score = max(0, 0.4 * (1 - amount_diff / max(abs(bank_txn.amount), abs(cc_txn.amount))))
+            scores.append(amount_score)
+            if amount_score > 0.2:
+                reasons.append("amounts close")
+        # Reference number matching (weight: 0.3)
+        if (bank_txn.reference_number and cc_txn.reference_number and
+            bank_txn.reference_number == cc_txn.reference_number):
+            scores.append(0.3)
+            reasons.append("reference numbers match")
+        else:
+            scores.append(0)
+        # Description matching (weight: 0.2)
+        desc_score = self.fuzzy_match_description(bank_txn.description, cc_txn.description)
+        scores.append(0.2 * desc_score)
+        if desc_score >= self.description_threshold:
+            reasons.append("descriptions match")
+        # Date matching (weight: 0.1)
+        date_diff = self.calculate_date_difference(bank_txn.date, cc_txn.date)
+        if date_diff <= self.max_date_diff_days:
+            date_score = 0.1 * (1 - date_diff / self.max_date_diff_days)
+            scores.append(date_score)
+            if date_diff <= 1:
+                reasons.append("dates match")
+            else:
+                reasons.append("dates close")
+        else:
+            scores.append(0)
+        total_score = sum(scores)
+        reason = ", ".join(reasons) if reasons else "partial match"
+        return total_score, reason
+    def reconcile(self, input_data: ReconciliationInput) -> ReconciliationOutput:
+        """Main reconciliation logic"""
+        matched_transactions = []
+        unmatched_bank = list(input_data.bank_transactions)
+        unmatched_cc = list(input_data.credit_card_transactions)
+        # Find matches
+        for bank_txn in input_data.bank_transactions:
+            best_match = None
+            best_score = 0
+            best_reason = ""
+            for cc_txn in input_data.credit_card_transactions:
+                # Check if types match first
+                if not self.types_match(bank_txn.type, cc_txn.type):
+                    continue
+                score, reason = self.calculate_match_score(bank_txn, cc_txn)
+                # Minimum threshold for considering a match
+                if score >= 0.6 and score > best_score:
+                    best_match = cc_txn
+                    best_score = score
+                    best_reason = reason
+            if best_match:
+                matched_transactions.append(MatchedTransaction(
+                    bank_id=bank_txn.id,
+                    credit_card_id=best_match.id,
+                    match_score=round(best_score, 2),
+                    match_reason=best_reason
+                ))
+                # Remove matched transactions from unmatched lists
+                if bank_txn in unmatched_bank:
+                    unmatched_bank.remove(bank_txn)
+                if best_match in unmatched_cc:
+                    unmatched_cc.remove(best_match)
+        # Convert remaining unmatched transactions
+        unmatched_bank_list = [
+            UnmatchedTransaction(
+                id=txn.id,
+                date=txn.date,
+                amount=txn.amount,
+                description=txn.description,
+                type=txn.type,
+                reference_number=txn.reference_number
+            ) for txn in unmatched_bank
+        ]
+        unmatched_cc_list = [
+            UnmatchedTransaction(
+                id=txn.id,
+                date=txn.date,
+                amount=txn.amount,
+                description=txn.description,
+                type=txn.type,
+                reference_number=txn.reference_number
+            ) for txn in unmatched_cc
+        ]
+        return ReconciliationOutput(
+            matched_transactions=matched_transactions,
+            unmatched_bank_transactions=unmatched_bank_list,
+            unmatched_credit_card_transactions=unmatched_cc_list
+        )
+# Initialize service
+reconciliation_service = ReconciliationService()
+@app.get("/")
+async def root():
+    """Health check endpoint"""
+    return {
+        "message": "Transaction Reconciliation API is running",
+        "status": "healthy",
+        "version": "1.0.0"
+    }
+@app.post("/reconcile", response_model=ReconciliationOutput)
+async def reconcile_transactions(input_data: ReconciliationInput):
+    """
+    Reconcile bank and credit card transactions
+    This endpoint matches transactions based on:
+    - Amount similarity (within tolerance)
+    - Date proximity (within 7 days)
+    - Description fuzzy matching (70% threshold)
+    - Transaction type compatibility
+    - Reference number exact matching
+    """
     try:
+        result = reconciliation_service.reconcile(input_data)
+        return result
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Reconciliation failed: {str(e)}")
+@app.get("/health")
+async def health_check():
+    """Health check for deployment"""
+    return {"status": "ok", "service": "Transaction Reconciliation API"}
+@app.post("/reconcile/custom", response_model=ReconciliationOutput)
+async def reconcile_with_custom_params(
+    input_data: ReconciliationInput,
+    description_threshold: float = Field(0.7, ge=0, le=1, description="Fuzzy match threshold for descriptions"),
+    amount_tolerance: float = Field(0.01, ge=0, description="Maximum allowed difference in amounts"),
+    max_date_diff_days: int = Field(7, ge=0, description="Maximum allowed date difference in days")
+):
+    """
+    Reconcile transactions with custom matching parameters
+    """
     try:
+        custom_service = ReconciliationService(
+            description_threshold=description_threshold,
+            amount_tolerance=amount_tolerance,
+            max_date_diff_days=max_date_diff_days
+        )
+        result = custom_service.reconcile(input_data)
+        return result
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Reconciliation failed: {str(e)}")
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)