Spaces:

AryanSifars
/

Bank-Scrubber

Runtime error

App Files Files Community

Aryan Jain commited on Jun 30, 2025

Commit

4e71548

0 Parent(s):

bank scrubber streamlit application

Browse files

Files changed (30) hide show

.dockerignore +168 -0
.env.example +27 -0
.gitignore +128 -0
DOCKER_DEPLOYMENT.md +443 -0
Dockerfile +71 -0
Dockerfile.alternative +63 -0
Dockerfile.fallback +81 -0
README.md +360 -0
build-docker.sh +115 -0
docker-compose.yml +26 -0
main.py +64 -0
poetry.lock +0 -0
pyproject.toml +31 -0
requirements.txt +17 -0
src/__init__.py +4 -0
src/config/config.py +46 -0
src/extractor/__init__.py +5 -0
src/extractor/account_extractor.py +152 -0
src/extractor/balance_extractor.py +102 -0
src/extractor/table_extractor.py +760 -0
src/models/__init__.py +3 -0
src/models/account_models.py +44 -0
src/ocr/__init__.py +4 -0
src/ocr/pdf_processor.py +131 -0
src/ocr/text_extractor.py +214 -0
src/services/__init__.py +3 -0
src/services/bank_statement_service.py +100 -0
src/utils/__init__.py +4 -0
src/utils/api_clients.py +116 -0
src/utils/model_manager.py +110 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,168 @@

+# Git
+.git
+.gitignore
+.gitattributes
+# Python
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+env
+pip-log.txt
+pip-delete-this-directory.txt
+.tox
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.log
+.mypy_cache
+.pytest_cache
+.hypothesis
+# Virtual environments
+venv/
+env/
+ENV/
+env.bak/
+venv.bak/
+.venv/
+.venv.bak/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# Project specific
+temp.pdf
+*.pdf
+.env
+.env.local
+.env.*.local
+requirements.txt
+# Documentation
+README.md
+*.md
+docs/
+# Testing
+test_structure.py
+setup_env.py
+startup.py
+docker-startup.py
+check-build-context.py
+build-docker.sh
+tests/
+test_*.py
+# Original files
+poc.py
+# Docker
+Dockerfile
+.dockerignore
+docker-compose*.yml
+DOCKER_DEPLOYMENT.md
+# Large files and directories
+*.tar
+*.tar.gz
+*.zip
+*.rar
+*.7z
+*.model
+*.pkl
+*.pickle
+*.h5
+*.hdf5
+*.ckpt
+*.pth
+*.pt
+*.bin
+*.safetensors
+# Model files and caches
+.cache/
+models/
+checkpoints/
+weights/
+*.weights
+*.cfg
+# Logs and temporary files
+logs/
+*.log
+tmp/
+temp/
+.tmp/
+# Node modules (if any)
+node_modules/
+# Large data files
+data/
+datasets/
+*.csv
+*.json
+*.xml
+*.xlsx
+*.xls
+# Backup files
+*.bak
+*.backup
+*.old
+# Jupyter notebooks
+*.ipynb
+.ipynb_checkpoints/
+# Large images
+*.jpg
+*.jpeg
+*.png
+*.gif
+*.bmp
+*.tiff
+*.tif
+images/
+img/
+# Audio/Video files
+*.mp3
+*.mp4
+*.avi
+*.mov
+*.wav
+*.flac
+# Archives
+*.tar
+*.tar.gz
+*.tar.bz2
+*.zip
+*.rar
+*.7z
+# System files
+Thumbs.db
+ehthumbs.db
+Desktop.ini
+$RECYCLE.BIN/

.env.example ADDED Viewed

	@@ -0,0 +1,27 @@

+# Bank Statement Analyzer Configuration
+# Copy this file to .env and update with your actual values
+# API Keys
+GROQ_API_KEY=your_groq_api_key_here
+GROQ_BASE_URL=https://api.groq.com/openai/v1
+HUGGINGFACE_API_KEY=your_huggingface_api_key_here
+HUGGINGFACE_PROVIDER=novita
+# Model Configuration
+LLM_MODEL=llama-3.1-8b-instant
+# OCR and Processing Settings
+Y_THRESHOLD=3.0
+GAP_THRESHOLD=10
+GAP_THRESHOLD_RATIO=0.1
+# File Processing Settings
+TEMP_FILE_NAME=temp.pdf
+DPI=300
+# spaCy Model Settings
+SPACY_MODEL_NAME=en_core_web_sm
+# Device Settings
+FORCE_CPU=false

.gitignore ADDED Viewed

	@@ -0,0 +1,128 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+PIPFILE.lock
+# Virtual Environment
+venv/
+ENV/
+env/
+.venv/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.project
+.pydevproject
+# Environment variables
+.env
+.env.local
+.env.*.local
+# Logs
+logs/
+*.log
+# Debug
+debug/
+*.debug
+# Cache
+.cache/
+*.cache
+__pycache__/
+.pytest_cache/
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Database
+*.db
+*.sqlite
+*.sqlite3
+# Output files
+output/
+results/
+exports/
+*.xlsx
+*.csv
+*.json
+# Temporary files
+tmp/
+temp/
+*.tmp
+*.temp
+# OS files
+.DS_Store
+Thumbs.db
+ehthumbs.db
+# Test coverage
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+*.cover
+*.py,cover
+.hypothesis/
+# Jupyter Notebook
+.ipynb_checkpoints
+# Redis
+dump.rdb
+# Secrets
+secrets/
+*.key
+*.pem
+*.crt
+# Model files
+models/*.pkl
+models/*.h5
+models/*.pt
+# Large files
+*.pdf
+*.zip
+*.tar.gz
+*.rar
+# Except test PDFs
+!tests/fixtures/*.pdf
+check-build-context.py
+test_structure.py
+startup.py
+setup_env.py
+poc.py
+docker-startup.py

DOCKER_DEPLOYMENT.md ADDED Viewed

	@@ -0,0 +1,443 @@

+# Docker Deployment Guide
+This guide explains how to deploy the Bank Statement Analyzer using Docker with Poetry dependency management.
+## Prerequisites
+- Docker installed on your system
+- Docker Compose (usually comes with Docker Desktop)
+- API keys for Groq and HuggingFace
+## Quick Start
+### 1. Set up Environment Variables
+Create a `.env` file in the project root:
+```bash
+# Copy the example file
+cp env.example .env
+# Edit with your actual API keys
+nano .env
+```
+Make sure to set:
+- `GROQ_API_KEY=your_actual_groq_api_key`
+- `HUGGINGFACE_API_KEY=your_actual_huggingface_api_key`
+### 2. Build and Run with Docker Compose
+```bash
+# Build and start the application
+docker-compose up --build
+# Or run in detached mode
+docker-compose up -d --build
+```
+### 3. Access the Application
+Open your browser and go to: `http://localhost:8501`
+## Manual Docker Build
+If you prefer to build manually:
+```bash
+# Build the image
+docker build -t bank-statement-analyzer .
+# Run the container
+docker run -p 8501:8501 \
+  --env-file .env \
+  -v $(pwd)/temp:/app/temp \
+  bank-statement-analyzer
+```
+## Docker Configuration
+### Dockerfile Features
+- **Base Image**: Python 3.12 slim for smaller size
+- **Dependency Management**: Poetry for reliable dependency resolution
+- **System Dependencies**: Includes OCR and graphics libraries
+- **PyTorch**: Pre-installed with CPU support (can be changed to CUDA)
+- **spaCy Models**: Pre-downloaded for faster startup
+- **Optimized Layers**: Efficient caching for faster rebuilds
+### Poetry Configuration
+The project uses Poetry for dependency management:
+```toml
+# pyproject.toml
+[tool.poetry]
+name = "bank-statement-analyzer"
+version = "1.0.0"
+description = "A comprehensive, async, class-based bank statement analyzer"
+[tool.poetry.dependencies]
+python = "^3.12"
+streamlit = "^1.28.0"
+pandas = "^2.0.0"
+# ... other dependencies
+```
+### Environment Variables
+The following environment variables can be set in your `.env` file:
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `GROQ_API_KEY` | Your Groq API key | Required |
+| `HUGGINGFACE_API_KEY` | Your HuggingFace API key | Required |
+| `LLM_MODEL` | Groq model to use | `llama-3.1-8b-instant` |
+| `SPACY_MODEL_NAME` | spaCy model for NER | `en_core_web_sm` |
+| `FORCE_CPU` | Force CPU usage | `false` |
+| `DPI` | PDF processing DPI | `300` |
+| `Y_THRESHOLD` | Text extraction threshold | `3.0` |
+### Volumes
+- `./temp:/app/temp`: Shared temp directory for file processing
+- `./.env:/app/.env:ro`: Read-only access to environment file
+## Production Deployment
+### Using Docker Compose (Recommended)
+```yaml
+# docker-compose.prod.yml
+version: '3.8'
+services:
+  bank-statement-analyzer:
+    build: .
+    ports:
+      - "8501:8501"
+    environment:
+      - PYTHONUNBUFFERED=1
+      - POETRY_VENV_IN_PROJECT=1
+      - POETRY_NO_INTERACTION=1
+    env_file:
+      - .env
+    volumes:
+      - ./temp:/app/temp
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
+    deploy:
+      resources:
+        limits:
+          memory: 4G
+        reservations:
+          memory: 2G
+```
+### Using Docker Swarm
+```bash
+# Initialize swarm (if not already done)
+docker swarm init
+# Deploy the stack
+docker stack deploy -c docker-compose.yml bank-analyzer
+```
+### Using Kubernetes
+Create a deployment YAML:
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: bank-statement-analyzer
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: bank-statement-analyzer
+  template:
+    metadata:
+      labels:
+        app: bank-statement-analyzer
+    spec:
+      containers:
+      - name: bank-statement-analyzer
+        image: bank-statement-analyzer:latest
+        ports:
+        - containerPort: 8501
+        env:
+        - name: GROQ_API_KEY
+          valueFrom:
+            secretKeyRef:
+              name: api-secrets
+              key: groq-api-key
+        - name: HUGGINGFACE_API_KEY
+          valueFrom:
+            secretKeyRef:
+              name: api-secrets
+              key: huggingface-api-key
+        - name: POETRY_VENV_IN_PROJECT
+          value: "1"
+        - name: POETRY_NO_INTERACTION
+          value: "1"
+        resources:
+          limits:
+            memory: "4Gi"
+            cpu: "2"
+          requests:
+            memory: "2Gi"
+            cpu: "1"
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: bank-statement-analyzer-service
+spec:
+  selector:
+    app: bank-statement-analyzer
+  ports:
+  - port: 80
+    targetPort: 8501
+  type: LoadBalancer
+```
+## Performance Optimization
+### GPU Support
+To enable GPU support, modify the Dockerfile:
+```dockerfile
+# Install PyTorch with CUDA support
+RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+```
+And run with GPU access:
+```bash
+docker run --gpus all -p 8501:8501 --env-file .env bank-statement-analyzer
+```
+### Memory Optimization
+- Set `FORCE_CPU=true` in `.env` if GPU is not needed
+- Use smaller spaCy model: `SPACY_MODEL_NAME=en_core_web_sm`
+- Adjust memory limits in docker-compose.yml
+### Build Optimization
+- Use `.dockerignore` to exclude unnecessary files
+- Leverage Docker layer caching
+- Use multi-stage builds for production
+- Poetry lock file ensures reproducible builds
+## Development with Poetry
+### Local Development
+```bash
+# Install Poetry (if not already installed)
+curl -sSL https://install.python-poetry.org | python3 -
+# Install dependencies
+poetry install
+# Activate virtual environment
+poetry shell
+# Run the application
+poetry run streamlit run main.py
+```
+### Adding Dependencies
+```bash
+# Add a new dependency
+poetry add package-name
+# Add a development dependency
+poetry add --group dev package-name
+# Update dependencies
+poetry update
+```
+### Poetry Scripts
+The project includes convenient Poetry scripts:
+```bash
+# Start the application
+poetry run start
+# Run startup script
+poetry run startup
+# Run tests
+poetry run test
+# Setup environment
+poetry run setup
+```
+## Troubleshooting
+### Common Issues
+1. **Port Already in Use**
+   ```bash
+   # Check what's using port 8501
+   lsof -i :8501
+   # Use different port
+   docker run -p 8502:8501 bank-statement-analyzer
+   ```
+2. **Permission Issues**
+   ```bash
+   # Fix temp directory permissions
+   sudo chown -R 1000:1000 ./temp
+   ```
+3. **Memory Issues**
+   ```bash
+   # Increase Docker memory limit
+   # In Docker Desktop: Settings > Resources > Memory
+   ```
+4. **API Key Issues**
+   ```bash
+   # Check environment variables
+   docker exec -it <container_id> env | grep API
+   ```
+5. **Poetry Issues**
+   ```bash
+   # Clear Poetry cache
+   poetry cache clear . --all
+   # Reinstall dependencies
+   poetry install --sync
+   ```
+### Logs
+```bash
+# View container logs
+docker-compose logs -f
+# View specific service logs
+docker-compose logs -f bank-statement-analyzer
+```
+### Health Check
+The application includes a health check endpoint:
+```bash
+# Test health endpoint
+curl http://localhost:8501/_stcore/health
+```
+## Security Considerations
+1. **API Keys**: Never commit `.env` files to version control
+2. **Network**: Use internal networks for production
+3. **Volumes**: Limit volume access to necessary directories
+4. **User**: Run container as non-root user
+5. **Updates**: Regularly update base images and dependencies
+6. **Dependencies**: Poetry lock file ensures reproducible builds
+## Monitoring
+### Basic Monitoring
+```bash
+# Check container status
+docker ps
+# Monitor resource usage
+docker stats
+# Check logs
+docker-compose logs -f
+```
+### Advanced Monitoring
+Consider using:
+- Prometheus + Grafana for metrics
+- ELK stack for log aggregation
+- Docker Swarm or Kubernetes for orchestration
+## Backup and Recovery
+### Data Backup
+```bash
+# Backup temp directory
+tar -czf temp_backup.tar.gz ./temp
+# Backup environment configuration
+cp .env .env.backup
+# Backup Poetry lock file
+cp poetry.lock poetry.lock.backup
+```
+### Container Backup
+```bash
+# Save container image
+docker save bank-statement-analyzer > bank-analyzer.tar
+# Load container image
+docker load < bank-analyzer.tar
+```
+## Scaling
+### Horizontal Scaling
+```yaml
+# docker-compose.scale.yml
+version: '3.8'
+services:
+  bank-statement-analyzer:
+    build: .
+    ports:
+      - "8501:8501"
+    deploy:
+      replicas: 3
+    environment:
+      - PYTHONUNBUFFERED=1
+      - POETRY_VENV_IN_PROJECT=1
+```
+### Load Balancing
+Use a reverse proxy like Nginx:
+```nginx
+upstream streamlit {
+    server bank-statement-analyzer:8501;
+}
+server {
+    listen 80;
+    location / {
+        proxy_pass http://streamlit;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+    }
+}
+```

Dockerfile ADDED Viewed

	@@ -0,0 +1,71 @@

+# Use official Python 3.12 slim image
+FROM python:3.12-slim
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV POETRY_VERSION=1.8.2
+ENV POETRY_HOME="/opt/poetry"
+ENV POETRY_VENV_IN_PROJECT=1
+ENV POETRY_NO_INTERACTION=1
+# Set work directory
+WORKDIR /app
+# Install system dependencies in a single layer
+RUN apt-get update && apt-get install -y \
+    curl \
+    build-essential \
+    tesseract-ocr \
+    libtesseract-dev \
+    poppler-utils \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+# Install Poetry
+RUN curl -sSL https://install.python-poetry.org | python3 - \
+    && export PATH="/opt/poetry/bin:$PATH" \
+    && poetry --version
+# Add Poetry to PATH
+ENV PATH="/opt/poetry/bin:$PATH"
+# Copy only Poetry configuration files first (for better caching)
+COPY pyproject.toml poetry.lock* /app/
+# Configure Poetry and install dependencies
+RUN poetry config virtualenvs.create false \
+    && poetry lock --no-update \
+    && poetry install --no-interaction --no-ansi --only main
+# Install PyTorch with CPU support (adjust based on your needs)
+RUN pip3 install torch torchvision torchaudio
+# Install spaCy models
+RUN python -m spacy download en_core_web_sm
+# Create temp directory for file processing
+RUN mkdir -p /app/temp && chmod 777 /app/temp
+# Copy the source code (this layer will be rebuilt when code changes)
+COPY src/ /app/src/
+COPY main.py /app/
+# Expose the port Streamlit will run on
+EXPOSE 8501
+# Set environment variables for Streamlit
+ENV STREAMLIT_SERVER_PORT=8501
+ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
+ENV STREAMLIT_SERVER_HEADLESS=true
+ENV STREAMLIT_SERVER_ENABLE_CORS=false
+ENV STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION=false
+# Run the Streamlit application
+CMD ["streamlit", "run", "main.py", "--server.port=8501", "--server.address=0.0.0.0"]

Dockerfile.alternative ADDED Viewed

	@@ -0,0 +1,63 @@

+# Use official Python 3.12 slim image
+FROM python:3.12-slim
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV POETRY_VERSION=1.8.2
+ENV POETRY_NO_INTERACTION=1
+# Set work directory
+WORKDIR /app
+# Install system dependencies in a single layer
+RUN apt-get update && apt-get install -y \
+    curl \
+    build-essential \
+    tesseract-ocr \
+    libtesseract-dev \
+    poppler-utils \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+# Install Poetry using pip (alternative method)
+RUN pip install poetry==$POETRY_VERSION
+# Copy only Poetry configuration files first (for better caching)
+COPY pyproject.toml poetry.lock* /app/
+# Configure Poetry and install dependencies
+RUN poetry config virtualenvs.create false \
+    && poetry install --no-interaction --no-ansi --only main
+# Install PyTorch with CPU support (adjust based on your needs)
+RUN pip3 install torch torchvision torchaudio
+# Install spaCy models
+RUN python -m spacy download en_core_web_sm
+# Create temp directory for file processing
+RUN mkdir -p /app/temp && chmod 777 /app/temp
+# Copy the source code (this layer will be rebuilt when code changes)
+COPY src/ /app/src/
+COPY main.py /app/
+# Expose the port Streamlit will run on
+EXPOSE 8501
+# Set environment variables for Streamlit
+ENV STREAMLIT_SERVER_PORT=8501
+ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
+ENV STREAMLIT_SERVER_HEADLESS=true
+ENV STREAMLIT_SERVER_ENABLE_CORS=false
+ENV STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION=false
+# Run the Streamlit application
+CMD ["streamlit", "run", "main.py", "--server.port=8501", "--server.address=0.0.0.0"]

Dockerfile.fallback ADDED Viewed

	@@ -0,0 +1,81 @@

+# Use official Python 3.12 slim image
+FROM python:3.12-slim
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+# Set work directory
+WORKDIR /app
+# Install system dependencies in a single layer
+RUN apt-get update && apt-get install -y \
+    curl \
+    build-essential \
+    tesseract-ocr \
+    libtesseract-dev \
+    poppler-utils \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements file (if it exists)
+COPY requirements.txt* /app/
+# Install Python dependencies using pip
+RUN pip install --no-cache-dir --upgrade pip
+# Install dependencies from requirements.txt if it exists, otherwise install manually
+RUN if [ -f "requirements.txt" ]; then \
+        pip install --no-cache-dir -r requirements.txt; \
+    else \
+        pip install --no-cache-dir \
+            streamlit>=1.28.0 \
+            pandas>=2.0.0 \
+            numpy>=1.24.0 \
+            PyMuPDF>=1.23.0 \
+            PyPDF2>=3.0.0 \
+            doctr>=2.4.0 \
+            pdf2image>=1.16.0 \
+            spacy>=3.7.0 \
+            torch>=2.0.0 \
+            fuzzywuzzy>=0.18.0 \
+            python-Levenshtein>=0.21.0 \
+            openai>=1.0.0 \
+            huggingface-hub>=0.19.0 \
+            pydantic>=2.0.0 \
+            pydantic-settings>=2.0.0 \
+            python-dateutil>=2.8.0 \
+            python-dotenv>=1.0.0; \
+    fi
+# Install PyTorch with CPU support
+RUN pip3 install torch torchvision torchaudio
+# Install spaCy models
+RUN python -m spacy download en_core_web_sm
+# Create temp directory for file processing
+RUN mkdir -p /app/temp && chmod 777 /app/temp
+# Copy the source code
+COPY src/ /app/src/
+COPY main.py /app/
+# Expose the port Streamlit will run on
+EXPOSE 8501
+# Set environment variables for Streamlit
+ENV STREAMLIT_SERVER_PORT=8501
+ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
+ENV STREAMLIT_SERVER_HEADLESS=true
+ENV STREAMLIT_SERVER_ENABLE_CORS=false
+ENV STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION=false
+# Run the Streamlit application
+CMD ["streamlit", "run", "main.py", "--server.port=8501", "--server.address=0.0.0.0"]

README.md ADDED Viewed

	@@ -0,0 +1,360 @@

+# Bank Statement Analyzer
+A comprehensive, async, class-based bank statement analyzer that extracts account information and transaction tables from PDF bank statements.
+## Features
+- **Async Processing**: All operations are asynchronous for better performance
+- **Class-Based Architecture**: Well-organized, maintainable code structure
+- **Model Pre-loading**: Models are loaded once at startup for faster processing
+- **Environment Configuration**: Flexible configuration via .env files
+- **Multiple PDF Support**: Handles both digital and scanned PDFs
+- **OCR Integration**: Uses doctr for scanned PDF processing
+- **LLM Integration**: Uses Groq API for intelligent data extraction
+- **Table Extraction**: Extracts and processes transaction tables
+- **Account Information**: Extracts account numbers, balances, and bank names
+- **Streamlit Interface**: User-friendly web interface
+## Project Structure
+```
+bank-scrubber/
+├── src/
+│   ├── config/
+│   │   └── config.py              # Configuration settings and API keys
+│   ├── models/
+│   │   ├── __init__.py
+│   │   └── account_models.py      # Pydantic models for data validation
+│   ├── utils/
+│   │   ├── __init__.py
+│   │   ├── api_clients.py         # Async API clients for Groq and HuggingFace
+│   │   └── model_manager.py       # Singleton model manager for pre-loading
+│   ├── ocr/
+│   │   ├── __init__.py
+│   │   ├── pdf_processor.py       # PDF processing and OCR setup
+│   │   └── text_extractor.py      # Text extraction with bounding boxes
+│   ├── extractor/
+│   │   ├── __init__.py
+│   │   ├── table_extractor.py     # Transaction table extraction and processing
+│   │   ├── account_extractor.py   # Account number and bank name extraction
+│   │   └── balance_extractor.py   # Balance information extraction
+│   ├── services/
+│   │   ├── __init__.py
+│   │   └── bank_statement_service.py  # Main service orchestrating all operations
+│   └── __init__.py
+├── main.py                        # Streamlit application entry point
+├── startup.py                     # Model pre-loading script
+├── setup_env.py                   # Environment setup helper
+├── env.example                    # Environment variables template
+├── test_structure.py              # Structure testing script
+├── poc.py                         # Original monolithic file (preserved)
+├── requirements.txt               # Python dependencies
+└── README.md                      # This file
+```
+## Installation
+1. Clone the repository:
+```bash
+git clone <repository-url>
+cd bank-scrubber
+```
+2. Create a virtual environment:
+```bash
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+```
+3. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+4. Install spaCy models:
+```bash
+python -m spacy download en_core_web_sm
+# Optional: python -m spacy download en_core_web_trf
+```
+## Configuration
+### Quick Setup
+Use the setup script to create your environment file:
+```bash
+python setup_env.py
+```
+This will:
+- Create a `.env` file from the template
+- Guide you through the setup process
+- Show current configuration status
+### Manual Setup
+1. Copy the environment template:
+```bash
+cp env.example .env
+```
+2. Edit the `.env` file with your API keys and settings:
+```env
+# API Keys
+GROQ_API_KEY=your_actual_groq_api_key_here
+HUGGINGFACE_API_KEY=your_actual_huggingface_api_key_here
+# Model Configuration
+LLM_MODEL=llama-3.1-8b-instant
+SPACY_MODEL_NAME=en_core_web_sm
+# Device Settings
+FORCE_CPU=false
+# Processing Settings
+DPI=300
+Y_THRESHOLD=3.0
+```
+### Configuration Options
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `GROQ_API_KEY` | Your Groq API key | Required |
+| `HUGGINGFACE_API_KEY` | Your HuggingFace API key | Required |
+| `LLM_MODEL` | Groq model to use | `llama-3.1-8b-instant` |
+| `SPACY_MODEL_NAME` | spaCy model for NER | `en_core_web_sm` |
+| `FORCE_CPU` | Force CPU usage | `false` |
+| `DPI` | PDF processing DPI | `300` |
+| `Y_THRESHOLD` | Text extraction threshold | `3.0` |
+| `GAP_THRESHOLD` | Table gap threshold | `10` |
+| `TEMP_FILE_NAME` | Temporary file name | `temp.pdf` |
+## Usage
+### Quick Start
+1. Set up environment:
+```bash
+python setup_env.py
+```
+2. Pre-load models:
+```bash
+python startup.py
+```
+3. Run the application:
+```bash
+streamlit run main.py
+```
+### Advanced Usage
+#### Model Pre-loading (Recommended)
+For optimal performance, pre-load models before running the application:
+```bash
+# Pre-load all models
+python startup.py
+# Then run the main application
+streamlit run main.py
+```
+#### Direct Application Run
+You can also run the application directly, which will load models on first use:
+```bash
+streamlit run main.py
+```
+#### Using the Service Programmatically
+```python
+import asyncio
+from src.services import BankStatementService
+async def process_statement(file_path):
+    async with BankStatementService() as service:
+        with open(file_path, 'rb') as f:
+            result = await service.process_bank_statement(f)
+        return result
+# Usage
+result = asyncio.run(process_statement('path/to/statement.pdf'))
+print(result.account_summary)
+print(result.transaction_tables)
+```
+## Architecture Overview
+### Configuration Management
+- **Environment Variables**: All settings configurable via `.env` file
+- **Pydantic Settings**: Type-safe configuration with validation
+- **Fallback Values**: Sensible defaults for all settings
+- **API Key Management**: Secure handling of API credentials
+### Model Management
+- **ModelManager**: Singleton class that pre-loads and manages all ML models
+- **Pre-loading**: Models are loaded once at startup and reused across the application
+- **Device Optimization**: Automatic GPU detection and utilization
+- **Configurable Models**: spaCy model selection via environment variables
+### Services Layer
+- **BankStatementService**: Main orchestrator that coordinates all processing steps
+### OCR Layer
+- **PDFProcessor**: Handles PDF file operations and uses pre-loaded OCR models
+- **TextExtractor**: Extracts text with bounding boxes from both digital and scanned PDFs
+### Extractor Layer
+- **TableExtractor**: Processes transaction tables with pattern matching and data cleaning
+- **AccountExtractor**: Extracts account numbers and bank names using regex and NER
+- **BalanceExtractor**: Extracts balance information using keyword matching
+### Utils Layer
+- **GroqClient**: Async client for Groq LLM API
+- **HuggingFaceClient**: Async client for HuggingFace Inference API
+- **ModelManager**: Centralized model management and pre-loading
+### Models Layer
+- **BankStatementData**: Main data model for processed results
+- **AccountSummary**: Model for account information
+- **AccountDetails**: Model for individual account details
+## Key Features
+### Environment Configuration
+All settings are configurable via environment variables:
+```python
+from src.config.config import settings
+print(f"Using model: {settings.llm_model}")
+print(f"Device: {'CPU' if settings.force_cpu else 'Auto'}")
+```
+### Model Pre-loading
+Models are loaded once at startup and reused throughout the application:
+```python
+from src.utils import model_manager
+# Check model status
+status = model_manager.get_model_status()
+print(f"Models loaded: {status['models_loaded']}")
+```
+### Async Processing
+All operations are asynchronous, allowing for better performance and resource utilization:
+```python
+async with BankStatementService() as service:
+    result = await service.process_bank_statement(uploaded_file)
+```
+### Class-Based Design
+Each component is a class with async context manager support:
+```python
+class MyService:
+    async def __aenter__(self):
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        pass
+```
+### Error Handling
+Comprehensive error handling throughout the pipeline with graceful fallbacks.
+## Performance Optimization
+### Model Pre-loading Benefits
+- **Faster Processing**: Models are loaded once at startup, not on each file upload
+- **Memory Efficiency**: Single instance of each model shared across the application
+- **GPU Optimization**: Automatic GPU detection and utilization
+- **Reduced Latency**: No model loading delays during file processing
+### Configuration Benefits
+- **Flexible Settings**: Easy to adjust parameters without code changes
+- **Environment-Specific**: Different settings for development/production
+- **Secure**: API keys kept separate from code
+- **Version Control Safe**: `.env` files can be excluded from git
+### Startup Process
+1. **Configuration Loading**: Loads settings from `.env` file
+2. **Model Detection**: Automatically detects available models (spaCy, doctr)
+3. **Device Selection**: Chooses optimal device (GPU/CPU) based on config
+4. **Pre-loading**: Loads all models into memory
+5. **Status Reporting**: Provides detailed loading status
+## Testing
+Run the structure test to verify everything works:
+```bash
+python test_structure.py
+```
+This will test:
+- All module imports
+- Model manager functionality
+- Service initialization
+- Configuration access
+## Troubleshooting
+### Common Issues
+1. **API Keys Not Set**
+   ```bash
+   python setup_env.py
+   # Edit .env file with your actual API keys
+   ```
+2. **spaCy Model Not Found**
+   ```bash
+   python -m spacy download en_core_web_sm
+   ```
+3. **GPU Not Detected**
+   - Set `FORCE_CPU=true` in `.env` file
+   - Or install CUDA-compatible PyTorch
+4. **Configuration Issues**
+   ```bash
+   python setup_env.py
+   # Check current configuration
+   ```
+## Dependencies
+- **Streamlit**: Web interface
+- **PyMuPDF**: PDF processing
+- **doctr**: OCR for scanned PDFs
+- **spaCy**: Natural language processing
+- **torch**: Deep learning framework
+- **pandas**: Data manipulation
+- **openai**: Groq API client
+- **huggingface-hub**: HuggingFace API client
+- **pydantic**: Data validation
+- **fuzzywuzzy**: Fuzzy string matching
+- **python-dotenv**: Environment variable loading
+## Contributing
+1. Fork the repository
+2. Create a feature branch
+3. Make your changes
+4. Add tests if applicable
+5. Submit a pull request
+## License
+This project is licensed under the MIT License.

build-docker.sh ADDED Viewed

	@@ -0,0 +1,115 @@

+#!/bin/bash
+# Docker build script for Bank Statement Analyzer
+echo "🐳 Building Bank Statement Analyzer Docker Image"
+echo "=================================================="
+# Check if we're in the right directory
+if [ ! -f "Dockerfile" ]; then
+    echo "❌ Error: Dockerfile not found in current directory"
+    echo "   Please run this script from the project root directory"
+    exit 1
+fi
+# Check build context size
+echo "🔍 Checking build context size..."
+python3 check-build-context.py
+echo ""
+echo "📦 Building Docker image..."
+# Check if pyproject.toml exists
+if [ ! -f "pyproject.toml" ]; then
+    echo "❌ Error: pyproject.toml not found!"
+    echo "   Please ensure you have a valid pyproject.toml file"
+    exit 1
+fi
+# Try building with the main Dockerfile first
+echo "🔄 Attempting build with main Dockerfile..."
+docker build -t bank-statement-analyzer .
+if [ $? -eq 0 ]; then
+    echo ""
+    echo "✅ Docker image built successfully!"
+    echo ""
+    echo "🚀 To run the application:"
+    echo "   docker run -p 8501:8501 --env-file .env bank-statement-analyzer"
+    echo ""
+    echo "   Or use docker-compose:"
+    echo "   docker-compose up"
+else
+    echo ""
+    echo "⚠️  Main Dockerfile build failed. Trying alternative method..."
+    # Check if alternative Dockerfile exists
+    if [ -f "Dockerfile.alternative" ]; then
+        echo "🔄 Attempting build with alternative Dockerfile..."
+        docker build -f Dockerfile.alternative -t bank-statement-analyzer .
+        if [ $? -eq 0 ]; then
+            echo ""
+            echo "✅ Docker image built successfully with alternative method!"
+            echo ""
+            echo "🚀 To run the application:"
+            echo "   docker run -p 8501:8501 --env-file .env bank-statement-analyzer"
+            echo ""
+            echo "   Or use docker-compose:"
+            echo "   docker-compose up"
+        else
+            echo ""
+            echo "⚠️  Alternative Dockerfile also failed. Trying fallback method..."
+            # Check if fallback Dockerfile exists
+            if [ -f "Dockerfile.fallback" ]; then
+                echo "🔄 Attempting build with fallback Dockerfile (pip-based)..."
+                docker build -f Dockerfile.fallback -t bank-statement-analyzer .
+                if [ $? -eq 0 ]; then
+                    echo ""
+                    echo "✅ Docker image built successfully with fallback method!"
+                    echo ""
+                    echo "🚀 To run the application:"
+                    echo "   docker run -p 8501:8501 --env-file .env bank-statement-analyzer"
+                    echo ""
+                    echo "   Or use docker-compose:"
+                    echo "   docker-compose up"
+                else
+                    echo ""
+                    echo "❌ All Dockerfile methods failed!"
+                    echo ""
+                    echo "💡 Troubleshooting tips:"
+                    echo "   - Check if Poetry is properly configured"
+                    echo "   - Ensure pyproject.toml and poetry.lock are valid"
+                    echo "   - Try running 'poetry install' locally first"
+                    echo "   - Check Docker logs for specific error messages"
+                    echo "   - Verify system dependencies are available"
+                    echo ""
+                    echo "🔧 Manual troubleshooting:"
+                    echo "   docker build -t bank-statement-analyzer . 2>&1 | tee build.log"
+                    exit 1
+                fi
+            else
+                echo ""
+                echo "❌ Fallback Dockerfile not found!"
+                echo ""
+                echo "💡 Troubleshooting tips:"
+                echo "   - Check if large files are being included in build context"
+                echo "   - Ensure .dockerignore is properly configured"
+                echo "   - Try running 'python3 check-build-context.py' to identify issues"
+                echo "   - Check Poetry installation and configuration"
+                exit 1
+            fi
+        fi
+    else
+        echo ""
+        echo "❌ Alternative Dockerfile not found!"
+        echo ""
+        echo "💡 Troubleshooting tips:"
+        echo "   - Check if large files are being included in build context"
+        echo "   - Ensure .dockerignore is properly configured"
+        echo "   - Try running 'python3 check-build-context.py' to identify issues"
+        echo "   - Check Poetry installation and configuration"
+        exit 1
+    fi
+fi

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,26 @@

+version: '3.8'
+services:
+  bank-statement-analyzer:
+    build: .
+    ports:
+      - "8501:8501"
+    environment:
+      - PYTHONUNBUFFERED=1
+      - STREAMLIT_SERVER_PORT=8501
+      - STREAMLIT_SERVER_ADDRESS=0.0.0.0
+      - STREAMLIT_SERVER_HEADLESS=true
+      - STREAMLIT_SERVER_ENABLE_CORS=false
+      - STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION=false
+      - POETRY_VENV_IN_PROJECT=1
+      - POETRY_NO_INTERACTION=1
+    volumes:
+      - ./temp:/app/temp
+      - ./.env:/app/.env:ro
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s

main.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import asyncio
+import streamlit as st
+import pandas as pd
+from src.services import BankStatementService
+from src.utils import model_manager
+async def preload_models():
+    """Pre-load all models at application startup."""
+    # st.info("🚀 Pre-loading models... This may take a moment on first run.")
+    # Ensure models are loaded
+    await model_manager.ensure_models_loaded()
+    # Get model status
+    status = model_manager.get_model_status()
+    if status["models_loaded"]:
+        # st.success("✅ All models loaded successfully!")
+        # st.info(f"📱 Using device: {status['device']}")
+        pass
+    else:
+        # st.error("❌ Failed to load some models")
+        pass
+async def main():
+    st.set_page_config(page_title="Bank Statement Analyzer", layout="wide")
+    st.title("📄 Bank Statement Analyzer")
+    # Pre-load models at startup
+    await preload_models()
+    uploaded_file = st.file_uploader("Upload Bank Statement PDF", type=["pdf"])
+    if uploaded_file:
+        st.info("📥 Processing uploaded file...")
+        with st.spinner("Extracting data..."):
+            async with BankStatementService() as service:
+                result = await service.process_bank_statement(uploaded_file)
+        if result:
+            # --- Account Summary ---
+            account_df = pd.DataFrame(result.account_summary.items(), columns=["Account Summary", "Data"])
+            st.dataframe(account_df, use_container_width=True, hide_index=True)
+            # --- Tables Section ---
+            st.subheader("📊 Extracted Tables")
+            for name, df in result.transaction_tables.items():
+                if df.empty:
+                    continue
+                st.markdown(f"### {name.capitalize()} Table")
+                st.dataframe(df, use_container_width=True, hide_index=True)
+        else:
+            st.error("⚠️ Unable to parse the statement correctly.")
+    else:
+        st.warning("📤 Please upload a PDF file to begin.")
+if __name__ == "__main__":
+    # Run the async main function
+    asyncio.run(main())

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,31 @@

+[tool.poetry]
+name = "bank-scrubber"
+version = "0.1.0"
+description = ""
+authors = ["Your Name <you@example.com>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = ">=3.12,<3.14"
+pypdf2 = "^3.0.1"
+pymupdf = "^1.26.1"
+pdf2image = "^1.17.0"
+python-doctr = "^0.12.0"
+numpy = "^2.3.1"
+pandas = "^2.3.0"
+streamlit = "^1.46.1"
+openai = "^1.93.0"
+fuzzywuzzy = "^0.18.0"
+huggingface-hub = "^0.33.1"
+pydantic = "^2.11.7"
+python-dateutil = "^2.9.0.post0"
+python-dotenv = "^1.1.1"
+python-levenshtein = "^0.27.1"
+pydantic-settings = "^2.10.1"
+doctr = "^1.9.0"
+spacy = "^3.8.7"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+streamlit>=1.28.0
+pandas>=2.0.0
+numpy>=1.24.0
+PyMuPDF>=1.23.0
+PyPDF2>=3.0.0
+doctr>=2.4.0
+pdf2image>=1.16.0
+spacy>=3.7.0
+torch>=2.0.0
+fuzzywuzzy>=0.18.0
+python-Levenshtein>=0.21.0
+openai>=1.0.0
+huggingface-hub>=0.19.0
+pydantic>=2.0.0
+pydantic-settings>=2.0.0
+python-dateutil>=2.8.0
+python-dotenv>=1.0.0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .services import BankStatementService
+from .models import BankStatementData, AccountSummary, AccountDetails
+__all__ = ["BankStatementService", "BankStatementData", "AccountSummary", "AccountDetails"]

src/config/config.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from typing import Optional
+# from pydantic import BaseSettings
+from pydantic_settings import BaseSettings
+import os
+from dotenv import load_dotenv
+# Load environment variables from .env file
+load_dotenv()
+class Settings(BaseSettings):
+    """Configuration settings for the application."""
+    # API Keys - will be loaded from environment variables
+    groq_api_key: str = os.getenv("GROQ_API_KEY")
+    groq_base_url: str = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1")
+    huggingface_api_key: str = os.getenv("HUGGINGFACE_API_KEY")
+    huggingface_provider: str = os.getenv("HUGGINGFACE_PROVIDER", "novita")
+    # Model configurations
+    llm_model: str = os.getenv("LLM_MODEL", "llama-3.1-8b-instant")
+    # OCR and processing settings
+    y_threshold: float = float(os.getenv("Y_THRESHOLD", "3.0"))
+    gap_threshold: int = int(os.getenv("GAP_THRESHOLD", "10"))
+    gap_threshold_ratio: float = float(os.getenv("GAP_THRESHOLD_RATIO", "0.1"))
+    # File processing settings
+    temp_file_name: str = os.getenv("TEMP_FILE_NAME", "temp.pdf")
+    dpi: int = int(os.getenv("DPI", "300"))
+    # spaCy model settings
+    spacy_model_name: str = os.getenv("SPACY_MODEL_NAME", "en_core_web_sm")
+    # Device settings
+    force_cpu: bool = os.getenv("FORCE_CPU", "false").lower() == "true"
+    class Config:
+        env_file = ".env"
+        env_file_encoding = "utf-8"
+        case_sensitive = False
+# Global settings instance
+settings = Settings()

src/extractor/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .table_extractor import TableExtractor
+from .account_extractor import AccountExtractor
+from .balance_extractor import BalanceExtractor
+__all__ = ["TableExtractor", "AccountExtractor", "BalanceExtractor"]

src/extractor/account_extractor.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import asyncio
+import re
+import math
+from typing import List, Dict, Any, Optional
+from fuzzywuzzy import fuzz, process
+import spacy
+from src.models.account_models import LineData
+from src.utils import model_manager
+class AccountExtractor:
+    """Async account extractor for extracting account numbers and bank names."""
+    def __init__(self):
+        # Use the centralized model manager for spaCy
+        self._ensure_models_loaded()
+    def _ensure_models_loaded(self):
+        """Ensure spaCy model is loaded via the model manager."""
+        if not model_manager.models_loaded:
+            print("🔄 Models not loaded, initializing model manager...")
+            # This will trigger model loading if not already done
+            _ = model_manager.spacy_model
+    @property
+    def nlp(self):
+        """Get the loaded spaCy model from model manager."""
+        return model_manager.spacy_model
+    async def __aenter__(self):
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        pass
+    def euclidean_distance(self, b1: List[float], b2: List[float]) -> float:
+        """Compute Euclidean distance between two bounding boxes."""
+        # Compute center points of both bboxes
+        x1 = (b1[0] + b1[2]) / 2
+        y1 = (b1[1] + b1[3]) / 2
+        x2 = (b2[0] + b2[2]) / 2
+        y2 = (b2[1] + b2[3]) / 2
+        return math.sqrt((x1 - x2)**2 + (y1 - y2)**2)
+    def combine_bboxes(self, bboxes: List[List[float]]) -> List[float]:
+        """Merge multiple bboxes into one that covers all."""
+        x_min = min(b[0] for b in bboxes)
+        y_min = min(b[1] for b in bboxes)
+        x_max = max(b[2] for b in bboxes)
+        y_max = max(b[3] for b in bboxes)
+        return [x_min, y_min, x_max, y_max]
+    async def extract_account_number_regex_distance(self, lines: List[Dict]) -> Optional[Dict]:
+        """Extract account number using regex and distance-based approach."""
+        def _extract_account():
+            for line in lines:
+                words = line.get("words", [])
+                word_texts = [w["word"] for w in words]
+                # Build cleaned line text (joined without space or special chars)
+                cleaned_line = ""
+                for w in words:
+                    if "/" not in w["word"]:
+                        cleaned_line += re.sub(r"[\s\-\_\,\/]", "", w["word"])
+                    else:
+                        cleaned_line += " " + w["word"]
+                # Look for 'account' in raw word list (not cleaned)
+                account_word = next((w for w in words if "account" in w["word"].lower()), None)
+                if not account_word:
+                    continue
+                cleaned_line = cleaned_line[cleaned_line.lower().find("account"):].strip()
+                # Run regex on cleaned line
+                match = re.search(r"[0-9Xx]{6,}", cleaned_line)
+                if not match:
+                    continue
+                matched_text = match.group(0)
+                # Now collect all words in line that could be part of this account number
+                # Join each word (after cleaning), and check if it contributes to matched text
+                joined_account = ""
+                matched_bboxes = []
+                for w in words:
+                    clean_w = re.sub(r"[\s\-\_\,\/]", "", w["word"])
+                    if not clean_w:
+                        continue
+                    if matched_text.startswith(joined_account + clean_w):
+                        joined_account += clean_w
+                        matched_bboxes.append(w["bbox"])
+                    if joined_account == matched_text:
+                        break
+                if joined_account != matched_text or not matched_bboxes:
+                    continue  # failed to reconstruct properly
+                # Compute distance from "account" word bbox to combined bbox
+                combined_bbox = self.combine_bboxes(matched_bboxes)
+                distance = self.euclidean_distance(account_word["bbox"], combined_bbox)
+                return {
+                    "account_number": matched_text,
+                    "bbox": combined_bbox,
+                    "distance": distance
+                }
+            return None
+        return await asyncio.get_event_loop().run_in_executor(None, _extract_account)
+    def match_keyword_bbox(self, keyword: str, words: List[Dict]) -> Optional[List[float]]:
+        """Match keyword in words and return combined bounding box."""
+        keyword_tokens = keyword.lower().split()
+        text_tokens = [w["word"].lower() for w in words]
+        for i in range(len(text_tokens) - len(keyword_tokens) + 1):
+            if text_tokens[i:i+len(keyword_tokens)] == keyword_tokens:
+                matched_bboxes = [words[i+j]["bbox"] for j in range(len(keyword_tokens))]
+                return self.combine_bboxes(matched_bboxes)
+        return None
+    async def extract_bank_name(self, text: str) -> str:
+        """Extract bank name using spaCy NER."""
+        def _extract_bank():
+            if not self.nlp:
+                return "Not Found"
+            doc = self.nlp(text)
+            candidates = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
+            for ent in doc.ents:
+                print(f"Entity: {ent.text}, Label: {ent.label_}")  # Debugging line to see entities and their labels
+            print(f"Candidates: {candidates}")  # Debugging line to see candidates
+            return candidates[0] if candidates else "Not Found"
+        return await asyncio.get_event_loop().run_in_executor(None, _extract_bank)
+    async def extract_bank_name_using_fuzzy(self, text: str) -> str:
+        """Extract bank name using fuzzy matching."""
+        def _extract_fuzzy():
+            bank_names = [
+                "Bank Of America", "South State Bank", "Midstates Bank",
+                "Synovus", "Shore United Bank", "Frost",
+                "Bethpage Federal Credit Union"
+            ]
+            best_match = process.extractOne(text, bank_names, scorer=fuzz.partial_ratio)
+            return best_match[0] if best_match else "Unknown"
+        return await asyncio.get_event_loop().run_in_executor(None, _extract_fuzzy)

src/extractor/balance_extractor.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import asyncio
+import re
+from typing import List, Dict, Any, Optional, Tuple
+from src.extractor.account_extractor import AccountExtractor
+class BalanceExtractor:
+    """Async balance extractor for extracting balance information."""
+    def __init__(self):
+        self.amount_pattern = re.compile(r'-?(?:\d{1,3}(?:,\d{2}){1,}(?:,\d{3})?|\d{1,3}(?:,\d{3})+|\d+)?\.\d{1,2}-?')
+        self.account_extractor = AccountExtractor()
+    async def __aenter__(self):
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        pass
+    async def extract_balances(self, object_line: List[Dict]) -> Tuple[Optional[str], Optional[str]]:
+        """Extract beginning and ending balances from line data."""
+        def _extract_balances():
+            # Keywords
+            previous_keywords = [
+                "previous balance", "starting balance", "beginning balance",
+                "balance last statement", "balance previous statement", "last statement",
+                "beginning statement", "previous statement", "starting"
+            ]
+            ending_keywords = [
+                "ending balance", "current balance", "balance this statement",
+                "balance ending statement", "this statement", "ending statement", "ending"
+            ]
+            beginning_balance = None
+            ending_balance = None
+            for idx, line_obj in enumerate(object_line):
+                line = line_obj['line']
+                line_lower = line.lower()
+                # Search for beginning balance keywords
+                if not beginning_balance:
+                    for keyword in previous_keywords:
+                        if keyword in line_lower:
+                            start_index = line_lower.find(keyword) + len(keyword)
+                            after_keyword = line[start_index:]
+                            match = self.amount_pattern.search(after_keyword)
+                            if match:
+                                beginning_balance = match.group().replace(",", "")
+                                break  # Stop after first match
+                            else:
+                                # combine the bbox of the keyword and check exact below word in range of keyword bbox range
+                                keyword_bbox = None
+                                keyword_bbox = self.account_extractor.match_keyword_bbox(keyword, line_obj["words"])
+                                if keyword_bbox:
+                                    x_min, _, x_max, _ = keyword_bbox
+                                    for next_line in object_line[idx+1:idx+3]:
+                                        final_amt = ""
+                                        for w in next_line.get("words", []):
+                                            wx_min, _, wx_max, _ = w["bbox"]
+                                            if wx_min >= x_min-0.1 and wx_max <= x_max+0.1:
+                                                final_amt += w["word"]
+                                        match = self.amount_pattern.search(final_amt)
+                                        if match:
+                                            beginning_balance = match.group().replace(",", "")
+                                            break
+                                        if beginning_balance:
+                                            break
+                if not ending_balance:
+                    # Search for ending balance keywords
+                    for keyword in ending_keywords:
+                        if keyword in line_lower:
+                            start_index = line_lower.find(keyword) + len(keyword)
+                            after_keyword = line[start_index:]
+                            match = self.amount_pattern.search(after_keyword)
+                            if match:
+                                ending_balance = match.group().replace(",", "")
+                                break  # Stop after first match
+                            else:
+                                # combine the bbox of the keyword and check exact below word in range of keyword bbox range
+                                keyword_bbox = None
+                                keyword_bbox = self.account_extractor.match_keyword_bbox(keyword, line_obj["words"])
+                                if keyword_bbox:
+                                    x_min, _, x_max, _ = keyword_bbox
+                                    for next_line in object_line[idx+1:idx+3]:
+                                        final_amt = ""
+                                        for w in next_line.get("words", []):
+                                            wx_min, _, wx_max, _ = w["bbox"]
+                                            if wx_min >= x_min-0.1 and wx_max <= x_max+0.1:
+                                                final_amt += w["word"]
+                                        match = self.amount_pattern.search(final_amt)
+                                        if match:
+                                            ending_balance = match.group().replace(",", "")
+                                            break
+                                        if ending_balance:
+                                            break
+            return beginning_balance, ending_balance
+        return await asyncio.get_event_loop().run_in_executor(None, _extract_balances)

src/extractor/table_extractor.py ADDED Viewed

	@@ -0,0 +1,760 @@

+import asyncio
+import re
+import pandas as pd
+from typing import List, Dict, Any, Optional, Tuple
+from src.config.config import settings
+class TableExtractor:
+    """Async table extractor for processing transaction tables."""
+    def __init__(self):
+        self.date_pattern = re.compile(
+            r"\b(?:"
+            r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}"
+            r"|\d{2,4}[-/]\d{1,2}[-/]\d{1,2}"
+            r"|\d{1,2}[-/]\d{2,4}"
+            r"|\d{2,4}[-/]\d{1,2}"
+            r"|\d{1,2}[-/]\d{1,2}"
+            r")\b"
+        )
+        self.amount_pattern = re.compile(r'-?(?:\d{1,3}(?:,\d{2}){1,}(?:,\d{3})?|\d{1,3}(?:,\d{3})+|\d+)?\.\d{1,2}-?')
+    async def __aenter__(self):
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        pass
+    def match_by_pattern(self, text: str, pattern) -> bool:
+        """Check if text matches a pattern."""
+        if pattern == self.amount_pattern and "-" not in text and len(text) > 6 and "," not in text:
+            return False
+        if pattern == self.amount_pattern and "-" in text and len(text) > 7 and "," not in text:
+            return False
+        return bool(pattern.fullmatch(text))
+    def extract_by_pattern(self, text: str, pattern) -> Tuple[Optional[str], Optional[str], Optional[str]]:
+        """Extract value by pattern and return value, before, after."""
+        match = pattern.search(text)
+        if match:
+            before = text[:match.start()].strip()
+            value = match.group()
+            after = text[match.end():].strip()
+            if pattern == self.amount_pattern and "-" not in value and len(value) > 6 and "," not in value:
+                return None, None, None
+            if pattern == self.amount_pattern and "-" in value and len(value) > 7 and "," not in value:
+                return None, None, None
+            return value, before, after
+        return None, None, None
+    def repair_row_with_date_and_amount(self, header: List[str], row: List[str]) -> List[str]:
+        """Repair row data by extracting dates and amounts."""
+        result = row[:]
+        n = len(header)
+        for i, col in enumerate(header):
+            val = result[i].strip()
+            if col.lower() == "date":
+                date, left, right = self.extract_by_pattern(val, self.date_pattern)
+                if date:
+                    result[i] = date
+                    if left and i > 0 and header[i-1] != "date":
+                        result[i-1] = (result[i-1] + " " + left).strip()
+                    if right and i < n - 1 and header[i+1] != "date":
+                        result[i+1] = (right + " " + result[i+1]).strip()
+                    continue
+                # Check previous column's last word
+                if i > 0 and header[i-1] != "date":
+                    left_val = result[i-1].strip()
+                    tokens = left_val.split()
+                    if tokens:
+                        last_word = tokens[-1]
+                        date_check, _, _ = self.extract_by_pattern(last_word, self.date_pattern)
+                        if date_check:
+                            result[i] = date_check + " " + result[i]
+                            tokens.pop()  # remove matched date
+                            result[i-1] = " ".join(tokens)
+                            again_date, again_left, again_right = self.extract_by_pattern(result[i], self.date_pattern)
+                            if again_date:
+                                result[i] = again_date
+                                if again_left:
+                                    result[i-1] = (result[i-1] + " " + again_left).strip()
+                                if again_right:
+                                    result[i+1] = (again_right + " " + result[i+1]).strip()
+                            continue
+                # Check next column's first word
+                if i < n - 1 and header[i+1] != "date":
+                    right_val = result[i+1].strip()
+                    tokens = right_val.split()
+                    if tokens:
+                        first_word = tokens[0]
+                        date_check, _, _ = self.extract_by_pattern(first_word, self.date_pattern)
+                        if date_check:
+                            result[i] = result[i] + " " + date_check
+                            tokens.pop(0)
+                            result[i+1] = " ".join(tokens)
+                            again_date, again_left, again_right = self.extract_by_pattern(result[i], self.date_pattern)
+                            if again_date:
+                                result[i] = again_date
+                                if again_left:
+                                    result[i-1] = (result[i-1] + " " + again_left).strip()
+                                if again_right:
+                                    result[i+1] = (again_right + " " + result[i+1]).strip()
+                            continue
+                # Check if the entire value is a date
+                if not self.match_by_pattern(result[i].strip(), self.date_pattern):
+                    result[i] = ""
+                    # check left
+                    if i > 0 and header[i-1] != "date":
+                        result[i-1] = (result[i-1] + " " + val).strip()
+                    elif i < n - 1 and header[i+1] != "date":
+                        result[i+1] = (val + " " + result[i+1]).strip()
+            elif col.lower() in ["amount", "balance", "credits", "debits"]:
+                amt, left, right = self.extract_by_pattern(val, self.amount_pattern)
+                if amt:
+                    result[i] = amt
+                    if left and i > 0:
+                        result[i-1] = (result[i-1] + " " + left).strip()
+                    if right and i < n - 1:
+                        result[i+1] = (right + " " + result[i+1]).strip()
+                    continue
+                # Check previous column's last word
+                if i > 0 and (header[i-1] not in ["amount", "balance", "credits", "debits"]):
+                    left_val = result[i-1].strip()
+                    tokens = left_val.split()
+                    if tokens:
+                        last_word = tokens[-1]
+                        amt_check, _, _ = self.extract_by_pattern(last_word, self.amount_pattern)
+                        if amt_check:
+                            result[i] = amt_check + " " + result[i]
+                            tokens.pop()
+                            result[i-1] = " ".join(tokens)
+                            again_amt, again_left, again_right = self.extract_by_pattern(result[i], self.amount_pattern)
+                            if again_amt:
+                                result[i] = again_amt
+                                if again_left:
+                                    result[i-1] = (result[i-1] + " " + again_left).strip()
+                                if again_right:
+                                    result[i+1] = (again_right + " " + result[i+1]).strip()
+                            continue
+                # Check next column's first word
+                if i < n - 1 and (header[i+1] not in ["amount", "balance", "credits", "debits"]):
+                    right_val = result[i+1].strip()
+                    tokens = right_val.split()
+                    if tokens:
+                        first_word = tokens[0]
+                        amt_check, _, _ = self.extract_by_pattern(first_word, self.amount_pattern)
+                        if amt_check:
+                            result[i] = result[i] + " " + amt_check
+                            tokens.pop(0)
+                            result[i+1] = " ".join(tokens)
+                            again_amt, again_left, again_right = self.extract_by_pattern(result[i], self.amount_pattern)
+                            if again_amt:
+                                result[i] = again_amt
+                                if again_left:
+                                    result[i-1] = (result[i-1] + " " + again_left).strip()
+                                if again_right:
+                                    result[i+1] = (again_right + " " + result[i+1]).strip()
+                            continue
+                # Check if the entire value is an amount
+                if not self.match_by_pattern(result[i].strip(), self.amount_pattern):
+                    result[i] = ""
+                    # check left
+                    if i > 0 and (header[i-1] not in ["amount", "balance", "credits", "debits"]):
+                        result[i-1] = (result[i-1] + " " + val).strip()
+                    elif i < n - 1 and (header[i+1] not in ["amount", "balance", "credits", "debits"]):
+                        result[i+1] = (val + " " + result[i+1]).strip()
+        return result
+    def extract_amount_or_return(self, line: str) -> str:
+        """Extract amount from line or return original line."""
+        matches = self.amount_pattern.findall(line)
+        if matches:
+            match = self.amount_pattern.search(line)
+            return match.group(0) if match else line
+        return line
+    def extract_date_or_return(self, line: str) -> str:
+        """Extract date from line or return original line."""
+        matches = self.date_pattern.findall(line)
+        if matches:
+            match = self.date_pattern.search(line)
+            return match.group(0) if match else line
+        return line
+    def is_date_word(self, word: str) -> bool:
+        """Check if word is a date."""
+        try:
+            return bool(self.date_pattern.fullmatch(word))
+        except ValueError:
+            return False
+    def detect_headers(self, line_data: Dict, gap_threshold_ratio: float = 0.1) -> List[str]:
+        """Detect headers from line data."""
+        if "description" not in line_data["line"]:
+            gap_threshold_ratio = 0.2
+        if "." in line_data["line"]:
+            gap_threshold_ratio = 0.1
+        word_data = sorted(line_data["words"], key=lambda w: w["bbox"][0])
+        line = line_data["line"]
+        if len(word_data) < 2:
+            return [line.strip()]  # Treat whole line as one header if only 1 word
+        # Compute horizontal gaps between words
+        gaps = []
+        for i in range(len(word_data) - 1):
+            x1 = word_data[i]["bbox"][2]  # end x of current word
+            x2 = word_data[i + 1]["bbox"][0]  # start x of next word
+            gaps.append(x2 - x1)
+        avg_gap = sum(gaps) / len(gaps)
+        threshold = avg_gap * gap_threshold_ratio
+        # Split words into groups based on large gaps (assumed column breaks)
+        headers = []
+        current_header = [word_data[0]["word"]]
+        for i in range(1, len(word_data)):
+            gap = gaps[i - 1]
+            if gap > threshold:
+                headers.append(" ".join(current_header))
+                current_header = []
+            current_header.append(word_data[i]["word"])
+        if current_header:
+            headers.append(" ".join(current_header))
+        # Process special cases
+        for i in range(len(headers)):
+            if "date" in headers[i].lower() and "description" in headers[i].lower():
+                header_checker = headers[i].split(" ")
+                date_index = header_checker.index("date")
+                description_index = header_checker.index("description")
+                if date_index < description_index:
+                    headers[i] = "date"
+                    headers.insert(i + 1, "description")
+                else:
+                    headers[i] = "description"
+                    headers.insert(i + 1, "date")
+        # Handle check/draft numbers
+        if "check" in headers or "draft" in headers:
+            resulted_headers = []
+            i = 0
+            while i < len(headers):
+                if (
+                    i + 1 < len(headers)
+                    and headers[i] == "check"
+                    and (headers[i + 1] == "no" or headers[i + 1] == "number")
+                ):
+                    resulted_headers.append(headers[i] + " " + headers[i + 1])
+                    i += 2
+                elif (
+                    i + 1 < len(headers)
+                    and headers[i] == "draft"
+                    and (headers[i + 1] == "no" or headers[i + 1] == "number")
+                ):
+                    resulted_headers.append(headers[i] + " " + headers[i + 1])
+                    i += 2
+                else:
+                    resulted_headers.append(headers[i])
+                    i += 1
+            resulted_headers = list(map(lambda x: re.sub(r'[^\w\s]', '', x).strip(), resulted_headers))
+            # Normalize header names
+            for i in range(len(resulted_headers)):
+                if any(keyword in resulted_headers[i].lower() for keyword in ["date", "day", "month", "year"]):
+                    resulted_headers[i] = "date"
+                if any(keyword in resulted_headers[i].lower() for keyword in ["amount", "total", "sum", "price", "value", "cost", "amt"]):
+                    resulted_headers[i] = "amount"
+                if any(keyword in resulted_headers[i].lower() for keyword in ["balance", "final", "closing", "current", "available", "running", "remaining", "left", "bal", "remain"]):
+                    resulted_headers[i] = "balance"
+                if any(keyword in resulted_headers[i].lower() for keyword in ["credit", "deposit", "cr"]):
+                    resulted_headers[i] = "credits"
+                if any(keyword in resulted_headers[i].lower() for keyword in ["debit", "withdrawal", "dr"]):
+                    resulted_headers[i] = "debits"
+            return resulted_headers
+        # Normalize header names
+        headers = list(map(lambda x: re.sub(r'[^\w\s]', '', x).strip(), headers))
+        for i in range(len(headers)):
+            if any(keyword in headers[i].lower() for keyword in ["date", "day", "month", "year"]):
+                headers[i] = "date"
+            if any(keyword in headers[i].lower() for keyword in ["amount", "total", "sum", "price", "value", "cost", "amt"]):
+                headers[i] = "amount"
+            if any(keyword in headers[i].lower() for keyword in ["balance", "final", "closing", "current", "available", "running", "remaining", "left", "bal", "remain"]):
+                headers[i] = "balance"
+            if any(keyword in headers[i].lower() for keyword in ["credit", "deposit"]):
+                headers[i] = "credits"
+            if any(keyword in headers[i].lower() for keyword in ["debit", "withdrawal"]):
+                headers[i] = "debits"
+        return headers
+    def detect_row_data(self, headers: List[str], header_data: List[Dict], row_data: List[Dict], gap_threshold: int = 10) -> List[str]:
+        """Detect row data based on headers and word positions."""
+        if "description" not in headers:
+            gap_threshold = 5
+        def flatten_bbox(bbox):
+            if isinstance(bbox[0], list):  # [[x0, y0], [x1, y1]]
+                return [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]]
+            return bbox
+        # Step 1: Get all x0, x1 for header words
+        header_ranges = []
+        for word in header_data:
+            x0, _, x1, _ = flatten_bbox(word["bbox"])
+            header_ranges.append((x0, x1))
+        # Step 2: Sort by x0
+        header_ranges.sort(key=lambda x: x[0])
+        # Step 3: Merge only close headers (preserve wide gaps)
+        merged_ranges = []
+        temp_x0, temp_x1 = header_ranges[0]
+        for x0, x1 in header_ranges[1:]:
+            gap = x0 - temp_x1
+            if gap < gap_threshold:
+                temp_x1 = max(temp_x1, x1)
+            else:
+                merged_ranges.append((temp_x0, temp_x1))
+                temp_x0, temp_x1 = x0, x1
+        merged_ranges.append((temp_x0, temp_x1))
+        # Step 4: Segment row_data based on horizontal gaps
+        row_data_sorted = sorted(row_data, key=lambda w: flatten_bbox(w["bbox"])[0])
+        segments = []
+        current_segment = [row_data_sorted[0]]
+        for i in range(1, len(row_data_sorted)):
+            prev_x1 = flatten_bbox(row_data_sorted[i - 1]["bbox"])[2]
+            curr_x0 = flatten_bbox(row_data_sorted[i]["bbox"])[0]
+            if curr_x0 - prev_x1 > gap_threshold:
+                segments.append(current_segment)
+                current_segment = [row_data_sorted[i]]
+            else:
+                current_segment.append(row_data_sorted[i])
+        if current_segment:
+            segments.append(current_segment)
+        # Step 5: Assign each segment to a column
+        row_values = [""] * len(headers)
+        for segment in segments:
+            seg_x0 = flatten_bbox(segment[0]["bbox"])[0]
+            seg_x1 = flatten_bbox(segment[-1]["bbox"])[2]
+            seg_center = (seg_x0 + seg_x1) / 2
+            seg_text = " ".join([w["word"] for w in segment])
+            assigned = False
+            for idx, (hx0, hx1) in enumerate(merged_ranges):
+                if hx0 <= seg_center <= hx1:
+                    row_values[idx] += seg_text + " "
+                    assigned = True
+                    break
+            if not assigned:
+                # Optionally assign to nearest column if center is outside range
+                nearest_idx = min(
+                    range(len(merged_ranges)),
+                    key=lambda idx: abs(
+                        (merged_ranges[idx][0] + merged_ranges[idx][1]) / 2 - seg_center
+                    ),
+                )
+                row_values[nearest_idx] += seg_text + " "
+        final_row = self.repair_row_with_date_and_amount(headers, row_values)
+        return [val.strip() for val in final_row]
+    def check_table_tags(self, line: str, headers: List[str]) -> str:
+        """Check and return table tag based on line content and headers."""
+        available_tags = ["transaction", "deposit", "withdrawal", "checks", "daily balance", "drafts", "service fee", "interest"]
+        tag = ""
+        if "deposit" in line.lower() or "credit" in line.lower():
+            tag = "deposit"
+        elif "withdrawal" in line.lower() or "debit" in line.lower():
+            tag = "withdrawal"
+        elif "checks" in line.lower():
+            tag = "checks"
+        elif "drafts" in line.lower():
+            tag = "drafts"
+        elif "service fee" in line.lower() or "fee" in line.lower():
+            tag = "service fee"
+        elif "daily balance" in line.lower() or "balance" in line.lower():
+            tag = "daily balance"
+        elif "interest" in line.lower():
+            tag = "interest"
+        elif "transaction" in line.lower() or "transfer" in line.lower():
+            tag = "transaction"
+        if "credits" in headers or "debits" in headers:
+            tag = "transaction"
+        for h in headers:
+            if "check" in h.lower():
+                tag = "checks"
+                break
+        for h in headers:
+            if "draft" in h.lower():
+                tag = "drafts"
+                break
+        return tag
+    async def process_transaction_tables_with_bbox(self, extracted_text_list: List[List[Dict]]) -> Tuple[List[pd.DataFrame], List[str]]:
+        """Process transaction tables with bounding box data."""
+        def _process_tables():
+            all_tables = []
+            table_tags = []
+            for block in extracted_text_list:
+                headers = []
+                table_started = False
+                current_table = []
+                current_row = {}
+                header_words = []
+                for line_idx, line_bbox in enumerate(block):
+                    line = line_bbox["line"]
+                    line = line.strip()
+                    if not table_started and ("date" in line and "description" in line):
+                        headers = self.detect_headers(line_bbox)
+                        header_words = line_bbox["words"]
+                        date_flag = False
+                        description_flag = False
+                        for header in headers:
+                            if "date" in header.lower():
+                                date_flag = True
+                            if "description" in header.lower():
+                                description_flag = True
+                        if date_flag and description_flag:
+                            table_started = True
+                            current_row = {header: [] for header in headers}
+                        else:
+                            continue
+                        if line_idx - 1 >= 0:
+                            prev_line = block[line_idx - 1]["line"]
+                            tag = self.check_table_tags(prev_line, headers)
+                            if tag:
+                                table_tags.append(tag)
+                            elif len(table_tags) > 0:
+                                table_tags.append(table_tags[-1])
+                            else:
+                                table_tags.append("transaction")
+                        continue
+                    elif (not table_started and ("date" in line and "amount" in line)) or (
+                        not table_started and ("date" in line and "balance" in line)
+                    ):
+                        headers = self.detect_headers(line_bbox)
+                        header_words = line_bbox["words"]
+                        date_flag = False
+                        amount_flag = False
+                        balance_flag = False
+                        for header in headers:
+                            if "date" in header.lower():
+                                date_flag = True
+                            if "amount" in header.lower():
+                                amount_flag = True
+                            if "balance" in header.lower():
+                                balance_flag = True
+                        if date_flag and (amount_flag or balance_flag):
+                            table_started = True
+                            current_row = {header: [] for header in headers}
+                        else:
+                            continue
+                        if line_idx - 1 >= 0:
+                            prev_line = block[line_idx - 1]["line"]
+                            tag = self.check_table_tags(prev_line, headers)
+                            if tag:
+                                table_tags.append(tag)
+                            elif len(table_tags) > 0:
+                                table_tags.append(table_tags[-1])
+                            else:
+                                table_tags.append("transaction")
+                        continue
+                    if table_started and ("date" in line and "description" in line):
+                        max_len = max(len(v) for v in current_row.values())
+                        for i in range(max_len):
+                            row_map = {}
+                            for key in current_row:
+                                row_map[key] = (
+                                    current_row[key][i] if i < len(current_row[key]) else ""
+                                )
+                            current_table.append(row_map)
+                        df = pd.DataFrame(current_table)
+                        all_tables.append(df)
+                        current_table = []
+                        headers = self.detect_headers(line_bbox)
+                        header_words = line_bbox["words"]
+                        date_flag = False
+                        description_flag = False
+                        for header in headers:
+                            if "date" in header.lower():
+                                date_flag = True
+                            if "description" in header.lower():
+                                description_flag = True
+                        if date_flag and description_flag:
+                            current_row = {header: [] for header in headers}
+                        else:
+                            continue
+                        if line_idx - 1 >= 0:
+                            prev_line = block[line_idx - 1]["line"]
+                            tag = self.check_table_tags(prev_line, headers)
+                            if tag:
+                                table_tags.append(tag)
+                            elif len(table_tags) > 0:
+                                table_tags.append(table_tags[-1])
+                            else:
+                                table_tags.append("transaction")
+                        continue
+                    elif (table_started and ("date" in line and "amount" in line)) or (
+                        table_started and ("date" in line and "balance" in line)
+                    ):
+                        max_len = max(len(v) for v in current_row.values())
+                        for i in range(max_len):
+                            row_map = {}
+                            for key in current_row:
+                                row_map[key] = (
+                                    current_row[key][i] if i < len(current_row[key]) else ""
+                                )
+                            current_table.append(row_map)
+                        df = pd.DataFrame(current_table)
+                        all_tables.append(df)
+                        current_table = []
+                        headers = self.detect_headers(line_bbox)
+                        header_words = line_bbox["words"]
+                        date_flag = False
+                        amount_flag = False
+                        balance_flag = False
+                        for header in headers:
+                            if "date" in header.lower():
+                                date_flag = True
+                            if "amount" in header.lower():
+                                amount_flag = True
+                            if "balance" in header.lower():
+                                balance_flag = True
+                        if date_flag and (amount_flag or balance_flag):
+                            current_row = {header: [] for header in headers}
+                        else:
+                            continue
+                        if line_idx - 1 >= 0:
+                            prev_line = block[line_idx - 1]["line"]
+                            tag = self.check_table_tags(prev_line, headers)
+                            if tag:
+                                table_tags.append(tag)
+                            elif len(table_tags) > 0:
+                                table_tags.append(table_tags[-1])
+                            else:
+                                table_tags.append("transaction")
+                        continue
+                    if table_started:
+                        parts = self.detect_row_data(headers, header_words, line_bbox["words"])
+                        for key, value in zip(headers, parts):
+                            current_row[key].append(value)
+                        max_len = max(len(v) for v in current_row.values())
+                        for i in range(max_len):
+                            if (
+                                "amount" in headers
+                                and current_row["amount"]
+                                and i < len(current_row["amount"])
+                                and current_row["amount"][i]
+                            ):
+                                amount = self.extract_amount_or_return(current_row["amount"][i])
+                                current_row["amount"][i] = amount
+                            if (
+                                "balance" in headers
+                                and current_row["balance"]
+                                and i < len(current_row["balance"])
+                                and current_row["balance"][i]
+                            ):
+                                amount = self.extract_amount_or_return(current_row["balance"][i])
+                                current_row["balance"][i] = amount
+                            if (
+                                "credits" in headers
+                                and current_row["credits"]
+                                and i < len(current_row["credits"])
+                                and current_row["credits"][i]
+                            ):
+                                amount = self.extract_amount_or_return(current_row["credits"][i])
+                                current_row["credits"][i] = amount
+                            if (
+                                "debits" in headers
+                                and current_row["debits"]
+                                and i < len(current_row["debits"])
+                                and current_row["debits"][i]
+                            ):
+                                amount = self.extract_amount_or_return(current_row["debits"][i])
+                                current_row["debits"][i] = amount
+                            if (
+                                "date" in headers
+                                and current_row["date"]
+                                and i < len(current_row["date"])
+                                and current_row["date"][i]
+                            ):
+                                current_row["date"][i] = self.extract_date_or_return(
+                                    current_row["date"][i]
+                                )
+                        if (
+                            "date" in headers
+                            and current_row["date"]
+                            and current_row["date"][0]
+                            and not self.is_date_word(current_row["date"][0])
+                            or (
+                                "amount" in headers
+                                and current_row["amount"][0]
+                                and not self.amount_pattern.match(current_row["amount"][0])
+                            )
+                            or (
+                                "balance" in headers
+                                and current_row["balance"][0]
+                                and not self.amount_pattern.match(current_row["balance"][0])
+                            )
+                            or (
+                                "credits" in headers
+                                and current_row["credits"][0]
+                                and not self.amount_pattern.match(current_row["credits"][0])
+                            )
+                            or (
+                                "debits" in headers
+                                and current_row["debits"][0]
+                                and not self.amount_pattern.match(current_row["debits"][0])
+                            )
+                        ):
+                            if not current_table and len(table_tags) > 0 and table_tags[-1]:
+                                table_tags.pop()
+                            all_tables.append(pd.DataFrame(current_table))
+                            current_table = []
+                            current_row = {}
+                            header_words = []
+                            headers = []
+                            table_started = False
+                        else:
+                            for i in range(max_len):
+                                row_map = {}
+                                for key in current_row:
+                                    row_map[key] = (
+                                        current_row[key][i] if i < len(current_row[key]) else ""
+                                    )
+                                current_table.append(row_map)
+                            current_row = {header: [] for header in headers}
+                table_started = False
+                if current_table:
+                    df = pd.DataFrame(current_table)
+                    all_tables.append(df)
+            return all_tables, table_tags
+        return await asyncio.get_event_loop().run_in_executor(None, _process_tables)
+    async def process_tables(self, table: pd.DataFrame) -> pd.DataFrame:
+        """Process the extracted table to clean and format it."""
+        def _process_table():
+            keywords = ["continue", "continued", "page", "next page", "total", "subtotal"]
+            table_copy = table.copy()
+            is_balance_column = "balance" in table_copy.columns
+            is_amount_column = "amount" in table_copy.columns
+            is_credits_column = "credits" in table_copy.columns
+            is_debits_column = "debits" in table_copy.columns
+            for idx, row in table_copy.iterrows():
+                if is_balance_column:
+                    if row["balance"] and not row["date"]:
+                        table_copy.loc[idx] = [""] * len(table_copy.columns)
+                        continue
+                if is_amount_column:
+                    if row["amount"] and not row["date"]:
+                        table_copy.loc[idx] = [""] * len(table_copy.columns)
+                        continue
+                if is_credits_column:
+                    if row["credits"] and not row["date"]:
+                        table_copy.loc[idx] = [""] * len(table_copy.columns)
+                        continue
+                if is_debits_column:
+                    if row["debits"] and not row["date"]:
+                        table_copy.loc[idx] = [""] * len(table_copy.columns)
+                        continue
+                for cell in row:
+                    if any(keyword in cell.lower() for keyword in keywords):
+                        table_copy.loc[idx] = [""] * len(table_copy.columns)
+                        break
+            df = table_copy.copy()
+            df = df.fillna("")  # Fill NaNs with empty string for easier processing
+            # Step 1: Identify key columns (case-insensitive match)
+            lower_cols = [col.lower() for col in df.columns]
+            date_col = next((col for col in df.columns if re.search(r'date', col, re.IGNORECASE)), None)
+            value_cols = [col for col in df.columns if re.search(r'amount|balance|credits|debits', col, re.IGNORECASE)]
+            if not date_col or not value_cols:
+                return df
+            def is_anchor(row):
+                return bool(row[date_col].strip()) and any(row[col].strip() for col in value_cols)
+            # Step 2: Loop over rows and identify anchor indices
+            anchor_indices = [i for i, row in df.iterrows() if is_anchor(row)]
+            for anchor_idx in anchor_indices:
+                # Merge upward
+                i = anchor_idx - 1
+                while i >= 0:
+                    if is_anchor(df.iloc[i]) or df.iloc[i].isnull().all() or all(df.iloc[i] == ""):
+                        break
+                    for col in df.columns:
+                        if col != date_col and col not in value_cols:
+                            df.at[anchor_idx, col] = (str(df.at[i, col]).strip() + " " + str(df.at[anchor_idx, col]).strip()).strip()
+                    df.iloc[i] = ""  # Blank the merged row
+                    i -= 1
+                # Merge downward
+                i = anchor_idx + 1
+                while i < len(df):
+                    if is_anchor(df.iloc[i]) or df.iloc[i].isnull().all() or all(df.iloc[i] == ""):
+                        break
+                    for col in df.columns:
+                        if col != date_col and col not in value_cols:
+                            df.at[anchor_idx, col] = (str(df.at[anchor_idx, col]).strip() + " " + str(df.at[i, col]).strip()).strip()
+                    df.iloc[i] = ""  # Blank the merged row
+                    i += 1
+            df_copy = df.copy()
+            col = "balance" if "balance" in df_copy.columns else "amount"
+            for idx, row in df_copy.iterrows():
+                if not row[col] and not row[date_col]:
+                    df_copy.loc[idx] = [""] * len(df_copy.columns)
+            df_copy = df_copy[~df_copy.apply(lambda row: all(cell == "" for cell in row), axis=1)].reset_index(drop=True)
+            return df_copy
+        return await asyncio.get_event_loop().run_in_executor(None, _process_table)

src/models/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .account_models import AccountSummary, AccountDetails, BankStatementData
2	+
3	+ __all__ = ["AccountSummary", "AccountDetails", "BankStatementData"]

src/models/account_models.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from typing import List, Dict, Any, Optional
+from pydantic import BaseModel, Field
+from datetime import date
+class AccountDetails(BaseModel):
+    """Model for individual account details."""
+    account_name: str = Field(..., description="Name of the account")
+    account_number: str = Field(..., description="Account number")
+    starting_balance: float = Field(..., description="Starting balance of the account")
+    ending_balance: float = Field(..., description="Ending balance of the account")
+    statement_start_date: str = Field(..., description="Statement start date in YYYY-MM-DD format")
+    statement_end_date: str = Field(..., description="Statement end date in YYYY-MM-DD format")
+class AccountSummary(BaseModel):
+    """Model for bank account summary extracted from LLM."""
+    bank_name: str = Field(..., description="Name of the bank")
+    account_holder: str = Field(..., description="Name of the account holder")
+    accounts: List[AccountDetails] = Field(..., description="List of account details")
+class BankStatementData(BaseModel):
+    """Model for processed bank statement data."""
+    account_summary: Dict[str, str] = Field(..., description="Account summary information")
+    transaction_tables: Dict[str, Any] = Field(..., description="Extracted transaction tables")
+class WordData(BaseModel):
+    """Model for word data with bounding box."""
+    word: str = Field(..., description="Extracted word text")
+    bbox: List[float] = Field(..., description="Bounding box coordinates [x0, y0, x1, y1]")
+class LineData(BaseModel):
+    """Model for line data with words."""
+    line: str = Field(..., description="Complete line text")
+    bbox: List[float] = Field(..., description="Line bounding box [x, y]")
+    words: List[WordData] = Field(..., description="List of words in the line")
+class ExtractedTextData(BaseModel):
+    """Model for extracted text data from PDF."""
+    pages: List[List[LineData]] = Field(..., description="List of pages, each containing lines")

src/ocr/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .pdf_processor import PDFProcessor
+from .text_extractor import TextExtractor
+__all__ = ["PDFProcessor", "TextExtractor"]

src/ocr/pdf_processor.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import asyncio
+import fitz
+import os
+from typing import List, Dict, Any, Optional
+import numpy as np
+from pdf2image import convert_from_path
+from doctr.models import ocr_predictor
+from doctr.io import DocumentFile
+import torch
+from src.config.config import settings
+from src.models.account_models import LineData, WordData
+from src.utils import model_manager
+class PDFProcessor:
+    """Async PDF processor for handling both digital and scanned PDFs."""
+    def __init__(self):
+        # Use the centralized model manager
+        self._ensure_models_loaded()
+    def _ensure_models_loaded(self):
+        """Ensure models are loaded via the model manager."""
+        if not model_manager.models_loaded:
+            print("🔄 Models not loaded, initializing model manager...")
+            # This will trigger model loading if not already done
+            _ = model_manager.doctr_model
+    @property
+    def doctr_model(self):
+        """Get the loaded doctr model from model manager."""
+        return model_manager.doctr_model
+    @property
+    def device(self):
+        """Get the device being used from model manager."""
+        return model_manager.device
+    async def __aenter__(self):
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        pass
+    async def is_pdf_scanned(self, pdf_path: str) -> bool:
+        """Check if PDF is scanned (no extractable text)."""
+        def _check_scanned():
+            doc = fitz.open(pdf_path)
+            for page in doc:
+                text = page.get_text()
+                if text.strip():
+                    return False
+            return True
+        return await asyncio.get_event_loop().run_in_executor(None, _check_scanned)
+    async def save_uploaded_file(self, uploaded_file) -> str:
+        """Save uploaded file to temporary location."""
+        def _save_file():
+            with open(settings.temp_file_name, "wb") as f:
+                f.write(uploaded_file.read())
+            return settings.temp_file_name
+        return await asyncio.get_event_loop().run_in_executor(None, _save_file)
+    async def extract_text_from_digital_pdf(self, pdf_path: str) -> List[List[str]]:
+        """Extract text from digital PDF using PyPDF2."""
+        from PyPDF2 import PdfReader
+        def _extract_text():
+            reader = PdfReader(pdf_path)
+            extracted_data = []
+            for page in reader.pages:
+                ptext = page.extract_text()
+                if ptext:
+                    data = []
+                    for line in ptext.splitlines():
+                        cleaned_line = self._split_on_repeated_pattern(line.strip())
+                        if cleaned_line:
+                            data.append(cleaned_line[0])
+                    extracted_data.append(data)
+            return extracted_data
+        return await asyncio.get_event_loop().run_in_executor(None, _extract_text)
+    def _split_on_repeated_pattern(self, line: str, min_space: int = 10) -> List[str]:
+        """Split line on repeated pattern."""
+        import re
+        from difflib import SequenceMatcher
+        original_line = line.strip()
+        # Find all spans of spaces >= min_space
+        space_spans = [
+            (m.start(), len(m.group()))
+            for m in re.finditer(r" {%d,}" % min_space, original_line)
+        ]
+        if not space_spans:
+            return [original_line]
+        # Count how often each gap size occurs
+        gaps = [span[1] for span in space_spans]
+        gap_counts = {}
+        for g in gaps:
+            gap_counts[g] = gap_counts.get(g, 0) + 1
+        # Sort gaps by size × count (more dominant gaps first)
+        sorted_gaps = sorted(gap_counts.items(), key=lambda x: x[1] * x[0], reverse=True)
+        # No significant gaps, return original
+        if not sorted_gaps:
+            return [original_line]
+        dominant_gap = sorted_gaps[0][0]
+        # Use the dominant large gap to split
+        chunks = re.split(rf" {{%d,}}" % dominant_gap, original_line)
+        # Check if it's actually repeated using fuzzy match
+        base = chunks[0].strip()
+        repeated = False
+        for chunk in chunks[1:]:
+            chunk = chunk.strip()
+            if chunk and SequenceMatcher(None, base, chunk).ratio() > 0.8:
+                repeated = True
+                break
+        return [base] if repeated else [original_line]

src/ocr/text_extractor.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import asyncio
+import fitz
+import re
+import numpy as np
+from typing import List, Dict, Any, Optional
+from pdf2image import convert_from_path
+from src.config.config import settings
+from src.models.account_models import LineData, WordData
+from doctr.io import DocumentFile
+class TextExtractor:
+    """Async text extractor for extracting text with bounding boxes."""
+    def __init__(self, doctr_model):
+        self.doctr_model = doctr_model
+    async def __aenter__(self):
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        pass
+    def normalize_bbox(self, bbox, width: float, height: float) -> List[float]:
+        """Normalize bounding box (x0, y0, x1, y1) to range [0, 1]."""
+        x0, y0, x1, y1 = bbox
+        return [
+            round(x0 / width, 6),
+            round(y0 / height, 6),
+            round(x1 / width, 6),
+            round(y1 / height, 6),
+        ]
+    def remove_consecutive_items(self, line: List[str]) -> List[str]:
+        """Remove consecutive duplicate items from a list."""
+        if not line:
+            return line
+        result = [line[0]]
+        for item in line[1:]:
+            if item != result[-1]:
+                result.append(item)
+        return result
+    def remove_consecutive_words(self, word_data: List[Dict]) -> List[Dict]:
+        """Remove consecutive duplicate words from word data."""
+        if not word_data:
+            return word_data
+        result = [word_data[0]]
+        for i in range(1, len(word_data)):
+            if word_data[i]["word"] != result[-1]["word"]:
+                result.append(word_data[i])
+        return result
+    async def extract_lines_with_bbox(self, pdf_path: str, y_threshold: float = 3.0) -> List[List[LineData]]:
+        """Extract lines with bounding boxes from digital PDF."""
+        def _extract_lines():
+            doc = fitz.open(pdf_path)
+            page_lines_with_bbox = []
+            for page in doc:
+                words = page.get_text("words")  # (x0, y0, x1, y1, word, block_no, line_no, word_no)
+                words.sort(key=lambda w: (round(w[1], 1), w[0]))  # sort by y then x
+                lines = []
+                current_line = []
+                current_y = None
+                current_word_data = []
+                for w in words:
+                    x0, y0, x1, y1, word = w[:5]
+                    if word == "|" or not word or word == "." or word == "#" or re.sub(r'[^\w\s]', '', word) == "":
+                        continue
+                    word = word.lower()
+                    word_data = {"word": word.strip(), "bbox": (x0, y0, x1, y1)}
+                    if current_y is None or abs(y0 - current_y) < y_threshold:
+                        current_line.append((x0, y0, word))
+                        current_y = y0
+                        current_word_data.append(word_data)
+                    else:
+                        current_line.sort()
+                        line_words = [w[2] for w in current_line]
+                        clean_line = self.remove_consecutive_items(line_words)
+                        current_word_data = sorted(current_word_data, key=lambda w: w["bbox"][0])
+                        clean_word_data = self.remove_consecutive_words(current_word_data)
+                        if clean_line:
+                            x_start = min([w[0] for w in current_line])
+                            y_start = min([w[1] for w in current_line])
+                            lines.append({
+                                "line": " ".join(clean_line),
+                                "bbox": [x_start, y_start],
+                                "words": clean_word_data,
+                            })
+                        current_line = [(x0, y0, word)]
+                        current_y = y0
+                        current_word_data = [word_data]
+                # Process remaining line
+                if current_line:
+                    current_line.sort()
+                    line_words = [w[2] for w in current_line]
+                    clean_line = self.remove_consecutive_items(line_words)
+                    current_word_data = sorted(current_word_data, key=lambda w: w["bbox"][0])
+                    clean_word_data = self.remove_consecutive_words(current_word_data)
+                    if clean_line:
+                        x_start = min([w[0] for w in current_line])
+                        y_start = min([w[1] for w in current_line])
+                        lines.append({
+                            "line": " ".join(clean_line),
+                            "bbox": [x_start, y_start],
+                            "words": clean_word_data,
+                        })
+                page_lines_with_bbox.append(lines)
+            return page_lines_with_bbox
+        return await asyncio.get_event_loop().run_in_executor(None, _extract_lines)
+    async def extract_lines_with_bbox_from_scanned_pdf(self, pdf_path: str, y_threshold: float = 5.0, first_page: bool = False) -> List[List[LineData]]:
+        """Extract lines with bounding boxes from scanned PDF using OCR."""
+        def _extract_from_scanned():
+            result = None
+            doc = None
+            if first_page:
+                pages = convert_from_path(pdf_path, dpi=settings.dpi, first_page=1, last_page=1)
+                first_page_img = pages[0].convert("RGB")
+                result = self.doctr_model([np.array(first_page_img)])
+                doc = np.array(first_page_img)
+            else:
+                doc = DocumentFile.from_pdf(pdf_path)
+                result = self.doctr_model(doc)
+            page_lines_with_bbox = []
+            for page in result.pages:
+                img_width, img_height = doc[0].shape[1], doc[0].shape[0]
+                words = []
+                for block in page.blocks:
+                    for line in block.lines:
+                        for word in line.words:
+                            x0, y0 = word.geometry[0]
+                            x1, y1 = word.geometry[1]
+                            abs_x0 = x0 * img_width
+                            abs_y0 = y0 * img_height
+                            abs_x1 = x1 * img_width
+                            abs_y1 = y1 * img_height
+                            text = word.value.strip().lower()
+                            text = re.sub(r'[#*]', ' ', text)
+                            text = text.strip()
+                            if text == "|" or not text or text == "." or text == "#" or re.sub(r'[^\w\s]', '', text) == "":
+                                continue
+                            words.append({"word": text, "bbox": [abs_x0, abs_y0, abs_x1, abs_y1]})
+                # Sort words by y then x
+                words.sort(key=lambda w: (round(w["bbox"][1], 3), w["bbox"][0]))
+                lines = []
+                current_line = []
+                current_word_data = []
+                current_y = None
+                for w in words:
+                    y0 = w["bbox"][1]
+                    if current_y is None or abs(y0 - current_y) < y_threshold:
+                        current_line.append((w["bbox"][0], y0, w["word"]))
+                        current_word_data.append(w)
+                        current_y = y0
+                    else:
+                        current_line.sort()
+                        line_words = [x[2] for x in current_line]
+                        clean_line = self.remove_consecutive_items(line_words)
+                        current_word_data = sorted(current_word_data, key=lambda w: w["bbox"][0])
+                        clean_word_data = self.remove_consecutive_words(current_word_data)
+                        if clean_line:
+                            x_start = min(x[0] for x in current_line)
+                            y_start = min(x[1] for x in current_line)
+                            lines.append({
+                                "line": " ".join(clean_line),
+                                "bbox": [x_start, y_start],
+                                "words": clean_word_data,
+                            })
+                        current_line = [(w["bbox"][0], y0, w["word"])]
+                        current_word_data = [w]
+                        current_y = y0
+                # Final remaining line
+                if current_line:
+                    current_line.sort()
+                    line_words = [x[2] for x in current_line]
+                    clean_line = self.remove_consecutive_items(line_words)
+                    current_word_data = sorted(current_word_data, key=lambda w: w["bbox"][0])
+                    clean_word_data = self.remove_consecutive_words(current_word_data)
+                    if clean_line:
+                        x_start = min(x[0] for x in current_line)
+                        y_start = min(x[1] for x in current_line)
+                        lines.append({
+                            "line": " ".join(clean_line),
+                            "bbox": [x_start, y_start],
+                            "words": clean_word_data,
+                        })
+                page_lines_with_bbox.append(lines)
+            return page_lines_with_bbox
+        return await asyncio.get_event_loop().run_in_executor(None, _extract_from_scanned)

src/services/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .bank_statement_service import BankStatementService
2	+
3	+ __all__ = ["BankStatementService"]

src/services/bank_statement_service.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import asyncio
+import json
+import pandas as pd
+from typing import List, Dict, Any, Optional, Tuple
+from src.ocr import PDFProcessor, TextExtractor
+from src.extractor import TableExtractor, AccountExtractor, BalanceExtractor
+from src.utils import GroqClient
+from src.models.account_models import BankStatementData
+from src.config.config import settings
+class BankStatementService:
+    """Main service for processing bank statements."""
+    def __init__(self):
+        self.pdf_processor = PDFProcessor()
+        self.table_extractor = TableExtractor()
+        self.account_extractor = AccountExtractor()
+        self.balance_extractor = BalanceExtractor()
+    async def __aenter__(self):
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        pass
+    async def process_bank_statement(self, uploaded_file) -> BankStatementData:
+        """Main method to process uploaded bank statement."""
+        # Save uploaded file
+        pdf_path = await self.pdf_processor.save_uploaded_file(uploaded_file)
+        # Check if PDF is scanned
+        pdf_scanned = await self.pdf_processor.is_pdf_scanned(pdf_path)
+        # Extract text based on PDF type
+        if pdf_scanned:
+            print(f"{pdf_path} is likely a scanned PDF.")
+            text_extractor = TextExtractor(self.pdf_processor.doctr_model)
+            extracted_text_list = await text_extractor.extract_lines_with_bbox_from_scanned_pdf(pdf_path)
+        else:
+            print(f"{pdf_path} is not a scanned PDF. Extracting text...")
+            text_extractor = TextExtractor(self.pdf_processor.doctr_model)
+            extracted_text_list = await text_extractor.extract_lines_with_bbox(pdf_path)
+        # Process transaction tables
+        pre_processed_tables, table_tags = await self.table_extractor.process_transaction_tables_with_bbox(extracted_text_list)
+        # Clean and process tables
+        processed_tables = []
+        for table in pre_processed_tables:
+            processed_table = await self.table_extractor.process_tables(table)
+            processed_tables.append(processed_table)
+        # Organize tables by tags
+        final_table_dic = {}
+        for i, tag in enumerate(table_tags):
+            if tag not in final_table_dic:
+                final_table_dic[tag] = [processed_tables[i]]
+            else:
+                final_table_dic[tag].append(processed_tables[i])
+        # Concatenate tables with same tags
+        for tag, tables in final_table_dic.items():
+            final_table_dic[tag] = pd.concat(tables, ignore_index=True)
+        # Extract account information from first page
+        first_page = None
+        if pdf_scanned:
+            first_page = extracted_text_list
+        else:
+            first_page = await text_extractor.extract_lines_with_bbox_from_scanned_pdf(pdf_path, first_page=True)
+        first_page_object = first_page[0]
+        # Extract text for LLM processing
+        starting_text = ""
+        for lines in first_page_object:
+            starting_text += lines["line"]
+        # Extract account details using LLM
+        async with GroqClient() as groq_client:
+            bank_summary = await groq_client.extract_account_details(starting_text)
+        bank_summary = json.loads(bank_summary)
+        # Create account summary
+        account_summary = {
+            "Bank Name": bank_summary["bank_name"].upper(),
+            "Account Number": bank_summary["accounts"][-1]["account_number"],
+            "Starting Balance": str(bank_summary["accounts"][-1]["starting_balance"]),
+            "Ending Balance": str(bank_summary["accounts"][-1]["ending_balance"]),
+            "Statement Start Date": bank_summary["accounts"][-1]["statement_start_date"],
+            "Statement End Date": bank_summary["accounts"][-1]["statement_end_date"]
+        }
+        return BankStatementData(
+            account_summary=account_summary,
+            transaction_tables=final_table_dic
+        )

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .api_clients import GroqClient, HuggingFaceClient
+from .model_manager import ModelManager, model_manager
+__all__ = ["GroqClient", "HuggingFaceClient", "ModelManager", "model_manager"]

src/utils/api_clients.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import asyncio
+import json
+from typing import Dict, Any, Optional
+from openai import AsyncOpenAI
+from huggingface_hub import AsyncInferenceClient
+from src.config.config import settings
+class GroqClient:
+    """Async client for Groq API."""
+    def __init__(self):
+        self.client = AsyncOpenAI(
+            base_url=settings.groq_base_url,
+            api_key=settings.groq_api_key,
+        )
+    async def __aenter__(self):
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        await self.client.close()
+    async def extract_account_details(self, text: str) -> str:
+        """Extract account details using LLM."""
+        system_prompt = """
+        You are a financial document parser that extracts structured data from bank statements.
+        Your task is to extract the following fields and return only valid JSON:
+        - Starting balance can also be referred with "Balance last statement" or "Balance previous statement" in pdfs.
+        - Ending balance can also be referred with "Balance this statement" in pdfs.
+        {
+        "bank_name": "string",
+        "account_holder": "string",
+        "accounts": [{
+            "account_name": "string",
+            "account_number": "string",
+            "starting_balance": float,
+            "ending_balance": float,
+            "statement_start_date": "YYYY-MM-DD",
+            "statement_end_date": "YYYY-MM-DD"
+        }]
+        }
+        Guidelines:
+        - Return strictly valid JSON (no markdown, comments, or extra explanation).
+        - `starting_balance` and `ending_balance` must be `float` (no currency symbol).
+        - Dates must follow the format `"YYYY-MM-DD"`.
+        - Do not respond with anything other than the JSON object.
+        - If multiple account are there then include all the account list in a list.
+        """
+        response = await self.client.chat.completions.create(
+            model=settings.llm_model,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": text},
+            ],
+        )
+        return response.choices[0].message.content
+class HuggingFaceClient:
+    """Async client for HuggingFace Inference API."""
+    def __init__(self):
+        self.client = AsyncInferenceClient(
+            provider=settings.huggingface_provider,
+            api_key=settings.huggingface_api_key,
+        )
+    async def __aenter__(self):
+        return self
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        pass
+    async def extract_account_details(self, text: str) -> str:
+        """Extract account details using HuggingFace model."""
+        # This is a placeholder - you can implement HuggingFace specific logic here
+        # For now, we'll use the same prompt as Groq
+        system_prompt = """
+        You are a financial document parser that extracts structured data from bank statements.
+        Your task is to extract the following fields and return only valid JSON:
+        - Starting balance can also be referred with "Balance last statement" or "Balance previous statement" in pdfs.
+        - Ending balance can also be referred with "Balance this statement" in pdfs.
+        {
+        "bank_name": "string",
+        "account_holder": "string",
+        "accounts": [{
+            "account_name": "string",
+            "account_number": "string",
+            "starting_balance": float,
+            "ending_balance": float,
+            "statement_start_date": "YYYY-MM-DD",
+            "statement_end_date": "YYYY-MM-DD"
+        }]
+        }
+        Guidelines:
+        - Return strictly valid JSON (no markdown, comments, or extra explanation).
+        - `starting_balance` and `ending_balance` must be `float` (no currency symbol).
+        - Dates must follow the format `"YYYY-MM-DD"`.
+        - Do not respond with anything other than the JSON object.
+        - If multiple account are there then include all the account list in a list.
+        """
+        # This would need to be implemented based on the specific HuggingFace model
+        # For now, returning a placeholder
+        return '{"bank_name": "Unknown", "account_holder": "Unknown", "accounts": []}'

src/utils/model_manager.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import asyncio
+import torch
+from typing import Optional
+from doctr.models import ocr_predictor
+import spacy
+from src.config.config import settings
+class ModelManager:
+    """Singleton model manager for pre-loading all models at startup."""
+    _instance = None
+    _doctr_model = None
+    _spacy_model = None
+    _device = None
+    _models_loaded = False
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(ModelManager, cls).__new__(cls)
+        return cls._instance
+    def __init__(self):
+        if not self._models_loaded:
+            self._load_models()
+    def _load_models(self):
+        """Load all models synchronously."""
+        print("🚀 Starting model pre-loading...")
+        # Set device based on config
+        if settings.force_cpu:
+            self._device = torch.device("cpu")
+            print("📱 Using CPU (forced by config)")
+        else:
+            self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            print(f"📱 Using device: {self._device}")
+        # Load doctr model
+        print("🔄 Loading doctr OCR model...")
+        self._doctr_model = ocr_predictor(pretrained=True)
+        self._doctr_model.det_predictor.model = self._doctr_model.det_predictor.model.to(self._device)
+        self._doctr_model.reco_predictor.model = self._doctr_model.reco_predictor.model.to(self._device)
+        print("✅ Doctr model loaded successfully!")
+        # Load spaCy model
+        print(f"🔄 Loading spaCy NER model: {settings.spacy_model_name}...")
+        try:
+            self._spacy_model = spacy.load(settings.spacy_model_name)
+            print(f"✅ spaCy model ({settings.spacy_model_name}) loaded successfully!")
+        except OSError:
+            print(f"⚠️ spaCy model '{settings.spacy_model_name}' not found.")
+            # Try fallback models
+            fallback_models = ["en_core_web_sm", "en_core_web_trf"]
+            for fallback_model in fallback_models:
+                if fallback_model != settings.spacy_model_name:
+                    try:
+                        print(f"🔄 Trying fallback model: {fallback_model}")
+                        self._spacy_model = spacy.load(fallback_model)
+                        print(f"✅ spaCy model ({fallback_model}) loaded successfully!")
+                        break
+                    except OSError:
+                        continue
+            if self._spacy_model is None:
+                print("⚠️ No spaCy model found. Please install with: python -m spacy download en_core_web_sm")
+        self._models_loaded = True
+        print("🎉 All models loaded successfully!")
+    @property
+    def doctr_model(self):
+        """Get the loaded doctr model."""
+        return self._doctr_model
+    @property
+    def spacy_model(self):
+        """Get the loaded spaCy model."""
+        return self._spacy_model
+    @property
+    def device(self):
+        """Get the device being used."""
+        return self._device
+    @property
+    def models_loaded(self):
+        """Check if models are loaded."""
+        return self._models_loaded
+    async def ensure_models_loaded(self):
+        """Ensure models are loaded (async wrapper)."""
+        if not self._models_loaded:
+            await asyncio.get_event_loop().run_in_executor(None, self._load_models)
+        return True
+    def get_model_status(self):
+        """Get status of all models."""
+        return {
+            "doctr_model": self._doctr_model is not None,
+            "spacy_model": self._spacy_model is not None,
+            "device": str(self._device),
+            "models_loaded": self._models_loaded,
+            "spacy_model_name": settings.spacy_model_name,
+            "force_cpu": settings.force_cpu
+        }
+# Global model manager instance
+model_manager = ModelManager()