Spaces:
Runtime error
Runtime error
Aryan Jain commited on
Commit Β·
4e71548
0
Parent(s):
bank scrubber streamlit application
Browse files- .dockerignore +168 -0
- .env.example +27 -0
- .gitignore +128 -0
- DOCKER_DEPLOYMENT.md +443 -0
- Dockerfile +71 -0
- Dockerfile.alternative +63 -0
- Dockerfile.fallback +81 -0
- README.md +360 -0
- build-docker.sh +115 -0
- docker-compose.yml +26 -0
- main.py +64 -0
- poetry.lock +0 -0
- pyproject.toml +31 -0
- requirements.txt +17 -0
- src/__init__.py +4 -0
- src/config/config.py +46 -0
- src/extractor/__init__.py +5 -0
- src/extractor/account_extractor.py +152 -0
- src/extractor/balance_extractor.py +102 -0
- src/extractor/table_extractor.py +760 -0
- src/models/__init__.py +3 -0
- src/models/account_models.py +44 -0
- src/ocr/__init__.py +4 -0
- src/ocr/pdf_processor.py +131 -0
- src/ocr/text_extractor.py +214 -0
- src/services/__init__.py +3 -0
- src/services/bank_statement_service.py +100 -0
- src/utils/__init__.py +4 -0
- src/utils/api_clients.py +116 -0
- src/utils/model_manager.py +110 -0
.dockerignore
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Git
|
| 2 |
+
.git
|
| 3 |
+
.gitignore
|
| 4 |
+
.gitattributes
|
| 5 |
+
|
| 6 |
+
# Python
|
| 7 |
+
__pycache__
|
| 8 |
+
*.pyc
|
| 9 |
+
*.pyo
|
| 10 |
+
*.pyd
|
| 11 |
+
.Python
|
| 12 |
+
env
|
| 13 |
+
pip-log.txt
|
| 14 |
+
pip-delete-this-directory.txt
|
| 15 |
+
.tox
|
| 16 |
+
.coverage
|
| 17 |
+
.coverage.*
|
| 18 |
+
.cache
|
| 19 |
+
nosetests.xml
|
| 20 |
+
coverage.xml
|
| 21 |
+
*.cover
|
| 22 |
+
*.log
|
| 23 |
+
.mypy_cache
|
| 24 |
+
.pytest_cache
|
| 25 |
+
.hypothesis
|
| 26 |
+
|
| 27 |
+
# Virtual environments
|
| 28 |
+
venv/
|
| 29 |
+
env/
|
| 30 |
+
ENV/
|
| 31 |
+
env.bak/
|
| 32 |
+
venv.bak/
|
| 33 |
+
.venv/
|
| 34 |
+
.venv.bak/
|
| 35 |
+
|
| 36 |
+
# IDE
|
| 37 |
+
.vscode/
|
| 38 |
+
.idea/
|
| 39 |
+
*.swp
|
| 40 |
+
*.swo
|
| 41 |
+
*~
|
| 42 |
+
|
| 43 |
+
# OS
|
| 44 |
+
.DS_Store
|
| 45 |
+
.DS_Store?
|
| 46 |
+
._*
|
| 47 |
+
.Spotlight-V100
|
| 48 |
+
.Trashes
|
| 49 |
+
ehthumbs.db
|
| 50 |
+
Thumbs.db
|
| 51 |
+
|
| 52 |
+
# Project specific
|
| 53 |
+
temp.pdf
|
| 54 |
+
*.pdf
|
| 55 |
+
.env
|
| 56 |
+
.env.local
|
| 57 |
+
.env.*.local
|
| 58 |
+
requirements.txt
|
| 59 |
+
|
| 60 |
+
# Documentation
|
| 61 |
+
README.md
|
| 62 |
+
*.md
|
| 63 |
+
docs/
|
| 64 |
+
|
| 65 |
+
# Testing
|
| 66 |
+
test_structure.py
|
| 67 |
+
setup_env.py
|
| 68 |
+
startup.py
|
| 69 |
+
docker-startup.py
|
| 70 |
+
check-build-context.py
|
| 71 |
+
build-docker.sh
|
| 72 |
+
tests/
|
| 73 |
+
test_*.py
|
| 74 |
+
|
| 75 |
+
# Original files
|
| 76 |
+
poc.py
|
| 77 |
+
|
| 78 |
+
# Docker
|
| 79 |
+
Dockerfile
|
| 80 |
+
.dockerignore
|
| 81 |
+
docker-compose*.yml
|
| 82 |
+
DOCKER_DEPLOYMENT.md
|
| 83 |
+
|
| 84 |
+
# Large files and directories
|
| 85 |
+
*.tar
|
| 86 |
+
*.tar.gz
|
| 87 |
+
*.zip
|
| 88 |
+
*.rar
|
| 89 |
+
*.7z
|
| 90 |
+
*.model
|
| 91 |
+
*.pkl
|
| 92 |
+
*.pickle
|
| 93 |
+
*.h5
|
| 94 |
+
*.hdf5
|
| 95 |
+
*.ckpt
|
| 96 |
+
*.pth
|
| 97 |
+
*.pt
|
| 98 |
+
*.bin
|
| 99 |
+
*.safetensors
|
| 100 |
+
|
| 101 |
+
# Model files and caches
|
| 102 |
+
.cache/
|
| 103 |
+
models/
|
| 104 |
+
checkpoints/
|
| 105 |
+
weights/
|
| 106 |
+
*.weights
|
| 107 |
+
*.cfg
|
| 108 |
+
|
| 109 |
+
# Logs and temporary files
|
| 110 |
+
logs/
|
| 111 |
+
*.log
|
| 112 |
+
tmp/
|
| 113 |
+
temp/
|
| 114 |
+
.tmp/
|
| 115 |
+
|
| 116 |
+
# Node modules (if any)
|
| 117 |
+
node_modules/
|
| 118 |
+
|
| 119 |
+
# Large data files
|
| 120 |
+
data/
|
| 121 |
+
datasets/
|
| 122 |
+
*.csv
|
| 123 |
+
*.json
|
| 124 |
+
*.xml
|
| 125 |
+
*.xlsx
|
| 126 |
+
*.xls
|
| 127 |
+
|
| 128 |
+
# Backup files
|
| 129 |
+
*.bak
|
| 130 |
+
*.backup
|
| 131 |
+
*.old
|
| 132 |
+
|
| 133 |
+
# Jupyter notebooks
|
| 134 |
+
*.ipynb
|
| 135 |
+
.ipynb_checkpoints/
|
| 136 |
+
|
| 137 |
+
# Large images
|
| 138 |
+
*.jpg
|
| 139 |
+
*.jpeg
|
| 140 |
+
*.png
|
| 141 |
+
*.gif
|
| 142 |
+
*.bmp
|
| 143 |
+
*.tiff
|
| 144 |
+
*.tif
|
| 145 |
+
images/
|
| 146 |
+
img/
|
| 147 |
+
|
| 148 |
+
# Audio/Video files
|
| 149 |
+
*.mp3
|
| 150 |
+
*.mp4
|
| 151 |
+
*.avi
|
| 152 |
+
*.mov
|
| 153 |
+
*.wav
|
| 154 |
+
*.flac
|
| 155 |
+
|
| 156 |
+
# Archives
|
| 157 |
+
*.tar
|
| 158 |
+
*.tar.gz
|
| 159 |
+
*.tar.bz2
|
| 160 |
+
*.zip
|
| 161 |
+
*.rar
|
| 162 |
+
*.7z
|
| 163 |
+
|
| 164 |
+
# System files
|
| 165 |
+
Thumbs.db
|
| 166 |
+
ehthumbs.db
|
| 167 |
+
Desktop.ini
|
| 168 |
+
$RECYCLE.BIN/
|
.env.example
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Bank Statement Analyzer Configuration
|
| 2 |
+
# Copy this file to .env and update with your actual values
|
| 3 |
+
|
| 4 |
+
# API Keys
|
| 5 |
+
GROQ_API_KEY=your_groq_api_key_here
|
| 6 |
+
GROQ_BASE_URL=https://api.groq.com/openai/v1
|
| 7 |
+
|
| 8 |
+
HUGGINGFACE_API_KEY=your_huggingface_api_key_here
|
| 9 |
+
HUGGINGFACE_PROVIDER=novita
|
| 10 |
+
|
| 11 |
+
# Model Configuration
|
| 12 |
+
LLM_MODEL=llama-3.1-8b-instant
|
| 13 |
+
|
| 14 |
+
# OCR and Processing Settings
|
| 15 |
+
Y_THRESHOLD=3.0
|
| 16 |
+
GAP_THRESHOLD=10
|
| 17 |
+
GAP_THRESHOLD_RATIO=0.1
|
| 18 |
+
|
| 19 |
+
# File Processing Settings
|
| 20 |
+
TEMP_FILE_NAME=temp.pdf
|
| 21 |
+
DPI=300
|
| 22 |
+
|
| 23 |
+
# spaCy Model Settings
|
| 24 |
+
SPACY_MODEL_NAME=en_core_web_sm
|
| 25 |
+
|
| 26 |
+
# Device Settings
|
| 27 |
+
FORCE_CPU=false
|
.gitignore
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
pip-wheel-metadata/
|
| 20 |
+
share/python-wheels/
|
| 21 |
+
*.egg-info/
|
| 22 |
+
.installed.cfg
|
| 23 |
+
*.egg
|
| 24 |
+
PIPFILE.lock
|
| 25 |
+
|
| 26 |
+
# Virtual Environment
|
| 27 |
+
venv/
|
| 28 |
+
ENV/
|
| 29 |
+
env/
|
| 30 |
+
.venv/
|
| 31 |
+
|
| 32 |
+
# IDE
|
| 33 |
+
.vscode/
|
| 34 |
+
.idea/
|
| 35 |
+
*.swp
|
| 36 |
+
*.swo
|
| 37 |
+
*~
|
| 38 |
+
.project
|
| 39 |
+
.pydevproject
|
| 40 |
+
|
| 41 |
+
# Environment variables
|
| 42 |
+
.env
|
| 43 |
+
.env.local
|
| 44 |
+
.env.*.local
|
| 45 |
+
|
| 46 |
+
# Logs
|
| 47 |
+
logs/
|
| 48 |
+
*.log
|
| 49 |
+
|
| 50 |
+
# Debug
|
| 51 |
+
debug/
|
| 52 |
+
*.debug
|
| 53 |
+
|
| 54 |
+
# Cache
|
| 55 |
+
.cache/
|
| 56 |
+
*.cache
|
| 57 |
+
__pycache__/
|
| 58 |
+
.pytest_cache/
|
| 59 |
+
.mypy_cache/
|
| 60 |
+
.dmypy.json
|
| 61 |
+
dmypy.json
|
| 62 |
+
|
| 63 |
+
# Database
|
| 64 |
+
*.db
|
| 65 |
+
*.sqlite
|
| 66 |
+
*.sqlite3
|
| 67 |
+
|
| 68 |
+
# Output files
|
| 69 |
+
output/
|
| 70 |
+
results/
|
| 71 |
+
exports/
|
| 72 |
+
*.xlsx
|
| 73 |
+
*.csv
|
| 74 |
+
*.json
|
| 75 |
+
|
| 76 |
+
# Temporary files
|
| 77 |
+
tmp/
|
| 78 |
+
temp/
|
| 79 |
+
*.tmp
|
| 80 |
+
*.temp
|
| 81 |
+
|
| 82 |
+
# OS files
|
| 83 |
+
.DS_Store
|
| 84 |
+
Thumbs.db
|
| 85 |
+
ehthumbs.db
|
| 86 |
+
|
| 87 |
+
# Test coverage
|
| 88 |
+
htmlcov/
|
| 89 |
+
.tox/
|
| 90 |
+
.nox/
|
| 91 |
+
.coverage
|
| 92 |
+
.coverage.*
|
| 93 |
+
*.cover
|
| 94 |
+
*.py,cover
|
| 95 |
+
.hypothesis/
|
| 96 |
+
|
| 97 |
+
# Jupyter Notebook
|
| 98 |
+
.ipynb_checkpoints
|
| 99 |
+
|
| 100 |
+
# Redis
|
| 101 |
+
dump.rdb
|
| 102 |
+
|
| 103 |
+
# Secrets
|
| 104 |
+
secrets/
|
| 105 |
+
*.key
|
| 106 |
+
*.pem
|
| 107 |
+
*.crt
|
| 108 |
+
|
| 109 |
+
# Model files
|
| 110 |
+
models/*.pkl
|
| 111 |
+
models/*.h5
|
| 112 |
+
models/*.pt
|
| 113 |
+
|
| 114 |
+
# Large files
|
| 115 |
+
*.pdf
|
| 116 |
+
*.zip
|
| 117 |
+
*.tar.gz
|
| 118 |
+
*.rar
|
| 119 |
+
|
| 120 |
+
# Except test PDFs
|
| 121 |
+
!tests/fixtures/*.pdf
|
| 122 |
+
|
| 123 |
+
check-build-context.py
|
| 124 |
+
test_structure.py
|
| 125 |
+
startup.py
|
| 126 |
+
setup_env.py
|
| 127 |
+
poc.py
|
| 128 |
+
docker-startup.py
|
DOCKER_DEPLOYMENT.md
ADDED
|
@@ -0,0 +1,443 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Docker Deployment Guide
|
| 2 |
+
|
| 3 |
+
This guide explains how to deploy the Bank Statement Analyzer using Docker with Poetry dependency management.
|
| 4 |
+
|
| 5 |
+
## Prerequisites
|
| 6 |
+
|
| 7 |
+
- Docker installed on your system
|
| 8 |
+
- Docker Compose (usually comes with Docker Desktop)
|
| 9 |
+
- API keys for Groq and HuggingFace
|
| 10 |
+
|
| 11 |
+
## Quick Start
|
| 12 |
+
|
| 13 |
+
### 1. Set up Environment Variables
|
| 14 |
+
|
| 15 |
+
Create a `.env` file in the project root:
|
| 16 |
+
|
| 17 |
+
```bash
|
| 18 |
+
# Copy the example file
|
| 19 |
+
cp env.example .env
|
| 20 |
+
|
| 21 |
+
# Edit with your actual API keys
|
| 22 |
+
nano .env
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
Make sure to set:
|
| 26 |
+
- `GROQ_API_KEY=your_actual_groq_api_key`
|
| 27 |
+
- `HUGGINGFACE_API_KEY=your_actual_huggingface_api_key`
|
| 28 |
+
|
| 29 |
+
### 2. Build and Run with Docker Compose
|
| 30 |
+
|
| 31 |
+
```bash
|
| 32 |
+
# Build and start the application
|
| 33 |
+
docker-compose up --build
|
| 34 |
+
|
| 35 |
+
# Or run in detached mode
|
| 36 |
+
docker-compose up -d --build
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
### 3. Access the Application
|
| 40 |
+
|
| 41 |
+
Open your browser and go to: `http://localhost:8501`
|
| 42 |
+
|
| 43 |
+
## Manual Docker Build
|
| 44 |
+
|
| 45 |
+
If you prefer to build manually:
|
| 46 |
+
|
| 47 |
+
```bash
|
| 48 |
+
# Build the image
|
| 49 |
+
docker build -t bank-statement-analyzer .
|
| 50 |
+
|
| 51 |
+
# Run the container
|
| 52 |
+
docker run -p 8501:8501 \
|
| 53 |
+
--env-file .env \
|
| 54 |
+
-v $(pwd)/temp:/app/temp \
|
| 55 |
+
bank-statement-analyzer
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
## Docker Configuration
|
| 59 |
+
|
| 60 |
+
### Dockerfile Features
|
| 61 |
+
|
| 62 |
+
- **Base Image**: Python 3.12 slim for smaller size
|
| 63 |
+
- **Dependency Management**: Poetry for reliable dependency resolution
|
| 64 |
+
- **System Dependencies**: Includes OCR and graphics libraries
|
| 65 |
+
- **PyTorch**: Pre-installed with CPU support (can be changed to CUDA)
|
| 66 |
+
- **spaCy Models**: Pre-downloaded for faster startup
|
| 67 |
+
- **Optimized Layers**: Efficient caching for faster rebuilds
|
| 68 |
+
|
| 69 |
+
### Poetry Configuration
|
| 70 |
+
|
| 71 |
+
The project uses Poetry for dependency management:
|
| 72 |
+
|
| 73 |
+
```toml
|
| 74 |
+
# pyproject.toml
|
| 75 |
+
[tool.poetry]
|
| 76 |
+
name = "bank-statement-analyzer"
|
| 77 |
+
version = "1.0.0"
|
| 78 |
+
description = "A comprehensive, async, class-based bank statement analyzer"
|
| 79 |
+
|
| 80 |
+
[tool.poetry.dependencies]
|
| 81 |
+
python = "^3.12"
|
| 82 |
+
streamlit = "^1.28.0"
|
| 83 |
+
pandas = "^2.0.0"
|
| 84 |
+
# ... other dependencies
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
### Environment Variables
|
| 88 |
+
|
| 89 |
+
The following environment variables can be set in your `.env` file:
|
| 90 |
+
|
| 91 |
+
| Variable | Description | Default |
|
| 92 |
+
|----------|-------------|---------|
|
| 93 |
+
| `GROQ_API_KEY` | Your Groq API key | Required |
|
| 94 |
+
| `HUGGINGFACE_API_KEY` | Your HuggingFace API key | Required |
|
| 95 |
+
| `LLM_MODEL` | Groq model to use | `llama-3.1-8b-instant` |
|
| 96 |
+
| `SPACY_MODEL_NAME` | spaCy model for NER | `en_core_web_sm` |
|
| 97 |
+
| `FORCE_CPU` | Force CPU usage | `false` |
|
| 98 |
+
| `DPI` | PDF processing DPI | `300` |
|
| 99 |
+
| `Y_THRESHOLD` | Text extraction threshold | `3.0` |
|
| 100 |
+
|
| 101 |
+
### Volumes
|
| 102 |
+
|
| 103 |
+
- `./temp:/app/temp`: Shared temp directory for file processing
|
| 104 |
+
- `./.env:/app/.env:ro`: Read-only access to environment file
|
| 105 |
+
|
| 106 |
+
## Production Deployment
|
| 107 |
+
|
| 108 |
+
### Using Docker Compose (Recommended)
|
| 109 |
+
|
| 110 |
+
```yaml
|
| 111 |
+
# docker-compose.prod.yml
|
| 112 |
+
version: '3.8'
|
| 113 |
+
|
| 114 |
+
services:
|
| 115 |
+
bank-statement-analyzer:
|
| 116 |
+
build: .
|
| 117 |
+
ports:
|
| 118 |
+
- "8501:8501"
|
| 119 |
+
environment:
|
| 120 |
+
- PYTHONUNBUFFERED=1
|
| 121 |
+
- POETRY_VENV_IN_PROJECT=1
|
| 122 |
+
- POETRY_NO_INTERACTION=1
|
| 123 |
+
env_file:
|
| 124 |
+
- .env
|
| 125 |
+
volumes:
|
| 126 |
+
- ./temp:/app/temp
|
| 127 |
+
restart: unless-stopped
|
| 128 |
+
healthcheck:
|
| 129 |
+
test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
|
| 130 |
+
interval: 30s
|
| 131 |
+
timeout: 10s
|
| 132 |
+
retries: 3
|
| 133 |
+
start_period: 40s
|
| 134 |
+
deploy:
|
| 135 |
+
resources:
|
| 136 |
+
limits:
|
| 137 |
+
memory: 4G
|
| 138 |
+
reservations:
|
| 139 |
+
memory: 2G
|
| 140 |
+
```
|
| 141 |
+
|
| 142 |
+
### Using Docker Swarm
|
| 143 |
+
|
| 144 |
+
```bash
|
| 145 |
+
# Initialize swarm (if not already done)
|
| 146 |
+
docker swarm init
|
| 147 |
+
|
| 148 |
+
# Deploy the stack
|
| 149 |
+
docker stack deploy -c docker-compose.yml bank-analyzer
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
### Using Kubernetes
|
| 153 |
+
|
| 154 |
+
Create a deployment YAML:
|
| 155 |
+
|
| 156 |
+
```yaml
|
| 157 |
+
apiVersion: apps/v1
|
| 158 |
+
kind: Deployment
|
| 159 |
+
metadata:
|
| 160 |
+
name: bank-statement-analyzer
|
| 161 |
+
spec:
|
| 162 |
+
replicas: 1
|
| 163 |
+
selector:
|
| 164 |
+
matchLabels:
|
| 165 |
+
app: bank-statement-analyzer
|
| 166 |
+
template:
|
| 167 |
+
metadata:
|
| 168 |
+
labels:
|
| 169 |
+
app: bank-statement-analyzer
|
| 170 |
+
spec:
|
| 171 |
+
containers:
|
| 172 |
+
- name: bank-statement-analyzer
|
| 173 |
+
image: bank-statement-analyzer:latest
|
| 174 |
+
ports:
|
| 175 |
+
- containerPort: 8501
|
| 176 |
+
env:
|
| 177 |
+
- name: GROQ_API_KEY
|
| 178 |
+
valueFrom:
|
| 179 |
+
secretKeyRef:
|
| 180 |
+
name: api-secrets
|
| 181 |
+
key: groq-api-key
|
| 182 |
+
- name: HUGGINGFACE_API_KEY
|
| 183 |
+
valueFrom:
|
| 184 |
+
secretKeyRef:
|
| 185 |
+
name: api-secrets
|
| 186 |
+
key: huggingface-api-key
|
| 187 |
+
- name: POETRY_VENV_IN_PROJECT
|
| 188 |
+
value: "1"
|
| 189 |
+
- name: POETRY_NO_INTERACTION
|
| 190 |
+
value: "1"
|
| 191 |
+
resources:
|
| 192 |
+
limits:
|
| 193 |
+
memory: "4Gi"
|
| 194 |
+
cpu: "2"
|
| 195 |
+
requests:
|
| 196 |
+
memory: "2Gi"
|
| 197 |
+
cpu: "1"
|
| 198 |
+
---
|
| 199 |
+
apiVersion: v1
|
| 200 |
+
kind: Service
|
| 201 |
+
metadata:
|
| 202 |
+
name: bank-statement-analyzer-service
|
| 203 |
+
spec:
|
| 204 |
+
selector:
|
| 205 |
+
app: bank-statement-analyzer
|
| 206 |
+
ports:
|
| 207 |
+
- port: 80
|
| 208 |
+
targetPort: 8501
|
| 209 |
+
type: LoadBalancer
|
| 210 |
+
```
|
| 211 |
+
|
| 212 |
+
## Performance Optimization
|
| 213 |
+
|
| 214 |
+
### GPU Support
|
| 215 |
+
|
| 216 |
+
To enable GPU support, modify the Dockerfile:
|
| 217 |
+
|
| 218 |
+
```dockerfile
|
| 219 |
+
# Install PyTorch with CUDA support
|
| 220 |
+
RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
| 221 |
+
```
|
| 222 |
+
|
| 223 |
+
And run with GPU access:
|
| 224 |
+
|
| 225 |
+
```bash
|
| 226 |
+
docker run --gpus all -p 8501:8501 --env-file .env bank-statement-analyzer
|
| 227 |
+
```
|
| 228 |
+
|
| 229 |
+
### Memory Optimization
|
| 230 |
+
|
| 231 |
+
- Set `FORCE_CPU=true` in `.env` if GPU is not needed
|
| 232 |
+
- Use smaller spaCy model: `SPACY_MODEL_NAME=en_core_web_sm`
|
| 233 |
+
- Adjust memory limits in docker-compose.yml
|
| 234 |
+
|
| 235 |
+
### Build Optimization
|
| 236 |
+
|
| 237 |
+
- Use `.dockerignore` to exclude unnecessary files
|
| 238 |
+
- Leverage Docker layer caching
|
| 239 |
+
- Use multi-stage builds for production
|
| 240 |
+
- Poetry lock file ensures reproducible builds
|
| 241 |
+
|
| 242 |
+
## Development with Poetry
|
| 243 |
+
|
| 244 |
+
### Local Development
|
| 245 |
+
|
| 246 |
+
```bash
|
| 247 |
+
# Install Poetry (if not already installed)
|
| 248 |
+
curl -sSL https://install.python-poetry.org | python3 -
|
| 249 |
+
|
| 250 |
+
# Install dependencies
|
| 251 |
+
poetry install
|
| 252 |
+
|
| 253 |
+
# Activate virtual environment
|
| 254 |
+
poetry shell
|
| 255 |
+
|
| 256 |
+
# Run the application
|
| 257 |
+
poetry run streamlit run main.py
|
| 258 |
+
```
|
| 259 |
+
|
| 260 |
+
### Adding Dependencies
|
| 261 |
+
|
| 262 |
+
```bash
|
| 263 |
+
# Add a new dependency
|
| 264 |
+
poetry add package-name
|
| 265 |
+
|
| 266 |
+
# Add a development dependency
|
| 267 |
+
poetry add --group dev package-name
|
| 268 |
+
|
| 269 |
+
# Update dependencies
|
| 270 |
+
poetry update
|
| 271 |
+
```
|
| 272 |
+
|
| 273 |
+
### Poetry Scripts
|
| 274 |
+
|
| 275 |
+
The project includes convenient Poetry scripts:
|
| 276 |
+
|
| 277 |
+
```bash
|
| 278 |
+
# Start the application
|
| 279 |
+
poetry run start
|
| 280 |
+
|
| 281 |
+
# Run startup script
|
| 282 |
+
poetry run startup
|
| 283 |
+
|
| 284 |
+
# Run tests
|
| 285 |
+
poetry run test
|
| 286 |
+
|
| 287 |
+
# Setup environment
|
| 288 |
+
poetry run setup
|
| 289 |
+
```
|
| 290 |
+
|
| 291 |
+
## Troubleshooting
|
| 292 |
+
|
| 293 |
+
### Common Issues
|
| 294 |
+
|
| 295 |
+
1. **Port Already in Use**
|
| 296 |
+
```bash
|
| 297 |
+
# Check what's using port 8501
|
| 298 |
+
lsof -i :8501
|
| 299 |
+
|
| 300 |
+
# Use different port
|
| 301 |
+
docker run -p 8502:8501 bank-statement-analyzer
|
| 302 |
+
```
|
| 303 |
+
|
| 304 |
+
2. **Permission Issues**
|
| 305 |
+
```bash
|
| 306 |
+
# Fix temp directory permissions
|
| 307 |
+
sudo chown -R 1000:1000 ./temp
|
| 308 |
+
```
|
| 309 |
+
|
| 310 |
+
3. **Memory Issues**
|
| 311 |
+
```bash
|
| 312 |
+
# Increase Docker memory limit
|
| 313 |
+
# In Docker Desktop: Settings > Resources > Memory
|
| 314 |
+
```
|
| 315 |
+
|
| 316 |
+
4. **API Key Issues**
|
| 317 |
+
```bash
|
| 318 |
+
# Check environment variables
|
| 319 |
+
docker exec -it <container_id> env | grep API
|
| 320 |
+
```
|
| 321 |
+
|
| 322 |
+
5. **Poetry Issues**
|
| 323 |
+
```bash
|
| 324 |
+
# Clear Poetry cache
|
| 325 |
+
poetry cache clear . --all
|
| 326 |
+
|
| 327 |
+
# Reinstall dependencies
|
| 328 |
+
poetry install --sync
|
| 329 |
+
```
|
| 330 |
+
|
| 331 |
+
### Logs
|
| 332 |
+
|
| 333 |
+
```bash
|
| 334 |
+
# View container logs
|
| 335 |
+
docker-compose logs -f
|
| 336 |
+
|
| 337 |
+
# View specific service logs
|
| 338 |
+
docker-compose logs -f bank-statement-analyzer
|
| 339 |
+
```
|
| 340 |
+
|
| 341 |
+
### Health Check
|
| 342 |
+
|
| 343 |
+
The application includes a health check endpoint:
|
| 344 |
+
|
| 345 |
+
```bash
|
| 346 |
+
# Test health endpoint
|
| 347 |
+
curl http://localhost:8501/_stcore/health
|
| 348 |
+
```
|
| 349 |
+
|
| 350 |
+
## Security Considerations
|
| 351 |
+
|
| 352 |
+
1. **API Keys**: Never commit `.env` files to version control
|
| 353 |
+
2. **Network**: Use internal networks for production
|
| 354 |
+
3. **Volumes**: Limit volume access to necessary directories
|
| 355 |
+
4. **User**: Run container as non-root user
|
| 356 |
+
5. **Updates**: Regularly update base images and dependencies
|
| 357 |
+
6. **Dependencies**: Poetry lock file ensures reproducible builds
|
| 358 |
+
|
| 359 |
+
## Monitoring
|
| 360 |
+
|
| 361 |
+
### Basic Monitoring
|
| 362 |
+
|
| 363 |
+
```bash
|
| 364 |
+
# Check container status
|
| 365 |
+
docker ps
|
| 366 |
+
|
| 367 |
+
# Monitor resource usage
|
| 368 |
+
docker stats
|
| 369 |
+
|
| 370 |
+
# Check logs
|
| 371 |
+
docker-compose logs -f
|
| 372 |
+
```
|
| 373 |
+
|
| 374 |
+
### Advanced Monitoring
|
| 375 |
+
|
| 376 |
+
Consider using:
|
| 377 |
+
- Prometheus + Grafana for metrics
|
| 378 |
+
- ELK stack for log aggregation
|
| 379 |
+
- Docker Swarm or Kubernetes for orchestration
|
| 380 |
+
|
| 381 |
+
## Backup and Recovery
|
| 382 |
+
|
| 383 |
+
### Data Backup
|
| 384 |
+
|
| 385 |
+
```bash
|
| 386 |
+
# Backup temp directory
|
| 387 |
+
tar -czf temp_backup.tar.gz ./temp
|
| 388 |
+
|
| 389 |
+
# Backup environment configuration
|
| 390 |
+
cp .env .env.backup
|
| 391 |
+
|
| 392 |
+
# Backup Poetry lock file
|
| 393 |
+
cp poetry.lock poetry.lock.backup
|
| 394 |
+
```
|
| 395 |
+
|
| 396 |
+
### Container Backup
|
| 397 |
+
|
| 398 |
+
```bash
|
| 399 |
+
# Save container image
|
| 400 |
+
docker save bank-statement-analyzer > bank-analyzer.tar
|
| 401 |
+
|
| 402 |
+
# Load container image
|
| 403 |
+
docker load < bank-analyzer.tar
|
| 404 |
+
```
|
| 405 |
+
|
| 406 |
+
## Scaling
|
| 407 |
+
|
| 408 |
+
### Horizontal Scaling
|
| 409 |
+
|
| 410 |
+
```yaml
|
| 411 |
+
# docker-compose.scale.yml
|
| 412 |
+
version: '3.8'
|
| 413 |
+
|
| 414 |
+
services:
|
| 415 |
+
bank-statement-analyzer:
|
| 416 |
+
build: .
|
| 417 |
+
ports:
|
| 418 |
+
- "8501:8501"
|
| 419 |
+
deploy:
|
| 420 |
+
replicas: 3
|
| 421 |
+
environment:
|
| 422 |
+
- PYTHONUNBUFFERED=1
|
| 423 |
+
- POETRY_VENV_IN_PROJECT=1
|
| 424 |
+
```
|
| 425 |
+
|
| 426 |
+
### Load Balancing
|
| 427 |
+
|
| 428 |
+
Use a reverse proxy like Nginx:
|
| 429 |
+
|
| 430 |
+
```nginx
|
| 431 |
+
upstream streamlit {
|
| 432 |
+
server bank-statement-analyzer:8501;
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
server {
|
| 436 |
+
listen 80;
|
| 437 |
+
location / {
|
| 438 |
+
proxy_pass http://streamlit;
|
| 439 |
+
proxy_set_header Host $host;
|
| 440 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 441 |
+
}
|
| 442 |
+
}
|
| 443 |
+
```
|
Dockerfile
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use official Python 3.12 slim image
|
| 2 |
+
FROM python:3.12-slim
|
| 3 |
+
|
| 4 |
+
# Set environment variables
|
| 5 |
+
ENV PYTHONUNBUFFERED=1
|
| 6 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 7 |
+
ENV POETRY_VERSION=1.8.2
|
| 8 |
+
ENV POETRY_HOME="/opt/poetry"
|
| 9 |
+
ENV POETRY_VENV_IN_PROJECT=1
|
| 10 |
+
ENV POETRY_NO_INTERACTION=1
|
| 11 |
+
|
| 12 |
+
# Set work directory
|
| 13 |
+
WORKDIR /app
|
| 14 |
+
|
| 15 |
+
# Install system dependencies in a single layer
|
| 16 |
+
RUN apt-get update && apt-get install -y \
|
| 17 |
+
curl \
|
| 18 |
+
build-essential \
|
| 19 |
+
tesseract-ocr \
|
| 20 |
+
libtesseract-dev \
|
| 21 |
+
poppler-utils \
|
| 22 |
+
libgl1-mesa-glx \
|
| 23 |
+
libglib2.0-0 \
|
| 24 |
+
libsm6 \
|
| 25 |
+
libxext6 \
|
| 26 |
+
libxrender-dev \
|
| 27 |
+
libgomp1 \
|
| 28 |
+
&& apt-get clean \
|
| 29 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 30 |
+
|
| 31 |
+
# Install Poetry
|
| 32 |
+
RUN curl -sSL https://install.python-poetry.org | python3 - \
|
| 33 |
+
&& export PATH="/opt/poetry/bin:$PATH" \
|
| 34 |
+
&& poetry --version
|
| 35 |
+
|
| 36 |
+
# Add Poetry to PATH
|
| 37 |
+
ENV PATH="/opt/poetry/bin:$PATH"
|
| 38 |
+
|
| 39 |
+
# Copy only Poetry configuration files first (for better caching)
|
| 40 |
+
COPY pyproject.toml poetry.lock* /app/
|
| 41 |
+
|
| 42 |
+
# Configure Poetry and install dependencies
|
| 43 |
+
RUN poetry config virtualenvs.create false \
|
| 44 |
+
&& poetry lock --no-update \
|
| 45 |
+
&& poetry install --no-interaction --no-ansi --only main
|
| 46 |
+
|
| 47 |
+
# Install PyTorch with CPU support (adjust based on your needs)
|
| 48 |
+
RUN pip3 install torch torchvision torchaudio
|
| 49 |
+
|
| 50 |
+
# Install spaCy models
|
| 51 |
+
RUN python -m spacy download en_core_web_sm
|
| 52 |
+
|
| 53 |
+
# Create temp directory for file processing
|
| 54 |
+
RUN mkdir -p /app/temp && chmod 777 /app/temp
|
| 55 |
+
|
| 56 |
+
# Copy the source code (this layer will be rebuilt when code changes)
|
| 57 |
+
COPY src/ /app/src/
|
| 58 |
+
COPY main.py /app/
|
| 59 |
+
|
| 60 |
+
# Expose the port Streamlit will run on
|
| 61 |
+
EXPOSE 8501
|
| 62 |
+
|
| 63 |
+
# Set environment variables for Streamlit
|
| 64 |
+
ENV STREAMLIT_SERVER_PORT=8501
|
| 65 |
+
ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
|
| 66 |
+
ENV STREAMLIT_SERVER_HEADLESS=true
|
| 67 |
+
ENV STREAMLIT_SERVER_ENABLE_CORS=false
|
| 68 |
+
ENV STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION=false
|
| 69 |
+
|
| 70 |
+
# Run the Streamlit application
|
| 71 |
+
CMD ["streamlit", "run", "main.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
Dockerfile.alternative
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use official Python 3.12 slim image
|
| 2 |
+
FROM python:3.12-slim
|
| 3 |
+
|
| 4 |
+
# Set environment variables
|
| 5 |
+
ENV PYTHONUNBUFFERED=1
|
| 6 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 7 |
+
ENV POETRY_VERSION=1.8.2
|
| 8 |
+
ENV POETRY_NO_INTERACTION=1
|
| 9 |
+
|
| 10 |
+
# Set work directory
|
| 11 |
+
WORKDIR /app
|
| 12 |
+
|
| 13 |
+
# Install system dependencies in a single layer
|
| 14 |
+
RUN apt-get update && apt-get install -y \
|
| 15 |
+
curl \
|
| 16 |
+
build-essential \
|
| 17 |
+
tesseract-ocr \
|
| 18 |
+
libtesseract-dev \
|
| 19 |
+
poppler-utils \
|
| 20 |
+
libgl1-mesa-glx \
|
| 21 |
+
libglib2.0-0 \
|
| 22 |
+
libsm6 \
|
| 23 |
+
libxext6 \
|
| 24 |
+
libxrender-dev \
|
| 25 |
+
libgomp1 \
|
| 26 |
+
&& apt-get clean \
|
| 27 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 28 |
+
|
| 29 |
+
# Install Poetry using pip (alternative method)
|
| 30 |
+
RUN pip install poetry==$POETRY_VERSION
|
| 31 |
+
|
| 32 |
+
# Copy only Poetry configuration files first (for better caching)
|
| 33 |
+
COPY pyproject.toml poetry.lock* /app/
|
| 34 |
+
|
| 35 |
+
# Configure Poetry and install dependencies
|
| 36 |
+
RUN poetry config virtualenvs.create false \
|
| 37 |
+
&& poetry install --no-interaction --no-ansi --only main
|
| 38 |
+
|
| 39 |
+
# Install PyTorch with CPU support (adjust based on your needs)
|
| 40 |
+
RUN pip3 install torch torchvision torchaudio
|
| 41 |
+
|
| 42 |
+
# Install spaCy models
|
| 43 |
+
RUN python -m spacy download en_core_web_sm
|
| 44 |
+
|
| 45 |
+
# Create temp directory for file processing
|
| 46 |
+
RUN mkdir -p /app/temp && chmod 777 /app/temp
|
| 47 |
+
|
| 48 |
+
# Copy the source code (this layer will be rebuilt when code changes)
|
| 49 |
+
COPY src/ /app/src/
|
| 50 |
+
COPY main.py /app/
|
| 51 |
+
|
| 52 |
+
# Expose the port Streamlit will run on
|
| 53 |
+
EXPOSE 8501
|
| 54 |
+
|
| 55 |
+
# Set environment variables for Streamlit
|
| 56 |
+
ENV STREAMLIT_SERVER_PORT=8501
|
| 57 |
+
ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
|
| 58 |
+
ENV STREAMLIT_SERVER_HEADLESS=true
|
| 59 |
+
ENV STREAMLIT_SERVER_ENABLE_CORS=false
|
| 60 |
+
ENV STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION=false
|
| 61 |
+
|
| 62 |
+
# Run the Streamlit application
|
| 63 |
+
CMD ["streamlit", "run", "main.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
Dockerfile.fallback
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use official Python 3.12 slim image
|
| 2 |
+
FROM python:3.12-slim
|
| 3 |
+
|
| 4 |
+
# Set environment variables
|
| 5 |
+
ENV PYTHONUNBUFFERED=1
|
| 6 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 7 |
+
|
| 8 |
+
# Set work directory
|
| 9 |
+
WORKDIR /app
|
| 10 |
+
|
| 11 |
+
# Install system dependencies in a single layer
|
| 12 |
+
RUN apt-get update && apt-get install -y \
|
| 13 |
+
curl \
|
| 14 |
+
build-essential \
|
| 15 |
+
tesseract-ocr \
|
| 16 |
+
libtesseract-dev \
|
| 17 |
+
poppler-utils \
|
| 18 |
+
libgl1-mesa-glx \
|
| 19 |
+
libglib2.0-0 \
|
| 20 |
+
libsm6 \
|
| 21 |
+
libxext6 \
|
| 22 |
+
libxrender-dev \
|
| 23 |
+
libgomp1 \
|
| 24 |
+
&& apt-get clean \
|
| 25 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 26 |
+
|
| 27 |
+
# Copy requirements file (if it exists)
|
| 28 |
+
COPY requirements.txt* /app/
|
| 29 |
+
|
| 30 |
+
# Install Python dependencies using pip
|
| 31 |
+
RUN pip install --no-cache-dir --upgrade pip
|
| 32 |
+
|
| 33 |
+
# Install dependencies from requirements.txt if it exists, otherwise install manually
|
| 34 |
+
RUN if [ -f "requirements.txt" ]; then \
|
| 35 |
+
pip install --no-cache-dir -r requirements.txt; \
|
| 36 |
+
else \
|
| 37 |
+
pip install --no-cache-dir \
|
| 38 |
+
streamlit>=1.28.0 \
|
| 39 |
+
pandas>=2.0.0 \
|
| 40 |
+
numpy>=1.24.0 \
|
| 41 |
+
PyMuPDF>=1.23.0 \
|
| 42 |
+
PyPDF2>=3.0.0 \
|
| 43 |
+
doctr>=2.4.0 \
|
| 44 |
+
pdf2image>=1.16.0 \
|
| 45 |
+
spacy>=3.7.0 \
|
| 46 |
+
torch>=2.0.0 \
|
| 47 |
+
fuzzywuzzy>=0.18.0 \
|
| 48 |
+
python-Levenshtein>=0.21.0 \
|
| 49 |
+
openai>=1.0.0 \
|
| 50 |
+
huggingface-hub>=0.19.0 \
|
| 51 |
+
pydantic>=2.0.0 \
|
| 52 |
+
pydantic-settings>=2.0.0 \
|
| 53 |
+
python-dateutil>=2.8.0 \
|
| 54 |
+
python-dotenv>=1.0.0; \
|
| 55 |
+
fi
|
| 56 |
+
|
| 57 |
+
# Install PyTorch with CPU support
|
| 58 |
+
RUN pip3 install torch torchvision torchaudio
|
| 59 |
+
|
| 60 |
+
# Install spaCy models
|
| 61 |
+
RUN python -m spacy download en_core_web_sm
|
| 62 |
+
|
| 63 |
+
# Create temp directory for file processing
|
| 64 |
+
RUN mkdir -p /app/temp && chmod 777 /app/temp
|
| 65 |
+
|
| 66 |
+
# Copy the source code
|
| 67 |
+
COPY src/ /app/src/
|
| 68 |
+
COPY main.py /app/
|
| 69 |
+
|
| 70 |
+
# Expose the port Streamlit will run on
|
| 71 |
+
EXPOSE 8501
|
| 72 |
+
|
| 73 |
+
# Set environment variables for Streamlit
|
| 74 |
+
ENV STREAMLIT_SERVER_PORT=8501
|
| 75 |
+
ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
|
| 76 |
+
ENV STREAMLIT_SERVER_HEADLESS=true
|
| 77 |
+
ENV STREAMLIT_SERVER_ENABLE_CORS=false
|
| 78 |
+
ENV STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION=false
|
| 79 |
+
|
| 80 |
+
# Run the Streamlit application
|
| 81 |
+
CMD ["streamlit", "run", "main.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
README.md
ADDED
|
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Bank Statement Analyzer
|
| 2 |
+
|
| 3 |
+
A comprehensive, async, class-based bank statement analyzer that extracts account information and transaction tables from PDF bank statements.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- **Async Processing**: All operations are asynchronous for better performance
|
| 8 |
+
- **Class-Based Architecture**: Well-organized, maintainable code structure
|
| 9 |
+
- **Model Pre-loading**: Models are loaded once at startup for faster processing
|
| 10 |
+
- **Environment Configuration**: Flexible configuration via .env files
|
| 11 |
+
- **Multiple PDF Support**: Handles both digital and scanned PDFs
|
| 12 |
+
- **OCR Integration**: Uses doctr for scanned PDF processing
|
| 13 |
+
- **LLM Integration**: Uses Groq API for intelligent data extraction
|
| 14 |
+
- **Table Extraction**: Extracts and processes transaction tables
|
| 15 |
+
- **Account Information**: Extracts account numbers, balances, and bank names
|
| 16 |
+
- **Streamlit Interface**: User-friendly web interface
|
| 17 |
+
|
| 18 |
+
## Project Structure
|
| 19 |
+
|
| 20 |
+
```
|
| 21 |
+
bank-scrubber/
|
| 22 |
+
βββ src/
|
| 23 |
+
β βββ config/
|
| 24 |
+
β β βββ config.py # Configuration settings and API keys
|
| 25 |
+
β βββ models/
|
| 26 |
+
β β βββ __init__.py
|
| 27 |
+
β β βββ account_models.py # Pydantic models for data validation
|
| 28 |
+
β βββ utils/
|
| 29 |
+
β β βββ __init__.py
|
| 30 |
+
β β βββ api_clients.py # Async API clients for Groq and HuggingFace
|
| 31 |
+
β β βββ model_manager.py # Singleton model manager for pre-loading
|
| 32 |
+
β βββ ocr/
|
| 33 |
+
β β βββ __init__.py
|
| 34 |
+
β β βββ pdf_processor.py # PDF processing and OCR setup
|
| 35 |
+
β β βββ text_extractor.py # Text extraction with bounding boxes
|
| 36 |
+
β βββ extractor/
|
| 37 |
+
β β βββ __init__.py
|
| 38 |
+
β β βββ table_extractor.py # Transaction table extraction and processing
|
| 39 |
+
β β βββ account_extractor.py # Account number and bank name extraction
|
| 40 |
+
β β βββ balance_extractor.py # Balance information extraction
|
| 41 |
+
β βββ services/
|
| 42 |
+
β β βββ __init__.py
|
| 43 |
+
β β βββ bank_statement_service.py # Main service orchestrating all operations
|
| 44 |
+
β βββ __init__.py
|
| 45 |
+
βββ main.py # Streamlit application entry point
|
| 46 |
+
βββ startup.py # Model pre-loading script
|
| 47 |
+
βββ setup_env.py # Environment setup helper
|
| 48 |
+
βββ env.example # Environment variables template
|
| 49 |
+
βββ test_structure.py # Structure testing script
|
| 50 |
+
βββ poc.py # Original monolithic file (preserved)
|
| 51 |
+
βββ requirements.txt # Python dependencies
|
| 52 |
+
βββ README.md # This file
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
## Installation
|
| 56 |
+
|
| 57 |
+
1. Clone the repository:
|
| 58 |
+
```bash
|
| 59 |
+
git clone <repository-url>
|
| 60 |
+
cd bank-scrubber
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
2. Create a virtual environment:
|
| 64 |
+
```bash
|
| 65 |
+
python -m venv venv
|
| 66 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
3. Install dependencies:
|
| 70 |
+
```bash
|
| 71 |
+
pip install -r requirements.txt
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
4. Install spaCy models:
|
| 75 |
+
```bash
|
| 76 |
+
python -m spacy download en_core_web_sm
|
| 77 |
+
# Optional: python -m spacy download en_core_web_trf
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
## Configuration
|
| 81 |
+
|
| 82 |
+
### Quick Setup
|
| 83 |
+
|
| 84 |
+
Use the setup script to create your environment file:
|
| 85 |
+
|
| 86 |
+
```bash
|
| 87 |
+
python setup_env.py
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
This will:
|
| 91 |
+
- Create a `.env` file from the template
|
| 92 |
+
- Guide you through the setup process
|
| 93 |
+
- Show current configuration status
|
| 94 |
+
|
| 95 |
+
### Manual Setup
|
| 96 |
+
|
| 97 |
+
1. Copy the environment template:
|
| 98 |
+
```bash
|
| 99 |
+
cp env.example .env
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
2. Edit the `.env` file with your API keys and settings:
|
| 103 |
+
|
| 104 |
+
```env
|
| 105 |
+
# API Keys
|
| 106 |
+
GROQ_API_KEY=your_actual_groq_api_key_here
|
| 107 |
+
HUGGINGFACE_API_KEY=your_actual_huggingface_api_key_here
|
| 108 |
+
|
| 109 |
+
# Model Configuration
|
| 110 |
+
LLM_MODEL=llama-3.1-8b-instant
|
| 111 |
+
SPACY_MODEL_NAME=en_core_web_sm
|
| 112 |
+
|
| 113 |
+
# Device Settings
|
| 114 |
+
FORCE_CPU=false
|
| 115 |
+
|
| 116 |
+
# Processing Settings
|
| 117 |
+
DPI=300
|
| 118 |
+
Y_THRESHOLD=3.0
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
### Configuration Options
|
| 122 |
+
|
| 123 |
+
| Variable | Description | Default |
|
| 124 |
+
|----------|-------------|---------|
|
| 125 |
+
| `GROQ_API_KEY` | Your Groq API key | Required |
|
| 126 |
+
| `HUGGINGFACE_API_KEY` | Your HuggingFace API key | Required |
|
| 127 |
+
| `LLM_MODEL` | Groq model to use | `llama-3.1-8b-instant` |
|
| 128 |
+
| `SPACY_MODEL_NAME` | spaCy model for NER | `en_core_web_sm` |
|
| 129 |
+
| `FORCE_CPU` | Force CPU usage | `false` |
|
| 130 |
+
| `DPI` | PDF processing DPI | `300` |
|
| 131 |
+
| `Y_THRESHOLD` | Text extraction threshold | `3.0` |
|
| 132 |
+
| `GAP_THRESHOLD` | Table gap threshold | `10` |
|
| 133 |
+
| `TEMP_FILE_NAME` | Temporary file name | `temp.pdf` |
|
| 134 |
+
|
| 135 |
+
## Usage
|
| 136 |
+
|
| 137 |
+
### Quick Start
|
| 138 |
+
|
| 139 |
+
1. Set up environment:
|
| 140 |
+
```bash
|
| 141 |
+
python setup_env.py
|
| 142 |
+
```
|
| 143 |
+
|
| 144 |
+
2. Pre-load models:
|
| 145 |
+
```bash
|
| 146 |
+
python startup.py
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
3. Run the application:
|
| 150 |
+
```bash
|
| 151 |
+
streamlit run main.py
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
+
### Advanced Usage
|
| 155 |
+
|
| 156 |
+
#### Model Pre-loading (Recommended)
|
| 157 |
+
|
| 158 |
+
For optimal performance, pre-load models before running the application:
|
| 159 |
+
|
| 160 |
+
```bash
|
| 161 |
+
# Pre-load all models
|
| 162 |
+
python startup.py
|
| 163 |
+
|
| 164 |
+
# Then run the main application
|
| 165 |
+
streamlit run main.py
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
#### Direct Application Run
|
| 169 |
+
|
| 170 |
+
You can also run the application directly, which will load models on first use:
|
| 171 |
+
|
| 172 |
+
```bash
|
| 173 |
+
streamlit run main.py
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
#### Using the Service Programmatically
|
| 177 |
+
|
| 178 |
+
```python
|
| 179 |
+
import asyncio
|
| 180 |
+
from src.services import BankStatementService
|
| 181 |
+
|
| 182 |
+
async def process_statement(file_path):
|
| 183 |
+
async with BankStatementService() as service:
|
| 184 |
+
with open(file_path, 'rb') as f:
|
| 185 |
+
result = await service.process_bank_statement(f)
|
| 186 |
+
return result
|
| 187 |
+
|
| 188 |
+
# Usage
|
| 189 |
+
result = asyncio.run(process_statement('path/to/statement.pdf'))
|
| 190 |
+
print(result.account_summary)
|
| 191 |
+
print(result.transaction_tables)
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
## Architecture Overview
|
| 195 |
+
|
| 196 |
+
### Configuration Management
|
| 197 |
+
- **Environment Variables**: All settings configurable via `.env` file
|
| 198 |
+
- **Pydantic Settings**: Type-safe configuration with validation
|
| 199 |
+
- **Fallback Values**: Sensible defaults for all settings
|
| 200 |
+
- **API Key Management**: Secure handling of API credentials
|
| 201 |
+
|
| 202 |
+
### Model Management
|
| 203 |
+
- **ModelManager**: Singleton class that pre-loads and manages all ML models
|
| 204 |
+
- **Pre-loading**: Models are loaded once at startup and reused across the application
|
| 205 |
+
- **Device Optimization**: Automatic GPU detection and utilization
|
| 206 |
+
- **Configurable Models**: spaCy model selection via environment variables
|
| 207 |
+
|
| 208 |
+
### Services Layer
|
| 209 |
+
- **BankStatementService**: Main orchestrator that coordinates all processing steps
|
| 210 |
+
|
| 211 |
+
### OCR Layer
|
| 212 |
+
- **PDFProcessor**: Handles PDF file operations and uses pre-loaded OCR models
|
| 213 |
+
- **TextExtractor**: Extracts text with bounding boxes from both digital and scanned PDFs
|
| 214 |
+
|
| 215 |
+
### Extractor Layer
|
| 216 |
+
- **TableExtractor**: Processes transaction tables with pattern matching and data cleaning
|
| 217 |
+
- **AccountExtractor**: Extracts account numbers and bank names using regex and NER
|
| 218 |
+
- **BalanceExtractor**: Extracts balance information using keyword matching
|
| 219 |
+
|
| 220 |
+
### Utils Layer
|
| 221 |
+
- **GroqClient**: Async client for Groq LLM API
|
| 222 |
+
- **HuggingFaceClient**: Async client for HuggingFace Inference API
|
| 223 |
+
- **ModelManager**: Centralized model management and pre-loading
|
| 224 |
+
|
| 225 |
+
### Models Layer
|
| 226 |
+
- **BankStatementData**: Main data model for processed results
|
| 227 |
+
- **AccountSummary**: Model for account information
|
| 228 |
+
- **AccountDetails**: Model for individual account details
|
| 229 |
+
|
| 230 |
+
## Key Features
|
| 231 |
+
|
| 232 |
+
### Environment Configuration
|
| 233 |
+
All settings are configurable via environment variables:
|
| 234 |
+
|
| 235 |
+
```python
|
| 236 |
+
from src.config.config import settings
|
| 237 |
+
|
| 238 |
+
print(f"Using model: {settings.llm_model}")
|
| 239 |
+
print(f"Device: {'CPU' if settings.force_cpu else 'Auto'}")
|
| 240 |
+
```
|
| 241 |
+
|
| 242 |
+
### Model Pre-loading
|
| 243 |
+
Models are loaded once at startup and reused throughout the application:
|
| 244 |
+
|
| 245 |
+
```python
|
| 246 |
+
from src.utils import model_manager
|
| 247 |
+
|
| 248 |
+
# Check model status
|
| 249 |
+
status = model_manager.get_model_status()
|
| 250 |
+
print(f"Models loaded: {status['models_loaded']}")
|
| 251 |
+
```
|
| 252 |
+
|
| 253 |
+
### Async Processing
|
| 254 |
+
All operations are asynchronous, allowing for better performance and resource utilization:
|
| 255 |
+
|
| 256 |
+
```python
|
| 257 |
+
async with BankStatementService() as service:
|
| 258 |
+
result = await service.process_bank_statement(uploaded_file)
|
| 259 |
+
```
|
| 260 |
+
|
| 261 |
+
### Class-Based Design
|
| 262 |
+
Each component is a class with async context manager support:
|
| 263 |
+
|
| 264 |
+
```python
|
| 265 |
+
class MyService:
|
| 266 |
+
async def __aenter__(self):
|
| 267 |
+
return self
|
| 268 |
+
|
| 269 |
+
async def __aexit__(self, exc_type, exc_value, traceback):
|
| 270 |
+
pass
|
| 271 |
+
```
|
| 272 |
+
|
| 273 |
+
### Error Handling
|
| 274 |
+
Comprehensive error handling throughout the pipeline with graceful fallbacks.
|
| 275 |
+
|
| 276 |
+
## Performance Optimization
|
| 277 |
+
|
| 278 |
+
### Model Pre-loading Benefits
|
| 279 |
+
- **Faster Processing**: Models are loaded once at startup, not on each file upload
|
| 280 |
+
- **Memory Efficiency**: Single instance of each model shared across the application
|
| 281 |
+
- **GPU Optimization**: Automatic GPU detection and utilization
|
| 282 |
+
- **Reduced Latency**: No model loading delays during file processing
|
| 283 |
+
|
| 284 |
+
### Configuration Benefits
|
| 285 |
+
- **Flexible Settings**: Easy to adjust parameters without code changes
|
| 286 |
+
- **Environment-Specific**: Different settings for development/production
|
| 287 |
+
- **Secure**: API keys kept separate from code
|
| 288 |
+
- **Version Control Safe**: `.env` files can be excluded from git
|
| 289 |
+
|
| 290 |
+
### Startup Process
|
| 291 |
+
1. **Configuration Loading**: Loads settings from `.env` file
|
| 292 |
+
2. **Model Detection**: Automatically detects available models (spaCy, doctr)
|
| 293 |
+
3. **Device Selection**: Chooses optimal device (GPU/CPU) based on config
|
| 294 |
+
4. **Pre-loading**: Loads all models into memory
|
| 295 |
+
5. **Status Reporting**: Provides detailed loading status
|
| 296 |
+
|
| 297 |
+
## Testing
|
| 298 |
+
|
| 299 |
+
Run the structure test to verify everything works:
|
| 300 |
+
|
| 301 |
+
```bash
|
| 302 |
+
python test_structure.py
|
| 303 |
+
```
|
| 304 |
+
|
| 305 |
+
This will test:
|
| 306 |
+
- All module imports
|
| 307 |
+
- Model manager functionality
|
| 308 |
+
- Service initialization
|
| 309 |
+
- Configuration access
|
| 310 |
+
|
| 311 |
+
## Troubleshooting
|
| 312 |
+
|
| 313 |
+
### Common Issues
|
| 314 |
+
|
| 315 |
+
1. **API Keys Not Set**
|
| 316 |
+
```bash
|
| 317 |
+
python setup_env.py
|
| 318 |
+
# Edit .env file with your actual API keys
|
| 319 |
+
```
|
| 320 |
+
|
| 321 |
+
2. **spaCy Model Not Found**
|
| 322 |
+
```bash
|
| 323 |
+
python -m spacy download en_core_web_sm
|
| 324 |
+
```
|
| 325 |
+
|
| 326 |
+
3. **GPU Not Detected**
|
| 327 |
+
- Set `FORCE_CPU=true` in `.env` file
|
| 328 |
+
- Or install CUDA-compatible PyTorch
|
| 329 |
+
|
| 330 |
+
4. **Configuration Issues**
|
| 331 |
+
```bash
|
| 332 |
+
python setup_env.py
|
| 333 |
+
# Check current configuration
|
| 334 |
+
```
|
| 335 |
+
|
| 336 |
+
## Dependencies
|
| 337 |
+
|
| 338 |
+
- **Streamlit**: Web interface
|
| 339 |
+
- **PyMuPDF**: PDF processing
|
| 340 |
+
- **doctr**: OCR for scanned PDFs
|
| 341 |
+
- **spaCy**: Natural language processing
|
| 342 |
+
- **torch**: Deep learning framework
|
| 343 |
+
- **pandas**: Data manipulation
|
| 344 |
+
- **openai**: Groq API client
|
| 345 |
+
- **huggingface-hub**: HuggingFace API client
|
| 346 |
+
- **pydantic**: Data validation
|
| 347 |
+
- **fuzzywuzzy**: Fuzzy string matching
|
| 348 |
+
- **python-dotenv**: Environment variable loading
|
| 349 |
+
|
| 350 |
+
## Contributing
|
| 351 |
+
|
| 352 |
+
1. Fork the repository
|
| 353 |
+
2. Create a feature branch
|
| 354 |
+
3. Make your changes
|
| 355 |
+
4. Add tests if applicable
|
| 356 |
+
5. Submit a pull request
|
| 357 |
+
|
| 358 |
+
## License
|
| 359 |
+
|
| 360 |
+
This project is licensed under the MIT License.
|
build-docker.sh
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Docker build script for Bank Statement Analyzer
|
| 4 |
+
echo "π³ Building Bank Statement Analyzer Docker Image"
|
| 5 |
+
echo "=================================================="
|
| 6 |
+
|
| 7 |
+
# Check if we're in the right directory
|
| 8 |
+
if [ ! -f "Dockerfile" ]; then
|
| 9 |
+
echo "β Error: Dockerfile not found in current directory"
|
| 10 |
+
echo " Please run this script from the project root directory"
|
| 11 |
+
exit 1
|
| 12 |
+
fi
|
| 13 |
+
|
| 14 |
+
# Check build context size
|
| 15 |
+
echo "π Checking build context size..."
|
| 16 |
+
python3 check-build-context.py
|
| 17 |
+
|
| 18 |
+
echo ""
|
| 19 |
+
echo "π¦ Building Docker image..."
|
| 20 |
+
|
| 21 |
+
# Check if pyproject.toml exists
|
| 22 |
+
if [ ! -f "pyproject.toml" ]; then
|
| 23 |
+
echo "β Error: pyproject.toml not found!"
|
| 24 |
+
echo " Please ensure you have a valid pyproject.toml file"
|
| 25 |
+
exit 1
|
| 26 |
+
fi
|
| 27 |
+
|
| 28 |
+
# Try building with the main Dockerfile first
|
| 29 |
+
echo "π Attempting build with main Dockerfile..."
|
| 30 |
+
docker build -t bank-statement-analyzer .
|
| 31 |
+
|
| 32 |
+
if [ $? -eq 0 ]; then
|
| 33 |
+
echo ""
|
| 34 |
+
echo "β
Docker image built successfully!"
|
| 35 |
+
echo ""
|
| 36 |
+
echo "π To run the application:"
|
| 37 |
+
echo " docker run -p 8501:8501 --env-file .env bank-statement-analyzer"
|
| 38 |
+
echo ""
|
| 39 |
+
echo " Or use docker-compose:"
|
| 40 |
+
echo " docker-compose up"
|
| 41 |
+
else
|
| 42 |
+
echo ""
|
| 43 |
+
echo "β οΈ Main Dockerfile build failed. Trying alternative method..."
|
| 44 |
+
|
| 45 |
+
# Check if alternative Dockerfile exists
|
| 46 |
+
if [ -f "Dockerfile.alternative" ]; then
|
| 47 |
+
echo "π Attempting build with alternative Dockerfile..."
|
| 48 |
+
docker build -f Dockerfile.alternative -t bank-statement-analyzer .
|
| 49 |
+
|
| 50 |
+
if [ $? -eq 0 ]; then
|
| 51 |
+
echo ""
|
| 52 |
+
echo "β
Docker image built successfully with alternative method!"
|
| 53 |
+
echo ""
|
| 54 |
+
echo "π To run the application:"
|
| 55 |
+
echo " docker run -p 8501:8501 --env-file .env bank-statement-analyzer"
|
| 56 |
+
echo ""
|
| 57 |
+
echo " Or use docker-compose:"
|
| 58 |
+
echo " docker-compose up"
|
| 59 |
+
else
|
| 60 |
+
echo ""
|
| 61 |
+
echo "β οΈ Alternative Dockerfile also failed. Trying fallback method..."
|
| 62 |
+
|
| 63 |
+
# Check if fallback Dockerfile exists
|
| 64 |
+
if [ -f "Dockerfile.fallback" ]; then
|
| 65 |
+
echo "π Attempting build with fallback Dockerfile (pip-based)..."
|
| 66 |
+
docker build -f Dockerfile.fallback -t bank-statement-analyzer .
|
| 67 |
+
|
| 68 |
+
if [ $? -eq 0 ]; then
|
| 69 |
+
echo ""
|
| 70 |
+
echo "β
Docker image built successfully with fallback method!"
|
| 71 |
+
echo ""
|
| 72 |
+
echo "π To run the application:"
|
| 73 |
+
echo " docker run -p 8501:8501 --env-file .env bank-statement-analyzer"
|
| 74 |
+
echo ""
|
| 75 |
+
echo " Or use docker-compose:"
|
| 76 |
+
echo " docker-compose up"
|
| 77 |
+
else
|
| 78 |
+
echo ""
|
| 79 |
+
echo "β All Dockerfile methods failed!"
|
| 80 |
+
echo ""
|
| 81 |
+
echo "π‘ Troubleshooting tips:"
|
| 82 |
+
echo " - Check if Poetry is properly configured"
|
| 83 |
+
echo " - Ensure pyproject.toml and poetry.lock are valid"
|
| 84 |
+
echo " - Try running 'poetry install' locally first"
|
| 85 |
+
echo " - Check Docker logs for specific error messages"
|
| 86 |
+
echo " - Verify system dependencies are available"
|
| 87 |
+
echo ""
|
| 88 |
+
echo "π§ Manual troubleshooting:"
|
| 89 |
+
echo " docker build -t bank-statement-analyzer . 2>&1 | tee build.log"
|
| 90 |
+
exit 1
|
| 91 |
+
fi
|
| 92 |
+
else
|
| 93 |
+
echo ""
|
| 94 |
+
echo "β Fallback Dockerfile not found!"
|
| 95 |
+
echo ""
|
| 96 |
+
echo "π‘ Troubleshooting tips:"
|
| 97 |
+
echo " - Check if large files are being included in build context"
|
| 98 |
+
echo " - Ensure .dockerignore is properly configured"
|
| 99 |
+
echo " - Try running 'python3 check-build-context.py' to identify issues"
|
| 100 |
+
echo " - Check Poetry installation and configuration"
|
| 101 |
+
exit 1
|
| 102 |
+
fi
|
| 103 |
+
fi
|
| 104 |
+
else
|
| 105 |
+
echo ""
|
| 106 |
+
echo "β Alternative Dockerfile not found!"
|
| 107 |
+
echo ""
|
| 108 |
+
echo "π‘ Troubleshooting tips:"
|
| 109 |
+
echo " - Check if large files are being included in build context"
|
| 110 |
+
echo " - Ensure .dockerignore is properly configured"
|
| 111 |
+
echo " - Try running 'python3 check-build-context.py' to identify issues"
|
| 112 |
+
echo " - Check Poetry installation and configuration"
|
| 113 |
+
exit 1
|
| 114 |
+
fi
|
| 115 |
+
fi
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
bank-statement-analyzer:
|
| 5 |
+
build: .
|
| 6 |
+
ports:
|
| 7 |
+
- "8501:8501"
|
| 8 |
+
environment:
|
| 9 |
+
- PYTHONUNBUFFERED=1
|
| 10 |
+
- STREAMLIT_SERVER_PORT=8501
|
| 11 |
+
- STREAMLIT_SERVER_ADDRESS=0.0.0.0
|
| 12 |
+
- STREAMLIT_SERVER_HEADLESS=true
|
| 13 |
+
- STREAMLIT_SERVER_ENABLE_CORS=false
|
| 14 |
+
- STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION=false
|
| 15 |
+
- POETRY_VENV_IN_PROJECT=1
|
| 16 |
+
- POETRY_NO_INTERACTION=1
|
| 17 |
+
volumes:
|
| 18 |
+
- ./temp:/app/temp
|
| 19 |
+
- ./.env:/app/.env:ro
|
| 20 |
+
restart: unless-stopped
|
| 21 |
+
healthcheck:
|
| 22 |
+
test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
|
| 23 |
+
interval: 30s
|
| 24 |
+
timeout: 10s
|
| 25 |
+
retries: 3
|
| 26 |
+
start_period: 40s
|
main.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from src.services import BankStatementService
|
| 5 |
+
from src.utils import model_manager
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
async def preload_models():
|
| 9 |
+
"""Pre-load all models at application startup."""
|
| 10 |
+
# st.info("π Pre-loading models... This may take a moment on first run.")
|
| 11 |
+
|
| 12 |
+
# Ensure models are loaded
|
| 13 |
+
await model_manager.ensure_models_loaded()
|
| 14 |
+
|
| 15 |
+
# Get model status
|
| 16 |
+
status = model_manager.get_model_status()
|
| 17 |
+
|
| 18 |
+
if status["models_loaded"]:
|
| 19 |
+
# st.success("β
All models loaded successfully!")
|
| 20 |
+
# st.info(f"π± Using device: {status['device']}")
|
| 21 |
+
pass
|
| 22 |
+
else:
|
| 23 |
+
# st.error("β Failed to load some models")
|
| 24 |
+
pass
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
async def main():
|
| 28 |
+
st.set_page_config(page_title="Bank Statement Analyzer", layout="wide")
|
| 29 |
+
st.title("π Bank Statement Analyzer")
|
| 30 |
+
|
| 31 |
+
# Pre-load models at startup
|
| 32 |
+
await preload_models()
|
| 33 |
+
|
| 34 |
+
uploaded_file = st.file_uploader("Upload Bank Statement PDF", type=["pdf"])
|
| 35 |
+
|
| 36 |
+
if uploaded_file:
|
| 37 |
+
st.info("π₯ Processing uploaded file...")
|
| 38 |
+
|
| 39 |
+
with st.spinner("Extracting data..."):
|
| 40 |
+
async with BankStatementService() as service:
|
| 41 |
+
result = await service.process_bank_statement(uploaded_file)
|
| 42 |
+
|
| 43 |
+
if result:
|
| 44 |
+
# --- Account Summary ---
|
| 45 |
+
account_df = pd.DataFrame(result.account_summary.items(), columns=["Account Summary", "Data"])
|
| 46 |
+
st.dataframe(account_df, use_container_width=True, hide_index=True)
|
| 47 |
+
|
| 48 |
+
# --- Tables Section ---
|
| 49 |
+
st.subheader("π Extracted Tables")
|
| 50 |
+
|
| 51 |
+
for name, df in result.transaction_tables.items():
|
| 52 |
+
if df.empty:
|
| 53 |
+
continue
|
| 54 |
+
st.markdown(f"### {name.capitalize()} Table")
|
| 55 |
+
st.dataframe(df, use_container_width=True, hide_index=True)
|
| 56 |
+
else:
|
| 57 |
+
st.error("β οΈ Unable to parse the statement correctly.")
|
| 58 |
+
else:
|
| 59 |
+
st.warning("π€ Please upload a PDF file to begin.")
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
# Run the async main function
|
| 64 |
+
asyncio.run(main())
|
poetry.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tool.poetry]
|
| 2 |
+
name = "bank-scrubber"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = ""
|
| 5 |
+
authors = ["Your Name <you@example.com>"]
|
| 6 |
+
readme = "README.md"
|
| 7 |
+
|
| 8 |
+
[tool.poetry.dependencies]
|
| 9 |
+
python = ">=3.12,<3.14"
|
| 10 |
+
pypdf2 = "^3.0.1"
|
| 11 |
+
pymupdf = "^1.26.1"
|
| 12 |
+
pdf2image = "^1.17.0"
|
| 13 |
+
python-doctr = "^0.12.0"
|
| 14 |
+
numpy = "^2.3.1"
|
| 15 |
+
pandas = "^2.3.0"
|
| 16 |
+
streamlit = "^1.46.1"
|
| 17 |
+
openai = "^1.93.0"
|
| 18 |
+
fuzzywuzzy = "^0.18.0"
|
| 19 |
+
huggingface-hub = "^0.33.1"
|
| 20 |
+
pydantic = "^2.11.7"
|
| 21 |
+
python-dateutil = "^2.9.0.post0"
|
| 22 |
+
python-dotenv = "^1.1.1"
|
| 23 |
+
python-levenshtein = "^0.27.1"
|
| 24 |
+
pydantic-settings = "^2.10.1"
|
| 25 |
+
doctr = "^1.9.0"
|
| 26 |
+
spacy = "^3.8.7"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
[build-system]
|
| 30 |
+
requires = ["poetry-core"]
|
| 31 |
+
build-backend = "poetry.core.masonry.api"
|
requirements.txt
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit>=1.28.0
|
| 2 |
+
pandas>=2.0.0
|
| 3 |
+
numpy>=1.24.0
|
| 4 |
+
PyMuPDF>=1.23.0
|
| 5 |
+
PyPDF2>=3.0.0
|
| 6 |
+
doctr>=2.4.0
|
| 7 |
+
pdf2image>=1.16.0
|
| 8 |
+
spacy>=3.7.0
|
| 9 |
+
torch>=2.0.0
|
| 10 |
+
fuzzywuzzy>=0.18.0
|
| 11 |
+
python-Levenshtein>=0.21.0
|
| 12 |
+
openai>=1.0.0
|
| 13 |
+
huggingface-hub>=0.19.0
|
| 14 |
+
pydantic>=2.0.0
|
| 15 |
+
pydantic-settings>=2.0.0
|
| 16 |
+
python-dateutil>=2.8.0
|
| 17 |
+
python-dotenv>=1.0.0
|
src/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .services import BankStatementService
|
| 2 |
+
from .models import BankStatementData, AccountSummary, AccountDetails
|
| 3 |
+
|
| 4 |
+
__all__ = ["BankStatementService", "BankStatementData", "AccountSummary", "AccountDetails"]
|
src/config/config.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional
|
| 2 |
+
# from pydantic import BaseSettings
|
| 3 |
+
from pydantic_settings import BaseSettings
|
| 4 |
+
import os
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
|
| 7 |
+
# Load environment variables from .env file
|
| 8 |
+
load_dotenv()
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class Settings(BaseSettings):
|
| 12 |
+
"""Configuration settings for the application."""
|
| 13 |
+
|
| 14 |
+
# API Keys - will be loaded from environment variables
|
| 15 |
+
groq_api_key: str = os.getenv("GROQ_API_KEY")
|
| 16 |
+
groq_base_url: str = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1")
|
| 17 |
+
|
| 18 |
+
huggingface_api_key: str = os.getenv("HUGGINGFACE_API_KEY")
|
| 19 |
+
huggingface_provider: str = os.getenv("HUGGINGFACE_PROVIDER", "novita")
|
| 20 |
+
|
| 21 |
+
# Model configurations
|
| 22 |
+
llm_model: str = os.getenv("LLM_MODEL", "llama-3.1-8b-instant")
|
| 23 |
+
|
| 24 |
+
# OCR and processing settings
|
| 25 |
+
y_threshold: float = float(os.getenv("Y_THRESHOLD", "3.0"))
|
| 26 |
+
gap_threshold: int = int(os.getenv("GAP_THRESHOLD", "10"))
|
| 27 |
+
gap_threshold_ratio: float = float(os.getenv("GAP_THRESHOLD_RATIO", "0.1"))
|
| 28 |
+
|
| 29 |
+
# File processing settings
|
| 30 |
+
temp_file_name: str = os.getenv("TEMP_FILE_NAME", "temp.pdf")
|
| 31 |
+
dpi: int = int(os.getenv("DPI", "300"))
|
| 32 |
+
|
| 33 |
+
# spaCy model settings
|
| 34 |
+
spacy_model_name: str = os.getenv("SPACY_MODEL_NAME", "en_core_web_sm")
|
| 35 |
+
|
| 36 |
+
# Device settings
|
| 37 |
+
force_cpu: bool = os.getenv("FORCE_CPU", "false").lower() == "true"
|
| 38 |
+
|
| 39 |
+
class Config:
|
| 40 |
+
env_file = ".env"
|
| 41 |
+
env_file_encoding = "utf-8"
|
| 42 |
+
case_sensitive = False
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# Global settings instance
|
| 46 |
+
settings = Settings()
|
src/extractor/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .table_extractor import TableExtractor
|
| 2 |
+
from .account_extractor import AccountExtractor
|
| 3 |
+
from .balance_extractor import BalanceExtractor
|
| 4 |
+
|
| 5 |
+
__all__ = ["TableExtractor", "AccountExtractor", "BalanceExtractor"]
|
src/extractor/account_extractor.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import re
|
| 3 |
+
import math
|
| 4 |
+
from typing import List, Dict, Any, Optional
|
| 5 |
+
from fuzzywuzzy import fuzz, process
|
| 6 |
+
import spacy
|
| 7 |
+
from src.models.account_models import LineData
|
| 8 |
+
from src.utils import model_manager
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class AccountExtractor:
|
| 12 |
+
"""Async account extractor for extracting account numbers and bank names."""
|
| 13 |
+
|
| 14 |
+
def __init__(self):
|
| 15 |
+
# Use the centralized model manager for spaCy
|
| 16 |
+
self._ensure_models_loaded()
|
| 17 |
+
|
| 18 |
+
def _ensure_models_loaded(self):
|
| 19 |
+
"""Ensure spaCy model is loaded via the model manager."""
|
| 20 |
+
if not model_manager.models_loaded:
|
| 21 |
+
print("π Models not loaded, initializing model manager...")
|
| 22 |
+
# This will trigger model loading if not already done
|
| 23 |
+
_ = model_manager.spacy_model
|
| 24 |
+
|
| 25 |
+
@property
|
| 26 |
+
def nlp(self):
|
| 27 |
+
"""Get the loaded spaCy model from model manager."""
|
| 28 |
+
return model_manager.spacy_model
|
| 29 |
+
|
| 30 |
+
async def __aenter__(self):
|
| 31 |
+
return self
|
| 32 |
+
|
| 33 |
+
async def __aexit__(self, exc_type, exc_value, traceback):
|
| 34 |
+
pass
|
| 35 |
+
|
| 36 |
+
def euclidean_distance(self, b1: List[float], b2: List[float]) -> float:
|
| 37 |
+
"""Compute Euclidean distance between two bounding boxes."""
|
| 38 |
+
# Compute center points of both bboxes
|
| 39 |
+
x1 = (b1[0] + b1[2]) / 2
|
| 40 |
+
y1 = (b1[1] + b1[3]) / 2
|
| 41 |
+
x2 = (b2[0] + b2[2]) / 2
|
| 42 |
+
y2 = (b2[1] + b2[3]) / 2
|
| 43 |
+
return math.sqrt((x1 - x2)**2 + (y1 - y2)**2)
|
| 44 |
+
|
| 45 |
+
def combine_bboxes(self, bboxes: List[List[float]]) -> List[float]:
|
| 46 |
+
"""Merge multiple bboxes into one that covers all."""
|
| 47 |
+
x_min = min(b[0] for b in bboxes)
|
| 48 |
+
y_min = min(b[1] for b in bboxes)
|
| 49 |
+
x_max = max(b[2] for b in bboxes)
|
| 50 |
+
y_max = max(b[3] for b in bboxes)
|
| 51 |
+
return [x_min, y_min, x_max, y_max]
|
| 52 |
+
|
| 53 |
+
async def extract_account_number_regex_distance(self, lines: List[Dict]) -> Optional[Dict]:
|
| 54 |
+
"""Extract account number using regex and distance-based approach."""
|
| 55 |
+
def _extract_account():
|
| 56 |
+
for line in lines:
|
| 57 |
+
words = line.get("words", [])
|
| 58 |
+
word_texts = [w["word"] for w in words]
|
| 59 |
+
|
| 60 |
+
# Build cleaned line text (joined without space or special chars)
|
| 61 |
+
cleaned_line = ""
|
| 62 |
+
for w in words:
|
| 63 |
+
if "/" not in w["word"]:
|
| 64 |
+
cleaned_line += re.sub(r"[\s\-\_\,\/]", "", w["word"])
|
| 65 |
+
else:
|
| 66 |
+
cleaned_line += " " + w["word"]
|
| 67 |
+
|
| 68 |
+
# Look for 'account' in raw word list (not cleaned)
|
| 69 |
+
account_word = next((w for w in words if "account" in w["word"].lower()), None)
|
| 70 |
+
if not account_word:
|
| 71 |
+
continue
|
| 72 |
+
|
| 73 |
+
cleaned_line = cleaned_line[cleaned_line.lower().find("account"):].strip()
|
| 74 |
+
|
| 75 |
+
# Run regex on cleaned line
|
| 76 |
+
match = re.search(r"[0-9Xx]{6,}", cleaned_line)
|
| 77 |
+
if not match:
|
| 78 |
+
continue
|
| 79 |
+
|
| 80 |
+
matched_text = match.group(0)
|
| 81 |
+
|
| 82 |
+
# Now collect all words in line that could be part of this account number
|
| 83 |
+
# Join each word (after cleaning), and check if it contributes to matched text
|
| 84 |
+
joined_account = ""
|
| 85 |
+
matched_bboxes = []
|
| 86 |
+
for w in words:
|
| 87 |
+
clean_w = re.sub(r"[\s\-\_\,\/]", "", w["word"])
|
| 88 |
+
if not clean_w:
|
| 89 |
+
continue
|
| 90 |
+
if matched_text.startswith(joined_account + clean_w):
|
| 91 |
+
joined_account += clean_w
|
| 92 |
+
matched_bboxes.append(w["bbox"])
|
| 93 |
+
if joined_account == matched_text:
|
| 94 |
+
break
|
| 95 |
+
|
| 96 |
+
if joined_account != matched_text or not matched_bboxes:
|
| 97 |
+
continue # failed to reconstruct properly
|
| 98 |
+
|
| 99 |
+
# Compute distance from "account" word bbox to combined bbox
|
| 100 |
+
combined_bbox = self.combine_bboxes(matched_bboxes)
|
| 101 |
+
distance = self.euclidean_distance(account_word["bbox"], combined_bbox)
|
| 102 |
+
|
| 103 |
+
return {
|
| 104 |
+
"account_number": matched_text,
|
| 105 |
+
"bbox": combined_bbox,
|
| 106 |
+
"distance": distance
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
return None
|
| 110 |
+
|
| 111 |
+
return await asyncio.get_event_loop().run_in_executor(None, _extract_account)
|
| 112 |
+
|
| 113 |
+
def match_keyword_bbox(self, keyword: str, words: List[Dict]) -> Optional[List[float]]:
|
| 114 |
+
"""Match keyword in words and return combined bounding box."""
|
| 115 |
+
keyword_tokens = keyword.lower().split()
|
| 116 |
+
text_tokens = [w["word"].lower() for w in words]
|
| 117 |
+
|
| 118 |
+
for i in range(len(text_tokens) - len(keyword_tokens) + 1):
|
| 119 |
+
if text_tokens[i:i+len(keyword_tokens)] == keyword_tokens:
|
| 120 |
+
matched_bboxes = [words[i+j]["bbox"] for j in range(len(keyword_tokens))]
|
| 121 |
+
return self.combine_bboxes(matched_bboxes)
|
| 122 |
+
return None
|
| 123 |
+
|
| 124 |
+
async def extract_bank_name(self, text: str) -> str:
|
| 125 |
+
"""Extract bank name using spaCy NER."""
|
| 126 |
+
def _extract_bank():
|
| 127 |
+
if not self.nlp:
|
| 128 |
+
return "Not Found"
|
| 129 |
+
|
| 130 |
+
doc = self.nlp(text)
|
| 131 |
+
candidates = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
|
| 132 |
+
|
| 133 |
+
for ent in doc.ents:
|
| 134 |
+
print(f"Entity: {ent.text}, Label: {ent.label_}") # Debugging line to see entities and their labels
|
| 135 |
+
print(f"Candidates: {candidates}") # Debugging line to see candidates
|
| 136 |
+
|
| 137 |
+
return candidates[0] if candidates else "Not Found"
|
| 138 |
+
|
| 139 |
+
return await asyncio.get_event_loop().run_in_executor(None, _extract_bank)
|
| 140 |
+
|
| 141 |
+
async def extract_bank_name_using_fuzzy(self, text: str) -> str:
|
| 142 |
+
"""Extract bank name using fuzzy matching."""
|
| 143 |
+
def _extract_fuzzy():
|
| 144 |
+
bank_names = [
|
| 145 |
+
"Bank Of America", "South State Bank", "Midstates Bank",
|
| 146 |
+
"Synovus", "Shore United Bank", "Frost",
|
| 147 |
+
"Bethpage Federal Credit Union"
|
| 148 |
+
]
|
| 149 |
+
best_match = process.extractOne(text, bank_names, scorer=fuzz.partial_ratio)
|
| 150 |
+
return best_match[0] if best_match else "Unknown"
|
| 151 |
+
|
| 152 |
+
return await asyncio.get_event_loop().run_in_executor(None, _extract_fuzzy)
|
src/extractor/balance_extractor.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import re
|
| 3 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 4 |
+
from src.extractor.account_extractor import AccountExtractor
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class BalanceExtractor:
|
| 8 |
+
"""Async balance extractor for extracting balance information."""
|
| 9 |
+
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.amount_pattern = re.compile(r'-?(?:\d{1,3}(?:,\d{2}){1,}(?:,\d{3})?|\d{1,3}(?:,\d{3})+|\d+)?\.\d{1,2}-?')
|
| 12 |
+
self.account_extractor = AccountExtractor()
|
| 13 |
+
|
| 14 |
+
async def __aenter__(self):
|
| 15 |
+
return self
|
| 16 |
+
|
| 17 |
+
async def __aexit__(self, exc_type, exc_value, traceback):
|
| 18 |
+
pass
|
| 19 |
+
|
| 20 |
+
async def extract_balances(self, object_line: List[Dict]) -> Tuple[Optional[str], Optional[str]]:
|
| 21 |
+
"""Extract beginning and ending balances from line data."""
|
| 22 |
+
def _extract_balances():
|
| 23 |
+
# Keywords
|
| 24 |
+
previous_keywords = [
|
| 25 |
+
"previous balance", "starting balance", "beginning balance",
|
| 26 |
+
"balance last statement", "balance previous statement", "last statement",
|
| 27 |
+
"beginning statement", "previous statement", "starting"
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
ending_keywords = [
|
| 31 |
+
"ending balance", "current balance", "balance this statement",
|
| 32 |
+
"balance ending statement", "this statement", "ending statement", "ending"
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
beginning_balance = None
|
| 36 |
+
ending_balance = None
|
| 37 |
+
|
| 38 |
+
for idx, line_obj in enumerate(object_line):
|
| 39 |
+
line = line_obj['line']
|
| 40 |
+
line_lower = line.lower()
|
| 41 |
+
|
| 42 |
+
# Search for beginning balance keywords
|
| 43 |
+
if not beginning_balance:
|
| 44 |
+
for keyword in previous_keywords:
|
| 45 |
+
if keyword in line_lower:
|
| 46 |
+
start_index = line_lower.find(keyword) + len(keyword)
|
| 47 |
+
after_keyword = line[start_index:]
|
| 48 |
+
match = self.amount_pattern.search(after_keyword)
|
| 49 |
+
if match:
|
| 50 |
+
beginning_balance = match.group().replace(",", "")
|
| 51 |
+
break # Stop after first match
|
| 52 |
+
else:
|
| 53 |
+
# combine the bbox of the keyword and check exact below word in range of keyword bbox range
|
| 54 |
+
keyword_bbox = None
|
| 55 |
+
keyword_bbox = self.account_extractor.match_keyword_bbox(keyword, line_obj["words"])
|
| 56 |
+
if keyword_bbox:
|
| 57 |
+
x_min, _, x_max, _ = keyword_bbox
|
| 58 |
+
for next_line in object_line[idx+1:idx+3]:
|
| 59 |
+
final_amt = ""
|
| 60 |
+
for w in next_line.get("words", []):
|
| 61 |
+
wx_min, _, wx_max, _ = w["bbox"]
|
| 62 |
+
if wx_min >= x_min-0.1 and wx_max <= x_max+0.1:
|
| 63 |
+
final_amt += w["word"]
|
| 64 |
+
match = self.amount_pattern.search(final_amt)
|
| 65 |
+
if match:
|
| 66 |
+
beginning_balance = match.group().replace(",", "")
|
| 67 |
+
break
|
| 68 |
+
if beginning_balance:
|
| 69 |
+
break
|
| 70 |
+
|
| 71 |
+
if not ending_balance:
|
| 72 |
+
# Search for ending balance keywords
|
| 73 |
+
for keyword in ending_keywords:
|
| 74 |
+
if keyword in line_lower:
|
| 75 |
+
start_index = line_lower.find(keyword) + len(keyword)
|
| 76 |
+
after_keyword = line[start_index:]
|
| 77 |
+
match = self.amount_pattern.search(after_keyword)
|
| 78 |
+
if match:
|
| 79 |
+
ending_balance = match.group().replace(",", "")
|
| 80 |
+
break # Stop after first match
|
| 81 |
+
else:
|
| 82 |
+
# combine the bbox of the keyword and check exact below word in range of keyword bbox range
|
| 83 |
+
keyword_bbox = None
|
| 84 |
+
keyword_bbox = self.account_extractor.match_keyword_bbox(keyword, line_obj["words"])
|
| 85 |
+
if keyword_bbox:
|
| 86 |
+
x_min, _, x_max, _ = keyword_bbox
|
| 87 |
+
for next_line in object_line[idx+1:idx+3]:
|
| 88 |
+
final_amt = ""
|
| 89 |
+
for w in next_line.get("words", []):
|
| 90 |
+
wx_min, _, wx_max, _ = w["bbox"]
|
| 91 |
+
if wx_min >= x_min-0.1 and wx_max <= x_max+0.1:
|
| 92 |
+
final_amt += w["word"]
|
| 93 |
+
match = self.amount_pattern.search(final_amt)
|
| 94 |
+
if match:
|
| 95 |
+
ending_balance = match.group().replace(",", "")
|
| 96 |
+
break
|
| 97 |
+
if ending_balance:
|
| 98 |
+
break
|
| 99 |
+
|
| 100 |
+
return beginning_balance, ending_balance
|
| 101 |
+
|
| 102 |
+
return await asyncio.get_event_loop().run_in_executor(None, _extract_balances)
|
src/extractor/table_extractor.py
ADDED
|
@@ -0,0 +1,760 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import re
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 5 |
+
from src.config.config import settings
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class TableExtractor:
|
| 9 |
+
"""Async table extractor for processing transaction tables."""
|
| 10 |
+
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.date_pattern = re.compile(
|
| 13 |
+
r"\b(?:"
|
| 14 |
+
r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}"
|
| 15 |
+
r"|\d{2,4}[-/]\d{1,2}[-/]\d{1,2}"
|
| 16 |
+
r"|\d{1,2}[-/]\d{2,4}"
|
| 17 |
+
r"|\d{2,4}[-/]\d{1,2}"
|
| 18 |
+
r"|\d{1,2}[-/]\d{1,2}"
|
| 19 |
+
r")\b"
|
| 20 |
+
)
|
| 21 |
+
self.amount_pattern = re.compile(r'-?(?:\d{1,3}(?:,\d{2}){1,}(?:,\d{3})?|\d{1,3}(?:,\d{3})+|\d+)?\.\d{1,2}-?')
|
| 22 |
+
|
| 23 |
+
async def __aenter__(self):
|
| 24 |
+
return self
|
| 25 |
+
|
| 26 |
+
async def __aexit__(self, exc_type, exc_value, traceback):
|
| 27 |
+
pass
|
| 28 |
+
|
| 29 |
+
def match_by_pattern(self, text: str, pattern) -> bool:
|
| 30 |
+
"""Check if text matches a pattern."""
|
| 31 |
+
if pattern == self.amount_pattern and "-" not in text and len(text) > 6 and "," not in text:
|
| 32 |
+
return False
|
| 33 |
+
if pattern == self.amount_pattern and "-" in text and len(text) > 7 and "," not in text:
|
| 34 |
+
return False
|
| 35 |
+
return bool(pattern.fullmatch(text))
|
| 36 |
+
|
| 37 |
+
def extract_by_pattern(self, text: str, pattern) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
| 38 |
+
"""Extract value by pattern and return value, before, after."""
|
| 39 |
+
match = pattern.search(text)
|
| 40 |
+
if match:
|
| 41 |
+
before = text[:match.start()].strip()
|
| 42 |
+
value = match.group()
|
| 43 |
+
after = text[match.end():].strip()
|
| 44 |
+
if pattern == self.amount_pattern and "-" not in value and len(value) > 6 and "," not in value:
|
| 45 |
+
return None, None, None
|
| 46 |
+
if pattern == self.amount_pattern and "-" in value and len(value) > 7 and "," not in value:
|
| 47 |
+
return None, None, None
|
| 48 |
+
return value, before, after
|
| 49 |
+
return None, None, None
|
| 50 |
+
|
| 51 |
+
def repair_row_with_date_and_amount(self, header: List[str], row: List[str]) -> List[str]:
|
| 52 |
+
"""Repair row data by extracting dates and amounts."""
|
| 53 |
+
result = row[:]
|
| 54 |
+
n = len(header)
|
| 55 |
+
|
| 56 |
+
for i, col in enumerate(header):
|
| 57 |
+
val = result[i].strip()
|
| 58 |
+
|
| 59 |
+
if col.lower() == "date":
|
| 60 |
+
date, left, right = self.extract_by_pattern(val, self.date_pattern)
|
| 61 |
+
if date:
|
| 62 |
+
result[i] = date
|
| 63 |
+
if left and i > 0 and header[i-1] != "date":
|
| 64 |
+
result[i-1] = (result[i-1] + " " + left).strip()
|
| 65 |
+
if right and i < n - 1 and header[i+1] != "date":
|
| 66 |
+
result[i+1] = (right + " " + result[i+1]).strip()
|
| 67 |
+
continue
|
| 68 |
+
|
| 69 |
+
# Check previous column's last word
|
| 70 |
+
if i > 0 and header[i-1] != "date":
|
| 71 |
+
left_val = result[i-1].strip()
|
| 72 |
+
tokens = left_val.split()
|
| 73 |
+
if tokens:
|
| 74 |
+
last_word = tokens[-1]
|
| 75 |
+
date_check, _, _ = self.extract_by_pattern(last_word, self.date_pattern)
|
| 76 |
+
if date_check:
|
| 77 |
+
result[i] = date_check + " " + result[i]
|
| 78 |
+
tokens.pop() # remove matched date
|
| 79 |
+
result[i-1] = " ".join(tokens)
|
| 80 |
+
again_date, again_left, again_right = self.extract_by_pattern(result[i], self.date_pattern)
|
| 81 |
+
if again_date:
|
| 82 |
+
result[i] = again_date
|
| 83 |
+
if again_left:
|
| 84 |
+
result[i-1] = (result[i-1] + " " + again_left).strip()
|
| 85 |
+
if again_right:
|
| 86 |
+
result[i+1] = (again_right + " " + result[i+1]).strip()
|
| 87 |
+
continue
|
| 88 |
+
|
| 89 |
+
# Check next column's first word
|
| 90 |
+
if i < n - 1 and header[i+1] != "date":
|
| 91 |
+
right_val = result[i+1].strip()
|
| 92 |
+
tokens = right_val.split()
|
| 93 |
+
if tokens:
|
| 94 |
+
first_word = tokens[0]
|
| 95 |
+
date_check, _, _ = self.extract_by_pattern(first_word, self.date_pattern)
|
| 96 |
+
if date_check:
|
| 97 |
+
result[i] = result[i] + " " + date_check
|
| 98 |
+
tokens.pop(0)
|
| 99 |
+
result[i+1] = " ".join(tokens)
|
| 100 |
+
again_date, again_left, again_right = self.extract_by_pattern(result[i], self.date_pattern)
|
| 101 |
+
if again_date:
|
| 102 |
+
result[i] = again_date
|
| 103 |
+
if again_left:
|
| 104 |
+
result[i-1] = (result[i-1] + " " + again_left).strip()
|
| 105 |
+
if again_right:
|
| 106 |
+
result[i+1] = (again_right + " " + result[i+1]).strip()
|
| 107 |
+
continue
|
| 108 |
+
|
| 109 |
+
# Check if the entire value is a date
|
| 110 |
+
if not self.match_by_pattern(result[i].strip(), self.date_pattern):
|
| 111 |
+
result[i] = ""
|
| 112 |
+
# check left
|
| 113 |
+
if i > 0 and header[i-1] != "date":
|
| 114 |
+
result[i-1] = (result[i-1] + " " + val).strip()
|
| 115 |
+
elif i < n - 1 and header[i+1] != "date":
|
| 116 |
+
result[i+1] = (val + " " + result[i+1]).strip()
|
| 117 |
+
|
| 118 |
+
elif col.lower() in ["amount", "balance", "credits", "debits"]:
|
| 119 |
+
amt, left, right = self.extract_by_pattern(val, self.amount_pattern)
|
| 120 |
+
if amt:
|
| 121 |
+
result[i] = amt
|
| 122 |
+
if left and i > 0:
|
| 123 |
+
result[i-1] = (result[i-1] + " " + left).strip()
|
| 124 |
+
if right and i < n - 1:
|
| 125 |
+
result[i+1] = (right + " " + result[i+1]).strip()
|
| 126 |
+
continue
|
| 127 |
+
|
| 128 |
+
# Check previous column's last word
|
| 129 |
+
if i > 0 and (header[i-1] not in ["amount", "balance", "credits", "debits"]):
|
| 130 |
+
left_val = result[i-1].strip()
|
| 131 |
+
tokens = left_val.split()
|
| 132 |
+
if tokens:
|
| 133 |
+
last_word = tokens[-1]
|
| 134 |
+
amt_check, _, _ = self.extract_by_pattern(last_word, self.amount_pattern)
|
| 135 |
+
if amt_check:
|
| 136 |
+
result[i] = amt_check + " " + result[i]
|
| 137 |
+
tokens.pop()
|
| 138 |
+
result[i-1] = " ".join(tokens)
|
| 139 |
+
again_amt, again_left, again_right = self.extract_by_pattern(result[i], self.amount_pattern)
|
| 140 |
+
if again_amt:
|
| 141 |
+
result[i] = again_amt
|
| 142 |
+
if again_left:
|
| 143 |
+
result[i-1] = (result[i-1] + " " + again_left).strip()
|
| 144 |
+
if again_right:
|
| 145 |
+
result[i+1] = (again_right + " " + result[i+1]).strip()
|
| 146 |
+
continue
|
| 147 |
+
|
| 148 |
+
# Check next column's first word
|
| 149 |
+
if i < n - 1 and (header[i+1] not in ["amount", "balance", "credits", "debits"]):
|
| 150 |
+
right_val = result[i+1].strip()
|
| 151 |
+
tokens = right_val.split()
|
| 152 |
+
if tokens:
|
| 153 |
+
first_word = tokens[0]
|
| 154 |
+
amt_check, _, _ = self.extract_by_pattern(first_word, self.amount_pattern)
|
| 155 |
+
if amt_check:
|
| 156 |
+
result[i] = result[i] + " " + amt_check
|
| 157 |
+
tokens.pop(0)
|
| 158 |
+
result[i+1] = " ".join(tokens)
|
| 159 |
+
again_amt, again_left, again_right = self.extract_by_pattern(result[i], self.amount_pattern)
|
| 160 |
+
if again_amt:
|
| 161 |
+
result[i] = again_amt
|
| 162 |
+
if again_left:
|
| 163 |
+
result[i-1] = (result[i-1] + " " + again_left).strip()
|
| 164 |
+
if again_right:
|
| 165 |
+
result[i+1] = (again_right + " " + result[i+1]).strip()
|
| 166 |
+
continue
|
| 167 |
+
|
| 168 |
+
# Check if the entire value is an amount
|
| 169 |
+
if not self.match_by_pattern(result[i].strip(), self.amount_pattern):
|
| 170 |
+
result[i] = ""
|
| 171 |
+
# check left
|
| 172 |
+
if i > 0 and (header[i-1] not in ["amount", "balance", "credits", "debits"]):
|
| 173 |
+
result[i-1] = (result[i-1] + " " + val).strip()
|
| 174 |
+
elif i < n - 1 and (header[i+1] not in ["amount", "balance", "credits", "debits"]):
|
| 175 |
+
result[i+1] = (val + " " + result[i+1]).strip()
|
| 176 |
+
|
| 177 |
+
return result
|
| 178 |
+
|
| 179 |
+
def extract_amount_or_return(self, line: str) -> str:
|
| 180 |
+
"""Extract amount from line or return original line."""
|
| 181 |
+
matches = self.amount_pattern.findall(line)
|
| 182 |
+
if matches:
|
| 183 |
+
match = self.amount_pattern.search(line)
|
| 184 |
+
return match.group(0) if match else line
|
| 185 |
+
return line
|
| 186 |
+
|
| 187 |
+
def extract_date_or_return(self, line: str) -> str:
|
| 188 |
+
"""Extract date from line or return original line."""
|
| 189 |
+
matches = self.date_pattern.findall(line)
|
| 190 |
+
if matches:
|
| 191 |
+
match = self.date_pattern.search(line)
|
| 192 |
+
return match.group(0) if match else line
|
| 193 |
+
return line
|
| 194 |
+
|
| 195 |
+
def is_date_word(self, word: str) -> bool:
|
| 196 |
+
"""Check if word is a date."""
|
| 197 |
+
try:
|
| 198 |
+
return bool(self.date_pattern.fullmatch(word))
|
| 199 |
+
except ValueError:
|
| 200 |
+
return False
|
| 201 |
+
|
| 202 |
+
def detect_headers(self, line_data: Dict, gap_threshold_ratio: float = 0.1) -> List[str]:
|
| 203 |
+
"""Detect headers from line data."""
|
| 204 |
+
if "description" not in line_data["line"]:
|
| 205 |
+
gap_threshold_ratio = 0.2
|
| 206 |
+
if "." in line_data["line"]:
|
| 207 |
+
gap_threshold_ratio = 0.1
|
| 208 |
+
|
| 209 |
+
word_data = sorted(line_data["words"], key=lambda w: w["bbox"][0])
|
| 210 |
+
line = line_data["line"]
|
| 211 |
+
|
| 212 |
+
if len(word_data) < 2:
|
| 213 |
+
return [line.strip()] # Treat whole line as one header if only 1 word
|
| 214 |
+
|
| 215 |
+
# Compute horizontal gaps between words
|
| 216 |
+
gaps = []
|
| 217 |
+
for i in range(len(word_data) - 1):
|
| 218 |
+
x1 = word_data[i]["bbox"][2] # end x of current word
|
| 219 |
+
x2 = word_data[i + 1]["bbox"][0] # start x of next word
|
| 220 |
+
gaps.append(x2 - x1)
|
| 221 |
+
|
| 222 |
+
avg_gap = sum(gaps) / len(gaps)
|
| 223 |
+
threshold = avg_gap * gap_threshold_ratio
|
| 224 |
+
|
| 225 |
+
# Split words into groups based on large gaps (assumed column breaks)
|
| 226 |
+
headers = []
|
| 227 |
+
current_header = [word_data[0]["word"]]
|
| 228 |
+
for i in range(1, len(word_data)):
|
| 229 |
+
gap = gaps[i - 1]
|
| 230 |
+
if gap > threshold:
|
| 231 |
+
headers.append(" ".join(current_header))
|
| 232 |
+
current_header = []
|
| 233 |
+
current_header.append(word_data[i]["word"])
|
| 234 |
+
|
| 235 |
+
if current_header:
|
| 236 |
+
headers.append(" ".join(current_header))
|
| 237 |
+
|
| 238 |
+
# Process special cases
|
| 239 |
+
for i in range(len(headers)):
|
| 240 |
+
if "date" in headers[i].lower() and "description" in headers[i].lower():
|
| 241 |
+
header_checker = headers[i].split(" ")
|
| 242 |
+
date_index = header_checker.index("date")
|
| 243 |
+
description_index = header_checker.index("description")
|
| 244 |
+
if date_index < description_index:
|
| 245 |
+
headers[i] = "date"
|
| 246 |
+
headers.insert(i + 1, "description")
|
| 247 |
+
else:
|
| 248 |
+
headers[i] = "description"
|
| 249 |
+
headers.insert(i + 1, "date")
|
| 250 |
+
|
| 251 |
+
# Handle check/draft numbers
|
| 252 |
+
if "check" in headers or "draft" in headers:
|
| 253 |
+
resulted_headers = []
|
| 254 |
+
i = 0
|
| 255 |
+
|
| 256 |
+
while i < len(headers):
|
| 257 |
+
if (
|
| 258 |
+
i + 1 < len(headers)
|
| 259 |
+
and headers[i] == "check"
|
| 260 |
+
and (headers[i + 1] == "no" or headers[i + 1] == "number")
|
| 261 |
+
):
|
| 262 |
+
resulted_headers.append(headers[i] + " " + headers[i + 1])
|
| 263 |
+
i += 2
|
| 264 |
+
elif (
|
| 265 |
+
i + 1 < len(headers)
|
| 266 |
+
and headers[i] == "draft"
|
| 267 |
+
and (headers[i + 1] == "no" or headers[i + 1] == "number")
|
| 268 |
+
):
|
| 269 |
+
resulted_headers.append(headers[i] + " " + headers[i + 1])
|
| 270 |
+
i += 2
|
| 271 |
+
else:
|
| 272 |
+
resulted_headers.append(headers[i])
|
| 273 |
+
i += 1
|
| 274 |
+
|
| 275 |
+
resulted_headers = list(map(lambda x: re.sub(r'[^\w\s]', '', x).strip(), resulted_headers))
|
| 276 |
+
|
| 277 |
+
# Normalize header names
|
| 278 |
+
for i in range(len(resulted_headers)):
|
| 279 |
+
if any(keyword in resulted_headers[i].lower() for keyword in ["date", "day", "month", "year"]):
|
| 280 |
+
resulted_headers[i] = "date"
|
| 281 |
+
if any(keyword in resulted_headers[i].lower() for keyword in ["amount", "total", "sum", "price", "value", "cost", "amt"]):
|
| 282 |
+
resulted_headers[i] = "amount"
|
| 283 |
+
if any(keyword in resulted_headers[i].lower() for keyword in ["balance", "final", "closing", "current", "available", "running", "remaining", "left", "bal", "remain"]):
|
| 284 |
+
resulted_headers[i] = "balance"
|
| 285 |
+
if any(keyword in resulted_headers[i].lower() for keyword in ["credit", "deposit", "cr"]):
|
| 286 |
+
resulted_headers[i] = "credits"
|
| 287 |
+
if any(keyword in resulted_headers[i].lower() for keyword in ["debit", "withdrawal", "dr"]):
|
| 288 |
+
resulted_headers[i] = "debits"
|
| 289 |
+
|
| 290 |
+
return resulted_headers
|
| 291 |
+
|
| 292 |
+
# Normalize header names
|
| 293 |
+
headers = list(map(lambda x: re.sub(r'[^\w\s]', '', x).strip(), headers))
|
| 294 |
+
for i in range(len(headers)):
|
| 295 |
+
if any(keyword in headers[i].lower() for keyword in ["date", "day", "month", "year"]):
|
| 296 |
+
headers[i] = "date"
|
| 297 |
+
if any(keyword in headers[i].lower() for keyword in ["amount", "total", "sum", "price", "value", "cost", "amt"]):
|
| 298 |
+
headers[i] = "amount"
|
| 299 |
+
if any(keyword in headers[i].lower() for keyword in ["balance", "final", "closing", "current", "available", "running", "remaining", "left", "bal", "remain"]):
|
| 300 |
+
headers[i] = "balance"
|
| 301 |
+
if any(keyword in headers[i].lower() for keyword in ["credit", "deposit"]):
|
| 302 |
+
headers[i] = "credits"
|
| 303 |
+
if any(keyword in headers[i].lower() for keyword in ["debit", "withdrawal"]):
|
| 304 |
+
headers[i] = "debits"
|
| 305 |
+
|
| 306 |
+
return headers
|
| 307 |
+
|
| 308 |
+
def detect_row_data(self, headers: List[str], header_data: List[Dict], row_data: List[Dict], gap_threshold: int = 10) -> List[str]:
|
| 309 |
+
"""Detect row data based on headers and word positions."""
|
| 310 |
+
if "description" not in headers:
|
| 311 |
+
gap_threshold = 5
|
| 312 |
+
|
| 313 |
+
def flatten_bbox(bbox):
|
| 314 |
+
if isinstance(bbox[0], list): # [[x0, y0], [x1, y1]]
|
| 315 |
+
return [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]]
|
| 316 |
+
return bbox
|
| 317 |
+
|
| 318 |
+
# Step 1: Get all x0, x1 for header words
|
| 319 |
+
header_ranges = []
|
| 320 |
+
for word in header_data:
|
| 321 |
+
x0, _, x1, _ = flatten_bbox(word["bbox"])
|
| 322 |
+
header_ranges.append((x0, x1))
|
| 323 |
+
|
| 324 |
+
# Step 2: Sort by x0
|
| 325 |
+
header_ranges.sort(key=lambda x: x[0])
|
| 326 |
+
|
| 327 |
+
# Step 3: Merge only close headers (preserve wide gaps)
|
| 328 |
+
merged_ranges = []
|
| 329 |
+
temp_x0, temp_x1 = header_ranges[0]
|
| 330 |
+
for x0, x1 in header_ranges[1:]:
|
| 331 |
+
gap = x0 - temp_x1
|
| 332 |
+
if gap < gap_threshold:
|
| 333 |
+
temp_x1 = max(temp_x1, x1)
|
| 334 |
+
else:
|
| 335 |
+
merged_ranges.append((temp_x0, temp_x1))
|
| 336 |
+
temp_x0, temp_x1 = x0, x1
|
| 337 |
+
merged_ranges.append((temp_x0, temp_x1))
|
| 338 |
+
|
| 339 |
+
# Step 4: Segment row_data based on horizontal gaps
|
| 340 |
+
row_data_sorted = sorted(row_data, key=lambda w: flatten_bbox(w["bbox"])[0])
|
| 341 |
+
segments = []
|
| 342 |
+
current_segment = [row_data_sorted[0]]
|
| 343 |
+
for i in range(1, len(row_data_sorted)):
|
| 344 |
+
prev_x1 = flatten_bbox(row_data_sorted[i - 1]["bbox"])[2]
|
| 345 |
+
curr_x0 = flatten_bbox(row_data_sorted[i]["bbox"])[0]
|
| 346 |
+
if curr_x0 - prev_x1 > gap_threshold:
|
| 347 |
+
segments.append(current_segment)
|
| 348 |
+
current_segment = [row_data_sorted[i]]
|
| 349 |
+
else:
|
| 350 |
+
current_segment.append(row_data_sorted[i])
|
| 351 |
+
if current_segment:
|
| 352 |
+
segments.append(current_segment)
|
| 353 |
+
|
| 354 |
+
# Step 5: Assign each segment to a column
|
| 355 |
+
row_values = [""] * len(headers)
|
| 356 |
+
for segment in segments:
|
| 357 |
+
seg_x0 = flatten_bbox(segment[0]["bbox"])[0]
|
| 358 |
+
seg_x1 = flatten_bbox(segment[-1]["bbox"])[2]
|
| 359 |
+
seg_center = (seg_x0 + seg_x1) / 2
|
| 360 |
+
seg_text = " ".join([w["word"] for w in segment])
|
| 361 |
+
|
| 362 |
+
assigned = False
|
| 363 |
+
for idx, (hx0, hx1) in enumerate(merged_ranges):
|
| 364 |
+
if hx0 <= seg_center <= hx1:
|
| 365 |
+
row_values[idx] += seg_text + " "
|
| 366 |
+
assigned = True
|
| 367 |
+
break
|
| 368 |
+
|
| 369 |
+
if not assigned:
|
| 370 |
+
# Optionally assign to nearest column if center is outside range
|
| 371 |
+
nearest_idx = min(
|
| 372 |
+
range(len(merged_ranges)),
|
| 373 |
+
key=lambda idx: abs(
|
| 374 |
+
(merged_ranges[idx][0] + merged_ranges[idx][1]) / 2 - seg_center
|
| 375 |
+
),
|
| 376 |
+
)
|
| 377 |
+
row_values[nearest_idx] += seg_text + " "
|
| 378 |
+
|
| 379 |
+
final_row = self.repair_row_with_date_and_amount(headers, row_values)
|
| 380 |
+
return [val.strip() for val in final_row]
|
| 381 |
+
|
| 382 |
+
def check_table_tags(self, line: str, headers: List[str]) -> str:
|
| 383 |
+
"""Check and return table tag based on line content and headers."""
|
| 384 |
+
available_tags = ["transaction", "deposit", "withdrawal", "checks", "daily balance", "drafts", "service fee", "interest"]
|
| 385 |
+
tag = ""
|
| 386 |
+
|
| 387 |
+
if "deposit" in line.lower() or "credit" in line.lower():
|
| 388 |
+
tag = "deposit"
|
| 389 |
+
elif "withdrawal" in line.lower() or "debit" in line.lower():
|
| 390 |
+
tag = "withdrawal"
|
| 391 |
+
elif "checks" in line.lower():
|
| 392 |
+
tag = "checks"
|
| 393 |
+
elif "drafts" in line.lower():
|
| 394 |
+
tag = "drafts"
|
| 395 |
+
elif "service fee" in line.lower() or "fee" in line.lower():
|
| 396 |
+
tag = "service fee"
|
| 397 |
+
elif "daily balance" in line.lower() or "balance" in line.lower():
|
| 398 |
+
tag = "daily balance"
|
| 399 |
+
elif "interest" in line.lower():
|
| 400 |
+
tag = "interest"
|
| 401 |
+
elif "transaction" in line.lower() or "transfer" in line.lower():
|
| 402 |
+
tag = "transaction"
|
| 403 |
+
|
| 404 |
+
if "credits" in headers or "debits" in headers:
|
| 405 |
+
tag = "transaction"
|
| 406 |
+
|
| 407 |
+
for h in headers:
|
| 408 |
+
if "check" in h.lower():
|
| 409 |
+
tag = "checks"
|
| 410 |
+
break
|
| 411 |
+
|
| 412 |
+
for h in headers:
|
| 413 |
+
if "draft" in h.lower():
|
| 414 |
+
tag = "drafts"
|
| 415 |
+
break
|
| 416 |
+
|
| 417 |
+
return tag
|
| 418 |
+
|
| 419 |
+
async def process_transaction_tables_with_bbox(self, extracted_text_list: List[List[Dict]]) -> Tuple[List[pd.DataFrame], List[str]]:
|
| 420 |
+
"""Process transaction tables with bounding box data."""
|
| 421 |
+
def _process_tables():
|
| 422 |
+
all_tables = []
|
| 423 |
+
table_tags = []
|
| 424 |
+
|
| 425 |
+
for block in extracted_text_list:
|
| 426 |
+
headers = []
|
| 427 |
+
table_started = False
|
| 428 |
+
current_table = []
|
| 429 |
+
current_row = {}
|
| 430 |
+
header_words = []
|
| 431 |
+
|
| 432 |
+
for line_idx, line_bbox in enumerate(block):
|
| 433 |
+
line = line_bbox["line"]
|
| 434 |
+
line = line.strip()
|
| 435 |
+
|
| 436 |
+
if not table_started and ("date" in line and "description" in line):
|
| 437 |
+
headers = self.detect_headers(line_bbox)
|
| 438 |
+
header_words = line_bbox["words"]
|
| 439 |
+
date_flag = False
|
| 440 |
+
description_flag = False
|
| 441 |
+
for header in headers:
|
| 442 |
+
if "date" in header.lower():
|
| 443 |
+
date_flag = True
|
| 444 |
+
if "description" in header.lower():
|
| 445 |
+
description_flag = True
|
| 446 |
+
if date_flag and description_flag:
|
| 447 |
+
table_started = True
|
| 448 |
+
current_row = {header: [] for header in headers}
|
| 449 |
+
else:
|
| 450 |
+
continue
|
| 451 |
+
|
| 452 |
+
if line_idx - 1 >= 0:
|
| 453 |
+
prev_line = block[line_idx - 1]["line"]
|
| 454 |
+
tag = self.check_table_tags(prev_line, headers)
|
| 455 |
+
if tag:
|
| 456 |
+
table_tags.append(tag)
|
| 457 |
+
elif len(table_tags) > 0:
|
| 458 |
+
table_tags.append(table_tags[-1])
|
| 459 |
+
else:
|
| 460 |
+
table_tags.append("transaction")
|
| 461 |
+
continue
|
| 462 |
+
|
| 463 |
+
elif (not table_started and ("date" in line and "amount" in line)) or (
|
| 464 |
+
not table_started and ("date" in line and "balance" in line)
|
| 465 |
+
):
|
| 466 |
+
headers = self.detect_headers(line_bbox)
|
| 467 |
+
header_words = line_bbox["words"]
|
| 468 |
+
date_flag = False
|
| 469 |
+
amount_flag = False
|
| 470 |
+
balance_flag = False
|
| 471 |
+
for header in headers:
|
| 472 |
+
if "date" in header.lower():
|
| 473 |
+
date_flag = True
|
| 474 |
+
if "amount" in header.lower():
|
| 475 |
+
amount_flag = True
|
| 476 |
+
if "balance" in header.lower():
|
| 477 |
+
balance_flag = True
|
| 478 |
+
if date_flag and (amount_flag or balance_flag):
|
| 479 |
+
table_started = True
|
| 480 |
+
current_row = {header: [] for header in headers}
|
| 481 |
+
else:
|
| 482 |
+
continue
|
| 483 |
+
|
| 484 |
+
if line_idx - 1 >= 0:
|
| 485 |
+
prev_line = block[line_idx - 1]["line"]
|
| 486 |
+
tag = self.check_table_tags(prev_line, headers)
|
| 487 |
+
if tag:
|
| 488 |
+
table_tags.append(tag)
|
| 489 |
+
elif len(table_tags) > 0:
|
| 490 |
+
table_tags.append(table_tags[-1])
|
| 491 |
+
else:
|
| 492 |
+
table_tags.append("transaction")
|
| 493 |
+
continue
|
| 494 |
+
|
| 495 |
+
if table_started and ("date" in line and "description" in line):
|
| 496 |
+
max_len = max(len(v) for v in current_row.values())
|
| 497 |
+
for i in range(max_len):
|
| 498 |
+
row_map = {}
|
| 499 |
+
for key in current_row:
|
| 500 |
+
row_map[key] = (
|
| 501 |
+
current_row[key][i] if i < len(current_row[key]) else ""
|
| 502 |
+
)
|
| 503 |
+
current_table.append(row_map)
|
| 504 |
+
|
| 505 |
+
df = pd.DataFrame(current_table)
|
| 506 |
+
all_tables.append(df)
|
| 507 |
+
current_table = []
|
| 508 |
+
headers = self.detect_headers(line_bbox)
|
| 509 |
+
header_words = line_bbox["words"]
|
| 510 |
+
date_flag = False
|
| 511 |
+
description_flag = False
|
| 512 |
+
for header in headers:
|
| 513 |
+
if "date" in header.lower():
|
| 514 |
+
date_flag = True
|
| 515 |
+
if "description" in header.lower():
|
| 516 |
+
description_flag = True
|
| 517 |
+
if date_flag and description_flag:
|
| 518 |
+
current_row = {header: [] for header in headers}
|
| 519 |
+
else:
|
| 520 |
+
continue
|
| 521 |
+
|
| 522 |
+
if line_idx - 1 >= 0:
|
| 523 |
+
prev_line = block[line_idx - 1]["line"]
|
| 524 |
+
tag = self.check_table_tags(prev_line, headers)
|
| 525 |
+
if tag:
|
| 526 |
+
table_tags.append(tag)
|
| 527 |
+
elif len(table_tags) > 0:
|
| 528 |
+
table_tags.append(table_tags[-1])
|
| 529 |
+
else:
|
| 530 |
+
table_tags.append("transaction")
|
| 531 |
+
continue
|
| 532 |
+
|
| 533 |
+
elif (table_started and ("date" in line and "amount" in line)) or (
|
| 534 |
+
table_started and ("date" in line and "balance" in line)
|
| 535 |
+
):
|
| 536 |
+
max_len = max(len(v) for v in current_row.values())
|
| 537 |
+
for i in range(max_len):
|
| 538 |
+
row_map = {}
|
| 539 |
+
for key in current_row:
|
| 540 |
+
row_map[key] = (
|
| 541 |
+
current_row[key][i] if i < len(current_row[key]) else ""
|
| 542 |
+
)
|
| 543 |
+
current_table.append(row_map)
|
| 544 |
+
|
| 545 |
+
df = pd.DataFrame(current_table)
|
| 546 |
+
all_tables.append(df)
|
| 547 |
+
current_table = []
|
| 548 |
+
headers = self.detect_headers(line_bbox)
|
| 549 |
+
header_words = line_bbox["words"]
|
| 550 |
+
date_flag = False
|
| 551 |
+
amount_flag = False
|
| 552 |
+
balance_flag = False
|
| 553 |
+
for header in headers:
|
| 554 |
+
if "date" in header.lower():
|
| 555 |
+
date_flag = True
|
| 556 |
+
if "amount" in header.lower():
|
| 557 |
+
amount_flag = True
|
| 558 |
+
if "balance" in header.lower():
|
| 559 |
+
balance_flag = True
|
| 560 |
+
if date_flag and (amount_flag or balance_flag):
|
| 561 |
+
current_row = {header: [] for header in headers}
|
| 562 |
+
else:
|
| 563 |
+
continue
|
| 564 |
+
|
| 565 |
+
if line_idx - 1 >= 0:
|
| 566 |
+
prev_line = block[line_idx - 1]["line"]
|
| 567 |
+
tag = self.check_table_tags(prev_line, headers)
|
| 568 |
+
if tag:
|
| 569 |
+
table_tags.append(tag)
|
| 570 |
+
elif len(table_tags) > 0:
|
| 571 |
+
table_tags.append(table_tags[-1])
|
| 572 |
+
else:
|
| 573 |
+
table_tags.append("transaction")
|
| 574 |
+
continue
|
| 575 |
+
|
| 576 |
+
if table_started:
|
| 577 |
+
parts = self.detect_row_data(headers, header_words, line_bbox["words"])
|
| 578 |
+
for key, value in zip(headers, parts):
|
| 579 |
+
current_row[key].append(value)
|
| 580 |
+
max_len = max(len(v) for v in current_row.values())
|
| 581 |
+
|
| 582 |
+
for i in range(max_len):
|
| 583 |
+
if (
|
| 584 |
+
"amount" in headers
|
| 585 |
+
and current_row["amount"]
|
| 586 |
+
and i < len(current_row["amount"])
|
| 587 |
+
and current_row["amount"][i]
|
| 588 |
+
):
|
| 589 |
+
amount = self.extract_amount_or_return(current_row["amount"][i])
|
| 590 |
+
current_row["amount"][i] = amount
|
| 591 |
+
if (
|
| 592 |
+
"balance" in headers
|
| 593 |
+
and current_row["balance"]
|
| 594 |
+
and i < len(current_row["balance"])
|
| 595 |
+
and current_row["balance"][i]
|
| 596 |
+
):
|
| 597 |
+
amount = self.extract_amount_or_return(current_row["balance"][i])
|
| 598 |
+
current_row["balance"][i] = amount
|
| 599 |
+
if (
|
| 600 |
+
"credits" in headers
|
| 601 |
+
and current_row["credits"]
|
| 602 |
+
and i < len(current_row["credits"])
|
| 603 |
+
and current_row["credits"][i]
|
| 604 |
+
):
|
| 605 |
+
amount = self.extract_amount_or_return(current_row["credits"][i])
|
| 606 |
+
current_row["credits"][i] = amount
|
| 607 |
+
if (
|
| 608 |
+
"debits" in headers
|
| 609 |
+
and current_row["debits"]
|
| 610 |
+
and i < len(current_row["debits"])
|
| 611 |
+
and current_row["debits"][i]
|
| 612 |
+
):
|
| 613 |
+
amount = self.extract_amount_or_return(current_row["debits"][i])
|
| 614 |
+
current_row["debits"][i] = amount
|
| 615 |
+
if (
|
| 616 |
+
"date" in headers
|
| 617 |
+
and current_row["date"]
|
| 618 |
+
and i < len(current_row["date"])
|
| 619 |
+
and current_row["date"][i]
|
| 620 |
+
):
|
| 621 |
+
current_row["date"][i] = self.extract_date_or_return(
|
| 622 |
+
current_row["date"][i]
|
| 623 |
+
)
|
| 624 |
+
|
| 625 |
+
if (
|
| 626 |
+
"date" in headers
|
| 627 |
+
and current_row["date"]
|
| 628 |
+
and current_row["date"][0]
|
| 629 |
+
and not self.is_date_word(current_row["date"][0])
|
| 630 |
+
or (
|
| 631 |
+
"amount" in headers
|
| 632 |
+
and current_row["amount"][0]
|
| 633 |
+
and not self.amount_pattern.match(current_row["amount"][0])
|
| 634 |
+
)
|
| 635 |
+
or (
|
| 636 |
+
"balance" in headers
|
| 637 |
+
and current_row["balance"][0]
|
| 638 |
+
and not self.amount_pattern.match(current_row["balance"][0])
|
| 639 |
+
)
|
| 640 |
+
or (
|
| 641 |
+
"credits" in headers
|
| 642 |
+
and current_row["credits"][0]
|
| 643 |
+
and not self.amount_pattern.match(current_row["credits"][0])
|
| 644 |
+
)
|
| 645 |
+
or (
|
| 646 |
+
"debits" in headers
|
| 647 |
+
and current_row["debits"][0]
|
| 648 |
+
and not self.amount_pattern.match(current_row["debits"][0])
|
| 649 |
+
)
|
| 650 |
+
):
|
| 651 |
+
if not current_table and len(table_tags) > 0 and table_tags[-1]:
|
| 652 |
+
table_tags.pop()
|
| 653 |
+
all_tables.append(pd.DataFrame(current_table))
|
| 654 |
+
current_table = []
|
| 655 |
+
current_row = {}
|
| 656 |
+
header_words = []
|
| 657 |
+
headers = []
|
| 658 |
+
table_started = False
|
| 659 |
+
else:
|
| 660 |
+
for i in range(max_len):
|
| 661 |
+
row_map = {}
|
| 662 |
+
for key in current_row:
|
| 663 |
+
row_map[key] = (
|
| 664 |
+
current_row[key][i] if i < len(current_row[key]) else ""
|
| 665 |
+
)
|
| 666 |
+
current_table.append(row_map)
|
| 667 |
+
current_row = {header: [] for header in headers}
|
| 668 |
+
|
| 669 |
+
table_started = False
|
| 670 |
+
|
| 671 |
+
if current_table:
|
| 672 |
+
df = pd.DataFrame(current_table)
|
| 673 |
+
all_tables.append(df)
|
| 674 |
+
|
| 675 |
+
return all_tables, table_tags
|
| 676 |
+
|
| 677 |
+
return await asyncio.get_event_loop().run_in_executor(None, _process_tables)
|
| 678 |
+
|
| 679 |
+
async def process_tables(self, table: pd.DataFrame) -> pd.DataFrame:
|
| 680 |
+
"""Process the extracted table to clean and format it."""
|
| 681 |
+
def _process_table():
|
| 682 |
+
keywords = ["continue", "continued", "page", "next page", "total", "subtotal"]
|
| 683 |
+
table_copy = table.copy()
|
| 684 |
+
is_balance_column = "balance" in table_copy.columns
|
| 685 |
+
is_amount_column = "amount" in table_copy.columns
|
| 686 |
+
is_credits_column = "credits" in table_copy.columns
|
| 687 |
+
is_debits_column = "debits" in table_copy.columns
|
| 688 |
+
|
| 689 |
+
for idx, row in table_copy.iterrows():
|
| 690 |
+
if is_balance_column:
|
| 691 |
+
if row["balance"] and not row["date"]:
|
| 692 |
+
table_copy.loc[idx] = [""] * len(table_copy.columns)
|
| 693 |
+
continue
|
| 694 |
+
if is_amount_column:
|
| 695 |
+
if row["amount"] and not row["date"]:
|
| 696 |
+
table_copy.loc[idx] = [""] * len(table_copy.columns)
|
| 697 |
+
continue
|
| 698 |
+
if is_credits_column:
|
| 699 |
+
if row["credits"] and not row["date"]:
|
| 700 |
+
table_copy.loc[idx] = [""] * len(table_copy.columns)
|
| 701 |
+
continue
|
| 702 |
+
if is_debits_column:
|
| 703 |
+
if row["debits"] and not row["date"]:
|
| 704 |
+
table_copy.loc[idx] = [""] * len(table_copy.columns)
|
| 705 |
+
continue
|
| 706 |
+
for cell in row:
|
| 707 |
+
if any(keyword in cell.lower() for keyword in keywords):
|
| 708 |
+
table_copy.loc[idx] = [""] * len(table_copy.columns)
|
| 709 |
+
break
|
| 710 |
+
|
| 711 |
+
df = table_copy.copy()
|
| 712 |
+
df = df.fillna("") # Fill NaNs with empty string for easier processing
|
| 713 |
+
|
| 714 |
+
# Step 1: Identify key columns (case-insensitive match)
|
| 715 |
+
lower_cols = [col.lower() for col in df.columns]
|
| 716 |
+
date_col = next((col for col in df.columns if re.search(r'date', col, re.IGNORECASE)), None)
|
| 717 |
+
value_cols = [col for col in df.columns if re.search(r'amount|balance|credits|debits', col, re.IGNORECASE)]
|
| 718 |
+
|
| 719 |
+
if not date_col or not value_cols:
|
| 720 |
+
return df
|
| 721 |
+
|
| 722 |
+
def is_anchor(row):
|
| 723 |
+
return bool(row[date_col].strip()) and any(row[col].strip() for col in value_cols)
|
| 724 |
+
|
| 725 |
+
# Step 2: Loop over rows and identify anchor indices
|
| 726 |
+
anchor_indices = [i for i, row in df.iterrows() if is_anchor(row)]
|
| 727 |
+
|
| 728 |
+
for anchor_idx in anchor_indices:
|
| 729 |
+
# Merge upward
|
| 730 |
+
i = anchor_idx - 1
|
| 731 |
+
while i >= 0:
|
| 732 |
+
if is_anchor(df.iloc[i]) or df.iloc[i].isnull().all() or all(df.iloc[i] == ""):
|
| 733 |
+
break
|
| 734 |
+
for col in df.columns:
|
| 735 |
+
if col != date_col and col not in value_cols:
|
| 736 |
+
df.at[anchor_idx, col] = (str(df.at[i, col]).strip() + " " + str(df.at[anchor_idx, col]).strip()).strip()
|
| 737 |
+
df.iloc[i] = "" # Blank the merged row
|
| 738 |
+
i -= 1
|
| 739 |
+
|
| 740 |
+
# Merge downward
|
| 741 |
+
i = anchor_idx + 1
|
| 742 |
+
while i < len(df):
|
| 743 |
+
if is_anchor(df.iloc[i]) or df.iloc[i].isnull().all() or all(df.iloc[i] == ""):
|
| 744 |
+
break
|
| 745 |
+
for col in df.columns:
|
| 746 |
+
if col != date_col and col not in value_cols:
|
| 747 |
+
df.at[anchor_idx, col] = (str(df.at[anchor_idx, col]).strip() + " " + str(df.at[i, col]).strip()).strip()
|
| 748 |
+
df.iloc[i] = "" # Blank the merged row
|
| 749 |
+
i += 1
|
| 750 |
+
|
| 751 |
+
df_copy = df.copy()
|
| 752 |
+
col = "balance" if "balance" in df_copy.columns else "amount"
|
| 753 |
+
|
| 754 |
+
for idx, row in df_copy.iterrows():
|
| 755 |
+
if not row[col] and not row[date_col]:
|
| 756 |
+
df_copy.loc[idx] = [""] * len(df_copy.columns)
|
| 757 |
+
df_copy = df_copy[~df_copy.apply(lambda row: all(cell == "" for cell in row), axis=1)].reset_index(drop=True)
|
| 758 |
+
return df_copy
|
| 759 |
+
|
| 760 |
+
return await asyncio.get_event_loop().run_in_executor(None, _process_table)
|
src/models/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .account_models import AccountSummary, AccountDetails, BankStatementData
|
| 2 |
+
|
| 3 |
+
__all__ = ["AccountSummary", "AccountDetails", "BankStatementData"]
|
src/models/account_models.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict, Any, Optional
|
| 2 |
+
from pydantic import BaseModel, Field
|
| 3 |
+
from datetime import date
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class AccountDetails(BaseModel):
|
| 7 |
+
"""Model for individual account details."""
|
| 8 |
+
account_name: str = Field(..., description="Name of the account")
|
| 9 |
+
account_number: str = Field(..., description="Account number")
|
| 10 |
+
starting_balance: float = Field(..., description="Starting balance of the account")
|
| 11 |
+
ending_balance: float = Field(..., description="Ending balance of the account")
|
| 12 |
+
statement_start_date: str = Field(..., description="Statement start date in YYYY-MM-DD format")
|
| 13 |
+
statement_end_date: str = Field(..., description="Statement end date in YYYY-MM-DD format")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class AccountSummary(BaseModel):
|
| 17 |
+
"""Model for bank account summary extracted from LLM."""
|
| 18 |
+
bank_name: str = Field(..., description="Name of the bank")
|
| 19 |
+
account_holder: str = Field(..., description="Name of the account holder")
|
| 20 |
+
accounts: List[AccountDetails] = Field(..., description="List of account details")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class BankStatementData(BaseModel):
|
| 24 |
+
"""Model for processed bank statement data."""
|
| 25 |
+
account_summary: Dict[str, str] = Field(..., description="Account summary information")
|
| 26 |
+
transaction_tables: Dict[str, Any] = Field(..., description="Extracted transaction tables")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class WordData(BaseModel):
|
| 30 |
+
"""Model for word data with bounding box."""
|
| 31 |
+
word: str = Field(..., description="Extracted word text")
|
| 32 |
+
bbox: List[float] = Field(..., description="Bounding box coordinates [x0, y0, x1, y1]")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class LineData(BaseModel):
|
| 36 |
+
"""Model for line data with words."""
|
| 37 |
+
line: str = Field(..., description="Complete line text")
|
| 38 |
+
bbox: List[float] = Field(..., description="Line bounding box [x, y]")
|
| 39 |
+
words: List[WordData] = Field(..., description="List of words in the line")
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class ExtractedTextData(BaseModel):
|
| 43 |
+
"""Model for extracted text data from PDF."""
|
| 44 |
+
pages: List[List[LineData]] = Field(..., description="List of pages, each containing lines")
|
src/ocr/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .pdf_processor import PDFProcessor
|
| 2 |
+
from .text_extractor import TextExtractor
|
| 3 |
+
|
| 4 |
+
__all__ = ["PDFProcessor", "TextExtractor"]
|
src/ocr/pdf_processor.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import fitz
|
| 3 |
+
import os
|
| 4 |
+
from typing import List, Dict, Any, Optional
|
| 5 |
+
import numpy as np
|
| 6 |
+
from pdf2image import convert_from_path
|
| 7 |
+
from doctr.models import ocr_predictor
|
| 8 |
+
from doctr.io import DocumentFile
|
| 9 |
+
import torch
|
| 10 |
+
from src.config.config import settings
|
| 11 |
+
from src.models.account_models import LineData, WordData
|
| 12 |
+
from src.utils import model_manager
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class PDFProcessor:
|
| 16 |
+
"""Async PDF processor for handling both digital and scanned PDFs."""
|
| 17 |
+
|
| 18 |
+
def __init__(self):
|
| 19 |
+
# Use the centralized model manager
|
| 20 |
+
self._ensure_models_loaded()
|
| 21 |
+
|
| 22 |
+
def _ensure_models_loaded(self):
|
| 23 |
+
"""Ensure models are loaded via the model manager."""
|
| 24 |
+
if not model_manager.models_loaded:
|
| 25 |
+
print("π Models not loaded, initializing model manager...")
|
| 26 |
+
# This will trigger model loading if not already done
|
| 27 |
+
_ = model_manager.doctr_model
|
| 28 |
+
|
| 29 |
+
@property
|
| 30 |
+
def doctr_model(self):
|
| 31 |
+
"""Get the loaded doctr model from model manager."""
|
| 32 |
+
return model_manager.doctr_model
|
| 33 |
+
|
| 34 |
+
@property
|
| 35 |
+
def device(self):
|
| 36 |
+
"""Get the device being used from model manager."""
|
| 37 |
+
return model_manager.device
|
| 38 |
+
|
| 39 |
+
async def __aenter__(self):
|
| 40 |
+
return self
|
| 41 |
+
|
| 42 |
+
async def __aexit__(self, exc_type, exc_value, traceback):
|
| 43 |
+
pass
|
| 44 |
+
|
| 45 |
+
async def is_pdf_scanned(self, pdf_path: str) -> bool:
|
| 46 |
+
"""Check if PDF is scanned (no extractable text)."""
|
| 47 |
+
def _check_scanned():
|
| 48 |
+
doc = fitz.open(pdf_path)
|
| 49 |
+
for page in doc:
|
| 50 |
+
text = page.get_text()
|
| 51 |
+
if text.strip():
|
| 52 |
+
return False
|
| 53 |
+
return True
|
| 54 |
+
|
| 55 |
+
return await asyncio.get_event_loop().run_in_executor(None, _check_scanned)
|
| 56 |
+
|
| 57 |
+
async def save_uploaded_file(self, uploaded_file) -> str:
|
| 58 |
+
"""Save uploaded file to temporary location."""
|
| 59 |
+
def _save_file():
|
| 60 |
+
with open(settings.temp_file_name, "wb") as f:
|
| 61 |
+
f.write(uploaded_file.read())
|
| 62 |
+
return settings.temp_file_name
|
| 63 |
+
|
| 64 |
+
return await asyncio.get_event_loop().run_in_executor(None, _save_file)
|
| 65 |
+
|
| 66 |
+
async def extract_text_from_digital_pdf(self, pdf_path: str) -> List[List[str]]:
|
| 67 |
+
"""Extract text from digital PDF using PyPDF2."""
|
| 68 |
+
from PyPDF2 import PdfReader
|
| 69 |
+
|
| 70 |
+
def _extract_text():
|
| 71 |
+
reader = PdfReader(pdf_path)
|
| 72 |
+
extracted_data = []
|
| 73 |
+
|
| 74 |
+
for page in reader.pages:
|
| 75 |
+
ptext = page.extract_text()
|
| 76 |
+
if ptext:
|
| 77 |
+
data = []
|
| 78 |
+
for line in ptext.splitlines():
|
| 79 |
+
cleaned_line = self._split_on_repeated_pattern(line.strip())
|
| 80 |
+
if cleaned_line:
|
| 81 |
+
data.append(cleaned_line[0])
|
| 82 |
+
extracted_data.append(data)
|
| 83 |
+
|
| 84 |
+
return extracted_data
|
| 85 |
+
|
| 86 |
+
return await asyncio.get_event_loop().run_in_executor(None, _extract_text)
|
| 87 |
+
|
| 88 |
+
def _split_on_repeated_pattern(self, line: str, min_space: int = 10) -> List[str]:
|
| 89 |
+
"""Split line on repeated pattern."""
|
| 90 |
+
import re
|
| 91 |
+
from difflib import SequenceMatcher
|
| 92 |
+
|
| 93 |
+
original_line = line.strip()
|
| 94 |
+
|
| 95 |
+
# Find all spans of spaces >= min_space
|
| 96 |
+
space_spans = [
|
| 97 |
+
(m.start(), len(m.group()))
|
| 98 |
+
for m in re.finditer(r" {%d,}" % min_space, original_line)
|
| 99 |
+
]
|
| 100 |
+
|
| 101 |
+
if not space_spans:
|
| 102 |
+
return [original_line]
|
| 103 |
+
|
| 104 |
+
# Count how often each gap size occurs
|
| 105 |
+
gaps = [span[1] for span in space_spans]
|
| 106 |
+
gap_counts = {}
|
| 107 |
+
for g in gaps:
|
| 108 |
+
gap_counts[g] = gap_counts.get(g, 0) + 1
|
| 109 |
+
|
| 110 |
+
# Sort gaps by size Γ count (more dominant gaps first)
|
| 111 |
+
sorted_gaps = sorted(gap_counts.items(), key=lambda x: x[1] * x[0], reverse=True)
|
| 112 |
+
|
| 113 |
+
# No significant gaps, return original
|
| 114 |
+
if not sorted_gaps:
|
| 115 |
+
return [original_line]
|
| 116 |
+
|
| 117 |
+
dominant_gap = sorted_gaps[0][0]
|
| 118 |
+
|
| 119 |
+
# Use the dominant large gap to split
|
| 120 |
+
chunks = re.split(rf" {{%d,}}" % dominant_gap, original_line)
|
| 121 |
+
|
| 122 |
+
# Check if it's actually repeated using fuzzy match
|
| 123 |
+
base = chunks[0].strip()
|
| 124 |
+
repeated = False
|
| 125 |
+
for chunk in chunks[1:]:
|
| 126 |
+
chunk = chunk.strip()
|
| 127 |
+
if chunk and SequenceMatcher(None, base, chunk).ratio() > 0.8:
|
| 128 |
+
repeated = True
|
| 129 |
+
break
|
| 130 |
+
|
| 131 |
+
return [base] if repeated else [original_line]
|
src/ocr/text_extractor.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import fitz
|
| 3 |
+
import re
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import List, Dict, Any, Optional
|
| 6 |
+
from pdf2image import convert_from_path
|
| 7 |
+
from src.config.config import settings
|
| 8 |
+
from src.models.account_models import LineData, WordData
|
| 9 |
+
from doctr.io import DocumentFile
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class TextExtractor:
|
| 13 |
+
"""Async text extractor for extracting text with bounding boxes."""
|
| 14 |
+
|
| 15 |
+
def __init__(self, doctr_model):
|
| 16 |
+
self.doctr_model = doctr_model
|
| 17 |
+
|
| 18 |
+
async def __aenter__(self):
|
| 19 |
+
return self
|
| 20 |
+
|
| 21 |
+
async def __aexit__(self, exc_type, exc_value, traceback):
|
| 22 |
+
pass
|
| 23 |
+
|
| 24 |
+
def normalize_bbox(self, bbox, width: float, height: float) -> List[float]:
|
| 25 |
+
"""Normalize bounding box (x0, y0, x1, y1) to range [0, 1]."""
|
| 26 |
+
x0, y0, x1, y1 = bbox
|
| 27 |
+
return [
|
| 28 |
+
round(x0 / width, 6),
|
| 29 |
+
round(y0 / height, 6),
|
| 30 |
+
round(x1 / width, 6),
|
| 31 |
+
round(y1 / height, 6),
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
def remove_consecutive_items(self, line: List[str]) -> List[str]:
|
| 35 |
+
"""Remove consecutive duplicate items from a list."""
|
| 36 |
+
if not line:
|
| 37 |
+
return line
|
| 38 |
+
result = [line[0]]
|
| 39 |
+
for item in line[1:]:
|
| 40 |
+
if item != result[-1]:
|
| 41 |
+
result.append(item)
|
| 42 |
+
return result
|
| 43 |
+
|
| 44 |
+
def remove_consecutive_words(self, word_data: List[Dict]) -> List[Dict]:
|
| 45 |
+
"""Remove consecutive duplicate words from word data."""
|
| 46 |
+
if not word_data:
|
| 47 |
+
return word_data
|
| 48 |
+
result = [word_data[0]]
|
| 49 |
+
for i in range(1, len(word_data)):
|
| 50 |
+
if word_data[i]["word"] != result[-1]["word"]:
|
| 51 |
+
result.append(word_data[i])
|
| 52 |
+
return result
|
| 53 |
+
|
| 54 |
+
async def extract_lines_with_bbox(self, pdf_path: str, y_threshold: float = 3.0) -> List[List[LineData]]:
|
| 55 |
+
"""Extract lines with bounding boxes from digital PDF."""
|
| 56 |
+
def _extract_lines():
|
| 57 |
+
doc = fitz.open(pdf_path)
|
| 58 |
+
page_lines_with_bbox = []
|
| 59 |
+
|
| 60 |
+
for page in doc:
|
| 61 |
+
words = page.get_text("words") # (x0, y0, x1, y1, word, block_no, line_no, word_no)
|
| 62 |
+
words.sort(key=lambda w: (round(w[1], 1), w[0])) # sort by y then x
|
| 63 |
+
|
| 64 |
+
lines = []
|
| 65 |
+
current_line = []
|
| 66 |
+
current_y = None
|
| 67 |
+
current_word_data = []
|
| 68 |
+
|
| 69 |
+
for w in words:
|
| 70 |
+
x0, y0, x1, y1, word = w[:5]
|
| 71 |
+
if word == "|" or not word or word == "." or word == "#" or re.sub(r'[^\w\s]', '', word) == "":
|
| 72 |
+
continue
|
| 73 |
+
word = word.lower()
|
| 74 |
+
word_data = {"word": word.strip(), "bbox": (x0, y0, x1, y1)}
|
| 75 |
+
|
| 76 |
+
if current_y is None or abs(y0 - current_y) < y_threshold:
|
| 77 |
+
current_line.append((x0, y0, word))
|
| 78 |
+
current_y = y0
|
| 79 |
+
current_word_data.append(word_data)
|
| 80 |
+
else:
|
| 81 |
+
current_line.sort()
|
| 82 |
+
line_words = [w[2] for w in current_line]
|
| 83 |
+
clean_line = self.remove_consecutive_items(line_words)
|
| 84 |
+
current_word_data = sorted(current_word_data, key=lambda w: w["bbox"][0])
|
| 85 |
+
clean_word_data = self.remove_consecutive_words(current_word_data)
|
| 86 |
+
|
| 87 |
+
if clean_line:
|
| 88 |
+
x_start = min([w[0] for w in current_line])
|
| 89 |
+
y_start = min([w[1] for w in current_line])
|
| 90 |
+
lines.append({
|
| 91 |
+
"line": " ".join(clean_line),
|
| 92 |
+
"bbox": [x_start, y_start],
|
| 93 |
+
"words": clean_word_data,
|
| 94 |
+
})
|
| 95 |
+
current_line = [(x0, y0, word)]
|
| 96 |
+
current_y = y0
|
| 97 |
+
current_word_data = [word_data]
|
| 98 |
+
|
| 99 |
+
# Process remaining line
|
| 100 |
+
if current_line:
|
| 101 |
+
current_line.sort()
|
| 102 |
+
line_words = [w[2] for w in current_line]
|
| 103 |
+
clean_line = self.remove_consecutive_items(line_words)
|
| 104 |
+
current_word_data = sorted(current_word_data, key=lambda w: w["bbox"][0])
|
| 105 |
+
clean_word_data = self.remove_consecutive_words(current_word_data)
|
| 106 |
+
|
| 107 |
+
if clean_line:
|
| 108 |
+
x_start = min([w[0] for w in current_line])
|
| 109 |
+
y_start = min([w[1] for w in current_line])
|
| 110 |
+
lines.append({
|
| 111 |
+
"line": " ".join(clean_line),
|
| 112 |
+
"bbox": [x_start, y_start],
|
| 113 |
+
"words": clean_word_data,
|
| 114 |
+
})
|
| 115 |
+
|
| 116 |
+
page_lines_with_bbox.append(lines)
|
| 117 |
+
|
| 118 |
+
return page_lines_with_bbox
|
| 119 |
+
|
| 120 |
+
return await asyncio.get_event_loop().run_in_executor(None, _extract_lines)
|
| 121 |
+
|
| 122 |
+
async def extract_lines_with_bbox_from_scanned_pdf(self, pdf_path: str, y_threshold: float = 5.0, first_page: bool = False) -> List[List[LineData]]:
|
| 123 |
+
"""Extract lines with bounding boxes from scanned PDF using OCR."""
|
| 124 |
+
def _extract_from_scanned():
|
| 125 |
+
result = None
|
| 126 |
+
doc = None
|
| 127 |
+
|
| 128 |
+
if first_page:
|
| 129 |
+
pages = convert_from_path(pdf_path, dpi=settings.dpi, first_page=1, last_page=1)
|
| 130 |
+
first_page_img = pages[0].convert("RGB")
|
| 131 |
+
result = self.doctr_model([np.array(first_page_img)])
|
| 132 |
+
doc = np.array(first_page_img)
|
| 133 |
+
else:
|
| 134 |
+
doc = DocumentFile.from_pdf(pdf_path)
|
| 135 |
+
result = self.doctr_model(doc)
|
| 136 |
+
|
| 137 |
+
page_lines_with_bbox = []
|
| 138 |
+
|
| 139 |
+
for page in result.pages:
|
| 140 |
+
img_width, img_height = doc[0].shape[1], doc[0].shape[0]
|
| 141 |
+
words = []
|
| 142 |
+
|
| 143 |
+
for block in page.blocks:
|
| 144 |
+
for line in block.lines:
|
| 145 |
+
for word in line.words:
|
| 146 |
+
x0, y0 = word.geometry[0]
|
| 147 |
+
x1, y1 = word.geometry[1]
|
| 148 |
+
abs_x0 = x0 * img_width
|
| 149 |
+
abs_y0 = y0 * img_height
|
| 150 |
+
abs_x1 = x1 * img_width
|
| 151 |
+
abs_y1 = y1 * img_height
|
| 152 |
+
text = word.value.strip().lower()
|
| 153 |
+
text = re.sub(r'[#*]', ' ', text)
|
| 154 |
+
text = text.strip()
|
| 155 |
+
|
| 156 |
+
if text == "|" or not text or text == "." or text == "#" or re.sub(r'[^\w\s]', '', text) == "":
|
| 157 |
+
continue
|
| 158 |
+
words.append({"word": text, "bbox": [abs_x0, abs_y0, abs_x1, abs_y1]})
|
| 159 |
+
|
| 160 |
+
# Sort words by y then x
|
| 161 |
+
words.sort(key=lambda w: (round(w["bbox"][1], 3), w["bbox"][0]))
|
| 162 |
+
|
| 163 |
+
lines = []
|
| 164 |
+
current_line = []
|
| 165 |
+
current_word_data = []
|
| 166 |
+
current_y = None
|
| 167 |
+
|
| 168 |
+
for w in words:
|
| 169 |
+
y0 = w["bbox"][1]
|
| 170 |
+
if current_y is None or abs(y0 - current_y) < y_threshold:
|
| 171 |
+
current_line.append((w["bbox"][0], y0, w["word"]))
|
| 172 |
+
current_word_data.append(w)
|
| 173 |
+
current_y = y0
|
| 174 |
+
else:
|
| 175 |
+
current_line.sort()
|
| 176 |
+
line_words = [x[2] for x in current_line]
|
| 177 |
+
clean_line = self.remove_consecutive_items(line_words)
|
| 178 |
+
current_word_data = sorted(current_word_data, key=lambda w: w["bbox"][0])
|
| 179 |
+
clean_word_data = self.remove_consecutive_words(current_word_data)
|
| 180 |
+
|
| 181 |
+
if clean_line:
|
| 182 |
+
x_start = min(x[0] for x in current_line)
|
| 183 |
+
y_start = min(x[1] for x in current_line)
|
| 184 |
+
lines.append({
|
| 185 |
+
"line": " ".join(clean_line),
|
| 186 |
+
"bbox": [x_start, y_start],
|
| 187 |
+
"words": clean_word_data,
|
| 188 |
+
})
|
| 189 |
+
current_line = [(w["bbox"][0], y0, w["word"])]
|
| 190 |
+
current_word_data = [w]
|
| 191 |
+
current_y = y0
|
| 192 |
+
|
| 193 |
+
# Final remaining line
|
| 194 |
+
if current_line:
|
| 195 |
+
current_line.sort()
|
| 196 |
+
line_words = [x[2] for x in current_line]
|
| 197 |
+
clean_line = self.remove_consecutive_items(line_words)
|
| 198 |
+
current_word_data = sorted(current_word_data, key=lambda w: w["bbox"][0])
|
| 199 |
+
clean_word_data = self.remove_consecutive_words(current_word_data)
|
| 200 |
+
|
| 201 |
+
if clean_line:
|
| 202 |
+
x_start = min(x[0] for x in current_line)
|
| 203 |
+
y_start = min(x[1] for x in current_line)
|
| 204 |
+
lines.append({
|
| 205 |
+
"line": " ".join(clean_line),
|
| 206 |
+
"bbox": [x_start, y_start],
|
| 207 |
+
"words": clean_word_data,
|
| 208 |
+
})
|
| 209 |
+
|
| 210 |
+
page_lines_with_bbox.append(lines)
|
| 211 |
+
|
| 212 |
+
return page_lines_with_bbox
|
| 213 |
+
|
| 214 |
+
return await asyncio.get_event_loop().run_in_executor(None, _extract_from_scanned)
|
src/services/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .bank_statement_service import BankStatementService
|
| 2 |
+
|
| 3 |
+
__all__ = ["BankStatementService"]
|
src/services/bank_statement_service.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import json
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 5 |
+
from src.ocr import PDFProcessor, TextExtractor
|
| 6 |
+
from src.extractor import TableExtractor, AccountExtractor, BalanceExtractor
|
| 7 |
+
from src.utils import GroqClient
|
| 8 |
+
from src.models.account_models import BankStatementData
|
| 9 |
+
from src.config.config import settings
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class BankStatementService:
|
| 13 |
+
"""Main service for processing bank statements."""
|
| 14 |
+
|
| 15 |
+
def __init__(self):
|
| 16 |
+
self.pdf_processor = PDFProcessor()
|
| 17 |
+
self.table_extractor = TableExtractor()
|
| 18 |
+
self.account_extractor = AccountExtractor()
|
| 19 |
+
self.balance_extractor = BalanceExtractor()
|
| 20 |
+
|
| 21 |
+
async def __aenter__(self):
|
| 22 |
+
return self
|
| 23 |
+
|
| 24 |
+
async def __aexit__(self, exc_type, exc_value, traceback):
|
| 25 |
+
pass
|
| 26 |
+
|
| 27 |
+
async def process_bank_statement(self, uploaded_file) -> BankStatementData:
|
| 28 |
+
"""Main method to process uploaded bank statement."""
|
| 29 |
+
# Save uploaded file
|
| 30 |
+
pdf_path = await self.pdf_processor.save_uploaded_file(uploaded_file)
|
| 31 |
+
|
| 32 |
+
# Check if PDF is scanned
|
| 33 |
+
pdf_scanned = await self.pdf_processor.is_pdf_scanned(pdf_path)
|
| 34 |
+
|
| 35 |
+
# Extract text based on PDF type
|
| 36 |
+
if pdf_scanned:
|
| 37 |
+
print(f"{pdf_path} is likely a scanned PDF.")
|
| 38 |
+
text_extractor = TextExtractor(self.pdf_processor.doctr_model)
|
| 39 |
+
extracted_text_list = await text_extractor.extract_lines_with_bbox_from_scanned_pdf(pdf_path)
|
| 40 |
+
else:
|
| 41 |
+
print(f"{pdf_path} is not a scanned PDF. Extracting text...")
|
| 42 |
+
text_extractor = TextExtractor(self.pdf_processor.doctr_model)
|
| 43 |
+
extracted_text_list = await text_extractor.extract_lines_with_bbox(pdf_path)
|
| 44 |
+
|
| 45 |
+
# Process transaction tables
|
| 46 |
+
pre_processed_tables, table_tags = await self.table_extractor.process_transaction_tables_with_bbox(extracted_text_list)
|
| 47 |
+
|
| 48 |
+
# Clean and process tables
|
| 49 |
+
processed_tables = []
|
| 50 |
+
for table in pre_processed_tables:
|
| 51 |
+
processed_table = await self.table_extractor.process_tables(table)
|
| 52 |
+
processed_tables.append(processed_table)
|
| 53 |
+
|
| 54 |
+
# Organize tables by tags
|
| 55 |
+
final_table_dic = {}
|
| 56 |
+
for i, tag in enumerate(table_tags):
|
| 57 |
+
if tag not in final_table_dic:
|
| 58 |
+
final_table_dic[tag] = [processed_tables[i]]
|
| 59 |
+
else:
|
| 60 |
+
final_table_dic[tag].append(processed_tables[i])
|
| 61 |
+
|
| 62 |
+
# Concatenate tables with same tags
|
| 63 |
+
for tag, tables in final_table_dic.items():
|
| 64 |
+
final_table_dic[tag] = pd.concat(tables, ignore_index=True)
|
| 65 |
+
|
| 66 |
+
# Extract account information from first page
|
| 67 |
+
first_page = None
|
| 68 |
+
if pdf_scanned:
|
| 69 |
+
first_page = extracted_text_list
|
| 70 |
+
else:
|
| 71 |
+
first_page = await text_extractor.extract_lines_with_bbox_from_scanned_pdf(pdf_path, first_page=True)
|
| 72 |
+
|
| 73 |
+
first_page_object = first_page[0]
|
| 74 |
+
|
| 75 |
+
# Extract text for LLM processing
|
| 76 |
+
starting_text = ""
|
| 77 |
+
for lines in first_page_object:
|
| 78 |
+
starting_text += lines["line"]
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# Extract account details using LLM
|
| 82 |
+
async with GroqClient() as groq_client:
|
| 83 |
+
bank_summary = await groq_client.extract_account_details(starting_text)
|
| 84 |
+
|
| 85 |
+
bank_summary = json.loads(bank_summary)
|
| 86 |
+
|
| 87 |
+
# Create account summary
|
| 88 |
+
account_summary = {
|
| 89 |
+
"Bank Name": bank_summary["bank_name"].upper(),
|
| 90 |
+
"Account Number": bank_summary["accounts"][-1]["account_number"],
|
| 91 |
+
"Starting Balance": str(bank_summary["accounts"][-1]["starting_balance"]),
|
| 92 |
+
"Ending Balance": str(bank_summary["accounts"][-1]["ending_balance"]),
|
| 93 |
+
"Statement Start Date": bank_summary["accounts"][-1]["statement_start_date"],
|
| 94 |
+
"Statement End Date": bank_summary["accounts"][-1]["statement_end_date"]
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
return BankStatementData(
|
| 98 |
+
account_summary=account_summary,
|
| 99 |
+
transaction_tables=final_table_dic
|
| 100 |
+
)
|
src/utils/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .api_clients import GroqClient, HuggingFaceClient
|
| 2 |
+
from .model_manager import ModelManager, model_manager
|
| 3 |
+
|
| 4 |
+
__all__ = ["GroqClient", "HuggingFaceClient", "ModelManager", "model_manager"]
|
src/utils/api_clients.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import json
|
| 3 |
+
from typing import Dict, Any, Optional
|
| 4 |
+
from openai import AsyncOpenAI
|
| 5 |
+
from huggingface_hub import AsyncInferenceClient
|
| 6 |
+
from src.config.config import settings
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class GroqClient:
|
| 10 |
+
"""Async client for Groq API."""
|
| 11 |
+
|
| 12 |
+
def __init__(self):
|
| 13 |
+
self.client = AsyncOpenAI(
|
| 14 |
+
base_url=settings.groq_base_url,
|
| 15 |
+
api_key=settings.groq_api_key,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
async def __aenter__(self):
|
| 19 |
+
return self
|
| 20 |
+
|
| 21 |
+
async def __aexit__(self, exc_type, exc_value, traceback):
|
| 22 |
+
await self.client.close()
|
| 23 |
+
|
| 24 |
+
async def extract_account_details(self, text: str) -> str:
|
| 25 |
+
"""Extract account details using LLM."""
|
| 26 |
+
system_prompt = """
|
| 27 |
+
You are a financial document parser that extracts structured data from bank statements.
|
| 28 |
+
|
| 29 |
+
Your task is to extract the following fields and return only valid JSON:
|
| 30 |
+
|
| 31 |
+
- Starting balance can also be referred with "Balance last statement" or "Balance previous statement" in pdfs.
|
| 32 |
+
- Ending balance can also be referred with "Balance this statement" in pdfs.
|
| 33 |
+
|
| 34 |
+
{
|
| 35 |
+
"bank_name": "string",
|
| 36 |
+
"account_holder": "string",
|
| 37 |
+
"accounts": [{
|
| 38 |
+
"account_name": "string",
|
| 39 |
+
"account_number": "string",
|
| 40 |
+
"starting_balance": float,
|
| 41 |
+
"ending_balance": float,
|
| 42 |
+
"statement_start_date": "YYYY-MM-DD",
|
| 43 |
+
"statement_end_date": "YYYY-MM-DD"
|
| 44 |
+
}]
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
Guidelines:
|
| 48 |
+
- Return strictly valid JSON (no markdown, comments, or extra explanation).
|
| 49 |
+
- `starting_balance` and `ending_balance` must be `float` (no currency symbol).
|
| 50 |
+
- Dates must follow the format `"YYYY-MM-DD"`.
|
| 51 |
+
- Do not respond with anything other than the JSON object.
|
| 52 |
+
- If multiple account are there then include all the account list in a list.
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
response = await self.client.chat.completions.create(
|
| 56 |
+
model=settings.llm_model,
|
| 57 |
+
messages=[
|
| 58 |
+
{"role": "system", "content": system_prompt},
|
| 59 |
+
{"role": "user", "content": text},
|
| 60 |
+
],
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
return response.choices[0].message.content
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class HuggingFaceClient:
|
| 67 |
+
"""Async client for HuggingFace Inference API."""
|
| 68 |
+
|
| 69 |
+
def __init__(self):
|
| 70 |
+
self.client = AsyncInferenceClient(
|
| 71 |
+
provider=settings.huggingface_provider,
|
| 72 |
+
api_key=settings.huggingface_api_key,
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
async def __aenter__(self):
|
| 76 |
+
return self
|
| 77 |
+
|
| 78 |
+
async def __aexit__(self, exc_type, exc_value, traceback):
|
| 79 |
+
pass
|
| 80 |
+
|
| 81 |
+
async def extract_account_details(self, text: str) -> str:
|
| 82 |
+
"""Extract account details using HuggingFace model."""
|
| 83 |
+
# This is a placeholder - you can implement HuggingFace specific logic here
|
| 84 |
+
# For now, we'll use the same prompt as Groq
|
| 85 |
+
system_prompt = """
|
| 86 |
+
You are a financial document parser that extracts structured data from bank statements.
|
| 87 |
+
|
| 88 |
+
Your task is to extract the following fields and return only valid JSON:
|
| 89 |
+
|
| 90 |
+
- Starting balance can also be referred with "Balance last statement" or "Balance previous statement" in pdfs.
|
| 91 |
+
- Ending balance can also be referred with "Balance this statement" in pdfs.
|
| 92 |
+
|
| 93 |
+
{
|
| 94 |
+
"bank_name": "string",
|
| 95 |
+
"account_holder": "string",
|
| 96 |
+
"accounts": [{
|
| 97 |
+
"account_name": "string",
|
| 98 |
+
"account_number": "string",
|
| 99 |
+
"starting_balance": float,
|
| 100 |
+
"ending_balance": float,
|
| 101 |
+
"statement_start_date": "YYYY-MM-DD",
|
| 102 |
+
"statement_end_date": "YYYY-MM-DD"
|
| 103 |
+
}]
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
Guidelines:
|
| 107 |
+
- Return strictly valid JSON (no markdown, comments, or extra explanation).
|
| 108 |
+
- `starting_balance` and `ending_balance` must be `float` (no currency symbol).
|
| 109 |
+
- Dates must follow the format `"YYYY-MM-DD"`.
|
| 110 |
+
- Do not respond with anything other than the JSON object.
|
| 111 |
+
- If multiple account are there then include all the account list in a list.
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
# This would need to be implemented based on the specific HuggingFace model
|
| 115 |
+
# For now, returning a placeholder
|
| 116 |
+
return '{"bank_name": "Unknown", "account_holder": "Unknown", "accounts": []}'
|
src/utils/model_manager.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import torch
|
| 3 |
+
from typing import Optional
|
| 4 |
+
from doctr.models import ocr_predictor
|
| 5 |
+
import spacy
|
| 6 |
+
from src.config.config import settings
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ModelManager:
|
| 10 |
+
"""Singleton model manager for pre-loading all models at startup."""
|
| 11 |
+
|
| 12 |
+
_instance = None
|
| 13 |
+
_doctr_model = None
|
| 14 |
+
_spacy_model = None
|
| 15 |
+
_device = None
|
| 16 |
+
_models_loaded = False
|
| 17 |
+
|
| 18 |
+
def __new__(cls):
|
| 19 |
+
if cls._instance is None:
|
| 20 |
+
cls._instance = super(ModelManager, cls).__new__(cls)
|
| 21 |
+
return cls._instance
|
| 22 |
+
|
| 23 |
+
def __init__(self):
|
| 24 |
+
if not self._models_loaded:
|
| 25 |
+
self._load_models()
|
| 26 |
+
|
| 27 |
+
def _load_models(self):
|
| 28 |
+
"""Load all models synchronously."""
|
| 29 |
+
print("π Starting model pre-loading...")
|
| 30 |
+
|
| 31 |
+
# Set device based on config
|
| 32 |
+
if settings.force_cpu:
|
| 33 |
+
self._device = torch.device("cpu")
|
| 34 |
+
print("π± Using CPU (forced by config)")
|
| 35 |
+
else:
|
| 36 |
+
self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 37 |
+
print(f"π± Using device: {self._device}")
|
| 38 |
+
|
| 39 |
+
# Load doctr model
|
| 40 |
+
print("π Loading doctr OCR model...")
|
| 41 |
+
self._doctr_model = ocr_predictor(pretrained=True)
|
| 42 |
+
self._doctr_model.det_predictor.model = self._doctr_model.det_predictor.model.to(self._device)
|
| 43 |
+
self._doctr_model.reco_predictor.model = self._doctr_model.reco_predictor.model.to(self._device)
|
| 44 |
+
print("β
Doctr model loaded successfully!")
|
| 45 |
+
|
| 46 |
+
# Load spaCy model
|
| 47 |
+
print(f"π Loading spaCy NER model: {settings.spacy_model_name}...")
|
| 48 |
+
try:
|
| 49 |
+
self._spacy_model = spacy.load(settings.spacy_model_name)
|
| 50 |
+
print(f"β
spaCy model ({settings.spacy_model_name}) loaded successfully!")
|
| 51 |
+
except OSError:
|
| 52 |
+
print(f"β οΈ spaCy model '{settings.spacy_model_name}' not found.")
|
| 53 |
+
# Try fallback models
|
| 54 |
+
fallback_models = ["en_core_web_sm", "en_core_web_trf"]
|
| 55 |
+
for fallback_model in fallback_models:
|
| 56 |
+
if fallback_model != settings.spacy_model_name:
|
| 57 |
+
try:
|
| 58 |
+
print(f"π Trying fallback model: {fallback_model}")
|
| 59 |
+
self._spacy_model = spacy.load(fallback_model)
|
| 60 |
+
print(f"β
spaCy model ({fallback_model}) loaded successfully!")
|
| 61 |
+
break
|
| 62 |
+
except OSError:
|
| 63 |
+
continue
|
| 64 |
+
|
| 65 |
+
if self._spacy_model is None:
|
| 66 |
+
print("β οΈ No spaCy model found. Please install with: python -m spacy download en_core_web_sm")
|
| 67 |
+
|
| 68 |
+
self._models_loaded = True
|
| 69 |
+
print("π All models loaded successfully!")
|
| 70 |
+
|
| 71 |
+
@property
|
| 72 |
+
def doctr_model(self):
|
| 73 |
+
"""Get the loaded doctr model."""
|
| 74 |
+
return self._doctr_model
|
| 75 |
+
|
| 76 |
+
@property
|
| 77 |
+
def spacy_model(self):
|
| 78 |
+
"""Get the loaded spaCy model."""
|
| 79 |
+
return self._spacy_model
|
| 80 |
+
|
| 81 |
+
@property
|
| 82 |
+
def device(self):
|
| 83 |
+
"""Get the device being used."""
|
| 84 |
+
return self._device
|
| 85 |
+
|
| 86 |
+
@property
|
| 87 |
+
def models_loaded(self):
|
| 88 |
+
"""Check if models are loaded."""
|
| 89 |
+
return self._models_loaded
|
| 90 |
+
|
| 91 |
+
async def ensure_models_loaded(self):
|
| 92 |
+
"""Ensure models are loaded (async wrapper)."""
|
| 93 |
+
if not self._models_loaded:
|
| 94 |
+
await asyncio.get_event_loop().run_in_executor(None, self._load_models)
|
| 95 |
+
return True
|
| 96 |
+
|
| 97 |
+
def get_model_status(self):
|
| 98 |
+
"""Get status of all models."""
|
| 99 |
+
return {
|
| 100 |
+
"doctr_model": self._doctr_model is not None,
|
| 101 |
+
"spacy_model": self._spacy_model is not None,
|
| 102 |
+
"device": str(self._device),
|
| 103 |
+
"models_loaded": self._models_loaded,
|
| 104 |
+
"spacy_model_name": settings.spacy_model_name,
|
| 105 |
+
"force_cpu": settings.force_cpu
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
# Global model manager instance
|
| 110 |
+
model_manager = ModelManager()
|