Aryan Jain commited on
Commit
4e71548
Β·
0 Parent(s):

bank scrubber streamlit application

Browse files
.dockerignore ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Git
2
+ .git
3
+ .gitignore
4
+ .gitattributes
5
+
6
+ # Python
7
+ __pycache__
8
+ *.pyc
9
+ *.pyo
10
+ *.pyd
11
+ .Python
12
+ env
13
+ pip-log.txt
14
+ pip-delete-this-directory.txt
15
+ .tox
16
+ .coverage
17
+ .coverage.*
18
+ .cache
19
+ nosetests.xml
20
+ coverage.xml
21
+ *.cover
22
+ *.log
23
+ .mypy_cache
24
+ .pytest_cache
25
+ .hypothesis
26
+
27
+ # Virtual environments
28
+ venv/
29
+ env/
30
+ ENV/
31
+ env.bak/
32
+ venv.bak/
33
+ .venv/
34
+ .venv.bak/
35
+
36
+ # IDE
37
+ .vscode/
38
+ .idea/
39
+ *.swp
40
+ *.swo
41
+ *~
42
+
43
+ # OS
44
+ .DS_Store
45
+ .DS_Store?
46
+ ._*
47
+ .Spotlight-V100
48
+ .Trashes
49
+ ehthumbs.db
50
+ Thumbs.db
51
+
52
+ # Project specific
53
+ temp.pdf
54
+ *.pdf
55
+ .env
56
+ .env.local
57
+ .env.*.local
58
+ requirements.txt
59
+
60
+ # Documentation
61
+ README.md
62
+ *.md
63
+ docs/
64
+
65
+ # Testing
66
+ test_structure.py
67
+ setup_env.py
68
+ startup.py
69
+ docker-startup.py
70
+ check-build-context.py
71
+ build-docker.sh
72
+ tests/
73
+ test_*.py
74
+
75
+ # Original files
76
+ poc.py
77
+
78
+ # Docker
79
+ Dockerfile
80
+ .dockerignore
81
+ docker-compose*.yml
82
+ DOCKER_DEPLOYMENT.md
83
+
84
+ # Large files and directories
85
+ *.tar
86
+ *.tar.gz
87
+ *.zip
88
+ *.rar
89
+ *.7z
90
+ *.model
91
+ *.pkl
92
+ *.pickle
93
+ *.h5
94
+ *.hdf5
95
+ *.ckpt
96
+ *.pth
97
+ *.pt
98
+ *.bin
99
+ *.safetensors
100
+
101
+ # Model files and caches
102
+ .cache/
103
+ models/
104
+ checkpoints/
105
+ weights/
106
+ *.weights
107
+ *.cfg
108
+
109
+ # Logs and temporary files
110
+ logs/
111
+ *.log
112
+ tmp/
113
+ temp/
114
+ .tmp/
115
+
116
+ # Node modules (if any)
117
+ node_modules/
118
+
119
+ # Large data files
120
+ data/
121
+ datasets/
122
+ *.csv
123
+ *.json
124
+ *.xml
125
+ *.xlsx
126
+ *.xls
127
+
128
+ # Backup files
129
+ *.bak
130
+ *.backup
131
+ *.old
132
+
133
+ # Jupyter notebooks
134
+ *.ipynb
135
+ .ipynb_checkpoints/
136
+
137
+ # Large images
138
+ *.jpg
139
+ *.jpeg
140
+ *.png
141
+ *.gif
142
+ *.bmp
143
+ *.tiff
144
+ *.tif
145
+ images/
146
+ img/
147
+
148
+ # Audio/Video files
149
+ *.mp3
150
+ *.mp4
151
+ *.avi
152
+ *.mov
153
+ *.wav
154
+ *.flac
155
+
156
+ # Archives
157
+ *.tar
158
+ *.tar.gz
159
+ *.tar.bz2
160
+ *.zip
161
+ *.rar
162
+ *.7z
163
+
164
+ # System files
165
+ Thumbs.db
166
+ ehthumbs.db
167
+ Desktop.ini
168
+ $RECYCLE.BIN/
.env.example ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Bank Statement Analyzer Configuration
2
+ # Copy this file to .env and update with your actual values
3
+
4
+ # API Keys
5
+ GROQ_API_KEY=your_groq_api_key_here
6
+ GROQ_BASE_URL=https://api.groq.com/openai/v1
7
+
8
+ HUGGINGFACE_API_KEY=your_huggingface_api_key_here
9
+ HUGGINGFACE_PROVIDER=novita
10
+
11
+ # Model Configuration
12
+ LLM_MODEL=llama-3.1-8b-instant
13
+
14
+ # OCR and Processing Settings
15
+ Y_THRESHOLD=3.0
16
+ GAP_THRESHOLD=10
17
+ GAP_THRESHOLD_RATIO=0.1
18
+
19
+ # File Processing Settings
20
+ TEMP_FILE_NAME=temp.pdf
21
+ DPI=300
22
+
23
+ # spaCy Model Settings
24
+ SPACY_MODEL_NAME=en_core_web_sm
25
+
26
+ # Device Settings
27
+ FORCE_CPU=false
.gitignore ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ pip-wheel-metadata/
20
+ share/python-wheels/
21
+ *.egg-info/
22
+ .installed.cfg
23
+ *.egg
24
+ PIPFILE.lock
25
+
26
+ # Virtual Environment
27
+ venv/
28
+ ENV/
29
+ env/
30
+ .venv/
31
+
32
+ # IDE
33
+ .vscode/
34
+ .idea/
35
+ *.swp
36
+ *.swo
37
+ *~
38
+ .project
39
+ .pydevproject
40
+
41
+ # Environment variables
42
+ .env
43
+ .env.local
44
+ .env.*.local
45
+
46
+ # Logs
47
+ logs/
48
+ *.log
49
+
50
+ # Debug
51
+ debug/
52
+ *.debug
53
+
54
+ # Cache
55
+ .cache/
56
+ *.cache
57
+ __pycache__/
58
+ .pytest_cache/
59
+ .mypy_cache/
60
+ .dmypy.json
61
+ dmypy.json
62
+
63
+ # Database
64
+ *.db
65
+ *.sqlite
66
+ *.sqlite3
67
+
68
+ # Output files
69
+ output/
70
+ results/
71
+ exports/
72
+ *.xlsx
73
+ *.csv
74
+ *.json
75
+
76
+ # Temporary files
77
+ tmp/
78
+ temp/
79
+ *.tmp
80
+ *.temp
81
+
82
+ # OS files
83
+ .DS_Store
84
+ Thumbs.db
85
+ ehthumbs.db
86
+
87
+ # Test coverage
88
+ htmlcov/
89
+ .tox/
90
+ .nox/
91
+ .coverage
92
+ .coverage.*
93
+ *.cover
94
+ *.py,cover
95
+ .hypothesis/
96
+
97
+ # Jupyter Notebook
98
+ .ipynb_checkpoints
99
+
100
+ # Redis
101
+ dump.rdb
102
+
103
+ # Secrets
104
+ secrets/
105
+ *.key
106
+ *.pem
107
+ *.crt
108
+
109
+ # Model files
110
+ models/*.pkl
111
+ models/*.h5
112
+ models/*.pt
113
+
114
+ # Large files
115
+ *.pdf
116
+ *.zip
117
+ *.tar.gz
118
+ *.rar
119
+
120
+ # Except test PDFs
121
+ !tests/fixtures/*.pdf
122
+
123
+ check-build-context.py
124
+ test_structure.py
125
+ startup.py
126
+ setup_env.py
127
+ poc.py
128
+ docker-startup.py
DOCKER_DEPLOYMENT.md ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Docker Deployment Guide
2
+
3
+ This guide explains how to deploy the Bank Statement Analyzer using Docker with Poetry dependency management.
4
+
5
+ ## Prerequisites
6
+
7
+ - Docker installed on your system
8
+ - Docker Compose (usually comes with Docker Desktop)
9
+ - API keys for Groq and HuggingFace
10
+
11
+ ## Quick Start
12
+
13
+ ### 1. Set up Environment Variables
14
+
15
+ Create a `.env` file in the project root:
16
+
17
+ ```bash
18
+ # Copy the example file
19
+ cp env.example .env
20
+
21
+ # Edit with your actual API keys
22
+ nano .env
23
+ ```
24
+
25
+ Make sure to set:
26
+ - `GROQ_API_KEY=your_actual_groq_api_key`
27
+ - `HUGGINGFACE_API_KEY=your_actual_huggingface_api_key`
28
+
29
+ ### 2. Build and Run with Docker Compose
30
+
31
+ ```bash
32
+ # Build and start the application
33
+ docker-compose up --build
34
+
35
+ # Or run in detached mode
36
+ docker-compose up -d --build
37
+ ```
38
+
39
+ ### 3. Access the Application
40
+
41
+ Open your browser and go to: `http://localhost:8501`
42
+
43
+ ## Manual Docker Build
44
+
45
+ If you prefer to build manually:
46
+
47
+ ```bash
48
+ # Build the image
49
+ docker build -t bank-statement-analyzer .
50
+
51
+ # Run the container
52
+ docker run -p 8501:8501 \
53
+ --env-file .env \
54
+ -v $(pwd)/temp:/app/temp \
55
+ bank-statement-analyzer
56
+ ```
57
+
58
+ ## Docker Configuration
59
+
60
+ ### Dockerfile Features
61
+
62
+ - **Base Image**: Python 3.12 slim for smaller size
63
+ - **Dependency Management**: Poetry for reliable dependency resolution
64
+ - **System Dependencies**: Includes OCR and graphics libraries
65
+ - **PyTorch**: Pre-installed with CPU support (can be changed to CUDA)
66
+ - **spaCy Models**: Pre-downloaded for faster startup
67
+ - **Optimized Layers**: Efficient caching for faster rebuilds
68
+
69
+ ### Poetry Configuration
70
+
71
+ The project uses Poetry for dependency management:
72
+
73
+ ```toml
74
+ # pyproject.toml
75
+ [tool.poetry]
76
+ name = "bank-statement-analyzer"
77
+ version = "1.0.0"
78
+ description = "A comprehensive, async, class-based bank statement analyzer"
79
+
80
+ [tool.poetry.dependencies]
81
+ python = "^3.12"
82
+ streamlit = "^1.28.0"
83
+ pandas = "^2.0.0"
84
+ # ... other dependencies
85
+ ```
86
+
87
+ ### Environment Variables
88
+
89
+ The following environment variables can be set in your `.env` file:
90
+
91
+ | Variable | Description | Default |
92
+ |----------|-------------|---------|
93
+ | `GROQ_API_KEY` | Your Groq API key | Required |
94
+ | `HUGGINGFACE_API_KEY` | Your HuggingFace API key | Required |
95
+ | `LLM_MODEL` | Groq model to use | `llama-3.1-8b-instant` |
96
+ | `SPACY_MODEL_NAME` | spaCy model for NER | `en_core_web_sm` |
97
+ | `FORCE_CPU` | Force CPU usage | `false` |
98
+ | `DPI` | PDF processing DPI | `300` |
99
+ | `Y_THRESHOLD` | Text extraction threshold | `3.0` |
100
+
101
+ ### Volumes
102
+
103
+ - `./temp:/app/temp`: Shared temp directory for file processing
104
+ - `./.env:/app/.env:ro`: Read-only access to environment file
105
+
106
+ ## Production Deployment
107
+
108
+ ### Using Docker Compose (Recommended)
109
+
110
+ ```yaml
111
+ # docker-compose.prod.yml
112
+ version: '3.8'
113
+
114
+ services:
115
+ bank-statement-analyzer:
116
+ build: .
117
+ ports:
118
+ - "8501:8501"
119
+ environment:
120
+ - PYTHONUNBUFFERED=1
121
+ - POETRY_VENV_IN_PROJECT=1
122
+ - POETRY_NO_INTERACTION=1
123
+ env_file:
124
+ - .env
125
+ volumes:
126
+ - ./temp:/app/temp
127
+ restart: unless-stopped
128
+ healthcheck:
129
+ test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
130
+ interval: 30s
131
+ timeout: 10s
132
+ retries: 3
133
+ start_period: 40s
134
+ deploy:
135
+ resources:
136
+ limits:
137
+ memory: 4G
138
+ reservations:
139
+ memory: 2G
140
+ ```
141
+
142
+ ### Using Docker Swarm
143
+
144
+ ```bash
145
+ # Initialize swarm (if not already done)
146
+ docker swarm init
147
+
148
+ # Deploy the stack
149
+ docker stack deploy -c docker-compose.yml bank-analyzer
150
+ ```
151
+
152
+ ### Using Kubernetes
153
+
154
+ Create a deployment YAML:
155
+
156
+ ```yaml
157
+ apiVersion: apps/v1
158
+ kind: Deployment
159
+ metadata:
160
+ name: bank-statement-analyzer
161
+ spec:
162
+ replicas: 1
163
+ selector:
164
+ matchLabels:
165
+ app: bank-statement-analyzer
166
+ template:
167
+ metadata:
168
+ labels:
169
+ app: bank-statement-analyzer
170
+ spec:
171
+ containers:
172
+ - name: bank-statement-analyzer
173
+ image: bank-statement-analyzer:latest
174
+ ports:
175
+ - containerPort: 8501
176
+ env:
177
+ - name: GROQ_API_KEY
178
+ valueFrom:
179
+ secretKeyRef:
180
+ name: api-secrets
181
+ key: groq-api-key
182
+ - name: HUGGINGFACE_API_KEY
183
+ valueFrom:
184
+ secretKeyRef:
185
+ name: api-secrets
186
+ key: huggingface-api-key
187
+ - name: POETRY_VENV_IN_PROJECT
188
+ value: "1"
189
+ - name: POETRY_NO_INTERACTION
190
+ value: "1"
191
+ resources:
192
+ limits:
193
+ memory: "4Gi"
194
+ cpu: "2"
195
+ requests:
196
+ memory: "2Gi"
197
+ cpu: "1"
198
+ ---
199
+ apiVersion: v1
200
+ kind: Service
201
+ metadata:
202
+ name: bank-statement-analyzer-service
203
+ spec:
204
+ selector:
205
+ app: bank-statement-analyzer
206
+ ports:
207
+ - port: 80
208
+ targetPort: 8501
209
+ type: LoadBalancer
210
+ ```
211
+
212
+ ## Performance Optimization
213
+
214
+ ### GPU Support
215
+
216
+ To enable GPU support, modify the Dockerfile:
217
+
218
+ ```dockerfile
219
+ # Install PyTorch with CUDA support
220
+ RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
221
+ ```
222
+
223
+ And run with GPU access:
224
+
225
+ ```bash
226
+ docker run --gpus all -p 8501:8501 --env-file .env bank-statement-analyzer
227
+ ```
228
+
229
+ ### Memory Optimization
230
+
231
+ - Set `FORCE_CPU=true` in `.env` if GPU is not needed
232
+ - Use smaller spaCy model: `SPACY_MODEL_NAME=en_core_web_sm`
233
+ - Adjust memory limits in docker-compose.yml
234
+
235
+ ### Build Optimization
236
+
237
+ - Use `.dockerignore` to exclude unnecessary files
238
+ - Leverage Docker layer caching
239
+ - Use multi-stage builds for production
240
+ - Poetry lock file ensures reproducible builds
241
+
242
+ ## Development with Poetry
243
+
244
+ ### Local Development
245
+
246
+ ```bash
247
+ # Install Poetry (if not already installed)
248
+ curl -sSL https://install.python-poetry.org | python3 -
249
+
250
+ # Install dependencies
251
+ poetry install
252
+
253
+ # Activate virtual environment
254
+ poetry shell
255
+
256
+ # Run the application
257
+ poetry run streamlit run main.py
258
+ ```
259
+
260
+ ### Adding Dependencies
261
+
262
+ ```bash
263
+ # Add a new dependency
264
+ poetry add package-name
265
+
266
+ # Add a development dependency
267
+ poetry add --group dev package-name
268
+
269
+ # Update dependencies
270
+ poetry update
271
+ ```
272
+
273
+ ### Poetry Scripts
274
+
275
+ The project includes convenient Poetry scripts:
276
+
277
+ ```bash
278
+ # Start the application
279
+ poetry run start
280
+
281
+ # Run startup script
282
+ poetry run startup
283
+
284
+ # Run tests
285
+ poetry run test
286
+
287
+ # Setup environment
288
+ poetry run setup
289
+ ```
290
+
291
+ ## Troubleshooting
292
+
293
+ ### Common Issues
294
+
295
+ 1. **Port Already in Use**
296
+ ```bash
297
+ # Check what's using port 8501
298
+ lsof -i :8501
299
+
300
+ # Use different port
301
+ docker run -p 8502:8501 bank-statement-analyzer
302
+ ```
303
+
304
+ 2. **Permission Issues**
305
+ ```bash
306
+ # Fix temp directory permissions
307
+ sudo chown -R 1000:1000 ./temp
308
+ ```
309
+
310
+ 3. **Memory Issues**
311
+ ```bash
312
+ # Increase Docker memory limit
313
+ # In Docker Desktop: Settings > Resources > Memory
314
+ ```
315
+
316
+ 4. **API Key Issues**
317
+ ```bash
318
+ # Check environment variables
319
+ docker exec -it <container_id> env | grep API
320
+ ```
321
+
322
+ 5. **Poetry Issues**
323
+ ```bash
324
+ # Clear Poetry cache
325
+ poetry cache clear . --all
326
+
327
+ # Reinstall dependencies
328
+ poetry install --sync
329
+ ```
330
+
331
+ ### Logs
332
+
333
+ ```bash
334
+ # View container logs
335
+ docker-compose logs -f
336
+
337
+ # View specific service logs
338
+ docker-compose logs -f bank-statement-analyzer
339
+ ```
340
+
341
+ ### Health Check
342
+
343
+ The application includes a health check endpoint:
344
+
345
+ ```bash
346
+ # Test health endpoint
347
+ curl http://localhost:8501/_stcore/health
348
+ ```
349
+
350
+ ## Security Considerations
351
+
352
+ 1. **API Keys**: Never commit `.env` files to version control
353
+ 2. **Network**: Use internal networks for production
354
+ 3. **Volumes**: Limit volume access to necessary directories
355
+ 4. **User**: Run container as non-root user
356
+ 5. **Updates**: Regularly update base images and dependencies
357
+ 6. **Dependencies**: Poetry lock file ensures reproducible builds
358
+
359
+ ## Monitoring
360
+
361
+ ### Basic Monitoring
362
+
363
+ ```bash
364
+ # Check container status
365
+ docker ps
366
+
367
+ # Monitor resource usage
368
+ docker stats
369
+
370
+ # Check logs
371
+ docker-compose logs -f
372
+ ```
373
+
374
+ ### Advanced Monitoring
375
+
376
+ Consider using:
377
+ - Prometheus + Grafana for metrics
378
+ - ELK stack for log aggregation
379
+ - Docker Swarm or Kubernetes for orchestration
380
+
381
+ ## Backup and Recovery
382
+
383
+ ### Data Backup
384
+
385
+ ```bash
386
+ # Backup temp directory
387
+ tar -czf temp_backup.tar.gz ./temp
388
+
389
+ # Backup environment configuration
390
+ cp .env .env.backup
391
+
392
+ # Backup Poetry lock file
393
+ cp poetry.lock poetry.lock.backup
394
+ ```
395
+
396
+ ### Container Backup
397
+
398
+ ```bash
399
+ # Save container image
400
+ docker save bank-statement-analyzer > bank-analyzer.tar
401
+
402
+ # Load container image
403
+ docker load < bank-analyzer.tar
404
+ ```
405
+
406
+ ## Scaling
407
+
408
+ ### Horizontal Scaling
409
+
410
+ ```yaml
411
+ # docker-compose.scale.yml
412
+ version: '3.8'
413
+
414
+ services:
415
+ bank-statement-analyzer:
416
+ build: .
417
+ ports:
418
+ - "8501:8501"
419
+ deploy:
420
+ replicas: 3
421
+ environment:
422
+ - PYTHONUNBUFFERED=1
423
+ - POETRY_VENV_IN_PROJECT=1
424
+ ```
425
+
426
+ ### Load Balancing
427
+
428
+ Use a reverse proxy like Nginx:
429
+
430
+ ```nginx
431
+ upstream streamlit {
432
+ server bank-statement-analyzer:8501;
433
+ }
434
+
435
+ server {
436
+ listen 80;
437
+ location / {
438
+ proxy_pass http://streamlit;
439
+ proxy_set_header Host $host;
440
+ proxy_set_header X-Real-IP $remote_addr;
441
+ }
442
+ }
443
+ ```
Dockerfile ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official Python 3.12 slim image
2
+ FROM python:3.12-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONUNBUFFERED=1
6
+ ENV PYTHONDONTWRITEBYTECODE=1
7
+ ENV POETRY_VERSION=1.8.2
8
+ ENV POETRY_HOME="/opt/poetry"
9
+ ENV POETRY_VENV_IN_PROJECT=1
10
+ ENV POETRY_NO_INTERACTION=1
11
+
12
+ # Set work directory
13
+ WORKDIR /app
14
+
15
+ # Install system dependencies in a single layer
16
+ RUN apt-get update && apt-get install -y \
17
+ curl \
18
+ build-essential \
19
+ tesseract-ocr \
20
+ libtesseract-dev \
21
+ poppler-utils \
22
+ libgl1-mesa-glx \
23
+ libglib2.0-0 \
24
+ libsm6 \
25
+ libxext6 \
26
+ libxrender-dev \
27
+ libgomp1 \
28
+ && apt-get clean \
29
+ && rm -rf /var/lib/apt/lists/*
30
+
31
+ # Install Poetry
32
+ RUN curl -sSL https://install.python-poetry.org | python3 - \
33
+ && export PATH="/opt/poetry/bin:$PATH" \
34
+ && poetry --version
35
+
36
+ # Add Poetry to PATH
37
+ ENV PATH="/opt/poetry/bin:$PATH"
38
+
39
+ # Copy only Poetry configuration files first (for better caching)
40
+ COPY pyproject.toml poetry.lock* /app/
41
+
42
+ # Configure Poetry and install dependencies
43
+ RUN poetry config virtualenvs.create false \
44
+ && poetry lock --no-update \
45
+ && poetry install --no-interaction --no-ansi --only main
46
+
47
+ # Install PyTorch with CPU support (adjust based on your needs)
48
+ RUN pip3 install torch torchvision torchaudio
49
+
50
+ # Install spaCy models
51
+ RUN python -m spacy download en_core_web_sm
52
+
53
+ # Create temp directory for file processing
54
+ RUN mkdir -p /app/temp && chmod 777 /app/temp
55
+
56
+ # Copy the source code (this layer will be rebuilt when code changes)
57
+ COPY src/ /app/src/
58
+ COPY main.py /app/
59
+
60
+ # Expose the port Streamlit will run on
61
+ EXPOSE 8501
62
+
63
+ # Set environment variables for Streamlit
64
+ ENV STREAMLIT_SERVER_PORT=8501
65
+ ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
66
+ ENV STREAMLIT_SERVER_HEADLESS=true
67
+ ENV STREAMLIT_SERVER_ENABLE_CORS=false
68
+ ENV STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION=false
69
+
70
+ # Run the Streamlit application
71
+ CMD ["streamlit", "run", "main.py", "--server.port=8501", "--server.address=0.0.0.0"]
Dockerfile.alternative ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official Python 3.12 slim image
2
+ FROM python:3.12-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONUNBUFFERED=1
6
+ ENV PYTHONDONTWRITEBYTECODE=1
7
+ ENV POETRY_VERSION=1.8.2
8
+ ENV POETRY_NO_INTERACTION=1
9
+
10
+ # Set work directory
11
+ WORKDIR /app
12
+
13
+ # Install system dependencies in a single layer
14
+ RUN apt-get update && apt-get install -y \
15
+ curl \
16
+ build-essential \
17
+ tesseract-ocr \
18
+ libtesseract-dev \
19
+ poppler-utils \
20
+ libgl1-mesa-glx \
21
+ libglib2.0-0 \
22
+ libsm6 \
23
+ libxext6 \
24
+ libxrender-dev \
25
+ libgomp1 \
26
+ && apt-get clean \
27
+ && rm -rf /var/lib/apt/lists/*
28
+
29
+ # Install Poetry using pip (alternative method)
30
+ RUN pip install poetry==$POETRY_VERSION
31
+
32
+ # Copy only Poetry configuration files first (for better caching)
33
+ COPY pyproject.toml poetry.lock* /app/
34
+
35
+ # Configure Poetry and install dependencies
36
+ RUN poetry config virtualenvs.create false \
37
+ && poetry install --no-interaction --no-ansi --only main
38
+
39
+ # Install PyTorch with CPU support (adjust based on your needs)
40
+ RUN pip3 install torch torchvision torchaudio
41
+
42
+ # Install spaCy models
43
+ RUN python -m spacy download en_core_web_sm
44
+
45
+ # Create temp directory for file processing
46
+ RUN mkdir -p /app/temp && chmod 777 /app/temp
47
+
48
+ # Copy the source code (this layer will be rebuilt when code changes)
49
+ COPY src/ /app/src/
50
+ COPY main.py /app/
51
+
52
+ # Expose the port Streamlit will run on
53
+ EXPOSE 8501
54
+
55
+ # Set environment variables for Streamlit
56
+ ENV STREAMLIT_SERVER_PORT=8501
57
+ ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
58
+ ENV STREAMLIT_SERVER_HEADLESS=true
59
+ ENV STREAMLIT_SERVER_ENABLE_CORS=false
60
+ ENV STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION=false
61
+
62
+ # Run the Streamlit application
63
+ CMD ["streamlit", "run", "main.py", "--server.port=8501", "--server.address=0.0.0.0"]
Dockerfile.fallback ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official Python 3.12 slim image
2
+ FROM python:3.12-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONUNBUFFERED=1
6
+ ENV PYTHONDONTWRITEBYTECODE=1
7
+
8
+ # Set work directory
9
+ WORKDIR /app
10
+
11
+ # Install system dependencies in a single layer
12
+ RUN apt-get update && apt-get install -y \
13
+ curl \
14
+ build-essential \
15
+ tesseract-ocr \
16
+ libtesseract-dev \
17
+ poppler-utils \
18
+ libgl1-mesa-glx \
19
+ libglib2.0-0 \
20
+ libsm6 \
21
+ libxext6 \
22
+ libxrender-dev \
23
+ libgomp1 \
24
+ && apt-get clean \
25
+ && rm -rf /var/lib/apt/lists/*
26
+
27
+ # Copy requirements file (if it exists)
28
+ COPY requirements.txt* /app/
29
+
30
+ # Install Python dependencies using pip
31
+ RUN pip install --no-cache-dir --upgrade pip
32
+
33
+ # Install dependencies from requirements.txt if it exists, otherwise install manually
34
+ RUN if [ -f "requirements.txt" ]; then \
35
+ pip install --no-cache-dir -r requirements.txt; \
36
+ else \
37
+ pip install --no-cache-dir \
38
+ streamlit>=1.28.0 \
39
+ pandas>=2.0.0 \
40
+ numpy>=1.24.0 \
41
+ PyMuPDF>=1.23.0 \
42
+ PyPDF2>=3.0.0 \
43
+ doctr>=2.4.0 \
44
+ pdf2image>=1.16.0 \
45
+ spacy>=3.7.0 \
46
+ torch>=2.0.0 \
47
+ fuzzywuzzy>=0.18.0 \
48
+ python-Levenshtein>=0.21.0 \
49
+ openai>=1.0.0 \
50
+ huggingface-hub>=0.19.0 \
51
+ pydantic>=2.0.0 \
52
+ pydantic-settings>=2.0.0 \
53
+ python-dateutil>=2.8.0 \
54
+ python-dotenv>=1.0.0; \
55
+ fi
56
+
57
+ # Install PyTorch with CPU support
58
+ RUN pip3 install torch torchvision torchaudio
59
+
60
+ # Install spaCy models
61
+ RUN python -m spacy download en_core_web_sm
62
+
63
+ # Create temp directory for file processing
64
+ RUN mkdir -p /app/temp && chmod 777 /app/temp
65
+
66
+ # Copy the source code
67
+ COPY src/ /app/src/
68
+ COPY main.py /app/
69
+
70
+ # Expose the port Streamlit will run on
71
+ EXPOSE 8501
72
+
73
+ # Set environment variables for Streamlit
74
+ ENV STREAMLIT_SERVER_PORT=8501
75
+ ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
76
+ ENV STREAMLIT_SERVER_HEADLESS=true
77
+ ENV STREAMLIT_SERVER_ENABLE_CORS=false
78
+ ENV STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION=false
79
+
80
+ # Run the Streamlit application
81
+ CMD ["streamlit", "run", "main.py", "--server.port=8501", "--server.address=0.0.0.0"]
README.md ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Bank Statement Analyzer
2
+
3
+ A comprehensive, async, class-based bank statement analyzer that extracts account information and transaction tables from PDF bank statements.
4
+
5
+ ## Features
6
+
7
+ - **Async Processing**: All operations are asynchronous for better performance
8
+ - **Class-Based Architecture**: Well-organized, maintainable code structure
9
+ - **Model Pre-loading**: Models are loaded once at startup for faster processing
10
+ - **Environment Configuration**: Flexible configuration via .env files
11
+ - **Multiple PDF Support**: Handles both digital and scanned PDFs
12
+ - **OCR Integration**: Uses doctr for scanned PDF processing
13
+ - **LLM Integration**: Uses Groq API for intelligent data extraction
14
+ - **Table Extraction**: Extracts and processes transaction tables
15
+ - **Account Information**: Extracts account numbers, balances, and bank names
16
+ - **Streamlit Interface**: User-friendly web interface
17
+
18
+ ## Project Structure
19
+
20
+ ```
21
+ bank-scrubber/
22
+ β”œβ”€β”€ src/
23
+ β”‚ β”œβ”€β”€ config/
24
+ β”‚ β”‚ └── config.py # Configuration settings and API keys
25
+ β”‚ β”œβ”€β”€ models/
26
+ β”‚ β”‚ β”œβ”€β”€ __init__.py
27
+ β”‚ β”‚ └── account_models.py # Pydantic models for data validation
28
+ β”‚ β”œβ”€β”€ utils/
29
+ β”‚ β”‚ β”œβ”€β”€ __init__.py
30
+ β”‚ β”‚ β”œβ”€β”€ api_clients.py # Async API clients for Groq and HuggingFace
31
+ β”‚ β”‚ └── model_manager.py # Singleton model manager for pre-loading
32
+ β”‚ β”œβ”€β”€ ocr/
33
+ β”‚ β”‚ β”œβ”€β”€ __init__.py
34
+ β”‚ β”‚ β”œβ”€β”€ pdf_processor.py # PDF processing and OCR setup
35
+ β”‚ β”‚ └── text_extractor.py # Text extraction with bounding boxes
36
+ β”‚ β”œβ”€β”€ extractor/
37
+ β”‚ β”‚ β”œβ”€β”€ __init__.py
38
+ β”‚ β”‚ β”œβ”€β”€ table_extractor.py # Transaction table extraction and processing
39
+ β”‚ β”‚ β”œβ”€β”€ account_extractor.py # Account number and bank name extraction
40
+ β”‚ β”‚ └── balance_extractor.py # Balance information extraction
41
+ β”‚ β”œβ”€β”€ services/
42
+ β”‚ β”‚ β”œβ”€β”€ __init__.py
43
+ β”‚ β”‚ └── bank_statement_service.py # Main service orchestrating all operations
44
+ β”‚ └── __init__.py
45
+ β”œβ”€β”€ main.py # Streamlit application entry point
46
+ β”œβ”€β”€ startup.py # Model pre-loading script
47
+ β”œβ”€β”€ setup_env.py # Environment setup helper
48
+ β”œβ”€β”€ env.example # Environment variables template
49
+ β”œβ”€β”€ test_structure.py # Structure testing script
50
+ β”œβ”€β”€ poc.py # Original monolithic file (preserved)
51
+ β”œβ”€β”€ requirements.txt # Python dependencies
52
+ └── README.md # This file
53
+ ```
54
+
55
+ ## Installation
56
+
57
+ 1. Clone the repository:
58
+ ```bash
59
+ git clone <repository-url>
60
+ cd bank-scrubber
61
+ ```
62
+
63
+ 2. Create a virtual environment:
64
+ ```bash
65
+ python -m venv venv
66
+ source venv/bin/activate # On Windows: venv\Scripts\activate
67
+ ```
68
+
69
+ 3. Install dependencies:
70
+ ```bash
71
+ pip install -r requirements.txt
72
+ ```
73
+
74
+ 4. Install spaCy models:
75
+ ```bash
76
+ python -m spacy download en_core_web_sm
77
+ # Optional: python -m spacy download en_core_web_trf
78
+ ```
79
+
80
+ ## Configuration
81
+
82
+ ### Quick Setup
83
+
84
+ Use the setup script to create your environment file:
85
+
86
+ ```bash
87
+ python setup_env.py
88
+ ```
89
+
90
+ This will:
91
+ - Create a `.env` file from the template
92
+ - Guide you through the setup process
93
+ - Show current configuration status
94
+
95
+ ### Manual Setup
96
+
97
+ 1. Copy the environment template:
98
+ ```bash
99
+ cp env.example .env
100
+ ```
101
+
102
+ 2. Edit the `.env` file with your API keys and settings:
103
+
104
+ ```env
105
+ # API Keys
106
+ GROQ_API_KEY=your_actual_groq_api_key_here
107
+ HUGGINGFACE_API_KEY=your_actual_huggingface_api_key_here
108
+
109
+ # Model Configuration
110
+ LLM_MODEL=llama-3.1-8b-instant
111
+ SPACY_MODEL_NAME=en_core_web_sm
112
+
113
+ # Device Settings
114
+ FORCE_CPU=false
115
+
116
+ # Processing Settings
117
+ DPI=300
118
+ Y_THRESHOLD=3.0
119
+ ```
120
+
121
+ ### Configuration Options
122
+
123
+ | Variable | Description | Default |
124
+ |----------|-------------|---------|
125
+ | `GROQ_API_KEY` | Your Groq API key | Required |
126
+ | `HUGGINGFACE_API_KEY` | Your HuggingFace API key | Required |
127
+ | `LLM_MODEL` | Groq model to use | `llama-3.1-8b-instant` |
128
+ | `SPACY_MODEL_NAME` | spaCy model for NER | `en_core_web_sm` |
129
+ | `FORCE_CPU` | Force CPU usage | `false` |
130
+ | `DPI` | PDF processing DPI | `300` |
131
+ | `Y_THRESHOLD` | Text extraction threshold | `3.0` |
132
+ | `GAP_THRESHOLD` | Table gap threshold | `10` |
133
+ | `TEMP_FILE_NAME` | Temporary file name | `temp.pdf` |
134
+
135
+ ## Usage
136
+
137
+ ### Quick Start
138
+
139
+ 1. Set up environment:
140
+ ```bash
141
+ python setup_env.py
142
+ ```
143
+
144
+ 2. Pre-load models:
145
+ ```bash
146
+ python startup.py
147
+ ```
148
+
149
+ 3. Run the application:
150
+ ```bash
151
+ streamlit run main.py
152
+ ```
153
+
154
+ ### Advanced Usage
155
+
156
+ #### Model Pre-loading (Recommended)
157
+
158
+ For optimal performance, pre-load models before running the application:
159
+
160
+ ```bash
161
+ # Pre-load all models
162
+ python startup.py
163
+
164
+ # Then run the main application
165
+ streamlit run main.py
166
+ ```
167
+
168
+ #### Direct Application Run
169
+
170
+ You can also run the application directly, which will load models on first use:
171
+
172
+ ```bash
173
+ streamlit run main.py
174
+ ```
175
+
176
+ #### Using the Service Programmatically
177
+
178
+ ```python
179
+ import asyncio
180
+ from src.services import BankStatementService
181
+
182
+ async def process_statement(file_path):
183
+ async with BankStatementService() as service:
184
+ with open(file_path, 'rb') as f:
185
+ result = await service.process_bank_statement(f)
186
+ return result
187
+
188
+ # Usage
189
+ result = asyncio.run(process_statement('path/to/statement.pdf'))
190
+ print(result.account_summary)
191
+ print(result.transaction_tables)
192
+ ```
193
+
194
+ ## Architecture Overview
195
+
196
+ ### Configuration Management
197
+ - **Environment Variables**: All settings configurable via `.env` file
198
+ - **Pydantic Settings**: Type-safe configuration with validation
199
+ - **Fallback Values**: Sensible defaults for all settings
200
+ - **API Key Management**: Secure handling of API credentials
201
+
202
+ ### Model Management
203
+ - **ModelManager**: Singleton class that pre-loads and manages all ML models
204
+ - **Pre-loading**: Models are loaded once at startup and reused across the application
205
+ - **Device Optimization**: Automatic GPU detection and utilization
206
+ - **Configurable Models**: spaCy model selection via environment variables
207
+
208
+ ### Services Layer
209
+ - **BankStatementService**: Main orchestrator that coordinates all processing steps
210
+
211
+ ### OCR Layer
212
+ - **PDFProcessor**: Handles PDF file operations and uses pre-loaded OCR models
213
+ - **TextExtractor**: Extracts text with bounding boxes from both digital and scanned PDFs
214
+
215
+ ### Extractor Layer
216
+ - **TableExtractor**: Processes transaction tables with pattern matching and data cleaning
217
+ - **AccountExtractor**: Extracts account numbers and bank names using regex and NER
218
+ - **BalanceExtractor**: Extracts balance information using keyword matching
219
+
220
+ ### Utils Layer
221
+ - **GroqClient**: Async client for Groq LLM API
222
+ - **HuggingFaceClient**: Async client for HuggingFace Inference API
223
+ - **ModelManager**: Centralized model management and pre-loading
224
+
225
+ ### Models Layer
226
+ - **BankStatementData**: Main data model for processed results
227
+ - **AccountSummary**: Model for account information
228
+ - **AccountDetails**: Model for individual account details
229
+
230
+ ## Key Features
231
+
232
+ ### Environment Configuration
233
+ All settings are configurable via environment variables:
234
+
235
+ ```python
236
+ from src.config.config import settings
237
+
238
+ print(f"Using model: {settings.llm_model}")
239
+ print(f"Device: {'CPU' if settings.force_cpu else 'Auto'}")
240
+ ```
241
+
242
+ ### Model Pre-loading
243
+ Models are loaded once at startup and reused throughout the application:
244
+
245
+ ```python
246
+ from src.utils import model_manager
247
+
248
+ # Check model status
249
+ status = model_manager.get_model_status()
250
+ print(f"Models loaded: {status['models_loaded']}")
251
+ ```
252
+
253
+ ### Async Processing
254
+ All operations are asynchronous, allowing for better performance and resource utilization:
255
+
256
+ ```python
257
+ async with BankStatementService() as service:
258
+ result = await service.process_bank_statement(uploaded_file)
259
+ ```
260
+
261
+ ### Class-Based Design
262
+ Each component is a class with async context manager support:
263
+
264
+ ```python
265
+ class MyService:
266
+ async def __aenter__(self):
267
+ return self
268
+
269
+ async def __aexit__(self, exc_type, exc_value, traceback):
270
+ pass
271
+ ```
272
+
273
+ ### Error Handling
274
+ Comprehensive error handling throughout the pipeline with graceful fallbacks.
275
+
276
+ ## Performance Optimization
277
+
278
+ ### Model Pre-loading Benefits
279
+ - **Faster Processing**: Models are loaded once at startup, not on each file upload
280
+ - **Memory Efficiency**: Single instance of each model shared across the application
281
+ - **GPU Optimization**: Automatic GPU detection and utilization
282
+ - **Reduced Latency**: No model loading delays during file processing
283
+
284
+ ### Configuration Benefits
285
+ - **Flexible Settings**: Easy to adjust parameters without code changes
286
+ - **Environment-Specific**: Different settings for development/production
287
+ - **Secure**: API keys kept separate from code
288
+ - **Version Control Safe**: `.env` files can be excluded from git
289
+
290
+ ### Startup Process
291
+ 1. **Configuration Loading**: Loads settings from `.env` file
292
+ 2. **Model Detection**: Automatically detects available models (spaCy, doctr)
293
+ 3. **Device Selection**: Chooses optimal device (GPU/CPU) based on config
294
+ 4. **Pre-loading**: Loads all models into memory
295
+ 5. **Status Reporting**: Provides detailed loading status
296
+
297
+ ## Testing
298
+
299
+ Run the structure test to verify everything works:
300
+
301
+ ```bash
302
+ python test_structure.py
303
+ ```
304
+
305
+ This will test:
306
+ - All module imports
307
+ - Model manager functionality
308
+ - Service initialization
309
+ - Configuration access
310
+
311
+ ## Troubleshooting
312
+
313
+ ### Common Issues
314
+
315
+ 1. **API Keys Not Set**
316
+ ```bash
317
+ python setup_env.py
318
+ # Edit .env file with your actual API keys
319
+ ```
320
+
321
+ 2. **spaCy Model Not Found**
322
+ ```bash
323
+ python -m spacy download en_core_web_sm
324
+ ```
325
+
326
+ 3. **GPU Not Detected**
327
+ - Set `FORCE_CPU=true` in `.env` file
328
+ - Or install CUDA-compatible PyTorch
329
+
330
+ 4. **Configuration Issues**
331
+ ```bash
332
+ python setup_env.py
333
+ # Check current configuration
334
+ ```
335
+
336
+ ## Dependencies
337
+
338
+ - **Streamlit**: Web interface
339
+ - **PyMuPDF**: PDF processing
340
+ - **doctr**: OCR for scanned PDFs
341
+ - **spaCy**: Natural language processing
342
+ - **torch**: Deep learning framework
343
+ - **pandas**: Data manipulation
344
+ - **openai**: Groq API client
345
+ - **huggingface-hub**: HuggingFace API client
346
+ - **pydantic**: Data validation
347
+ - **fuzzywuzzy**: Fuzzy string matching
348
+ - **python-dotenv**: Environment variable loading
349
+
350
+ ## Contributing
351
+
352
+ 1. Fork the repository
353
+ 2. Create a feature branch
354
+ 3. Make your changes
355
+ 4. Add tests if applicable
356
+ 5. Submit a pull request
357
+
358
+ ## License
359
+
360
+ This project is licensed under the MIT License.
build-docker.sh ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Docker build script for Bank Statement Analyzer
4
+ echo "🐳 Building Bank Statement Analyzer Docker Image"
5
+ echo "=================================================="
6
+
7
+ # Check if we're in the right directory
8
+ if [ ! -f "Dockerfile" ]; then
9
+ echo "❌ Error: Dockerfile not found in current directory"
10
+ echo " Please run this script from the project root directory"
11
+ exit 1
12
+ fi
13
+
14
+ # Check build context size
15
+ echo "πŸ” Checking build context size..."
16
+ python3 check-build-context.py
17
+
18
+ echo ""
19
+ echo "πŸ“¦ Building Docker image..."
20
+
21
+ # Check if pyproject.toml exists
22
+ if [ ! -f "pyproject.toml" ]; then
23
+ echo "❌ Error: pyproject.toml not found!"
24
+ echo " Please ensure you have a valid pyproject.toml file"
25
+ exit 1
26
+ fi
27
+
28
+ # Try building with the main Dockerfile first
29
+ echo "πŸ”„ Attempting build with main Dockerfile..."
30
+ docker build -t bank-statement-analyzer .
31
+
32
+ if [ $? -eq 0 ]; then
33
+ echo ""
34
+ echo "βœ… Docker image built successfully!"
35
+ echo ""
36
+ echo "πŸš€ To run the application:"
37
+ echo " docker run -p 8501:8501 --env-file .env bank-statement-analyzer"
38
+ echo ""
39
+ echo " Or use docker-compose:"
40
+ echo " docker-compose up"
41
+ else
42
+ echo ""
43
+ echo "⚠️ Main Dockerfile build failed. Trying alternative method..."
44
+
45
+ # Check if alternative Dockerfile exists
46
+ if [ -f "Dockerfile.alternative" ]; then
47
+ echo "πŸ”„ Attempting build with alternative Dockerfile..."
48
+ docker build -f Dockerfile.alternative -t bank-statement-analyzer .
49
+
50
+ if [ $? -eq 0 ]; then
51
+ echo ""
52
+ echo "βœ… Docker image built successfully with alternative method!"
53
+ echo ""
54
+ echo "πŸš€ To run the application:"
55
+ echo " docker run -p 8501:8501 --env-file .env bank-statement-analyzer"
56
+ echo ""
57
+ echo " Or use docker-compose:"
58
+ echo " docker-compose up"
59
+ else
60
+ echo ""
61
+ echo "⚠️ Alternative Dockerfile also failed. Trying fallback method..."
62
+
63
+ # Check if fallback Dockerfile exists
64
+ if [ -f "Dockerfile.fallback" ]; then
65
+ echo "πŸ”„ Attempting build with fallback Dockerfile (pip-based)..."
66
+ docker build -f Dockerfile.fallback -t bank-statement-analyzer .
67
+
68
+ if [ $? -eq 0 ]; then
69
+ echo ""
70
+ echo "βœ… Docker image built successfully with fallback method!"
71
+ echo ""
72
+ echo "πŸš€ To run the application:"
73
+ echo " docker run -p 8501:8501 --env-file .env bank-statement-analyzer"
74
+ echo ""
75
+ echo " Or use docker-compose:"
76
+ echo " docker-compose up"
77
+ else
78
+ echo ""
79
+ echo "❌ All Dockerfile methods failed!"
80
+ echo ""
81
+ echo "πŸ’‘ Troubleshooting tips:"
82
+ echo " - Check if Poetry is properly configured"
83
+ echo " - Ensure pyproject.toml and poetry.lock are valid"
84
+ echo " - Try running 'poetry install' locally first"
85
+ echo " - Check Docker logs for specific error messages"
86
+ echo " - Verify system dependencies are available"
87
+ echo ""
88
+ echo "πŸ”§ Manual troubleshooting:"
89
+ echo " docker build -t bank-statement-analyzer . 2>&1 | tee build.log"
90
+ exit 1
91
+ fi
92
+ else
93
+ echo ""
94
+ echo "❌ Fallback Dockerfile not found!"
95
+ echo ""
96
+ echo "πŸ’‘ Troubleshooting tips:"
97
+ echo " - Check if large files are being included in build context"
98
+ echo " - Ensure .dockerignore is properly configured"
99
+ echo " - Try running 'python3 check-build-context.py' to identify issues"
100
+ echo " - Check Poetry installation and configuration"
101
+ exit 1
102
+ fi
103
+ fi
104
+ else
105
+ echo ""
106
+ echo "❌ Alternative Dockerfile not found!"
107
+ echo ""
108
+ echo "πŸ’‘ Troubleshooting tips:"
109
+ echo " - Check if large files are being included in build context"
110
+ echo " - Ensure .dockerignore is properly configured"
111
+ echo " - Try running 'python3 check-build-context.py' to identify issues"
112
+ echo " - Check Poetry installation and configuration"
113
+ exit 1
114
+ fi
115
+ fi
docker-compose.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ bank-statement-analyzer:
5
+ build: .
6
+ ports:
7
+ - "8501:8501"
8
+ environment:
9
+ - PYTHONUNBUFFERED=1
10
+ - STREAMLIT_SERVER_PORT=8501
11
+ - STREAMLIT_SERVER_ADDRESS=0.0.0.0
12
+ - STREAMLIT_SERVER_HEADLESS=true
13
+ - STREAMLIT_SERVER_ENABLE_CORS=false
14
+ - STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION=false
15
+ - POETRY_VENV_IN_PROJECT=1
16
+ - POETRY_NO_INTERACTION=1
17
+ volumes:
18
+ - ./temp:/app/temp
19
+ - ./.env:/app/.env:ro
20
+ restart: unless-stopped
21
+ healthcheck:
22
+ test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
23
+ interval: 30s
24
+ timeout: 10s
25
+ retries: 3
26
+ start_period: 40s
main.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import streamlit as st
3
+ import pandas as pd
4
+ from src.services import BankStatementService
5
+ from src.utils import model_manager
6
+
7
+
8
+ async def preload_models():
9
+ """Pre-load all models at application startup."""
10
+ # st.info("πŸš€ Pre-loading models... This may take a moment on first run.")
11
+
12
+ # Ensure models are loaded
13
+ await model_manager.ensure_models_loaded()
14
+
15
+ # Get model status
16
+ status = model_manager.get_model_status()
17
+
18
+ if status["models_loaded"]:
19
+ # st.success("βœ… All models loaded successfully!")
20
+ # st.info(f"πŸ“± Using device: {status['device']}")
21
+ pass
22
+ else:
23
+ # st.error("❌ Failed to load some models")
24
+ pass
25
+
26
+
27
+ async def main():
28
+ st.set_page_config(page_title="Bank Statement Analyzer", layout="wide")
29
+ st.title("πŸ“„ Bank Statement Analyzer")
30
+
31
+ # Pre-load models at startup
32
+ await preload_models()
33
+
34
+ uploaded_file = st.file_uploader("Upload Bank Statement PDF", type=["pdf"])
35
+
36
+ if uploaded_file:
37
+ st.info("πŸ“₯ Processing uploaded file...")
38
+
39
+ with st.spinner("Extracting data..."):
40
+ async with BankStatementService() as service:
41
+ result = await service.process_bank_statement(uploaded_file)
42
+
43
+ if result:
44
+ # --- Account Summary ---
45
+ account_df = pd.DataFrame(result.account_summary.items(), columns=["Account Summary", "Data"])
46
+ st.dataframe(account_df, use_container_width=True, hide_index=True)
47
+
48
+ # --- Tables Section ---
49
+ st.subheader("πŸ“Š Extracted Tables")
50
+
51
+ for name, df in result.transaction_tables.items():
52
+ if df.empty:
53
+ continue
54
+ st.markdown(f"### {name.capitalize()} Table")
55
+ st.dataframe(df, use_container_width=True, hide_index=True)
56
+ else:
57
+ st.error("⚠️ Unable to parse the statement correctly.")
58
+ else:
59
+ st.warning("πŸ“€ Please upload a PDF file to begin.")
60
+
61
+
62
+ if __name__ == "__main__":
63
+ # Run the async main function
64
+ asyncio.run(main())
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "bank-scrubber"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Your Name <you@example.com>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = ">=3.12,<3.14"
10
+ pypdf2 = "^3.0.1"
11
+ pymupdf = "^1.26.1"
12
+ pdf2image = "^1.17.0"
13
+ python-doctr = "^0.12.0"
14
+ numpy = "^2.3.1"
15
+ pandas = "^2.3.0"
16
+ streamlit = "^1.46.1"
17
+ openai = "^1.93.0"
18
+ fuzzywuzzy = "^0.18.0"
19
+ huggingface-hub = "^0.33.1"
20
+ pydantic = "^2.11.7"
21
+ python-dateutil = "^2.9.0.post0"
22
+ python-dotenv = "^1.1.1"
23
+ python-levenshtein = "^0.27.1"
24
+ pydantic-settings = "^2.10.1"
25
+ doctr = "^1.9.0"
26
+ spacy = "^3.8.7"
27
+
28
+
29
+ [build-system]
30
+ requires = ["poetry-core"]
31
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit>=1.28.0
2
+ pandas>=2.0.0
3
+ numpy>=1.24.0
4
+ PyMuPDF>=1.23.0
5
+ PyPDF2>=3.0.0
6
+ doctr>=2.4.0
7
+ pdf2image>=1.16.0
8
+ spacy>=3.7.0
9
+ torch>=2.0.0
10
+ fuzzywuzzy>=0.18.0
11
+ python-Levenshtein>=0.21.0
12
+ openai>=1.0.0
13
+ huggingface-hub>=0.19.0
14
+ pydantic>=2.0.0
15
+ pydantic-settings>=2.0.0
16
+ python-dateutil>=2.8.0
17
+ python-dotenv>=1.0.0
src/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .services import BankStatementService
2
+ from .models import BankStatementData, AccountSummary, AccountDetails
3
+
4
+ __all__ = ["BankStatementService", "BankStatementData", "AccountSummary", "AccountDetails"]
src/config/config.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ # from pydantic import BaseSettings
3
+ from pydantic_settings import BaseSettings
4
+ import os
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables from .env file
8
+ load_dotenv()
9
+
10
+
11
+ class Settings(BaseSettings):
12
+ """Configuration settings for the application."""
13
+
14
+ # API Keys - will be loaded from environment variables
15
+ groq_api_key: str = os.getenv("GROQ_API_KEY")
16
+ groq_base_url: str = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1")
17
+
18
+ huggingface_api_key: str = os.getenv("HUGGINGFACE_API_KEY")
19
+ huggingface_provider: str = os.getenv("HUGGINGFACE_PROVIDER", "novita")
20
+
21
+ # Model configurations
22
+ llm_model: str = os.getenv("LLM_MODEL", "llama-3.1-8b-instant")
23
+
24
+ # OCR and processing settings
25
+ y_threshold: float = float(os.getenv("Y_THRESHOLD", "3.0"))
26
+ gap_threshold: int = int(os.getenv("GAP_THRESHOLD", "10"))
27
+ gap_threshold_ratio: float = float(os.getenv("GAP_THRESHOLD_RATIO", "0.1"))
28
+
29
+ # File processing settings
30
+ temp_file_name: str = os.getenv("TEMP_FILE_NAME", "temp.pdf")
31
+ dpi: int = int(os.getenv("DPI", "300"))
32
+
33
+ # spaCy model settings
34
+ spacy_model_name: str = os.getenv("SPACY_MODEL_NAME", "en_core_web_sm")
35
+
36
+ # Device settings
37
+ force_cpu: bool = os.getenv("FORCE_CPU", "false").lower() == "true"
38
+
39
+ class Config:
40
+ env_file = ".env"
41
+ env_file_encoding = "utf-8"
42
+ case_sensitive = False
43
+
44
+
45
+ # Global settings instance
46
+ settings = Settings()
src/extractor/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .table_extractor import TableExtractor
2
+ from .account_extractor import AccountExtractor
3
+ from .balance_extractor import BalanceExtractor
4
+
5
+ __all__ = ["TableExtractor", "AccountExtractor", "BalanceExtractor"]
src/extractor/account_extractor.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import re
3
+ import math
4
+ from typing import List, Dict, Any, Optional
5
+ from fuzzywuzzy import fuzz, process
6
+ import spacy
7
+ from src.models.account_models import LineData
8
+ from src.utils import model_manager
9
+
10
+
11
+ class AccountExtractor:
12
+ """Async account extractor for extracting account numbers and bank names."""
13
+
14
+ def __init__(self):
15
+ # Use the centralized model manager for spaCy
16
+ self._ensure_models_loaded()
17
+
18
+ def _ensure_models_loaded(self):
19
+ """Ensure spaCy model is loaded via the model manager."""
20
+ if not model_manager.models_loaded:
21
+ print("πŸ”„ Models not loaded, initializing model manager...")
22
+ # This will trigger model loading if not already done
23
+ _ = model_manager.spacy_model
24
+
25
+ @property
26
+ def nlp(self):
27
+ """Get the loaded spaCy model from model manager."""
28
+ return model_manager.spacy_model
29
+
30
+ async def __aenter__(self):
31
+ return self
32
+
33
+ async def __aexit__(self, exc_type, exc_value, traceback):
34
+ pass
35
+
36
+ def euclidean_distance(self, b1: List[float], b2: List[float]) -> float:
37
+ """Compute Euclidean distance between two bounding boxes."""
38
+ # Compute center points of both bboxes
39
+ x1 = (b1[0] + b1[2]) / 2
40
+ y1 = (b1[1] + b1[3]) / 2
41
+ x2 = (b2[0] + b2[2]) / 2
42
+ y2 = (b2[1] + b2[3]) / 2
43
+ return math.sqrt((x1 - x2)**2 + (y1 - y2)**2)
44
+
45
+ def combine_bboxes(self, bboxes: List[List[float]]) -> List[float]:
46
+ """Merge multiple bboxes into one that covers all."""
47
+ x_min = min(b[0] for b in bboxes)
48
+ y_min = min(b[1] for b in bboxes)
49
+ x_max = max(b[2] for b in bboxes)
50
+ y_max = max(b[3] for b in bboxes)
51
+ return [x_min, y_min, x_max, y_max]
52
+
53
+ async def extract_account_number_regex_distance(self, lines: List[Dict]) -> Optional[Dict]:
54
+ """Extract account number using regex and distance-based approach."""
55
+ def _extract_account():
56
+ for line in lines:
57
+ words = line.get("words", [])
58
+ word_texts = [w["word"] for w in words]
59
+
60
+ # Build cleaned line text (joined without space or special chars)
61
+ cleaned_line = ""
62
+ for w in words:
63
+ if "/" not in w["word"]:
64
+ cleaned_line += re.sub(r"[\s\-\_\,\/]", "", w["word"])
65
+ else:
66
+ cleaned_line += " " + w["word"]
67
+
68
+ # Look for 'account' in raw word list (not cleaned)
69
+ account_word = next((w for w in words if "account" in w["word"].lower()), None)
70
+ if not account_word:
71
+ continue
72
+
73
+ cleaned_line = cleaned_line[cleaned_line.lower().find("account"):].strip()
74
+
75
+ # Run regex on cleaned line
76
+ match = re.search(r"[0-9Xx]{6,}", cleaned_line)
77
+ if not match:
78
+ continue
79
+
80
+ matched_text = match.group(0)
81
+
82
+ # Now collect all words in line that could be part of this account number
83
+ # Join each word (after cleaning), and check if it contributes to matched text
84
+ joined_account = ""
85
+ matched_bboxes = []
86
+ for w in words:
87
+ clean_w = re.sub(r"[\s\-\_\,\/]", "", w["word"])
88
+ if not clean_w:
89
+ continue
90
+ if matched_text.startswith(joined_account + clean_w):
91
+ joined_account += clean_w
92
+ matched_bboxes.append(w["bbox"])
93
+ if joined_account == matched_text:
94
+ break
95
+
96
+ if joined_account != matched_text or not matched_bboxes:
97
+ continue # failed to reconstruct properly
98
+
99
+ # Compute distance from "account" word bbox to combined bbox
100
+ combined_bbox = self.combine_bboxes(matched_bboxes)
101
+ distance = self.euclidean_distance(account_word["bbox"], combined_bbox)
102
+
103
+ return {
104
+ "account_number": matched_text,
105
+ "bbox": combined_bbox,
106
+ "distance": distance
107
+ }
108
+
109
+ return None
110
+
111
+ return await asyncio.get_event_loop().run_in_executor(None, _extract_account)
112
+
113
+ def match_keyword_bbox(self, keyword: str, words: List[Dict]) -> Optional[List[float]]:
114
+ """Match keyword in words and return combined bounding box."""
115
+ keyword_tokens = keyword.lower().split()
116
+ text_tokens = [w["word"].lower() for w in words]
117
+
118
+ for i in range(len(text_tokens) - len(keyword_tokens) + 1):
119
+ if text_tokens[i:i+len(keyword_tokens)] == keyword_tokens:
120
+ matched_bboxes = [words[i+j]["bbox"] for j in range(len(keyword_tokens))]
121
+ return self.combine_bboxes(matched_bboxes)
122
+ return None
123
+
124
+ async def extract_bank_name(self, text: str) -> str:
125
+ """Extract bank name using spaCy NER."""
126
+ def _extract_bank():
127
+ if not self.nlp:
128
+ return "Not Found"
129
+
130
+ doc = self.nlp(text)
131
+ candidates = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
132
+
133
+ for ent in doc.ents:
134
+ print(f"Entity: {ent.text}, Label: {ent.label_}") # Debugging line to see entities and their labels
135
+ print(f"Candidates: {candidates}") # Debugging line to see candidates
136
+
137
+ return candidates[0] if candidates else "Not Found"
138
+
139
+ return await asyncio.get_event_loop().run_in_executor(None, _extract_bank)
140
+
141
+ async def extract_bank_name_using_fuzzy(self, text: str) -> str:
142
+ """Extract bank name using fuzzy matching."""
143
+ def _extract_fuzzy():
144
+ bank_names = [
145
+ "Bank Of America", "South State Bank", "Midstates Bank",
146
+ "Synovus", "Shore United Bank", "Frost",
147
+ "Bethpage Federal Credit Union"
148
+ ]
149
+ best_match = process.extractOne(text, bank_names, scorer=fuzz.partial_ratio)
150
+ return best_match[0] if best_match else "Unknown"
151
+
152
+ return await asyncio.get_event_loop().run_in_executor(None, _extract_fuzzy)
src/extractor/balance_extractor.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import re
3
+ from typing import List, Dict, Any, Optional, Tuple
4
+ from src.extractor.account_extractor import AccountExtractor
5
+
6
+
7
+ class BalanceExtractor:
8
+ """Async balance extractor for extracting balance information."""
9
+
10
+ def __init__(self):
11
+ self.amount_pattern = re.compile(r'-?(?:\d{1,3}(?:,\d{2}){1,}(?:,\d{3})?|\d{1,3}(?:,\d{3})+|\d+)?\.\d{1,2}-?')
12
+ self.account_extractor = AccountExtractor()
13
+
14
+ async def __aenter__(self):
15
+ return self
16
+
17
+ async def __aexit__(self, exc_type, exc_value, traceback):
18
+ pass
19
+
20
+ async def extract_balances(self, object_line: List[Dict]) -> Tuple[Optional[str], Optional[str]]:
21
+ """Extract beginning and ending balances from line data."""
22
+ def _extract_balances():
23
+ # Keywords
24
+ previous_keywords = [
25
+ "previous balance", "starting balance", "beginning balance",
26
+ "balance last statement", "balance previous statement", "last statement",
27
+ "beginning statement", "previous statement", "starting"
28
+ ]
29
+
30
+ ending_keywords = [
31
+ "ending balance", "current balance", "balance this statement",
32
+ "balance ending statement", "this statement", "ending statement", "ending"
33
+ ]
34
+
35
+ beginning_balance = None
36
+ ending_balance = None
37
+
38
+ for idx, line_obj in enumerate(object_line):
39
+ line = line_obj['line']
40
+ line_lower = line.lower()
41
+
42
+ # Search for beginning balance keywords
43
+ if not beginning_balance:
44
+ for keyword in previous_keywords:
45
+ if keyword in line_lower:
46
+ start_index = line_lower.find(keyword) + len(keyword)
47
+ after_keyword = line[start_index:]
48
+ match = self.amount_pattern.search(after_keyword)
49
+ if match:
50
+ beginning_balance = match.group().replace(",", "")
51
+ break # Stop after first match
52
+ else:
53
+ # combine the bbox of the keyword and check exact below word in range of keyword bbox range
54
+ keyword_bbox = None
55
+ keyword_bbox = self.account_extractor.match_keyword_bbox(keyword, line_obj["words"])
56
+ if keyword_bbox:
57
+ x_min, _, x_max, _ = keyword_bbox
58
+ for next_line in object_line[idx+1:idx+3]:
59
+ final_amt = ""
60
+ for w in next_line.get("words", []):
61
+ wx_min, _, wx_max, _ = w["bbox"]
62
+ if wx_min >= x_min-0.1 and wx_max <= x_max+0.1:
63
+ final_amt += w["word"]
64
+ match = self.amount_pattern.search(final_amt)
65
+ if match:
66
+ beginning_balance = match.group().replace(",", "")
67
+ break
68
+ if beginning_balance:
69
+ break
70
+
71
+ if not ending_balance:
72
+ # Search for ending balance keywords
73
+ for keyword in ending_keywords:
74
+ if keyword in line_lower:
75
+ start_index = line_lower.find(keyword) + len(keyword)
76
+ after_keyword = line[start_index:]
77
+ match = self.amount_pattern.search(after_keyword)
78
+ if match:
79
+ ending_balance = match.group().replace(",", "")
80
+ break # Stop after first match
81
+ else:
82
+ # combine the bbox of the keyword and check exact below word in range of keyword bbox range
83
+ keyword_bbox = None
84
+ keyword_bbox = self.account_extractor.match_keyword_bbox(keyword, line_obj["words"])
85
+ if keyword_bbox:
86
+ x_min, _, x_max, _ = keyword_bbox
87
+ for next_line in object_line[idx+1:idx+3]:
88
+ final_amt = ""
89
+ for w in next_line.get("words", []):
90
+ wx_min, _, wx_max, _ = w["bbox"]
91
+ if wx_min >= x_min-0.1 and wx_max <= x_max+0.1:
92
+ final_amt += w["word"]
93
+ match = self.amount_pattern.search(final_amt)
94
+ if match:
95
+ ending_balance = match.group().replace(",", "")
96
+ break
97
+ if ending_balance:
98
+ break
99
+
100
+ return beginning_balance, ending_balance
101
+
102
+ return await asyncio.get_event_loop().run_in_executor(None, _extract_balances)
src/extractor/table_extractor.py ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import re
3
+ import pandas as pd
4
+ from typing import List, Dict, Any, Optional, Tuple
5
+ from src.config.config import settings
6
+
7
+
8
+ class TableExtractor:
9
+ """Async table extractor for processing transaction tables."""
10
+
11
+ def __init__(self):
12
+ self.date_pattern = re.compile(
13
+ r"\b(?:"
14
+ r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}"
15
+ r"|\d{2,4}[-/]\d{1,2}[-/]\d{1,2}"
16
+ r"|\d{1,2}[-/]\d{2,4}"
17
+ r"|\d{2,4}[-/]\d{1,2}"
18
+ r"|\d{1,2}[-/]\d{1,2}"
19
+ r")\b"
20
+ )
21
+ self.amount_pattern = re.compile(r'-?(?:\d{1,3}(?:,\d{2}){1,}(?:,\d{3})?|\d{1,3}(?:,\d{3})+|\d+)?\.\d{1,2}-?')
22
+
23
+ async def __aenter__(self):
24
+ return self
25
+
26
+ async def __aexit__(self, exc_type, exc_value, traceback):
27
+ pass
28
+
29
+ def match_by_pattern(self, text: str, pattern) -> bool:
30
+ """Check if text matches a pattern."""
31
+ if pattern == self.amount_pattern and "-" not in text and len(text) > 6 and "," not in text:
32
+ return False
33
+ if pattern == self.amount_pattern and "-" in text and len(text) > 7 and "," not in text:
34
+ return False
35
+ return bool(pattern.fullmatch(text))
36
+
37
+ def extract_by_pattern(self, text: str, pattern) -> Tuple[Optional[str], Optional[str], Optional[str]]:
38
+ """Extract value by pattern and return value, before, after."""
39
+ match = pattern.search(text)
40
+ if match:
41
+ before = text[:match.start()].strip()
42
+ value = match.group()
43
+ after = text[match.end():].strip()
44
+ if pattern == self.amount_pattern and "-" not in value and len(value) > 6 and "," not in value:
45
+ return None, None, None
46
+ if pattern == self.amount_pattern and "-" in value and len(value) > 7 and "," not in value:
47
+ return None, None, None
48
+ return value, before, after
49
+ return None, None, None
50
+
51
+ def repair_row_with_date_and_amount(self, header: List[str], row: List[str]) -> List[str]:
52
+ """Repair row data by extracting dates and amounts."""
53
+ result = row[:]
54
+ n = len(header)
55
+
56
+ for i, col in enumerate(header):
57
+ val = result[i].strip()
58
+
59
+ if col.lower() == "date":
60
+ date, left, right = self.extract_by_pattern(val, self.date_pattern)
61
+ if date:
62
+ result[i] = date
63
+ if left and i > 0 and header[i-1] != "date":
64
+ result[i-1] = (result[i-1] + " " + left).strip()
65
+ if right and i < n - 1 and header[i+1] != "date":
66
+ result[i+1] = (right + " " + result[i+1]).strip()
67
+ continue
68
+
69
+ # Check previous column's last word
70
+ if i > 0 and header[i-1] != "date":
71
+ left_val = result[i-1].strip()
72
+ tokens = left_val.split()
73
+ if tokens:
74
+ last_word = tokens[-1]
75
+ date_check, _, _ = self.extract_by_pattern(last_word, self.date_pattern)
76
+ if date_check:
77
+ result[i] = date_check + " " + result[i]
78
+ tokens.pop() # remove matched date
79
+ result[i-1] = " ".join(tokens)
80
+ again_date, again_left, again_right = self.extract_by_pattern(result[i], self.date_pattern)
81
+ if again_date:
82
+ result[i] = again_date
83
+ if again_left:
84
+ result[i-1] = (result[i-1] + " " + again_left).strip()
85
+ if again_right:
86
+ result[i+1] = (again_right + " " + result[i+1]).strip()
87
+ continue
88
+
89
+ # Check next column's first word
90
+ if i < n - 1 and header[i+1] != "date":
91
+ right_val = result[i+1].strip()
92
+ tokens = right_val.split()
93
+ if tokens:
94
+ first_word = tokens[0]
95
+ date_check, _, _ = self.extract_by_pattern(first_word, self.date_pattern)
96
+ if date_check:
97
+ result[i] = result[i] + " " + date_check
98
+ tokens.pop(0)
99
+ result[i+1] = " ".join(tokens)
100
+ again_date, again_left, again_right = self.extract_by_pattern(result[i], self.date_pattern)
101
+ if again_date:
102
+ result[i] = again_date
103
+ if again_left:
104
+ result[i-1] = (result[i-1] + " " + again_left).strip()
105
+ if again_right:
106
+ result[i+1] = (again_right + " " + result[i+1]).strip()
107
+ continue
108
+
109
+ # Check if the entire value is a date
110
+ if not self.match_by_pattern(result[i].strip(), self.date_pattern):
111
+ result[i] = ""
112
+ # check left
113
+ if i > 0 and header[i-1] != "date":
114
+ result[i-1] = (result[i-1] + " " + val).strip()
115
+ elif i < n - 1 and header[i+1] != "date":
116
+ result[i+1] = (val + " " + result[i+1]).strip()
117
+
118
+ elif col.lower() in ["amount", "balance", "credits", "debits"]:
119
+ amt, left, right = self.extract_by_pattern(val, self.amount_pattern)
120
+ if amt:
121
+ result[i] = amt
122
+ if left and i > 0:
123
+ result[i-1] = (result[i-1] + " " + left).strip()
124
+ if right and i < n - 1:
125
+ result[i+1] = (right + " " + result[i+1]).strip()
126
+ continue
127
+
128
+ # Check previous column's last word
129
+ if i > 0 and (header[i-1] not in ["amount", "balance", "credits", "debits"]):
130
+ left_val = result[i-1].strip()
131
+ tokens = left_val.split()
132
+ if tokens:
133
+ last_word = tokens[-1]
134
+ amt_check, _, _ = self.extract_by_pattern(last_word, self.amount_pattern)
135
+ if amt_check:
136
+ result[i] = amt_check + " " + result[i]
137
+ tokens.pop()
138
+ result[i-1] = " ".join(tokens)
139
+ again_amt, again_left, again_right = self.extract_by_pattern(result[i], self.amount_pattern)
140
+ if again_amt:
141
+ result[i] = again_amt
142
+ if again_left:
143
+ result[i-1] = (result[i-1] + " " + again_left).strip()
144
+ if again_right:
145
+ result[i+1] = (again_right + " " + result[i+1]).strip()
146
+ continue
147
+
148
+ # Check next column's first word
149
+ if i < n - 1 and (header[i+1] not in ["amount", "balance", "credits", "debits"]):
150
+ right_val = result[i+1].strip()
151
+ tokens = right_val.split()
152
+ if tokens:
153
+ first_word = tokens[0]
154
+ amt_check, _, _ = self.extract_by_pattern(first_word, self.amount_pattern)
155
+ if amt_check:
156
+ result[i] = result[i] + " " + amt_check
157
+ tokens.pop(0)
158
+ result[i+1] = " ".join(tokens)
159
+ again_amt, again_left, again_right = self.extract_by_pattern(result[i], self.amount_pattern)
160
+ if again_amt:
161
+ result[i] = again_amt
162
+ if again_left:
163
+ result[i-1] = (result[i-1] + " " + again_left).strip()
164
+ if again_right:
165
+ result[i+1] = (again_right + " " + result[i+1]).strip()
166
+ continue
167
+
168
+ # Check if the entire value is an amount
169
+ if not self.match_by_pattern(result[i].strip(), self.amount_pattern):
170
+ result[i] = ""
171
+ # check left
172
+ if i > 0 and (header[i-1] not in ["amount", "balance", "credits", "debits"]):
173
+ result[i-1] = (result[i-1] + " " + val).strip()
174
+ elif i < n - 1 and (header[i+1] not in ["amount", "balance", "credits", "debits"]):
175
+ result[i+1] = (val + " " + result[i+1]).strip()
176
+
177
+ return result
178
+
179
+ def extract_amount_or_return(self, line: str) -> str:
180
+ """Extract amount from line or return original line."""
181
+ matches = self.amount_pattern.findall(line)
182
+ if matches:
183
+ match = self.amount_pattern.search(line)
184
+ return match.group(0) if match else line
185
+ return line
186
+
187
+ def extract_date_or_return(self, line: str) -> str:
188
+ """Extract date from line or return original line."""
189
+ matches = self.date_pattern.findall(line)
190
+ if matches:
191
+ match = self.date_pattern.search(line)
192
+ return match.group(0) if match else line
193
+ return line
194
+
195
+ def is_date_word(self, word: str) -> bool:
196
+ """Check if word is a date."""
197
+ try:
198
+ return bool(self.date_pattern.fullmatch(word))
199
+ except ValueError:
200
+ return False
201
+
202
+ def detect_headers(self, line_data: Dict, gap_threshold_ratio: float = 0.1) -> List[str]:
203
+ """Detect headers from line data."""
204
+ if "description" not in line_data["line"]:
205
+ gap_threshold_ratio = 0.2
206
+ if "." in line_data["line"]:
207
+ gap_threshold_ratio = 0.1
208
+
209
+ word_data = sorted(line_data["words"], key=lambda w: w["bbox"][0])
210
+ line = line_data["line"]
211
+
212
+ if len(word_data) < 2:
213
+ return [line.strip()] # Treat whole line as one header if only 1 word
214
+
215
+ # Compute horizontal gaps between words
216
+ gaps = []
217
+ for i in range(len(word_data) - 1):
218
+ x1 = word_data[i]["bbox"][2] # end x of current word
219
+ x2 = word_data[i + 1]["bbox"][0] # start x of next word
220
+ gaps.append(x2 - x1)
221
+
222
+ avg_gap = sum(gaps) / len(gaps)
223
+ threshold = avg_gap * gap_threshold_ratio
224
+
225
+ # Split words into groups based on large gaps (assumed column breaks)
226
+ headers = []
227
+ current_header = [word_data[0]["word"]]
228
+ for i in range(1, len(word_data)):
229
+ gap = gaps[i - 1]
230
+ if gap > threshold:
231
+ headers.append(" ".join(current_header))
232
+ current_header = []
233
+ current_header.append(word_data[i]["word"])
234
+
235
+ if current_header:
236
+ headers.append(" ".join(current_header))
237
+
238
+ # Process special cases
239
+ for i in range(len(headers)):
240
+ if "date" in headers[i].lower() and "description" in headers[i].lower():
241
+ header_checker = headers[i].split(" ")
242
+ date_index = header_checker.index("date")
243
+ description_index = header_checker.index("description")
244
+ if date_index < description_index:
245
+ headers[i] = "date"
246
+ headers.insert(i + 1, "description")
247
+ else:
248
+ headers[i] = "description"
249
+ headers.insert(i + 1, "date")
250
+
251
+ # Handle check/draft numbers
252
+ if "check" in headers or "draft" in headers:
253
+ resulted_headers = []
254
+ i = 0
255
+
256
+ while i < len(headers):
257
+ if (
258
+ i + 1 < len(headers)
259
+ and headers[i] == "check"
260
+ and (headers[i + 1] == "no" or headers[i + 1] == "number")
261
+ ):
262
+ resulted_headers.append(headers[i] + " " + headers[i + 1])
263
+ i += 2
264
+ elif (
265
+ i + 1 < len(headers)
266
+ and headers[i] == "draft"
267
+ and (headers[i + 1] == "no" or headers[i + 1] == "number")
268
+ ):
269
+ resulted_headers.append(headers[i] + " " + headers[i + 1])
270
+ i += 2
271
+ else:
272
+ resulted_headers.append(headers[i])
273
+ i += 1
274
+
275
+ resulted_headers = list(map(lambda x: re.sub(r'[^\w\s]', '', x).strip(), resulted_headers))
276
+
277
+ # Normalize header names
278
+ for i in range(len(resulted_headers)):
279
+ if any(keyword in resulted_headers[i].lower() for keyword in ["date", "day", "month", "year"]):
280
+ resulted_headers[i] = "date"
281
+ if any(keyword in resulted_headers[i].lower() for keyword in ["amount", "total", "sum", "price", "value", "cost", "amt"]):
282
+ resulted_headers[i] = "amount"
283
+ if any(keyword in resulted_headers[i].lower() for keyword in ["balance", "final", "closing", "current", "available", "running", "remaining", "left", "bal", "remain"]):
284
+ resulted_headers[i] = "balance"
285
+ if any(keyword in resulted_headers[i].lower() for keyword in ["credit", "deposit", "cr"]):
286
+ resulted_headers[i] = "credits"
287
+ if any(keyword in resulted_headers[i].lower() for keyword in ["debit", "withdrawal", "dr"]):
288
+ resulted_headers[i] = "debits"
289
+
290
+ return resulted_headers
291
+
292
+ # Normalize header names
293
+ headers = list(map(lambda x: re.sub(r'[^\w\s]', '', x).strip(), headers))
294
+ for i in range(len(headers)):
295
+ if any(keyword in headers[i].lower() for keyword in ["date", "day", "month", "year"]):
296
+ headers[i] = "date"
297
+ if any(keyword in headers[i].lower() for keyword in ["amount", "total", "sum", "price", "value", "cost", "amt"]):
298
+ headers[i] = "amount"
299
+ if any(keyword in headers[i].lower() for keyword in ["balance", "final", "closing", "current", "available", "running", "remaining", "left", "bal", "remain"]):
300
+ headers[i] = "balance"
301
+ if any(keyword in headers[i].lower() for keyword in ["credit", "deposit"]):
302
+ headers[i] = "credits"
303
+ if any(keyword in headers[i].lower() for keyword in ["debit", "withdrawal"]):
304
+ headers[i] = "debits"
305
+
306
+ return headers
307
+
308
+ def detect_row_data(self, headers: List[str], header_data: List[Dict], row_data: List[Dict], gap_threshold: int = 10) -> List[str]:
309
+ """Detect row data based on headers and word positions."""
310
+ if "description" not in headers:
311
+ gap_threshold = 5
312
+
313
+ def flatten_bbox(bbox):
314
+ if isinstance(bbox[0], list): # [[x0, y0], [x1, y1]]
315
+ return [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]]
316
+ return bbox
317
+
318
+ # Step 1: Get all x0, x1 for header words
319
+ header_ranges = []
320
+ for word in header_data:
321
+ x0, _, x1, _ = flatten_bbox(word["bbox"])
322
+ header_ranges.append((x0, x1))
323
+
324
+ # Step 2: Sort by x0
325
+ header_ranges.sort(key=lambda x: x[0])
326
+
327
+ # Step 3: Merge only close headers (preserve wide gaps)
328
+ merged_ranges = []
329
+ temp_x0, temp_x1 = header_ranges[0]
330
+ for x0, x1 in header_ranges[1:]:
331
+ gap = x0 - temp_x1
332
+ if gap < gap_threshold:
333
+ temp_x1 = max(temp_x1, x1)
334
+ else:
335
+ merged_ranges.append((temp_x0, temp_x1))
336
+ temp_x0, temp_x1 = x0, x1
337
+ merged_ranges.append((temp_x0, temp_x1))
338
+
339
+ # Step 4: Segment row_data based on horizontal gaps
340
+ row_data_sorted = sorted(row_data, key=lambda w: flatten_bbox(w["bbox"])[0])
341
+ segments = []
342
+ current_segment = [row_data_sorted[0]]
343
+ for i in range(1, len(row_data_sorted)):
344
+ prev_x1 = flatten_bbox(row_data_sorted[i - 1]["bbox"])[2]
345
+ curr_x0 = flatten_bbox(row_data_sorted[i]["bbox"])[0]
346
+ if curr_x0 - prev_x1 > gap_threshold:
347
+ segments.append(current_segment)
348
+ current_segment = [row_data_sorted[i]]
349
+ else:
350
+ current_segment.append(row_data_sorted[i])
351
+ if current_segment:
352
+ segments.append(current_segment)
353
+
354
+ # Step 5: Assign each segment to a column
355
+ row_values = [""] * len(headers)
356
+ for segment in segments:
357
+ seg_x0 = flatten_bbox(segment[0]["bbox"])[0]
358
+ seg_x1 = flatten_bbox(segment[-1]["bbox"])[2]
359
+ seg_center = (seg_x0 + seg_x1) / 2
360
+ seg_text = " ".join([w["word"] for w in segment])
361
+
362
+ assigned = False
363
+ for idx, (hx0, hx1) in enumerate(merged_ranges):
364
+ if hx0 <= seg_center <= hx1:
365
+ row_values[idx] += seg_text + " "
366
+ assigned = True
367
+ break
368
+
369
+ if not assigned:
370
+ # Optionally assign to nearest column if center is outside range
371
+ nearest_idx = min(
372
+ range(len(merged_ranges)),
373
+ key=lambda idx: abs(
374
+ (merged_ranges[idx][0] + merged_ranges[idx][1]) / 2 - seg_center
375
+ ),
376
+ )
377
+ row_values[nearest_idx] += seg_text + " "
378
+
379
+ final_row = self.repair_row_with_date_and_amount(headers, row_values)
380
+ return [val.strip() for val in final_row]
381
+
382
+ def check_table_tags(self, line: str, headers: List[str]) -> str:
383
+ """Check and return table tag based on line content and headers."""
384
+ available_tags = ["transaction", "deposit", "withdrawal", "checks", "daily balance", "drafts", "service fee", "interest"]
385
+ tag = ""
386
+
387
+ if "deposit" in line.lower() or "credit" in line.lower():
388
+ tag = "deposit"
389
+ elif "withdrawal" in line.lower() or "debit" in line.lower():
390
+ tag = "withdrawal"
391
+ elif "checks" in line.lower():
392
+ tag = "checks"
393
+ elif "drafts" in line.lower():
394
+ tag = "drafts"
395
+ elif "service fee" in line.lower() or "fee" in line.lower():
396
+ tag = "service fee"
397
+ elif "daily balance" in line.lower() or "balance" in line.lower():
398
+ tag = "daily balance"
399
+ elif "interest" in line.lower():
400
+ tag = "interest"
401
+ elif "transaction" in line.lower() or "transfer" in line.lower():
402
+ tag = "transaction"
403
+
404
+ if "credits" in headers or "debits" in headers:
405
+ tag = "transaction"
406
+
407
+ for h in headers:
408
+ if "check" in h.lower():
409
+ tag = "checks"
410
+ break
411
+
412
+ for h in headers:
413
+ if "draft" in h.lower():
414
+ tag = "drafts"
415
+ break
416
+
417
+ return tag
418
+
419
+ async def process_transaction_tables_with_bbox(self, extracted_text_list: List[List[Dict]]) -> Tuple[List[pd.DataFrame], List[str]]:
420
+ """Process transaction tables with bounding box data."""
421
+ def _process_tables():
422
+ all_tables = []
423
+ table_tags = []
424
+
425
+ for block in extracted_text_list:
426
+ headers = []
427
+ table_started = False
428
+ current_table = []
429
+ current_row = {}
430
+ header_words = []
431
+
432
+ for line_idx, line_bbox in enumerate(block):
433
+ line = line_bbox["line"]
434
+ line = line.strip()
435
+
436
+ if not table_started and ("date" in line and "description" in line):
437
+ headers = self.detect_headers(line_bbox)
438
+ header_words = line_bbox["words"]
439
+ date_flag = False
440
+ description_flag = False
441
+ for header in headers:
442
+ if "date" in header.lower():
443
+ date_flag = True
444
+ if "description" in header.lower():
445
+ description_flag = True
446
+ if date_flag and description_flag:
447
+ table_started = True
448
+ current_row = {header: [] for header in headers}
449
+ else:
450
+ continue
451
+
452
+ if line_idx - 1 >= 0:
453
+ prev_line = block[line_idx - 1]["line"]
454
+ tag = self.check_table_tags(prev_line, headers)
455
+ if tag:
456
+ table_tags.append(tag)
457
+ elif len(table_tags) > 0:
458
+ table_tags.append(table_tags[-1])
459
+ else:
460
+ table_tags.append("transaction")
461
+ continue
462
+
463
+ elif (not table_started and ("date" in line and "amount" in line)) or (
464
+ not table_started and ("date" in line and "balance" in line)
465
+ ):
466
+ headers = self.detect_headers(line_bbox)
467
+ header_words = line_bbox["words"]
468
+ date_flag = False
469
+ amount_flag = False
470
+ balance_flag = False
471
+ for header in headers:
472
+ if "date" in header.lower():
473
+ date_flag = True
474
+ if "amount" in header.lower():
475
+ amount_flag = True
476
+ if "balance" in header.lower():
477
+ balance_flag = True
478
+ if date_flag and (amount_flag or balance_flag):
479
+ table_started = True
480
+ current_row = {header: [] for header in headers}
481
+ else:
482
+ continue
483
+
484
+ if line_idx - 1 >= 0:
485
+ prev_line = block[line_idx - 1]["line"]
486
+ tag = self.check_table_tags(prev_line, headers)
487
+ if tag:
488
+ table_tags.append(tag)
489
+ elif len(table_tags) > 0:
490
+ table_tags.append(table_tags[-1])
491
+ else:
492
+ table_tags.append("transaction")
493
+ continue
494
+
495
+ if table_started and ("date" in line and "description" in line):
496
+ max_len = max(len(v) for v in current_row.values())
497
+ for i in range(max_len):
498
+ row_map = {}
499
+ for key in current_row:
500
+ row_map[key] = (
501
+ current_row[key][i] if i < len(current_row[key]) else ""
502
+ )
503
+ current_table.append(row_map)
504
+
505
+ df = pd.DataFrame(current_table)
506
+ all_tables.append(df)
507
+ current_table = []
508
+ headers = self.detect_headers(line_bbox)
509
+ header_words = line_bbox["words"]
510
+ date_flag = False
511
+ description_flag = False
512
+ for header in headers:
513
+ if "date" in header.lower():
514
+ date_flag = True
515
+ if "description" in header.lower():
516
+ description_flag = True
517
+ if date_flag and description_flag:
518
+ current_row = {header: [] for header in headers}
519
+ else:
520
+ continue
521
+
522
+ if line_idx - 1 >= 0:
523
+ prev_line = block[line_idx - 1]["line"]
524
+ tag = self.check_table_tags(prev_line, headers)
525
+ if tag:
526
+ table_tags.append(tag)
527
+ elif len(table_tags) > 0:
528
+ table_tags.append(table_tags[-1])
529
+ else:
530
+ table_tags.append("transaction")
531
+ continue
532
+
533
+ elif (table_started and ("date" in line and "amount" in line)) or (
534
+ table_started and ("date" in line and "balance" in line)
535
+ ):
536
+ max_len = max(len(v) for v in current_row.values())
537
+ for i in range(max_len):
538
+ row_map = {}
539
+ for key in current_row:
540
+ row_map[key] = (
541
+ current_row[key][i] if i < len(current_row[key]) else ""
542
+ )
543
+ current_table.append(row_map)
544
+
545
+ df = pd.DataFrame(current_table)
546
+ all_tables.append(df)
547
+ current_table = []
548
+ headers = self.detect_headers(line_bbox)
549
+ header_words = line_bbox["words"]
550
+ date_flag = False
551
+ amount_flag = False
552
+ balance_flag = False
553
+ for header in headers:
554
+ if "date" in header.lower():
555
+ date_flag = True
556
+ if "amount" in header.lower():
557
+ amount_flag = True
558
+ if "balance" in header.lower():
559
+ balance_flag = True
560
+ if date_flag and (amount_flag or balance_flag):
561
+ current_row = {header: [] for header in headers}
562
+ else:
563
+ continue
564
+
565
+ if line_idx - 1 >= 0:
566
+ prev_line = block[line_idx - 1]["line"]
567
+ tag = self.check_table_tags(prev_line, headers)
568
+ if tag:
569
+ table_tags.append(tag)
570
+ elif len(table_tags) > 0:
571
+ table_tags.append(table_tags[-1])
572
+ else:
573
+ table_tags.append("transaction")
574
+ continue
575
+
576
+ if table_started:
577
+ parts = self.detect_row_data(headers, header_words, line_bbox["words"])
578
+ for key, value in zip(headers, parts):
579
+ current_row[key].append(value)
580
+ max_len = max(len(v) for v in current_row.values())
581
+
582
+ for i in range(max_len):
583
+ if (
584
+ "amount" in headers
585
+ and current_row["amount"]
586
+ and i < len(current_row["amount"])
587
+ and current_row["amount"][i]
588
+ ):
589
+ amount = self.extract_amount_or_return(current_row["amount"][i])
590
+ current_row["amount"][i] = amount
591
+ if (
592
+ "balance" in headers
593
+ and current_row["balance"]
594
+ and i < len(current_row["balance"])
595
+ and current_row["balance"][i]
596
+ ):
597
+ amount = self.extract_amount_or_return(current_row["balance"][i])
598
+ current_row["balance"][i] = amount
599
+ if (
600
+ "credits" in headers
601
+ and current_row["credits"]
602
+ and i < len(current_row["credits"])
603
+ and current_row["credits"][i]
604
+ ):
605
+ amount = self.extract_amount_or_return(current_row["credits"][i])
606
+ current_row["credits"][i] = amount
607
+ if (
608
+ "debits" in headers
609
+ and current_row["debits"]
610
+ and i < len(current_row["debits"])
611
+ and current_row["debits"][i]
612
+ ):
613
+ amount = self.extract_amount_or_return(current_row["debits"][i])
614
+ current_row["debits"][i] = amount
615
+ if (
616
+ "date" in headers
617
+ and current_row["date"]
618
+ and i < len(current_row["date"])
619
+ and current_row["date"][i]
620
+ ):
621
+ current_row["date"][i] = self.extract_date_or_return(
622
+ current_row["date"][i]
623
+ )
624
+
625
+ if (
626
+ "date" in headers
627
+ and current_row["date"]
628
+ and current_row["date"][0]
629
+ and not self.is_date_word(current_row["date"][0])
630
+ or (
631
+ "amount" in headers
632
+ and current_row["amount"][0]
633
+ and not self.amount_pattern.match(current_row["amount"][0])
634
+ )
635
+ or (
636
+ "balance" in headers
637
+ and current_row["balance"][0]
638
+ and not self.amount_pattern.match(current_row["balance"][0])
639
+ )
640
+ or (
641
+ "credits" in headers
642
+ and current_row["credits"][0]
643
+ and not self.amount_pattern.match(current_row["credits"][0])
644
+ )
645
+ or (
646
+ "debits" in headers
647
+ and current_row["debits"][0]
648
+ and not self.amount_pattern.match(current_row["debits"][0])
649
+ )
650
+ ):
651
+ if not current_table and len(table_tags) > 0 and table_tags[-1]:
652
+ table_tags.pop()
653
+ all_tables.append(pd.DataFrame(current_table))
654
+ current_table = []
655
+ current_row = {}
656
+ header_words = []
657
+ headers = []
658
+ table_started = False
659
+ else:
660
+ for i in range(max_len):
661
+ row_map = {}
662
+ for key in current_row:
663
+ row_map[key] = (
664
+ current_row[key][i] if i < len(current_row[key]) else ""
665
+ )
666
+ current_table.append(row_map)
667
+ current_row = {header: [] for header in headers}
668
+
669
+ table_started = False
670
+
671
+ if current_table:
672
+ df = pd.DataFrame(current_table)
673
+ all_tables.append(df)
674
+
675
+ return all_tables, table_tags
676
+
677
+ return await asyncio.get_event_loop().run_in_executor(None, _process_tables)
678
+
679
+ async def process_tables(self, table: pd.DataFrame) -> pd.DataFrame:
680
+ """Process the extracted table to clean and format it."""
681
+ def _process_table():
682
+ keywords = ["continue", "continued", "page", "next page", "total", "subtotal"]
683
+ table_copy = table.copy()
684
+ is_balance_column = "balance" in table_copy.columns
685
+ is_amount_column = "amount" in table_copy.columns
686
+ is_credits_column = "credits" in table_copy.columns
687
+ is_debits_column = "debits" in table_copy.columns
688
+
689
+ for idx, row in table_copy.iterrows():
690
+ if is_balance_column:
691
+ if row["balance"] and not row["date"]:
692
+ table_copy.loc[idx] = [""] * len(table_copy.columns)
693
+ continue
694
+ if is_amount_column:
695
+ if row["amount"] and not row["date"]:
696
+ table_copy.loc[idx] = [""] * len(table_copy.columns)
697
+ continue
698
+ if is_credits_column:
699
+ if row["credits"] and not row["date"]:
700
+ table_copy.loc[idx] = [""] * len(table_copy.columns)
701
+ continue
702
+ if is_debits_column:
703
+ if row["debits"] and not row["date"]:
704
+ table_copy.loc[idx] = [""] * len(table_copy.columns)
705
+ continue
706
+ for cell in row:
707
+ if any(keyword in cell.lower() for keyword in keywords):
708
+ table_copy.loc[idx] = [""] * len(table_copy.columns)
709
+ break
710
+
711
+ df = table_copy.copy()
712
+ df = df.fillna("") # Fill NaNs with empty string for easier processing
713
+
714
+ # Step 1: Identify key columns (case-insensitive match)
715
+ lower_cols = [col.lower() for col in df.columns]
716
+ date_col = next((col for col in df.columns if re.search(r'date', col, re.IGNORECASE)), None)
717
+ value_cols = [col for col in df.columns if re.search(r'amount|balance|credits|debits', col, re.IGNORECASE)]
718
+
719
+ if not date_col or not value_cols:
720
+ return df
721
+
722
+ def is_anchor(row):
723
+ return bool(row[date_col].strip()) and any(row[col].strip() for col in value_cols)
724
+
725
+ # Step 2: Loop over rows and identify anchor indices
726
+ anchor_indices = [i for i, row in df.iterrows() if is_anchor(row)]
727
+
728
+ for anchor_idx in anchor_indices:
729
+ # Merge upward
730
+ i = anchor_idx - 1
731
+ while i >= 0:
732
+ if is_anchor(df.iloc[i]) or df.iloc[i].isnull().all() or all(df.iloc[i] == ""):
733
+ break
734
+ for col in df.columns:
735
+ if col != date_col and col not in value_cols:
736
+ df.at[anchor_idx, col] = (str(df.at[i, col]).strip() + " " + str(df.at[anchor_idx, col]).strip()).strip()
737
+ df.iloc[i] = "" # Blank the merged row
738
+ i -= 1
739
+
740
+ # Merge downward
741
+ i = anchor_idx + 1
742
+ while i < len(df):
743
+ if is_anchor(df.iloc[i]) or df.iloc[i].isnull().all() or all(df.iloc[i] == ""):
744
+ break
745
+ for col in df.columns:
746
+ if col != date_col and col not in value_cols:
747
+ df.at[anchor_idx, col] = (str(df.at[anchor_idx, col]).strip() + " " + str(df.at[i, col]).strip()).strip()
748
+ df.iloc[i] = "" # Blank the merged row
749
+ i += 1
750
+
751
+ df_copy = df.copy()
752
+ col = "balance" if "balance" in df_copy.columns else "amount"
753
+
754
+ for idx, row in df_copy.iterrows():
755
+ if not row[col] and not row[date_col]:
756
+ df_copy.loc[idx] = [""] * len(df_copy.columns)
757
+ df_copy = df_copy[~df_copy.apply(lambda row: all(cell == "" for cell in row), axis=1)].reset_index(drop=True)
758
+ return df_copy
759
+
760
+ return await asyncio.get_event_loop().run_in_executor(None, _process_table)
src/models/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .account_models import AccountSummary, AccountDetails, BankStatementData
2
+
3
+ __all__ = ["AccountSummary", "AccountDetails", "BankStatementData"]
src/models/account_models.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any, Optional
2
+ from pydantic import BaseModel, Field
3
+ from datetime import date
4
+
5
+
6
+ class AccountDetails(BaseModel):
7
+ """Model for individual account details."""
8
+ account_name: str = Field(..., description="Name of the account")
9
+ account_number: str = Field(..., description="Account number")
10
+ starting_balance: float = Field(..., description="Starting balance of the account")
11
+ ending_balance: float = Field(..., description="Ending balance of the account")
12
+ statement_start_date: str = Field(..., description="Statement start date in YYYY-MM-DD format")
13
+ statement_end_date: str = Field(..., description="Statement end date in YYYY-MM-DD format")
14
+
15
+
16
+ class AccountSummary(BaseModel):
17
+ """Model for bank account summary extracted from LLM."""
18
+ bank_name: str = Field(..., description="Name of the bank")
19
+ account_holder: str = Field(..., description="Name of the account holder")
20
+ accounts: List[AccountDetails] = Field(..., description="List of account details")
21
+
22
+
23
+ class BankStatementData(BaseModel):
24
+ """Model for processed bank statement data."""
25
+ account_summary: Dict[str, str] = Field(..., description="Account summary information")
26
+ transaction_tables: Dict[str, Any] = Field(..., description="Extracted transaction tables")
27
+
28
+
29
+ class WordData(BaseModel):
30
+ """Model for word data with bounding box."""
31
+ word: str = Field(..., description="Extracted word text")
32
+ bbox: List[float] = Field(..., description="Bounding box coordinates [x0, y0, x1, y1]")
33
+
34
+
35
+ class LineData(BaseModel):
36
+ """Model for line data with words."""
37
+ line: str = Field(..., description="Complete line text")
38
+ bbox: List[float] = Field(..., description="Line bounding box [x, y]")
39
+ words: List[WordData] = Field(..., description="List of words in the line")
40
+
41
+
42
+ class ExtractedTextData(BaseModel):
43
+ """Model for extracted text data from PDF."""
44
+ pages: List[List[LineData]] = Field(..., description="List of pages, each containing lines")
src/ocr/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .pdf_processor import PDFProcessor
2
+ from .text_extractor import TextExtractor
3
+
4
+ __all__ = ["PDFProcessor", "TextExtractor"]
src/ocr/pdf_processor.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import fitz
3
+ import os
4
+ from typing import List, Dict, Any, Optional
5
+ import numpy as np
6
+ from pdf2image import convert_from_path
7
+ from doctr.models import ocr_predictor
8
+ from doctr.io import DocumentFile
9
+ import torch
10
+ from src.config.config import settings
11
+ from src.models.account_models import LineData, WordData
12
+ from src.utils import model_manager
13
+
14
+
15
+ class PDFProcessor:
16
+ """Async PDF processor for handling both digital and scanned PDFs."""
17
+
18
+ def __init__(self):
19
+ # Use the centralized model manager
20
+ self._ensure_models_loaded()
21
+
22
+ def _ensure_models_loaded(self):
23
+ """Ensure models are loaded via the model manager."""
24
+ if not model_manager.models_loaded:
25
+ print("πŸ”„ Models not loaded, initializing model manager...")
26
+ # This will trigger model loading if not already done
27
+ _ = model_manager.doctr_model
28
+
29
+ @property
30
+ def doctr_model(self):
31
+ """Get the loaded doctr model from model manager."""
32
+ return model_manager.doctr_model
33
+
34
+ @property
35
+ def device(self):
36
+ """Get the device being used from model manager."""
37
+ return model_manager.device
38
+
39
+ async def __aenter__(self):
40
+ return self
41
+
42
+ async def __aexit__(self, exc_type, exc_value, traceback):
43
+ pass
44
+
45
+ async def is_pdf_scanned(self, pdf_path: str) -> bool:
46
+ """Check if PDF is scanned (no extractable text)."""
47
+ def _check_scanned():
48
+ doc = fitz.open(pdf_path)
49
+ for page in doc:
50
+ text = page.get_text()
51
+ if text.strip():
52
+ return False
53
+ return True
54
+
55
+ return await asyncio.get_event_loop().run_in_executor(None, _check_scanned)
56
+
57
+ async def save_uploaded_file(self, uploaded_file) -> str:
58
+ """Save uploaded file to temporary location."""
59
+ def _save_file():
60
+ with open(settings.temp_file_name, "wb") as f:
61
+ f.write(uploaded_file.read())
62
+ return settings.temp_file_name
63
+
64
+ return await asyncio.get_event_loop().run_in_executor(None, _save_file)
65
+
66
+ async def extract_text_from_digital_pdf(self, pdf_path: str) -> List[List[str]]:
67
+ """Extract text from digital PDF using PyPDF2."""
68
+ from PyPDF2 import PdfReader
69
+
70
+ def _extract_text():
71
+ reader = PdfReader(pdf_path)
72
+ extracted_data = []
73
+
74
+ for page in reader.pages:
75
+ ptext = page.extract_text()
76
+ if ptext:
77
+ data = []
78
+ for line in ptext.splitlines():
79
+ cleaned_line = self._split_on_repeated_pattern(line.strip())
80
+ if cleaned_line:
81
+ data.append(cleaned_line[0])
82
+ extracted_data.append(data)
83
+
84
+ return extracted_data
85
+
86
+ return await asyncio.get_event_loop().run_in_executor(None, _extract_text)
87
+
88
+ def _split_on_repeated_pattern(self, line: str, min_space: int = 10) -> List[str]:
89
+ """Split line on repeated pattern."""
90
+ import re
91
+ from difflib import SequenceMatcher
92
+
93
+ original_line = line.strip()
94
+
95
+ # Find all spans of spaces >= min_space
96
+ space_spans = [
97
+ (m.start(), len(m.group()))
98
+ for m in re.finditer(r" {%d,}" % min_space, original_line)
99
+ ]
100
+
101
+ if not space_spans:
102
+ return [original_line]
103
+
104
+ # Count how often each gap size occurs
105
+ gaps = [span[1] for span in space_spans]
106
+ gap_counts = {}
107
+ for g in gaps:
108
+ gap_counts[g] = gap_counts.get(g, 0) + 1
109
+
110
+ # Sort gaps by size Γ— count (more dominant gaps first)
111
+ sorted_gaps = sorted(gap_counts.items(), key=lambda x: x[1] * x[0], reverse=True)
112
+
113
+ # No significant gaps, return original
114
+ if not sorted_gaps:
115
+ return [original_line]
116
+
117
+ dominant_gap = sorted_gaps[0][0]
118
+
119
+ # Use the dominant large gap to split
120
+ chunks = re.split(rf" {{%d,}}" % dominant_gap, original_line)
121
+
122
+ # Check if it's actually repeated using fuzzy match
123
+ base = chunks[0].strip()
124
+ repeated = False
125
+ for chunk in chunks[1:]:
126
+ chunk = chunk.strip()
127
+ if chunk and SequenceMatcher(None, base, chunk).ratio() > 0.8:
128
+ repeated = True
129
+ break
130
+
131
+ return [base] if repeated else [original_line]
src/ocr/text_extractor.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import fitz
3
+ import re
4
+ import numpy as np
5
+ from typing import List, Dict, Any, Optional
6
+ from pdf2image import convert_from_path
7
+ from src.config.config import settings
8
+ from src.models.account_models import LineData, WordData
9
+ from doctr.io import DocumentFile
10
+
11
+
12
+ class TextExtractor:
13
+ """Async text extractor for extracting text with bounding boxes."""
14
+
15
+ def __init__(self, doctr_model):
16
+ self.doctr_model = doctr_model
17
+
18
+ async def __aenter__(self):
19
+ return self
20
+
21
+ async def __aexit__(self, exc_type, exc_value, traceback):
22
+ pass
23
+
24
+ def normalize_bbox(self, bbox, width: float, height: float) -> List[float]:
25
+ """Normalize bounding box (x0, y0, x1, y1) to range [0, 1]."""
26
+ x0, y0, x1, y1 = bbox
27
+ return [
28
+ round(x0 / width, 6),
29
+ round(y0 / height, 6),
30
+ round(x1 / width, 6),
31
+ round(y1 / height, 6),
32
+ ]
33
+
34
+ def remove_consecutive_items(self, line: List[str]) -> List[str]:
35
+ """Remove consecutive duplicate items from a list."""
36
+ if not line:
37
+ return line
38
+ result = [line[0]]
39
+ for item in line[1:]:
40
+ if item != result[-1]:
41
+ result.append(item)
42
+ return result
43
+
44
+ def remove_consecutive_words(self, word_data: List[Dict]) -> List[Dict]:
45
+ """Remove consecutive duplicate words from word data."""
46
+ if not word_data:
47
+ return word_data
48
+ result = [word_data[0]]
49
+ for i in range(1, len(word_data)):
50
+ if word_data[i]["word"] != result[-1]["word"]:
51
+ result.append(word_data[i])
52
+ return result
53
+
54
+ async def extract_lines_with_bbox(self, pdf_path: str, y_threshold: float = 3.0) -> List[List[LineData]]:
55
+ """Extract lines with bounding boxes from digital PDF."""
56
+ def _extract_lines():
57
+ doc = fitz.open(pdf_path)
58
+ page_lines_with_bbox = []
59
+
60
+ for page in doc:
61
+ words = page.get_text("words") # (x0, y0, x1, y1, word, block_no, line_no, word_no)
62
+ words.sort(key=lambda w: (round(w[1], 1), w[0])) # sort by y then x
63
+
64
+ lines = []
65
+ current_line = []
66
+ current_y = None
67
+ current_word_data = []
68
+
69
+ for w in words:
70
+ x0, y0, x1, y1, word = w[:5]
71
+ if word == "|" or not word or word == "." or word == "#" or re.sub(r'[^\w\s]', '', word) == "":
72
+ continue
73
+ word = word.lower()
74
+ word_data = {"word": word.strip(), "bbox": (x0, y0, x1, y1)}
75
+
76
+ if current_y is None or abs(y0 - current_y) < y_threshold:
77
+ current_line.append((x0, y0, word))
78
+ current_y = y0
79
+ current_word_data.append(word_data)
80
+ else:
81
+ current_line.sort()
82
+ line_words = [w[2] for w in current_line]
83
+ clean_line = self.remove_consecutive_items(line_words)
84
+ current_word_data = sorted(current_word_data, key=lambda w: w["bbox"][0])
85
+ clean_word_data = self.remove_consecutive_words(current_word_data)
86
+
87
+ if clean_line:
88
+ x_start = min([w[0] for w in current_line])
89
+ y_start = min([w[1] for w in current_line])
90
+ lines.append({
91
+ "line": " ".join(clean_line),
92
+ "bbox": [x_start, y_start],
93
+ "words": clean_word_data,
94
+ })
95
+ current_line = [(x0, y0, word)]
96
+ current_y = y0
97
+ current_word_data = [word_data]
98
+
99
+ # Process remaining line
100
+ if current_line:
101
+ current_line.sort()
102
+ line_words = [w[2] for w in current_line]
103
+ clean_line = self.remove_consecutive_items(line_words)
104
+ current_word_data = sorted(current_word_data, key=lambda w: w["bbox"][0])
105
+ clean_word_data = self.remove_consecutive_words(current_word_data)
106
+
107
+ if clean_line:
108
+ x_start = min([w[0] for w in current_line])
109
+ y_start = min([w[1] for w in current_line])
110
+ lines.append({
111
+ "line": " ".join(clean_line),
112
+ "bbox": [x_start, y_start],
113
+ "words": clean_word_data,
114
+ })
115
+
116
+ page_lines_with_bbox.append(lines)
117
+
118
+ return page_lines_with_bbox
119
+
120
+ return await asyncio.get_event_loop().run_in_executor(None, _extract_lines)
121
+
122
+ async def extract_lines_with_bbox_from_scanned_pdf(self, pdf_path: str, y_threshold: float = 5.0, first_page: bool = False) -> List[List[LineData]]:
123
+ """Extract lines with bounding boxes from scanned PDF using OCR."""
124
+ def _extract_from_scanned():
125
+ result = None
126
+ doc = None
127
+
128
+ if first_page:
129
+ pages = convert_from_path(pdf_path, dpi=settings.dpi, first_page=1, last_page=1)
130
+ first_page_img = pages[0].convert("RGB")
131
+ result = self.doctr_model([np.array(first_page_img)])
132
+ doc = np.array(first_page_img)
133
+ else:
134
+ doc = DocumentFile.from_pdf(pdf_path)
135
+ result = self.doctr_model(doc)
136
+
137
+ page_lines_with_bbox = []
138
+
139
+ for page in result.pages:
140
+ img_width, img_height = doc[0].shape[1], doc[0].shape[0]
141
+ words = []
142
+
143
+ for block in page.blocks:
144
+ for line in block.lines:
145
+ for word in line.words:
146
+ x0, y0 = word.geometry[0]
147
+ x1, y1 = word.geometry[1]
148
+ abs_x0 = x0 * img_width
149
+ abs_y0 = y0 * img_height
150
+ abs_x1 = x1 * img_width
151
+ abs_y1 = y1 * img_height
152
+ text = word.value.strip().lower()
153
+ text = re.sub(r'[#*]', ' ', text)
154
+ text = text.strip()
155
+
156
+ if text == "|" or not text or text == "." or text == "#" or re.sub(r'[^\w\s]', '', text) == "":
157
+ continue
158
+ words.append({"word": text, "bbox": [abs_x0, abs_y0, abs_x1, abs_y1]})
159
+
160
+ # Sort words by y then x
161
+ words.sort(key=lambda w: (round(w["bbox"][1], 3), w["bbox"][0]))
162
+
163
+ lines = []
164
+ current_line = []
165
+ current_word_data = []
166
+ current_y = None
167
+
168
+ for w in words:
169
+ y0 = w["bbox"][1]
170
+ if current_y is None or abs(y0 - current_y) < y_threshold:
171
+ current_line.append((w["bbox"][0], y0, w["word"]))
172
+ current_word_data.append(w)
173
+ current_y = y0
174
+ else:
175
+ current_line.sort()
176
+ line_words = [x[2] for x in current_line]
177
+ clean_line = self.remove_consecutive_items(line_words)
178
+ current_word_data = sorted(current_word_data, key=lambda w: w["bbox"][0])
179
+ clean_word_data = self.remove_consecutive_words(current_word_data)
180
+
181
+ if clean_line:
182
+ x_start = min(x[0] for x in current_line)
183
+ y_start = min(x[1] for x in current_line)
184
+ lines.append({
185
+ "line": " ".join(clean_line),
186
+ "bbox": [x_start, y_start],
187
+ "words": clean_word_data,
188
+ })
189
+ current_line = [(w["bbox"][0], y0, w["word"])]
190
+ current_word_data = [w]
191
+ current_y = y0
192
+
193
+ # Final remaining line
194
+ if current_line:
195
+ current_line.sort()
196
+ line_words = [x[2] for x in current_line]
197
+ clean_line = self.remove_consecutive_items(line_words)
198
+ current_word_data = sorted(current_word_data, key=lambda w: w["bbox"][0])
199
+ clean_word_data = self.remove_consecutive_words(current_word_data)
200
+
201
+ if clean_line:
202
+ x_start = min(x[0] for x in current_line)
203
+ y_start = min(x[1] for x in current_line)
204
+ lines.append({
205
+ "line": " ".join(clean_line),
206
+ "bbox": [x_start, y_start],
207
+ "words": clean_word_data,
208
+ })
209
+
210
+ page_lines_with_bbox.append(lines)
211
+
212
+ return page_lines_with_bbox
213
+
214
+ return await asyncio.get_event_loop().run_in_executor(None, _extract_from_scanned)
src/services/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .bank_statement_service import BankStatementService
2
+
3
+ __all__ = ["BankStatementService"]
src/services/bank_statement_service.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import pandas as pd
4
+ from typing import List, Dict, Any, Optional, Tuple
5
+ from src.ocr import PDFProcessor, TextExtractor
6
+ from src.extractor import TableExtractor, AccountExtractor, BalanceExtractor
7
+ from src.utils import GroqClient
8
+ from src.models.account_models import BankStatementData
9
+ from src.config.config import settings
10
+
11
+
12
+ class BankStatementService:
13
+ """Main service for processing bank statements."""
14
+
15
+ def __init__(self):
16
+ self.pdf_processor = PDFProcessor()
17
+ self.table_extractor = TableExtractor()
18
+ self.account_extractor = AccountExtractor()
19
+ self.balance_extractor = BalanceExtractor()
20
+
21
+ async def __aenter__(self):
22
+ return self
23
+
24
+ async def __aexit__(self, exc_type, exc_value, traceback):
25
+ pass
26
+
27
+ async def process_bank_statement(self, uploaded_file) -> BankStatementData:
28
+ """Main method to process uploaded bank statement."""
29
+ # Save uploaded file
30
+ pdf_path = await self.pdf_processor.save_uploaded_file(uploaded_file)
31
+
32
+ # Check if PDF is scanned
33
+ pdf_scanned = await self.pdf_processor.is_pdf_scanned(pdf_path)
34
+
35
+ # Extract text based on PDF type
36
+ if pdf_scanned:
37
+ print(f"{pdf_path} is likely a scanned PDF.")
38
+ text_extractor = TextExtractor(self.pdf_processor.doctr_model)
39
+ extracted_text_list = await text_extractor.extract_lines_with_bbox_from_scanned_pdf(pdf_path)
40
+ else:
41
+ print(f"{pdf_path} is not a scanned PDF. Extracting text...")
42
+ text_extractor = TextExtractor(self.pdf_processor.doctr_model)
43
+ extracted_text_list = await text_extractor.extract_lines_with_bbox(pdf_path)
44
+
45
+ # Process transaction tables
46
+ pre_processed_tables, table_tags = await self.table_extractor.process_transaction_tables_with_bbox(extracted_text_list)
47
+
48
+ # Clean and process tables
49
+ processed_tables = []
50
+ for table in pre_processed_tables:
51
+ processed_table = await self.table_extractor.process_tables(table)
52
+ processed_tables.append(processed_table)
53
+
54
+ # Organize tables by tags
55
+ final_table_dic = {}
56
+ for i, tag in enumerate(table_tags):
57
+ if tag not in final_table_dic:
58
+ final_table_dic[tag] = [processed_tables[i]]
59
+ else:
60
+ final_table_dic[tag].append(processed_tables[i])
61
+
62
+ # Concatenate tables with same tags
63
+ for tag, tables in final_table_dic.items():
64
+ final_table_dic[tag] = pd.concat(tables, ignore_index=True)
65
+
66
+ # Extract account information from first page
67
+ first_page = None
68
+ if pdf_scanned:
69
+ first_page = extracted_text_list
70
+ else:
71
+ first_page = await text_extractor.extract_lines_with_bbox_from_scanned_pdf(pdf_path, first_page=True)
72
+
73
+ first_page_object = first_page[0]
74
+
75
+ # Extract text for LLM processing
76
+ starting_text = ""
77
+ for lines in first_page_object:
78
+ starting_text += lines["line"]
79
+
80
+
81
+ # Extract account details using LLM
82
+ async with GroqClient() as groq_client:
83
+ bank_summary = await groq_client.extract_account_details(starting_text)
84
+
85
+ bank_summary = json.loads(bank_summary)
86
+
87
+ # Create account summary
88
+ account_summary = {
89
+ "Bank Name": bank_summary["bank_name"].upper(),
90
+ "Account Number": bank_summary["accounts"][-1]["account_number"],
91
+ "Starting Balance": str(bank_summary["accounts"][-1]["starting_balance"]),
92
+ "Ending Balance": str(bank_summary["accounts"][-1]["ending_balance"]),
93
+ "Statement Start Date": bank_summary["accounts"][-1]["statement_start_date"],
94
+ "Statement End Date": bank_summary["accounts"][-1]["statement_end_date"]
95
+ }
96
+
97
+ return BankStatementData(
98
+ account_summary=account_summary,
99
+ transaction_tables=final_table_dic
100
+ )
src/utils/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .api_clients import GroqClient, HuggingFaceClient
2
+ from .model_manager import ModelManager, model_manager
3
+
4
+ __all__ = ["GroqClient", "HuggingFaceClient", "ModelManager", "model_manager"]
src/utils/api_clients.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ from typing import Dict, Any, Optional
4
+ from openai import AsyncOpenAI
5
+ from huggingface_hub import AsyncInferenceClient
6
+ from src.config.config import settings
7
+
8
+
9
+ class GroqClient:
10
+ """Async client for Groq API."""
11
+
12
+ def __init__(self):
13
+ self.client = AsyncOpenAI(
14
+ base_url=settings.groq_base_url,
15
+ api_key=settings.groq_api_key,
16
+ )
17
+
18
+ async def __aenter__(self):
19
+ return self
20
+
21
+ async def __aexit__(self, exc_type, exc_value, traceback):
22
+ await self.client.close()
23
+
24
+ async def extract_account_details(self, text: str) -> str:
25
+ """Extract account details using LLM."""
26
+ system_prompt = """
27
+ You are a financial document parser that extracts structured data from bank statements.
28
+
29
+ Your task is to extract the following fields and return only valid JSON:
30
+
31
+ - Starting balance can also be referred with "Balance last statement" or "Balance previous statement" in pdfs.
32
+ - Ending balance can also be referred with "Balance this statement" in pdfs.
33
+
34
+ {
35
+ "bank_name": "string",
36
+ "account_holder": "string",
37
+ "accounts": [{
38
+ "account_name": "string",
39
+ "account_number": "string",
40
+ "starting_balance": float,
41
+ "ending_balance": float,
42
+ "statement_start_date": "YYYY-MM-DD",
43
+ "statement_end_date": "YYYY-MM-DD"
44
+ }]
45
+ }
46
+
47
+ Guidelines:
48
+ - Return strictly valid JSON (no markdown, comments, or extra explanation).
49
+ - `starting_balance` and `ending_balance` must be `float` (no currency symbol).
50
+ - Dates must follow the format `"YYYY-MM-DD"`.
51
+ - Do not respond with anything other than the JSON object.
52
+ - If multiple account are there then include all the account list in a list.
53
+ """
54
+
55
+ response = await self.client.chat.completions.create(
56
+ model=settings.llm_model,
57
+ messages=[
58
+ {"role": "system", "content": system_prompt},
59
+ {"role": "user", "content": text},
60
+ ],
61
+ )
62
+
63
+ return response.choices[0].message.content
64
+
65
+
66
+ class HuggingFaceClient:
67
+ """Async client for HuggingFace Inference API."""
68
+
69
+ def __init__(self):
70
+ self.client = AsyncInferenceClient(
71
+ provider=settings.huggingface_provider,
72
+ api_key=settings.huggingface_api_key,
73
+ )
74
+
75
+ async def __aenter__(self):
76
+ return self
77
+
78
+ async def __aexit__(self, exc_type, exc_value, traceback):
79
+ pass
80
+
81
+ async def extract_account_details(self, text: str) -> str:
82
+ """Extract account details using HuggingFace model."""
83
+ # This is a placeholder - you can implement HuggingFace specific logic here
84
+ # For now, we'll use the same prompt as Groq
85
+ system_prompt = """
86
+ You are a financial document parser that extracts structured data from bank statements.
87
+
88
+ Your task is to extract the following fields and return only valid JSON:
89
+
90
+ - Starting balance can also be referred with "Balance last statement" or "Balance previous statement" in pdfs.
91
+ - Ending balance can also be referred with "Balance this statement" in pdfs.
92
+
93
+ {
94
+ "bank_name": "string",
95
+ "account_holder": "string",
96
+ "accounts": [{
97
+ "account_name": "string",
98
+ "account_number": "string",
99
+ "starting_balance": float,
100
+ "ending_balance": float,
101
+ "statement_start_date": "YYYY-MM-DD",
102
+ "statement_end_date": "YYYY-MM-DD"
103
+ }]
104
+ }
105
+
106
+ Guidelines:
107
+ - Return strictly valid JSON (no markdown, comments, or extra explanation).
108
+ - `starting_balance` and `ending_balance` must be `float` (no currency symbol).
109
+ - Dates must follow the format `"YYYY-MM-DD"`.
110
+ - Do not respond with anything other than the JSON object.
111
+ - If multiple account are there then include all the account list in a list.
112
+ """
113
+
114
+ # This would need to be implemented based on the specific HuggingFace model
115
+ # For now, returning a placeholder
116
+ return '{"bank_name": "Unknown", "account_holder": "Unknown", "accounts": []}'
src/utils/model_manager.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import torch
3
+ from typing import Optional
4
+ from doctr.models import ocr_predictor
5
+ import spacy
6
+ from src.config.config import settings
7
+
8
+
9
+ class ModelManager:
10
+ """Singleton model manager for pre-loading all models at startup."""
11
+
12
+ _instance = None
13
+ _doctr_model = None
14
+ _spacy_model = None
15
+ _device = None
16
+ _models_loaded = False
17
+
18
+ def __new__(cls):
19
+ if cls._instance is None:
20
+ cls._instance = super(ModelManager, cls).__new__(cls)
21
+ return cls._instance
22
+
23
+ def __init__(self):
24
+ if not self._models_loaded:
25
+ self._load_models()
26
+
27
+ def _load_models(self):
28
+ """Load all models synchronously."""
29
+ print("πŸš€ Starting model pre-loading...")
30
+
31
+ # Set device based on config
32
+ if settings.force_cpu:
33
+ self._device = torch.device("cpu")
34
+ print("πŸ“± Using CPU (forced by config)")
35
+ else:
36
+ self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
37
+ print(f"πŸ“± Using device: {self._device}")
38
+
39
+ # Load doctr model
40
+ print("πŸ”„ Loading doctr OCR model...")
41
+ self._doctr_model = ocr_predictor(pretrained=True)
42
+ self._doctr_model.det_predictor.model = self._doctr_model.det_predictor.model.to(self._device)
43
+ self._doctr_model.reco_predictor.model = self._doctr_model.reco_predictor.model.to(self._device)
44
+ print("βœ… Doctr model loaded successfully!")
45
+
46
+ # Load spaCy model
47
+ print(f"πŸ”„ Loading spaCy NER model: {settings.spacy_model_name}...")
48
+ try:
49
+ self._spacy_model = spacy.load(settings.spacy_model_name)
50
+ print(f"βœ… spaCy model ({settings.spacy_model_name}) loaded successfully!")
51
+ except OSError:
52
+ print(f"⚠️ spaCy model '{settings.spacy_model_name}' not found.")
53
+ # Try fallback models
54
+ fallback_models = ["en_core_web_sm", "en_core_web_trf"]
55
+ for fallback_model in fallback_models:
56
+ if fallback_model != settings.spacy_model_name:
57
+ try:
58
+ print(f"πŸ”„ Trying fallback model: {fallback_model}")
59
+ self._spacy_model = spacy.load(fallback_model)
60
+ print(f"βœ… spaCy model ({fallback_model}) loaded successfully!")
61
+ break
62
+ except OSError:
63
+ continue
64
+
65
+ if self._spacy_model is None:
66
+ print("⚠️ No spaCy model found. Please install with: python -m spacy download en_core_web_sm")
67
+
68
+ self._models_loaded = True
69
+ print("πŸŽ‰ All models loaded successfully!")
70
+
71
+ @property
72
+ def doctr_model(self):
73
+ """Get the loaded doctr model."""
74
+ return self._doctr_model
75
+
76
+ @property
77
+ def spacy_model(self):
78
+ """Get the loaded spaCy model."""
79
+ return self._spacy_model
80
+
81
+ @property
82
+ def device(self):
83
+ """Get the device being used."""
84
+ return self._device
85
+
86
+ @property
87
+ def models_loaded(self):
88
+ """Check if models are loaded."""
89
+ return self._models_loaded
90
+
91
+ async def ensure_models_loaded(self):
92
+ """Ensure models are loaded (async wrapper)."""
93
+ if not self._models_loaded:
94
+ await asyncio.get_event_loop().run_in_executor(None, self._load_models)
95
+ return True
96
+
97
+ def get_model_status(self):
98
+ """Get status of all models."""
99
+ return {
100
+ "doctr_model": self._doctr_model is not None,
101
+ "spacy_model": self._spacy_model is not None,
102
+ "device": str(self._device),
103
+ "models_loaded": self._models_loaded,
104
+ "spacy_model_name": settings.spacy_model_name,
105
+ "force_cpu": settings.force_cpu
106
+ }
107
+
108
+
109
+ # Global model manager instance
110
+ model_manager = ModelManager()