File size: 37,230 Bytes
4e71548
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
import asyncio
import re
import pandas as pd
from typing import List, Dict, Any, Optional, Tuple
from src.config.config import settings


class TableExtractor:
    """Async table extractor for processing transaction tables."""
    
    def __init__(self):
        self.date_pattern = re.compile(
            r"\b(?:"
            r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}"
            r"|\d{2,4}[-/]\d{1,2}[-/]\d{1,2}"
            r"|\d{1,2}[-/]\d{2,4}"
            r"|\d{2,4}[-/]\d{1,2}"
            r"|\d{1,2}[-/]\d{1,2}"
            r")\b"
        )
        self.amount_pattern = re.compile(r'-?(?:\d{1,3}(?:,\d{2}){1,}(?:,\d{3})?|\d{1,3}(?:,\d{3})+|\d+)?\.\d{1,2}-?')
    
    async def __aenter__(self):
        return self
    
    async def __aexit__(self, exc_type, exc_value, traceback):
        pass
    
    def match_by_pattern(self, text: str, pattern) -> bool:
        """Check if text matches a pattern."""
        if pattern == self.amount_pattern and "-" not in text and len(text) > 6 and "," not in text:
            return False
        if pattern == self.amount_pattern and "-" in text and len(text) > 7 and "," not in text:
            return False
        return bool(pattern.fullmatch(text))
    
    def extract_by_pattern(self, text: str, pattern) -> Tuple[Optional[str], Optional[str], Optional[str]]:
        """Extract value by pattern and return value, before, after."""
        match = pattern.search(text)
        if match:
            before = text[:match.start()].strip()
            value = match.group()
            after = text[match.end():].strip()
            if pattern == self.amount_pattern and "-" not in value and len(value) > 6 and "," not in value:
                return None, None, None
            if pattern == self.amount_pattern and "-" in value and len(value) > 7 and "," not in value:
                return None, None, None
            return value, before, after
        return None, None, None
    
    def repair_row_with_date_and_amount(self, header: List[str], row: List[str]) -> List[str]:
        """Repair row data by extracting dates and amounts."""
        result = row[:]
        n = len(header)
        
        for i, col in enumerate(header):
            val = result[i].strip()
            
            if col.lower() == "date":
                date, left, right = self.extract_by_pattern(val, self.date_pattern)
                if date:
                    result[i] = date
                    if left and i > 0 and header[i-1] != "date":
                        result[i-1] = (result[i-1] + " " + left).strip()
                    if right and i < n - 1 and header[i+1] != "date":
                        result[i+1] = (right + " " + result[i+1]).strip()
                    continue
                
                # Check previous column's last word
                if i > 0 and header[i-1] != "date":
                    left_val = result[i-1].strip()
                    tokens = left_val.split()
                    if tokens:
                        last_word = tokens[-1]
                        date_check, _, _ = self.extract_by_pattern(last_word, self.date_pattern)
                        if date_check:
                            result[i] = date_check + " " + result[i]
                            tokens.pop()  # remove matched date
                            result[i-1] = " ".join(tokens)
                            again_date, again_left, again_right = self.extract_by_pattern(result[i], self.date_pattern)
                            if again_date:
                                result[i] = again_date
                                if again_left:
                                    result[i-1] = (result[i-1] + " " + again_left).strip()
                                if again_right:
                                    result[i+1] = (again_right + " " + result[i+1]).strip()
                            continue
                
                # Check next column's first word
                if i < n - 1 and header[i+1] != "date":
                    right_val = result[i+1].strip()
                    tokens = right_val.split()
                    if tokens:
                        first_word = tokens[0]
                        date_check, _, _ = self.extract_by_pattern(first_word, self.date_pattern)
                        if date_check:
                            result[i] = result[i] + " " + date_check
                            tokens.pop(0)
                            result[i+1] = " ".join(tokens)
                            again_date, again_left, again_right = self.extract_by_pattern(result[i], self.date_pattern)
                            if again_date:
                                result[i] = again_date
                                if again_left:
                                    result[i-1] = (result[i-1] + " " + again_left).strip()
                                if again_right:
                                    result[i+1] = (again_right + " " + result[i+1]).strip()
                            continue
                
                # Check if the entire value is a date
                if not self.match_by_pattern(result[i].strip(), self.date_pattern):
                    result[i] = ""
                    # check left
                    if i > 0 and header[i-1] != "date":
                        result[i-1] = (result[i-1] + " " + val).strip()
                    elif i < n - 1 and header[i+1] != "date":
                        result[i+1] = (val + " " + result[i+1]).strip()
            
            elif col.lower() in ["amount", "balance", "credits", "debits"]:
                amt, left, right = self.extract_by_pattern(val, self.amount_pattern)
                if amt:
                    result[i] = amt
                    if left and i > 0:
                        result[i-1] = (result[i-1] + " " + left).strip()
                    if right and i < n - 1:
                        result[i+1] = (right + " " + result[i+1]).strip()
                    continue
                
                # Check previous column's last word
                if i > 0 and (header[i-1] not in ["amount", "balance", "credits", "debits"]):
                    left_val = result[i-1].strip()
                    tokens = left_val.split()
                    if tokens:
                        last_word = tokens[-1]
                        amt_check, _, _ = self.extract_by_pattern(last_word, self.amount_pattern)
                        if amt_check:
                            result[i] = amt_check + " " + result[i]
                            tokens.pop()
                            result[i-1] = " ".join(tokens)
                            again_amt, again_left, again_right = self.extract_by_pattern(result[i], self.amount_pattern)
                            if again_amt:
                                result[i] = again_amt
                                if again_left:
                                    result[i-1] = (result[i-1] + " " + again_left).strip()
                                if again_right:
                                    result[i+1] = (again_right + " " + result[i+1]).strip()
                            continue
                
                # Check next column's first word
                if i < n - 1 and (header[i+1] not in ["amount", "balance", "credits", "debits"]):
                    right_val = result[i+1].strip()
                    tokens = right_val.split()
                    if tokens:
                        first_word = tokens[0]
                        amt_check, _, _ = self.extract_by_pattern(first_word, self.amount_pattern)
                        if amt_check:
                            result[i] = result[i] + " " + amt_check
                            tokens.pop(0)
                            result[i+1] = " ".join(tokens)
                            again_amt, again_left, again_right = self.extract_by_pattern(result[i], self.amount_pattern)
                            if again_amt:
                                result[i] = again_amt
                                if again_left:
                                    result[i-1] = (result[i-1] + " " + again_left).strip()
                                if again_right:
                                    result[i+1] = (again_right + " " + result[i+1]).strip()
                            continue
                
                # Check if the entire value is an amount
                if not self.match_by_pattern(result[i].strip(), self.amount_pattern):
                    result[i] = ""
                    # check left
                    if i > 0 and (header[i-1] not in ["amount", "balance", "credits", "debits"]):
                        result[i-1] = (result[i-1] + " " + val).strip()
                    elif i < n - 1 and (header[i+1] not in ["amount", "balance", "credits", "debits"]):
                        result[i+1] = (val + " " + result[i+1]).strip()
        
        return result
    
    def extract_amount_or_return(self, line: str) -> str:
        """Extract amount from line or return original line."""
        matches = self.amount_pattern.findall(line)
        if matches:
            match = self.amount_pattern.search(line)
            return match.group(0) if match else line
        return line
    
    def extract_date_or_return(self, line: str) -> str:
        """Extract date from line or return original line."""
        matches = self.date_pattern.findall(line)
        if matches:
            match = self.date_pattern.search(line)
            return match.group(0) if match else line
        return line
    
    def is_date_word(self, word: str) -> bool:
        """Check if word is a date."""
        try:
            return bool(self.date_pattern.fullmatch(word))
        except ValueError:
            return False
    
    def detect_headers(self, line_data: Dict, gap_threshold_ratio: float = 0.1) -> List[str]:
        """Detect headers from line data."""
        if "description" not in line_data["line"]:
            gap_threshold_ratio = 0.2
        if "." in line_data["line"]:
            gap_threshold_ratio = 0.1
        
        word_data = sorted(line_data["words"], key=lambda w: w["bbox"][0])
        line = line_data["line"]
        
        if len(word_data) < 2:
            return [line.strip()]  # Treat whole line as one header if only 1 word
        
        # Compute horizontal gaps between words
        gaps = []
        for i in range(len(word_data) - 1):
            x1 = word_data[i]["bbox"][2]  # end x of current word
            x2 = word_data[i + 1]["bbox"][0]  # start x of next word
            gaps.append(x2 - x1)
        
        avg_gap = sum(gaps) / len(gaps)
        threshold = avg_gap * gap_threshold_ratio
        
        # Split words into groups based on large gaps (assumed column breaks)
        headers = []
        current_header = [word_data[0]["word"]]
        for i in range(1, len(word_data)):
            gap = gaps[i - 1]
            if gap > threshold:
                headers.append(" ".join(current_header))
                current_header = []
            current_header.append(word_data[i]["word"])
        
        if current_header:
            headers.append(" ".join(current_header))
        
        # Process special cases
        for i in range(len(headers)):
            if "date" in headers[i].lower() and "description" in headers[i].lower():
                header_checker = headers[i].split(" ")
                date_index = header_checker.index("date")
                description_index = header_checker.index("description")
                if date_index < description_index:
                    headers[i] = "date"
                    headers.insert(i + 1, "description")
                else:
                    headers[i] = "description"
                    headers.insert(i + 1, "date")
        
        # Handle check/draft numbers
        if "check" in headers or "draft" in headers:
            resulted_headers = []
            i = 0
            
            while i < len(headers):
                if (
                    i + 1 < len(headers)
                    and headers[i] == "check"
                    and (headers[i + 1] == "no" or headers[i + 1] == "number")
                ):
                    resulted_headers.append(headers[i] + " " + headers[i + 1])
                    i += 2
                elif (
                    i + 1 < len(headers)
                    and headers[i] == "draft"
                    and (headers[i + 1] == "no" or headers[i + 1] == "number")
                ):
                    resulted_headers.append(headers[i] + " " + headers[i + 1])
                    i += 2
                else:
                    resulted_headers.append(headers[i])
                    i += 1
            
            resulted_headers = list(map(lambda x: re.sub(r'[^\w\s]', '', x).strip(), resulted_headers))
            
            # Normalize header names
            for i in range(len(resulted_headers)):
                if any(keyword in resulted_headers[i].lower() for keyword in ["date", "day", "month", "year"]):
                    resulted_headers[i] = "date"
                if any(keyword in resulted_headers[i].lower() for keyword in ["amount", "total", "sum", "price", "value", "cost", "amt"]):
                    resulted_headers[i] = "amount"
                if any(keyword in resulted_headers[i].lower() for keyword in ["balance", "final", "closing", "current", "available", "running", "remaining", "left", "bal", "remain"]):
                    resulted_headers[i] = "balance"
                if any(keyword in resulted_headers[i].lower() for keyword in ["credit", "deposit", "cr"]):
                    resulted_headers[i] = "credits"
                if any(keyword in resulted_headers[i].lower() for keyword in ["debit", "withdrawal", "dr"]):
                    resulted_headers[i] = "debits"
            
            return resulted_headers
        
        # Normalize header names
        headers = list(map(lambda x: re.sub(r'[^\w\s]', '', x).strip(), headers))
        for i in range(len(headers)):
            if any(keyword in headers[i].lower() for keyword in ["date", "day", "month", "year"]):
                headers[i] = "date"
            if any(keyword in headers[i].lower() for keyword in ["amount", "total", "sum", "price", "value", "cost", "amt"]):
                headers[i] = "amount"
            if any(keyword in headers[i].lower() for keyword in ["balance", "final", "closing", "current", "available", "running", "remaining", "left", "bal", "remain"]):
                headers[i] = "balance"
            if any(keyword in headers[i].lower() for keyword in ["credit", "deposit"]):
                headers[i] = "credits"
            if any(keyword in headers[i].lower() for keyword in ["debit", "withdrawal"]):
                headers[i] = "debits"
        
        return headers
    
    def detect_row_data(self, headers: List[str], header_data: List[Dict], row_data: List[Dict], gap_threshold: int = 10) -> List[str]:
        """Detect row data based on headers and word positions."""
        if "description" not in headers:
            gap_threshold = 5
        
        def flatten_bbox(bbox):
            if isinstance(bbox[0], list):  # [[x0, y0], [x1, y1]]
                return [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]]
            return bbox
        
        # Step 1: Get all x0, x1 for header words
        header_ranges = []
        for word in header_data:
            x0, _, x1, _ = flatten_bbox(word["bbox"])
            header_ranges.append((x0, x1))
        
        # Step 2: Sort by x0
        header_ranges.sort(key=lambda x: x[0])
        
        # Step 3: Merge only close headers (preserve wide gaps)
        merged_ranges = []
        temp_x0, temp_x1 = header_ranges[0]
        for x0, x1 in header_ranges[1:]:
            gap = x0 - temp_x1
            if gap < gap_threshold:
                temp_x1 = max(temp_x1, x1)
            else:
                merged_ranges.append((temp_x0, temp_x1))
                temp_x0, temp_x1 = x0, x1
        merged_ranges.append((temp_x0, temp_x1))
        
        # Step 4: Segment row_data based on horizontal gaps
        row_data_sorted = sorted(row_data, key=lambda w: flatten_bbox(w["bbox"])[0])
        segments = []
        current_segment = [row_data_sorted[0]]
        for i in range(1, len(row_data_sorted)):
            prev_x1 = flatten_bbox(row_data_sorted[i - 1]["bbox"])[2]
            curr_x0 = flatten_bbox(row_data_sorted[i]["bbox"])[0]
            if curr_x0 - prev_x1 > gap_threshold:
                segments.append(current_segment)
                current_segment = [row_data_sorted[i]]
            else:
                current_segment.append(row_data_sorted[i])
        if current_segment:
            segments.append(current_segment)
        
        # Step 5: Assign each segment to a column
        row_values = [""] * len(headers)
        for segment in segments:
            seg_x0 = flatten_bbox(segment[0]["bbox"])[0]
            seg_x1 = flatten_bbox(segment[-1]["bbox"])[2]
            seg_center = (seg_x0 + seg_x1) / 2
            seg_text = " ".join([w["word"] for w in segment])
            
            assigned = False
            for idx, (hx0, hx1) in enumerate(merged_ranges):
                if hx0 <= seg_center <= hx1:
                    row_values[idx] += seg_text + " "
                    assigned = True
                    break
            
            if not assigned:
                # Optionally assign to nearest column if center is outside range
                nearest_idx = min(
                    range(len(merged_ranges)),
                    key=lambda idx: abs(
                        (merged_ranges[idx][0] + merged_ranges[idx][1]) / 2 - seg_center
                    ),
                )
                row_values[nearest_idx] += seg_text + " "
        
        final_row = self.repair_row_with_date_and_amount(headers, row_values)
        return [val.strip() for val in final_row]
    
    def check_table_tags(self, line: str, headers: List[str]) -> str:
        """Check and return table tag based on line content and headers."""
        available_tags = ["transaction", "deposit", "withdrawal", "checks", "daily balance", "drafts", "service fee", "interest"]
        tag = ""
        
        if "deposit" in line.lower() or "credit" in line.lower():
            tag = "deposit"
        elif "withdrawal" in line.lower() or "debit" in line.lower():
            tag = "withdrawal"
        elif "checks" in line.lower():
            tag = "checks"
        elif "drafts" in line.lower():
            tag = "drafts"
        elif "service fee" in line.lower() or "fee" in line.lower():
            tag = "service fee"
        elif "daily balance" in line.lower() or "balance" in line.lower():
            tag = "daily balance"
        elif "interest" in line.lower():
            tag = "interest"
        elif "transaction" in line.lower() or "transfer" in line.lower():
            tag = "transaction"
        
        if "credits" in headers or "debits" in headers:
            tag = "transaction"
        
        for h in headers:
            if "check" in h.lower():
                tag = "checks"
                break
        
        for h in headers:
            if "draft" in h.lower():
                tag = "drafts"
                break
        
        return tag
    
    async def process_transaction_tables_with_bbox(self, extracted_text_list: List[List[Dict]]) -> Tuple[List[pd.DataFrame], List[str]]:
        """Process transaction tables with bounding box data."""
        def _process_tables():
            all_tables = []
            table_tags = []
            
            for block in extracted_text_list:
                headers = []
                table_started = False
                current_table = []
                current_row = {}
                header_words = []
                
                for line_idx, line_bbox in enumerate(block):
                    line = line_bbox["line"]
                    line = line.strip()
                    
                    if not table_started and ("date" in line and "description" in line):
                        headers = self.detect_headers(line_bbox)
                        header_words = line_bbox["words"]
                        date_flag = False
                        description_flag = False
                        for header in headers:
                            if "date" in header.lower():
                                date_flag = True
                            if "description" in header.lower():
                                description_flag = True
                        if date_flag and description_flag:
                            table_started = True
                            current_row = {header: [] for header in headers}
                        else:
                            continue
                        
                        if line_idx - 1 >= 0:
                            prev_line = block[line_idx - 1]["line"]
                            tag = self.check_table_tags(prev_line, headers)
                            if tag:
                                table_tags.append(tag)
                            elif len(table_tags) > 0:
                                table_tags.append(table_tags[-1])
                            else:
                                table_tags.append("transaction")
                        continue
                    
                    elif (not table_started and ("date" in line and "amount" in line)) or (
                        not table_started and ("date" in line and "balance" in line)
                    ):
                        headers = self.detect_headers(line_bbox)
                        header_words = line_bbox["words"]
                        date_flag = False
                        amount_flag = False
                        balance_flag = False
                        for header in headers:
                            if "date" in header.lower():
                                date_flag = True
                            if "amount" in header.lower():
                                amount_flag = True
                            if "balance" in header.lower():
                                balance_flag = True
                        if date_flag and (amount_flag or balance_flag):
                            table_started = True
                            current_row = {header: [] for header in headers}
                        else:
                            continue
                        
                        if line_idx - 1 >= 0:
                            prev_line = block[line_idx - 1]["line"]
                            tag = self.check_table_tags(prev_line, headers)
                            if tag:
                                table_tags.append(tag)
                            elif len(table_tags) > 0:
                                table_tags.append(table_tags[-1])
                            else:
                                table_tags.append("transaction")
                        continue
                    
                    if table_started and ("date" in line and "description" in line):
                        max_len = max(len(v) for v in current_row.values())
                        for i in range(max_len):
                            row_map = {}
                            for key in current_row:
                                row_map[key] = (
                                    current_row[key][i] if i < len(current_row[key]) else ""
                                )
                            current_table.append(row_map)
                        
                        df = pd.DataFrame(current_table)
                        all_tables.append(df)
                        current_table = []
                        headers = self.detect_headers(line_bbox)
                        header_words = line_bbox["words"]
                        date_flag = False
                        description_flag = False
                        for header in headers:
                            if "date" in header.lower():
                                date_flag = True
                            if "description" in header.lower():
                                description_flag = True
                        if date_flag and description_flag:
                            current_row = {header: [] for header in headers}
                        else:
                            continue
                        
                        if line_idx - 1 >= 0:
                            prev_line = block[line_idx - 1]["line"]
                            tag = self.check_table_tags(prev_line, headers)
                            if tag:
                                table_tags.append(tag)
                            elif len(table_tags) > 0:
                                table_tags.append(table_tags[-1])
                            else:
                                table_tags.append("transaction")
                        continue
                    
                    elif (table_started and ("date" in line and "amount" in line)) or (
                        table_started and ("date" in line and "balance" in line)
                    ):
                        max_len = max(len(v) for v in current_row.values())
                        for i in range(max_len):
                            row_map = {}
                            for key in current_row:
                                row_map[key] = (
                                    current_row[key][i] if i < len(current_row[key]) else ""
                                )
                            current_table.append(row_map)
                        
                        df = pd.DataFrame(current_table)
                        all_tables.append(df)
                        current_table = []
                        headers = self.detect_headers(line_bbox)
                        header_words = line_bbox["words"]
                        date_flag = False
                        amount_flag = False
                        balance_flag = False
                        for header in headers:
                            if "date" in header.lower():
                                date_flag = True
                            if "amount" in header.lower():
                                amount_flag = True
                            if "balance" in header.lower():
                                balance_flag = True
                        if date_flag and (amount_flag or balance_flag):
                            current_row = {header: [] for header in headers}
                        else:
                            continue
                        
                        if line_idx - 1 >= 0:
                            prev_line = block[line_idx - 1]["line"]
                            tag = self.check_table_tags(prev_line, headers)
                            if tag:
                                table_tags.append(tag)
                            elif len(table_tags) > 0:
                                table_tags.append(table_tags[-1])
                            else:
                                table_tags.append("transaction")
                        continue
                    
                    if table_started:
                        parts = self.detect_row_data(headers, header_words, line_bbox["words"])
                        for key, value in zip(headers, parts):
                            current_row[key].append(value)
                        max_len = max(len(v) for v in current_row.values())
                        
                        for i in range(max_len):
                            if (
                                "amount" in headers
                                and current_row["amount"]
                                and i < len(current_row["amount"])
                                and current_row["amount"][i]
                            ):
                                amount = self.extract_amount_or_return(current_row["amount"][i])
                                current_row["amount"][i] = amount
                            if (
                                "balance" in headers
                                and current_row["balance"]
                                and i < len(current_row["balance"])
                                and current_row["balance"][i]
                            ):
                                amount = self.extract_amount_or_return(current_row["balance"][i])
                                current_row["balance"][i] = amount
                            if (
                                "credits" in headers
                                and current_row["credits"]
                                and i < len(current_row["credits"])
                                and current_row["credits"][i]
                            ):
                                amount = self.extract_amount_or_return(current_row["credits"][i])
                                current_row["credits"][i] = amount
                            if (
                                "debits" in headers
                                and current_row["debits"]
                                and i < len(current_row["debits"])
                                and current_row["debits"][i]
                            ):
                                amount = self.extract_amount_or_return(current_row["debits"][i])
                                current_row["debits"][i] = amount
                            if (
                                "date" in headers
                                and current_row["date"]
                                and i < len(current_row["date"])
                                and current_row["date"][i]
                            ):
                                current_row["date"][i] = self.extract_date_or_return(
                                    current_row["date"][i]
                                )
                        
                        if (
                            "date" in headers
                            and current_row["date"]
                            and current_row["date"][0]
                            and not self.is_date_word(current_row["date"][0])
                            or (
                                "amount" in headers
                                and current_row["amount"][0]
                                and not self.amount_pattern.match(current_row["amount"][0])
                            )
                            or (
                                "balance" in headers
                                and current_row["balance"][0]
                                and not self.amount_pattern.match(current_row["balance"][0])
                            )
                            or (
                                "credits" in headers
                                and current_row["credits"][0]
                                and not self.amount_pattern.match(current_row["credits"][0])
                            )
                            or (
                                "debits" in headers
                                and current_row["debits"][0]
                                and not self.amount_pattern.match(current_row["debits"][0])
                            )
                        ):
                            if not current_table and len(table_tags) > 0 and table_tags[-1]:
                                table_tags.pop()
                            all_tables.append(pd.DataFrame(current_table))
                            current_table = []
                            current_row = {}
                            header_words = []
                            headers = []
                            table_started = False
                        else:
                            for i in range(max_len):
                                row_map = {}
                                for key in current_row:
                                    row_map[key] = (
                                        current_row[key][i] if i < len(current_row[key]) else ""
                                    )
                                current_table.append(row_map)
                            current_row = {header: [] for header in headers}
                
                table_started = False
                
                if current_table:
                    df = pd.DataFrame(current_table)
                    all_tables.append(df)
            
            return all_tables, table_tags
        
        return await asyncio.get_event_loop().run_in_executor(None, _process_tables)
    
    async def process_tables(self, table: pd.DataFrame) -> pd.DataFrame:
        """Process the extracted table to clean and format it."""
        def _process_table():
            keywords = ["continue", "continued", "page", "next page", "total", "subtotal"]
            table_copy = table.copy()
            is_balance_column = "balance" in table_copy.columns
            is_amount_column = "amount" in table_copy.columns
            is_credits_column = "credits" in table_copy.columns
            is_debits_column = "debits" in table_copy.columns
            
            for idx, row in table_copy.iterrows():
                if is_balance_column:
                    if row["balance"] and not row["date"]:
                        table_copy.loc[idx] = [""] * len(table_copy.columns)
                        continue
                if is_amount_column:
                    if row["amount"] and not row["date"]:
                        table_copy.loc[idx] = [""] * len(table_copy.columns)
                        continue
                if is_credits_column:
                    if row["credits"] and not row["date"]:
                        table_copy.loc[idx] = [""] * len(table_copy.columns)
                        continue
                if is_debits_column:
                    if row["debits"] and not row["date"]:
                        table_copy.loc[idx] = [""] * len(table_copy.columns)
                        continue
                for cell in row:
                    if any(keyword in cell.lower() for keyword in keywords):
                        table_copy.loc[idx] = [""] * len(table_copy.columns)
                        break
            
            df = table_copy.copy()
            df = df.fillna("")  # Fill NaNs with empty string for easier processing
            
            # Step 1: Identify key columns (case-insensitive match)
            lower_cols = [col.lower() for col in df.columns]
            date_col = next((col for col in df.columns if re.search(r'date', col, re.IGNORECASE)), None)
            value_cols = [col for col in df.columns if re.search(r'amount|balance|credits|debits', col, re.IGNORECASE)]
            
            if not date_col or not value_cols:
                return df
            
            def is_anchor(row):
                return bool(row[date_col].strip()) and any(row[col].strip() for col in value_cols)
            
            # Step 2: Loop over rows and identify anchor indices
            anchor_indices = [i for i, row in df.iterrows() if is_anchor(row)]
            
            for anchor_idx in anchor_indices:
                # Merge upward
                i = anchor_idx - 1
                while i >= 0:
                    if is_anchor(df.iloc[i]) or df.iloc[i].isnull().all() or all(df.iloc[i] == ""):
                        break
                    for col in df.columns:
                        if col != date_col and col not in value_cols:
                            df.at[anchor_idx, col] = (str(df.at[i, col]).strip() + " " + str(df.at[anchor_idx, col]).strip()).strip()
                    df.iloc[i] = ""  # Blank the merged row
                    i -= 1
                
                # Merge downward
                i = anchor_idx + 1
                while i < len(df):
                    if is_anchor(df.iloc[i]) or df.iloc[i].isnull().all() or all(df.iloc[i] == ""):
                        break
                    for col in df.columns:
                        if col != date_col and col not in value_cols:
                            df.at[anchor_idx, col] = (str(df.at[anchor_idx, col]).strip() + " " + str(df.at[i, col]).strip()).strip()
                    df.iloc[i] = ""  # Blank the merged row
                    i += 1
            
            df_copy = df.copy()
            col = "balance" if "balance" in df_copy.columns else "amount"
            
            for idx, row in df_copy.iterrows():
                if not row[col] and not row[date_col]:
                    df_copy.loc[idx] = [""] * len(df_copy.columns)
            df_copy = df_copy[~df_copy.apply(lambda row: all(cell == "" for cell in row), axis=1)].reset_index(drop=True)
            return df_copy
        
        return await asyncio.get_event_loop().run_in_executor(None, _process_table)