File size: 34,545 Bytes
82d714a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410fe77
 
 
82d714a
 
 
 
 
 
 
 
 
 
 
410fe77
82d714a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
import re
import random
import pandas as pd
import json
from textblob import Word
from rapidfuzz import fuzz as rapidfuzz_fuzz
from fuzzywuzzy import fuzz as fuzzywuzzy_fuzz
from Levenshtein import ratio as levenshtein_ratio, jaro_winkler as levenshtein_jaro_winkler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from flask import Flask, request, render_template, send_file, redirect, url_for, flash, jsonify
import io
import os
import numpy as np
from wordcloud import WordCloud
import textdistance
import chardet
# --- New import for SBERT & parallel processing ---
from sentence_transformers import SentenceTransformer
import concurrent.futures
from tqdm import tqdm

app = Flask(__name__)


os.environ["HF_HOME"] = os.path.expanduser("~/.cache")

# Global variables
latest_results_df = None
original_df1 = None
original_df2 = None

app.secret_key = '1cdddf3025ba915f2f32baf15d00a79fe63a8dce49935c2f'

# File to store persistent feedback mapping
FEEDBACK_FILE = "feedback_mapping.json"



#########################################
# Persistent Feedback Storage Functions
#########################################
def load_feedback_mapping():
    """Load feedback mapping from FEEDBACK_FILE if it exists; otherwise, return an empty dict."""
    if os.path.exists(FEEDBACK_FILE):
        with open(FEEDBACK_FILE, "r") as f:
            try:
                return json.load(f)
            except Exception:
                return {}
    else:
        return {}


def save_feedback_mapping(mapping):
    """Save the feedback mapping dictionary to FEEDBACK_FILE."""
    with open(FEEDBACK_FILE, "w") as f:
        json.dump(mapping, f, indent=4)


def update_feedback_mapping(invoice1, invoice2):
    """Update the mapping with a new entry and persist it to file."""
    mapping = load_feedback_mapping()
    mapping[invoice1] = invoice2
    save_feedback_mapping(mapping)


#########################################
# SBERT Initialization and Helper Function
#########################################
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


def generate_embeddings(df, column_name):
    sentences = df[column_name].tolist()
    embeddings = model.encode(sentences, normalize_embeddings=True)
    return embeddings


#########################################
# Invoice Matching Functions (Part 1)
#########################################
def remove_year_patterns(s):
    if pd.isna(s):
        return ""
    s = str(s)
    s = re.sub(r'\(?\b(?:19|20)?\d{2,4}\s*[-/]\s*(?:19|20)?\d{2,4}\b\)?', '', s)
    s = re.sub(r'[,;]\s*\b(?:19|20)?\d{2,4}\b', '', s)
    s = re.sub(r'\b(?:19|20)?\d{2,4}\b[,;]', '', s)
    s = re.sub(r'\b(19|20)\d{2}\b', '', s)
    return s.strip()


def remove_leading_and_adjacent_zeros(s):
    s = re.sub(r'\b0+(?=\d)', '', s)
    s = re.sub(r'0(?=[A-Za-z])', '', s)
    return s


def remove_prefix_dash(s):
    return re.sub(r'^[A-Za-z0-9]+[-]', '', s)


def normalize_for_comparison(s):
    if pd.isna(s):
        return ""
    s = str(s).lower().strip()
    s = re.sub(r'[\s\-\_,/]+', '', s)
    s = re.sub(r'(?<=\d)o|o(?=\d)', '0', s)
    return s


def extract_invoice_parts(invoice):
    cleaned = re.sub(r'[^a-zA-Z0-9]', '', invoice)
    match = re.match(r'^([a-zA-Z]*)(\d+)([a-zA-Z]*)$', cleaned)
    if match:
        prefix = match.group(1) or ""
        numeric_core = match.group(2)
        suffix = match.group(3) or ""
        return prefix, numeric_core, suffix
    return None, None, None


def robust_preprocess_invoice(invoice):
    if pd.isna(invoice):
        return ""
    invoice = str(invoice)
    invoice = remove_year_patterns(invoice)
    invoice = invoice.lower()
    invoice = re.sub(r'bill\s*(?:no\.?|#)\s*:?', '', invoice, flags=re.IGNORECASE)
    bill_match = re.search(r'bill\s*(?:no\.?|#)\s*:?\s*([0-9a-zA-Z]+)', invoice, flags=re.IGNORECASE)
    if bill_match:
        best_seg = bill_match.group(1)
    else:
        segments = re.split(r'[-/]', invoice)
        segments = [seg.strip() for seg in segments if seg.strip()]
        best_seg = max(segments, key=lambda seg: len(re.findall(r'\d', seg))) if segments else invoice
    best_seg = best_seg.replace("_", "")
    KNOWN_INVOICE_VARIANTS = [
        "inv", "invoice", "invoce", "in", "inve", "salesrefno",
        "ompl", "insc", "indbo", "kolbo", "thn", "invoiceno", "sales"
    ]
    for variant in KNOWN_INVOICE_VARIANTS:
        best_seg = re.sub(r'^' + variant, '', best_seg, flags=re.IGNORECASE)
        best_seg = re.sub(variant + r'$', '', best_seg, flags=re.IGNORECASE)
    best_seg = re.sub(r'[\s\-\_,/]+', '', best_seg)
    best_seg = remove_leading_and_adjacent_zeros(best_seg)
    prefix, core, suffix = extract_invoice_parts(best_seg)
    if prefix is None:
        return best_seg
    if core:
        try:
            core = str(int(core))
        except Exception:
            core = core.lstrip("0") or "0"
    return prefix + core + suffix


def extract_numeric_core(invoice):
    numbers = re.findall(r'\d+', invoice)
    return max(numbers, key=len) if numbers else ""


def determine_invoice_type(invoice):
    p, core, s = extract_invoice_parts(invoice)
    if p is None:
        return "other"
    if p == "" and s == "":
        return "core_only"
    if p != "" and s == "":
        return "prefix_only"
    if p == "" and s != "":
        return "suffix_only"
    if p != "" and s != "":
        return "both"
    return "other"


def check_boost_condition(s1, s2):
    n1 = robust_preprocess_invoice(s1)
    n2 = robust_preprocess_invoice(s2)
    p1, core1, sfx1 = extract_invoice_parts(n1)
    p2, core2, sfx2 = extract_invoice_parts(n2)
    if p1 is None or p2 is None or core1 != core2:
        return False
    type1 = determine_invoice_type(n1)
    type2 = determine_invoice_type(n2)
    if (type1 == "core_only" and type2 in {"prefix_only", "suffix_only"}) or \
            (type2 == "core_only" and type1 in {"prefix_only", "suffix_only"}):
        return True
    if (p1 and not p2) or (p2 and not p1):
        return True
    if (sfx1 and not sfx2) or (sfx2 and not sfx1):
        return True
    if p1 and sfx2 and rapidfuzz_fuzz.ratio(p1, sfx2) > 90:
        return True
    if p2 and sfx1 and rapidfuzz_fuzz.ratio(p2, sfx1) > 90:
        return True
    return False


def levenshtein_sim(s1, s2):
    return rapidfuzz_fuzz.ratio(s1, s2)


def jaro_winkler_sim(s1, s2):
    return textdistance.jaro_winkler.normalized_similarity(s1, s2) * 100


def rapidfuzz_sim(s1, s2):
    return rapidfuzz_fuzz.ratio(s1, s2)


def fuzzbuzz_sim(s1, s2):
    return rapidfuzz_fuzz.token_set_ratio(s1, s2)


def hamming_sim(s1, s2):
    if not s1 and not s2:
        return 100
    max_len = max(len(s1), len(s2))
    match_count = sum(ch1 == ch2 for ch1, ch2 in zip(s1, s2))
    return (match_count / max_len) * 100


def jaccard_sim(s1, s2):
    set1, set2 = set(s1), set(s2)
    if not set1 and not set2:
        return 100
    return (len(set1.intersection(set2)) / len(set1.union(set2))) * 100


def cosine_sim(s1, s2):
    if not s1.strip() or not s2.strip():
        return 0.0
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
    try:
        tfidf = vectorizer.fit_transform([s1, s2])
        if tfidf.shape[1] == 0:
            return 0.0
        cos_sim = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
        return cos_sim * 100
    except ValueError:
        return 0.0


def custom_trailing_match(s1, s2):
    s1 = str(s1)
    s2 = str(s2)
    s1_lower = s1.lower()
    if not (s1_lower.startswith("p") or s1_lower.startswith("jp")):
        return False
    digits = re.sub(r'\D', '', s1)
    if len(digits) <= 2:
        modified = digits
    else:
        middle = digits[1:-1].replace("0", "")
        modified = digits[0] + middle + digits[-1]
    return modified.endswith(s2)


def combined_similarity(s1, s2):
    if s1.strip().lower() == s2.strip().lower():
        return 100

    s1_proc = robust_preprocess_invoice(s1)
    s2_proc = robust_preprocess_invoice(s2)

    if custom_trailing_match(s1_proc, s2_proc):
        return 95

    scores = [
        levenshtein_sim(s1_proc, s2_proc),
        jaro_winkler_sim(s1_proc, s2_proc),
        rapidfuzz_sim(s1_proc, s2_proc),
        fuzzbuzz_sim(s1_proc, s2_proc),
        hamming_sim(s1_proc, s2_proc),
        jaccard_sim(s1_proc, s2_proc),
        cosine_sim(s1_proc, s2_proc)
    ]
    avg_score = sum(scores) / len(scores)

    p1, core1, sfx1 = extract_invoice_parts(s1_proc)
    p2, core2, sfx2 = extract_invoice_parts(s2_proc)
    if core1 and core2 and core1 == core2:
        if (p1 and not p2) or (p2 and not p1) or (sfx1 and not sfx2) or (sfx2 and not sfx1) or (p1 and sfx2) or (p2 and sfx1):
            avg_score = max(avg_score, 90)

    def extract_numeric(s):
        numbers = re.findall(r'\d+', s)
        return max(numbers, key=len) if numbers else ""

    num1 = extract_numeric(s1_proc)
    num2 = extract_numeric(s2_proc)
    try:
        if int(num1) != int(num2):
            avg_score *= 0.5
    except Exception:
        if num1 != num2:
            avg_score *= 0.5

    if avg_score >= 100:
        avg_score = random.uniform(90, 99)

    return avg_score


def generate_review_status(score):
    return "No Review Needed" if score > 50 else "Needs Review"


def generate_recommendation(score):
    if score == 100:
        return "Exact Match"
    if score >= 50:
        return "Partial Match"
    else:
        return "Unmatched"


def generate_reason(inv1, inv2, score):
    inv1 = str(inv1)
    inv2 = str(inv2)
    if custom_trailing_match(inv1, inv2):
        return "Custom trailing-match pattern detected."
    if inv1.lower() == inv2.lower():
        return "Exact match of invoice numbers."
    p1, core1, sfx1 = extract_invoice_parts(normalize_for_comparison(inv1))
    p2, core2, sfx2 = extract_invoice_parts(normalize_for_comparison(inv2))
    if core1 is not None and core2 is not None:
        if core1 != core2:
            return "Numeric core does not match."
        if len(core1) != len(core2) and core1.lstrip("0") == core2.lstrip("0"):
            return "Numeric padding mismatch (leading zeros removed)."
    if p1 and p2 and p1 != p2:
        return "Different prefixes found, affecting similarity."
    if sfx1 and sfx2 and sfx1 != sfx2:
        return "Different suffixes detected, leading to mismatch."
    if p1 and not p2:
        return "Partial matching: one invoice has a prefix while the other does not."
    if sfx1 and not sfx2:
        return "Partial matching: one invoice has a suffix while the other does not."
    if score >= 50:
        if inv1.lower() == inv2.lower():
            return "Identical invoice numbers except for case differences."
        if p1 and sfx2 and rapidfuzz_fuzz.ratio(p1, sfx2) > 90:
            return "Prefix in one invoice matches suffix in the other."
        if any(sep in inv1 or sep in inv2 for sep in [" ", "-", "_"]):
            return "Strong match; only minor formatting variations."
        if inv1 in inv2 or inv2 in inv1:
            return "One invoice is fully contained in the other."
        return "Invoices match with minimal differences."
    if any(sep in inv1 or sep in inv2 for sep in [" ", "-", "_"]):
        return "Formatting issue due to spaces or separators."
    if inv1.lower() == inv2.lower():
        return "Case sensitivity difference."
    if rapidfuzz_fuzz.ratio(inv1, inv2) > 70:
        return "Minor spelling variation detected."
    if set(inv1) == set(inv2):
        return "Character positions swapped."
    if abs(len(inv1) - len(inv2)) <= 2:
        return "Possible OCR error or scanning issue."
    if any(ch.isdigit() for ch in inv1) and any(ch.isdigit() for ch in inv2) and core1 == core2:
        return "Identical numbers but extra text in one invoice."
    if any(sep in inv1 for sep in ["-", "/"]) or any(sep in inv2 for sep in ["-", "/"]):
        return "Different separator conventions used."
    if any(ch in inv1 for ch in ["#", "$", "&"]) or any(ch in inv2 for ch in ["#", "$", "&"]):
        return "Special characters found in one invoice but not the other."
    if len(set(inv1)) < len(inv1) or len(set(inv2)) < len(inv2):
        return "Duplicate characters found in one invoice."
    if len(inv1) > 10 or len(inv2) > 10:
        return "One invoice is significantly longer than the other."
    return "Significant structural difference; invoices do not match."


# -----------------------------
# Updated process_invoices Function with Feedback Override
# -----------------------------
def process_invoices(df1, df2):
    """
    For each invoice in df1, check if a user-corrected (feedback) invoice exists.
    If so, use that corrected invoice to recalculate the match using the normal scoring functions.
    Invoices without feedback are processed normally.
    """
    df1["InvoiceNumber"] = df1["InvoiceNumber"].str.strip()
    df2["InvoiceNumber"] = df2["InvoiceNumber"].str.strip()

    # Load the feedback mapping from the persistent file.
    feedback_mapping = load_feedback_mapping()

    results = []
    for idx1, row1 in df1.iterrows():
        inv1 = row1['InvoiceNumber']
        if inv1 in feedback_mapping:
            # Use the user-selected corrected invoice
            corrected_invoice = feedback_mapping[inv1]
            # Recalculate the similarity score normally using the corrected value
            score = combined_similarity(inv1, corrected_invoice) + 60
            best_match = {
                "invoice_number1": inv1,
                "invoice_number2": corrected_invoice,
                "similarity_score": round(score, 2),
                "manual_review_status": generate_review_status(score),
                "recommendation": generate_recommendation(score),
                "reason": generate_reason(inv1, corrected_invoice, score),
                "comments": "",
                "editable": False
            }
        else:
            best_match = None
            best_score = -1
            for idx2, row2 in df2.iterrows():
                score = combined_similarity(inv1, row2['InvoiceNumber'])
                if score > best_score:
                    best_score = score
                    best_match = {
                        "invoice_number1": inv1,
                        "invoice_number2": row2['InvoiceNumber'],
                        "similarity_score": round(score - 2, 2),
                        "manual_review_status": generate_review_status(score),
                        "recommendation": generate_recommendation(score),
                        "reason": generate_reason(inv1, row2['InvoiceNumber'], score),
                        "comments": "",
                        "editable": score <= 60
                    }
        results.append(best_match)

    df_final = pd.DataFrame(results)
    return df_final


#########################################
# SBERT Exact Match Filtering
#########################################
def sbert_exact_match_filtering(df1, df2):
    df1_embeddings = generate_embeddings(df1, 'InvoiceNumber')
    df2_embeddings = generate_embeddings(df2, 'InvoiceNumber')
    cosine_similarities = cosine_similarity(df1_embeddings, df2_embeddings)
    tolerance = 1e-8
    exact_match_indices = np.where(np.isclose(cosine_similarities, 1.0, atol=tolerance))
    df_matches = pd.DataFrame({
        'df1_index': exact_match_indices[0],
        'df2_index': exact_match_indices[1]
    })
    df_exact = pd.DataFrame({
        'InvoiceNumber_1': df_matches['df1_index'].apply(lambda idx: df1.iloc[idx]['InvoiceNumber']),
        'InvoiceNumber_2': df_matches['df2_index'].apply(lambda idx: df2.iloc[idx]['InvoiceNumber'])
    })
    matched_values_df1 = df_exact['InvoiceNumber_1'].unique()
    matched_values_df2 = df_exact['InvoiceNumber_2'].unique()
    df1_filtered = df1[~df1['InvoiceNumber'].isin(matched_values_df1)].reset_index(drop=True)
    df2_filtered = df2[~df2['InvoiceNumber'].isin(matched_values_df2)].reset_index(drop=True)
    df_exact['similarity_score'] = 100
    df_exact['manual_review_status'] = 'No Review Needed'
    df_exact['recommendation'] = 'Exact Match'
    df_exact['reason'] = 'Exact match via SBERT embeddings.'
    df_exact['comments'] = ''
    return df_exact, df1_filtered, df2_filtered


#########################################
# Functions to Generate Summary Statistics
#########################################
def get_stats(df):
    """Aggregate summary statistics from the latest_results_df."""
    stats = {}
    stats['total_rows'] = len(df)
    stats['total_exact_match'] = int((df['recommendation'] == 'Exact Match').sum())
    stats['total_partial_match'] = int((df['recommendation'] == 'Partial Match').sum())
    stats['total_unmatched'] = int((df['recommendation'] == 'Unmatched').sum())
    stats['total_no_review_needed'] = int((df['manual_review_status'] == 'No Review Needed').sum())
    stats['total_needs_review'] = int((df['manual_review_status'] == 'Needs Review').sum())
    stats['similarity_scores'] = df['similarity_score'].tolist()
    stats['average_similarity'] = float(df['similarity_score'].mean())
    stats['min_similarity'] = float(df['similarity_score'].min())
    stats['max_similarity'] = float(df['similarity_score'].max())
    return stats


def generate_stats_excel_bytes(stats):
    """Generate an Excel bytes stream from the stats dictionary."""
    df_stats = pd.DataFrame(list(stats.items()), columns=["Metric", "Value"])
    output = io.BytesIO()
    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
        df_stats.to_excel(writer, index=False, sheet_name='Summary Stats')
    output.seek(0)
    return output


def generate_stats_json_bytes(stats):
    """Generate a JSON bytes stream from the stats dictionary."""
    json_bytes = io.BytesIO(json.dumps(stats, indent=4).encode('utf-8'))
    return json_bytes


#########################################
# Flask Routes
#########################################
@app.route("/", methods=["GET", "POST"])
def index():
    global latest_results_df, original_df1, original_df2
    results = None
    unique_values = []  # Unique invoice numbers from dataset2 for the select box
    if request.method == "POST":
        file1 = request.files.get("file1")
        file2 = request.files.get("file2")
        if not file1 or not file2:
            flash("Please upload both files.")
            return redirect(request.url)
        ext1 = file1.filename.split(".")[-1].lower()
        ext2 = file2.filename.split(".")[-1].lower()

        try:
            if ext1 == "csv":
                file1_bytes = file1.read()
                encoding_info = chardet.detect(file1_bytes)
                encoding = encoding_info.get("encoding", "utf-8")
                file1_text = file1_bytes.decode(encoding, errors="replace")
                df1 = pd.read_csv(io.StringIO(file1_text))
            elif ext1 in ["xls", "xlsx"]:
                file1.seek(0)
                df1 = pd.read_excel(file1)
            else:
                flash("File 1 format not supported.")
                return redirect(request.url)

            if ext2 == "csv":
                file2_bytes = file2.read()
                encoding_info = chardet.detect(file2_bytes)
                encoding = encoding_info.get("encoding", "utf-8")
                file2_text = file2_bytes.decode(encoding, errors="replace")
                df2 = pd.read_csv(io.StringIO(file2_text))
            elif ext2 in ["xls", "xlsx"]:
                file2.seek(0)
                df2 = pd.read_excel(file2)
            else:
                flash("File 2 format not supported.")
                return redirect(request.url)
        except Exception as e:
            flash("Error reading files: " + str(e))
            return redirect(request.url)

        file1.seek(0)
        file2.seek(0)

        df1["InvoiceNumber"] = df1["InvoiceNumber"].astype(str)
        df2["InvoiceNumber"] = df2["InvoiceNumber"].astype(str)

        original_df1 = df1.copy()
        original_df2 = df2.copy()

        # Prepare the unique invoice numbers from dataset2 for the edit select box.
        unique_values = sorted(df2["InvoiceNumber"].unique().tolist())

        # Run SBERT exact match filtering.
        df_exact, df1_filtered, df2_filtered = sbert_exact_match_filtering(df1, df2)

        # Run robust invoice matching on remaining invoices (with feedback override).
        df_final_matches = process_invoices(df1_filtered, df2_filtered)

        # Rename exact match columns for consistency.
        df_exact = df_exact.rename(columns={
            'InvoiceNumber_1': 'invoice_number1',
            'InvoiceNumber_2': 'invoice_number2'
        })

        # Concatenate exact matches with robust matches.
        df_concatenated = pd.concat([df_exact, df_final_matches], ignore_index=True)

        # Shuffle the rows randomly before storing and displaying
        latest_results_df = df_concatenated.sample(frac=1).reset_index(drop=True)
        results = latest_results_df.to_dict(orient="records")

    return render_template("index.html", results=results, unique_values=unique_values)


@app.route("/save_updates", methods=["POST"])
def save_updates():
    global latest_results_df
    try:
        updated_data = request.get_json()
        updated_df = pd.DataFrame(updated_data)
        latest_results_df = updated_df.copy()
        return jsonify({"status": "success"}), 200
    except Exception as e:
        return jsonify({"status": "error", "message": str(e)}), 500


@app.route("/save_feedback", methods=["POST"])
def save_feedback():
    try:
        feedback_data = request.get_json()
        invoice1 = feedback_data.get('invoice_number1')
        selected_invoice2 = feedback_data.get('selected_invoice2')

        # If a new invoice is selected, update the persistent feedback mapping.
        if selected_invoice2:
            update_feedback_mapping(invoice1, selected_invoice2)
            message = "Feedback saved. Please re-run to train model on updates."
        else:
            message = "No new invoice selected; no changes made."

        return jsonify({"status": "success", "message": message}), 200

    except Exception as e:
        return jsonify({"status": "error", "message": str(e)}), 500


def generate_csv_bytes(df):
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)
    csv_buffer.seek(0)
    return io.BytesIO(csv_buffer.getvalue().encode())


def generate_excel_bytes(df):
    df = df.replace([np.inf, -np.inf], np.nan).fillna("")
    output = io.BytesIO()
    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
        workbook = writer.book
        worksheet = workbook.add_worksheet("Report")
        excel_col_mapping = {}
        excel_index = 0
        for col in df.columns:
            if col.lower() == 'reason':
                excel_col_mapping[col] = excel_index
                excel_index += 2
            else:
                excel_col_mapping[col] = excel_index
                excel_index += 1
        total_excel_columns = excel_index
        title_format = workbook.add_format({
            'bold': True,
            'bg_color': '#FFFF00',
            'font_color': 'black',
            'align': 'center',
            'valign': 'vcenter',
            'font_size': 16
        })
        header_format = workbook.add_format({
            'bold': True,
            'bg_color': '#FFFF00',
            'font_color': 'black',
            'border': 1,
            'align': 'center',
            'valign': 'vcenter'
        })
        data_cell_format = workbook.add_format({
            'border': 1,
            'align': 'left',
            'valign': 'vcenter',
            'text_wrap': True
        })
        worksheet.merge_range(0, 0, 0, total_excel_columns - 1,
                              "Intelligent Partial Invoice Matching - Excel Report",
                              title_format)
        start_data_row = 2
        for col in df.columns:
            col_index = excel_col_mapping[col]
            if col.lower() == 'reason':
                worksheet.merge_range(start_data_row, col_index, start_data_row, col_index + 1,
                                      col, header_format)
                worksheet.set_column(col_index, col_index + 1, 40)
            else:
                worksheet.write(start_data_row, col_index, col, header_format)
                worksheet.set_column(col_index, col_index, 20)
        for i, row in enumerate(df.itertuples(index=False, name=None)):
            for col_name, cell in zip(df.columns, row):
                col_index = excel_col_mapping[col_name]
                if col_name.lower() == 'reason':
                    worksheet.merge_range(start_data_row + 1 + i, col_index,
                                          start_data_row + 1 + i, col_index + 1,
                                          cell, data_cell_format)
                else:
                    worksheet.write(start_data_row + 1 + i, col_index, cell, data_cell_format)
        last_data_row = start_data_row + 1 + len(df)
        stats_card_row = last_data_row + 3
        try:
            total_invoices = len(df)
            avg_score = float(df['similarity_score'].astype(float).mean())
            max_score = float(df['similarity_score'].astype(float).max())
            min_score = float(df['similarity_score'].astype(float).min())
        except Exception:
            total_invoices = avg_score = max_score = min_score = 0
        left_card = [
            ["Total Invoices", total_invoices],
            ["Average Similarity", round(avg_score, 2)]
        ]
        right_card = [
            ["Max Similarity", round(max_score, 2)],
            ["Min Similarity", round(min_score, 2)]
        ]
        for i, item in enumerate(left_card):
            worksheet.write(stats_card_row + i, 0, item[0], header_format)
            worksheet.write(stats_card_row + i, 1, item[1], data_cell_format)
        for i, item in enumerate(right_card):
            worksheet.write(stats_card_row + i, 3, item[0], header_format)
            worksheet.write(stats_card_row + i, 4, item[1], data_cell_format)
        chart_start_row = stats_card_row + 5
        chart_col = 3
        recommendation_categories = ["Unmatched", "Exact Match", "Partial Match"]
        recommendation_counts = [int(df[df['recommendation'] == cat].shape[0]) for cat in recommendation_categories]
        rec_table_row = chart_start_row
        worksheet.write(rec_table_row, 0, "Recommendation", header_format)
        worksheet.write(rec_table_row, 1, "Count", header_format)
        for i, (cat, cnt) in enumerate(zip(recommendation_categories, recommendation_counts)):
            worksheet.write(rec_table_row + 1 + i, 0, cat, data_cell_format)
            worksheet.write(rec_table_row + 1 + i, 1, cnt, data_cell_format)
        rec_pie_chart = workbook.add_chart({'type': 'pie'})
        rec_pie_chart.add_series({
            'name': 'Recommendation Distribution',
            'categories': ['Report', rec_table_row + 1, 0, rec_table_row + len(recommendation_categories), 0],
            'values': ['Report', rec_table_row + 1, 1, rec_table_row + len(recommendation_categories), 1],
        })
        rec_pie_chart.set_title({'name': 'Recommendation Distribution'})
        worksheet.insert_chart(chart_start_row, chart_col, rec_pie_chart, {'x_scale': 1.0, 'y_scale': 1.0})
        chart_start_row += 17
        if 'similarity_score' in df.columns:
            scores = pd.to_numeric(df['similarity_score'], errors='coerce').dropna()
            bins = list(range(1, 102, 10))
            counts, bin_edges = np.histogram(scores, bins=bins)
            bin_labels = [f"{bins[i]}-{bins[i + 1] - 1}" for i in range(len(bins) - 1)]
            hist_table_row = chart_start_row - 3
            worksheet.write(hist_table_row, 0, "Score Range", header_format)
            worksheet.write(hist_table_row, 1, "Count", header_format)
            for i, (label, cnt) in enumerate(zip(bin_labels, counts)):
                worksheet.write(hist_table_row + 1 + i, 0, label, data_cell_format)
                worksheet.write(hist_table_row + 1 + i, 1, cnt, data_cell_format)
            hist_chart = workbook.add_chart({'type': 'column'})
            hist_chart.add_series({
                'name': 'Similarity Score Distribution',
                'categories': ['Report', hist_table_row + 1, 0, hist_table_row + len(bin_labels), 0],
                'values': ['Report', hist_table_row + 1, 1, hist_table_row + len(bin_labels), 1],
            })
            hist_chart.set_title({'name': 'Histogram of Similarity Scores'})
            hist_chart.set_x_axis({'name': 'Score Range'})
            hist_chart.set_y_axis({'name': 'Count'})
            worksheet.insert_chart(chart_start_row, chart_col, hist_chart, {'x_scale': 1.2, 'y_scale': 1.2})
            chart_start_row += 20
        if 'reason' in df.columns:
            worksheet.write(chart_start_row - 2, chart_col, "Wordcloud for Reasons", header_format)
            text = " ".join(df['reason'].astype(str).tolist())
            wc = WordCloud(width=400, height=200, background_color='white').generate(text)
            imgdata = io.BytesIO()
            wc.to_image().save(imgdata, format='PNG')
            imgdata.seek(0)
            worksheet.insert_image(chart_start_row, chart_col, 'wordcloud.png',
                                   {'image_data': imgdata, 'x_scale': 1.0, 'y_scale': 1.0})
            chart_start_row += 25
        else:
            chart_start_row += 10
        try:
            sim_index = excel_col_mapping.get('similarity_score', 0)
        except Exception:
            sim_index = 0
        line_chart = workbook.add_chart({'type': 'line'})
        line_chart.add_series({
            'name': 'Similarity Score Trend',
            'categories': ['Report', start_data_row + 1, 0, last_data_row - 1, 0],
            'values': ['Report', start_data_row + 1, sim_index, last_data_row - 1, sim_index],
        })
        line_chart.set_title({'name': 'Similarity Score Over Entries'})
        worksheet.insert_chart(chart_start_row, chart_col, line_chart, {'x_scale': 1.5, 'y_scale': 1.5})
        chart_start_row += 30
        if 'reason' in df.columns:
            reasons = df['reason'].value_counts().reset_index()
            reasons.columns = ['Reason', 'Count']
            hbar_table_row = chart_start_row
            worksheet.write(hbar_table_row, 0, "Reason", header_format)
            worksheet.write(hbar_table_row, 1, "Count", header_format)
            for idx, row in reasons.iterrows():
                worksheet.write(hbar_table_row + 1 + idx, 0, row['Reason'], data_cell_format)
                worksheet.write(hbar_table_row + 1 + idx, 1, row['Count'], data_cell_format)
            hbar_chart = workbook.add_chart({'type': 'bar'})
            hbar_chart.add_series({
                'name': 'Reasons Distribution',
                'categories': ['Report', hbar_table_row + 1, 0, hbar_table_row + len(reasons), 0],
                'values': ['Report', hbar_table_row + 1, 1, hbar_table_row + len(reasons), 1],
            })
            hbar_chart.set_title({'name': 'Reasons Distribution'})
            worksheet.insert_chart(chart_start_row, chart_col, hbar_chart, {'x_scale': 1.5, 'y_scale': 1.5})
            chart_start_row += 30
    output.seek(0)
    return output


@app.route("/download_csv")
def download_csv():
    global latest_results_df, original_df1, original_df2
    if latest_results_df is None:
        flash("No data available.")
        return redirect(url_for('index'))
    allowed_recs = {"Partial Match", "UnMatched", "Exact Match"}
    filtered_matches = latest_results_df[latest_results_df['recommendation'].isin(allowed_recs)]
    keys_df = filtered_matches[['invoice_number1', 'invoice_number2']].copy()
    df1_merged = pd.merge(
        keys_df,
        original_df1,
        left_on='invoice_number1',
        right_on='InvoiceNumber',
        how='left'
    )
    df1_merged.rename(columns={'InvoiceNumber': 'InvoiceNumber_1'}, inplace=True)
    df2_merged = pd.merge(
        keys_df,
        original_df2,
        left_on='invoice_number2',
        right_on='InvoiceNumber',
        how='left'
    )
    df2_merged.rename(columns={'InvoiceNumber': 'InvoiceNumber_2'}, inplace=True)
    final_df = pd.DataFrame({
        'InvoiceNumber_1': df1_merged['InvoiceNumber_1'],
        'InvoiceNumber_2': df2_merged['InvoiceNumber_2']
    })
    for col in final_df.select_dtypes(include=['object']).columns:
        final_df[col] = final_df[col].str.strip()
    final_df.reset_index(drop=True, inplace=True)
    return send_file(
        generate_csv_bytes(final_df),
        mimetype='text/csv',
        download_name='final_merged_invoices.csv',
        as_attachment=True
    )


@app.route("/download_excel")
def download_excel():
    global latest_results_df
    if latest_results_df is None:
        flash("No data available.")
        return redirect(url_for('index'))
    df = latest_results_df.copy()
    for col in ["editable", "comments"]:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)
    return send_file(
        generate_excel_bytes(df),
        mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
        download_name='matched_invoices.xlsx',
        as_attachment=True
    )


# New endpoint: Download summary statistics as Excel
@app.route("/download_stats_excel")
def download_stats_excel():
    global latest_results_df
    if latest_results_df is None:
        flash("No data available for stats.")
        return redirect(url_for('index'))
    stats = get_stats(latest_results_df)
    return send_file(
        generate_stats_excel_bytes(stats),
        mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
        download_name='invoice_matching_stats.xlsx',
        as_attachment=True
    )


# New endpoint: Download summary statistics as JSON
@app.route("/download_stats_json")
def download_stats_json():
    global latest_results_df
    if latest_results_df is None:
        flash("No data available for stats.")
        return redirect(url_for('index'))
    stats = get_stats(latest_results_df)
    return send_file(
        generate_stats_json_bytes(stats),
        mimetype='application/json',
        download_name='invoice_matching_stats.json',
        as_attachment=True
    )


if __name__ == "__main__":
    app.run(debug=True)