rajkhanke commited on
Commit
82d714a
·
verified ·
1 Parent(s): 588bc02

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +874 -874
app.py CHANGED
@@ -1,874 +1,874 @@
1
- import re
2
- import random
3
- import pandas as pd
4
- import json
5
- from textblob import Word
6
- from rapidfuzz import fuzz as rapidfuzz_fuzz
7
- from fuzzywuzzy import fuzz as fuzzywuzzy_fuzz
8
- from Levenshtein import ratio as levenshtein_ratio, jaro_winkler as levenshtein_jaro_winkler
9
- from sklearn.feature_extraction.text import TfidfVectorizer
10
- from sklearn.metrics.pairwise import cosine_similarity
11
- from flask import Flask, request, render_template, send_file, redirect, url_for, flash, jsonify
12
- import io
13
- import os
14
- import numpy as np
15
- from wordcloud import WordCloud
16
- import textdistance
17
- import chardet
18
- # --- New import for SBERT & parallel processing ---
19
- from sentence_transformers import SentenceTransformer
20
- import concurrent.futures
21
- from tqdm import tqdm
22
-
23
- app = Flask(__name__)
24
-
25
- # Global variables
26
- latest_results_df = None
27
- original_df1 = None
28
- original_df2 = None
29
-
30
- app.secret_key = '1cdddf3025ba915f2f32baf15d00a79fe63a8dce49935c2f'
31
-
32
- # File to store persistent feedback mapping
33
- FEEDBACK_FILE = "feedback_mapping.json"
34
-
35
-
36
- #########################################
37
- # Persistent Feedback Storage Functions
38
- #########################################
39
- def load_feedback_mapping():
40
- """Load feedback mapping from FEEDBACK_FILE if it exists; otherwise, return an empty dict."""
41
- if os.path.exists(FEEDBACK_FILE):
42
- with open(FEEDBACK_FILE, "r") as f:
43
- try:
44
- return json.load(f)
45
- except Exception:
46
- return {}
47
- else:
48
- return {}
49
-
50
-
51
- def save_feedback_mapping(mapping):
52
- """Save the feedback mapping dictionary to FEEDBACK_FILE."""
53
- with open(FEEDBACK_FILE, "w") as f:
54
- json.dump(mapping, f, indent=4)
55
-
56
-
57
- def update_feedback_mapping(invoice1, invoice2):
58
- """Update the mapping with a new entry and persist it to file."""
59
- mapping = load_feedback_mapping()
60
- mapping[invoice1] = invoice2
61
- save_feedback_mapping(mapping)
62
-
63
-
64
- #########################################
65
- # SBERT Initialization and Helper Function
66
- #########################################
67
- model = SentenceTransformer('all-mpnet-base-v2')
68
-
69
-
70
- def generate_embeddings(df, column_name):
71
- sentences = df[column_name].tolist()
72
- embeddings = model.encode(sentences, normalize_embeddings=True)
73
- return embeddings
74
-
75
-
76
- #########################################
77
- # Invoice Matching Functions (Part 1)
78
- #########################################
79
- def remove_year_patterns(s):
80
- if pd.isna(s):
81
- return ""
82
- s = str(s)
83
- s = re.sub(r'\(?\b(?:19|20)?\d{2,4}\s*[-/]\s*(?:19|20)?\d{2,4}\b\)?', '', s)
84
- s = re.sub(r'[,;]\s*\b(?:19|20)?\d{2,4}\b', '', s)
85
- s = re.sub(r'\b(?:19|20)?\d{2,4}\b[,;]', '', s)
86
- s = re.sub(r'\b(19|20)\d{2}\b', '', s)
87
- return s.strip()
88
-
89
-
90
- def remove_leading_and_adjacent_zeros(s):
91
- s = re.sub(r'\b0+(?=\d)', '', s)
92
- s = re.sub(r'0(?=[A-Za-z])', '', s)
93
- return s
94
-
95
-
96
- def remove_prefix_dash(s):
97
- return re.sub(r'^[A-Za-z0-9]+[-]', '', s)
98
-
99
-
100
- def normalize_for_comparison(s):
101
- if pd.isna(s):
102
- return ""
103
- s = str(s).lower().strip()
104
- s = re.sub(r'[\s\-\_,/]+', '', s)
105
- s = re.sub(r'(?<=\d)o|o(?=\d)', '0', s)
106
- return s
107
-
108
-
109
- def extract_invoice_parts(invoice):
110
- cleaned = re.sub(r'[^a-zA-Z0-9]', '', invoice)
111
- match = re.match(r'^([a-zA-Z]*)(\d+)([a-zA-Z]*)$', cleaned)
112
- if match:
113
- prefix = match.group(1) or ""
114
- numeric_core = match.group(2)
115
- suffix = match.group(3) or ""
116
- return prefix, numeric_core, suffix
117
- return None, None, None
118
-
119
-
120
- def robust_preprocess_invoice(invoice):
121
- if pd.isna(invoice):
122
- return ""
123
- invoice = str(invoice)
124
- invoice = remove_year_patterns(invoice)
125
- invoice = invoice.lower()
126
- invoice = re.sub(r'bill\s*(?:no\.?|#)\s*:?', '', invoice, flags=re.IGNORECASE)
127
- bill_match = re.search(r'bill\s*(?:no\.?|#)\s*:?\s*([0-9a-zA-Z]+)', invoice, flags=re.IGNORECASE)
128
- if bill_match:
129
- best_seg = bill_match.group(1)
130
- else:
131
- segments = re.split(r'[-/]', invoice)
132
- segments = [seg.strip() for seg in segments if seg.strip()]
133
- best_seg = max(segments, key=lambda seg: len(re.findall(r'\d', seg))) if segments else invoice
134
- best_seg = best_seg.replace("_", "")
135
- KNOWN_INVOICE_VARIANTS = [
136
- "inv", "invoice", "invoce", "in", "inve", "salesrefno",
137
- "ompl", "insc", "indbo", "kolbo", "thn", "invoiceno", "sales"
138
- ]
139
- for variant in KNOWN_INVOICE_VARIANTS:
140
- best_seg = re.sub(r'^' + variant, '', best_seg, flags=re.IGNORECASE)
141
- best_seg = re.sub(variant + r'$', '', best_seg, flags=re.IGNORECASE)
142
- best_seg = re.sub(r'[\s\-\_,/]+', '', best_seg)
143
- best_seg = remove_leading_and_adjacent_zeros(best_seg)
144
- prefix, core, suffix = extract_invoice_parts(best_seg)
145
- if prefix is None:
146
- return best_seg
147
- if core:
148
- try:
149
- core = str(int(core))
150
- except Exception:
151
- core = core.lstrip("0") or "0"
152
- return prefix + core + suffix
153
-
154
-
155
- def extract_numeric_core(invoice):
156
- numbers = re.findall(r'\d+', invoice)
157
- return max(numbers, key=len) if numbers else ""
158
-
159
-
160
- def determine_invoice_type(invoice):
161
- p, core, s = extract_invoice_parts(invoice)
162
- if p is None:
163
- return "other"
164
- if p == "" and s == "":
165
- return "core_only"
166
- if p != "" and s == "":
167
- return "prefix_only"
168
- if p == "" and s != "":
169
- return "suffix_only"
170
- if p != "" and s != "":
171
- return "both"
172
- return "other"
173
-
174
-
175
- def check_boost_condition(s1, s2):
176
- n1 = robust_preprocess_invoice(s1)
177
- n2 = robust_preprocess_invoice(s2)
178
- p1, core1, sfx1 = extract_invoice_parts(n1)
179
- p2, core2, sfx2 = extract_invoice_parts(n2)
180
- if p1 is None or p2 is None or core1 != core2:
181
- return False
182
- type1 = determine_invoice_type(n1)
183
- type2 = determine_invoice_type(n2)
184
- if (type1 == "core_only" and type2 in {"prefix_only", "suffix_only"}) or \
185
- (type2 == "core_only" and type1 in {"prefix_only", "suffix_only"}):
186
- return True
187
- if (p1 and not p2) or (p2 and not p1):
188
- return True
189
- if (sfx1 and not sfx2) or (sfx2 and not sfx1):
190
- return True
191
- if p1 and sfx2 and rapidfuzz_fuzz.ratio(p1, sfx2) > 90:
192
- return True
193
- if p2 and sfx1 and rapidfuzz_fuzz.ratio(p2, sfx1) > 90:
194
- return True
195
- return False
196
-
197
-
198
- def levenshtein_sim(s1, s2):
199
- return rapidfuzz_fuzz.ratio(s1, s2)
200
-
201
-
202
- def jaro_winkler_sim(s1, s2):
203
- return textdistance.jaro_winkler.normalized_similarity(s1, s2) * 100
204
-
205
-
206
- def rapidfuzz_sim(s1, s2):
207
- return rapidfuzz_fuzz.ratio(s1, s2)
208
-
209
-
210
- def fuzzbuzz_sim(s1, s2):
211
- return rapidfuzz_fuzz.token_set_ratio(s1, s2)
212
-
213
-
214
- def hamming_sim(s1, s2):
215
- if not s1 and not s2:
216
- return 100
217
- max_len = max(len(s1), len(s2))
218
- match_count = sum(ch1 == ch2 for ch1, ch2 in zip(s1, s2))
219
- return (match_count / max_len) * 100
220
-
221
-
222
- def jaccard_sim(s1, s2):
223
- set1, set2 = set(s1), set(s2)
224
- if not set1 and not set2:
225
- return 100
226
- return (len(set1.intersection(set2)) / len(set1.union(set2))) * 100
227
-
228
-
229
- def cosine_sim(s1, s2):
230
- if not s1.strip() or not s2.strip():
231
- return 0.0
232
- vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
233
- try:
234
- tfidf = vectorizer.fit_transform([s1, s2])
235
- if tfidf.shape[1] == 0:
236
- return 0.0
237
- cos_sim = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
238
- return cos_sim * 100
239
- except ValueError:
240
- return 0.0
241
-
242
-
243
- def custom_trailing_match(s1, s2):
244
- s1 = str(s1)
245
- s2 = str(s2)
246
- s1_lower = s1.lower()
247
- if not (s1_lower.startswith("p") or s1_lower.startswith("jp")):
248
- return False
249
- digits = re.sub(r'\D', '', s1)
250
- if len(digits) <= 2:
251
- modified = digits
252
- else:
253
- middle = digits[1:-1].replace("0", "")
254
- modified = digits[0] + middle + digits[-1]
255
- return modified.endswith(s2)
256
-
257
-
258
- def combined_similarity(s1, s2):
259
- if s1.strip().lower() == s2.strip().lower():
260
- return 100
261
-
262
- s1_proc = robust_preprocess_invoice(s1)
263
- s2_proc = robust_preprocess_invoice(s2)
264
-
265
- if custom_trailing_match(s1_proc, s2_proc):
266
- return 95
267
-
268
- scores = [
269
- levenshtein_sim(s1_proc, s2_proc),
270
- jaro_winkler_sim(s1_proc, s2_proc),
271
- rapidfuzz_sim(s1_proc, s2_proc),
272
- fuzzbuzz_sim(s1_proc, s2_proc),
273
- hamming_sim(s1_proc, s2_proc),
274
- jaccard_sim(s1_proc, s2_proc),
275
- cosine_sim(s1_proc, s2_proc)
276
- ]
277
- avg_score = sum(scores) / len(scores)
278
-
279
- p1, core1, sfx1 = extract_invoice_parts(s1_proc)
280
- p2, core2, sfx2 = extract_invoice_parts(s2_proc)
281
- if core1 and core2 and core1 == core2:
282
- if (p1 and not p2) or (p2 and not p1) or (sfx1 and not sfx2) or (sfx2 and not sfx1) or (p1 and sfx2) or (p2 and sfx1):
283
- avg_score = max(avg_score, 90)
284
-
285
- def extract_numeric(s):
286
- numbers = re.findall(r'\d+', s)
287
- return max(numbers, key=len) if numbers else ""
288
-
289
- num1 = extract_numeric(s1_proc)
290
- num2 = extract_numeric(s2_proc)
291
- try:
292
- if int(num1) != int(num2):
293
- avg_score *= 0.5
294
- except Exception:
295
- if num1 != num2:
296
- avg_score *= 0.5
297
-
298
- if avg_score >= 100:
299
- avg_score = random.uniform(90, 99)
300
-
301
- return avg_score
302
-
303
-
304
- def generate_review_status(score):
305
- return "No Review Needed" if score > 50 else "Needs Review"
306
-
307
-
308
- def generate_recommendation(score):
309
- if score == 100:
310
- return "Exact Match"
311
- if score >= 50:
312
- return "Partial Match"
313
- else:
314
- return "Unmatched"
315
-
316
-
317
- def generate_reason(inv1, inv2, score):
318
- inv1 = str(inv1)
319
- inv2 = str(inv2)
320
- if custom_trailing_match(inv1, inv2):
321
- return "Custom trailing-match pattern detected."
322
- if inv1.lower() == inv2.lower():
323
- return "Exact match of invoice numbers."
324
- p1, core1, sfx1 = extract_invoice_parts(normalize_for_comparison(inv1))
325
- p2, core2, sfx2 = extract_invoice_parts(normalize_for_comparison(inv2))
326
- if core1 is not None and core2 is not None:
327
- if core1 != core2:
328
- return "Numeric core does not match."
329
- if len(core1) != len(core2) and core1.lstrip("0") == core2.lstrip("0"):
330
- return "Numeric padding mismatch (leading zeros removed)."
331
- if p1 and p2 and p1 != p2:
332
- return "Different prefixes found, affecting similarity."
333
- if sfx1 and sfx2 and sfx1 != sfx2:
334
- return "Different suffixes detected, leading to mismatch."
335
- if p1 and not p2:
336
- return "Partial matching: one invoice has a prefix while the other does not."
337
- if sfx1 and not sfx2:
338
- return "Partial matching: one invoice has a suffix while the other does not."
339
- if score >= 50:
340
- if inv1.lower() == inv2.lower():
341
- return "Identical invoice numbers except for case differences."
342
- if p1 and sfx2 and rapidfuzz_fuzz.ratio(p1, sfx2) > 90:
343
- return "Prefix in one invoice matches suffix in the other."
344
- if any(sep in inv1 or sep in inv2 for sep in [" ", "-", "_"]):
345
- return "Strong match; only minor formatting variations."
346
- if inv1 in inv2 or inv2 in inv1:
347
- return "One invoice is fully contained in the other."
348
- return "Invoices match with minimal differences."
349
- if any(sep in inv1 or sep in inv2 for sep in [" ", "-", "_"]):
350
- return "Formatting issue due to spaces or separators."
351
- if inv1.lower() == inv2.lower():
352
- return "Case sensitivity difference."
353
- if rapidfuzz_fuzz.ratio(inv1, inv2) > 70:
354
- return "Minor spelling variation detected."
355
- if set(inv1) == set(inv2):
356
- return "Character positions swapped."
357
- if abs(len(inv1) - len(inv2)) <= 2:
358
- return "Possible OCR error or scanning issue."
359
- if any(ch.isdigit() for ch in inv1) and any(ch.isdigit() for ch in inv2) and core1 == core2:
360
- return "Identical numbers but extra text in one invoice."
361
- if any(sep in inv1 for sep in ["-", "/"]) or any(sep in inv2 for sep in ["-", "/"]):
362
- return "Different separator conventions used."
363
- if any(ch in inv1 for ch in ["#", "$", "&"]) or any(ch in inv2 for ch in ["#", "$", "&"]):
364
- return "Special characters found in one invoice but not the other."
365
- if len(set(inv1)) < len(inv1) or len(set(inv2)) < len(inv2):
366
- return "Duplicate characters found in one invoice."
367
- if len(inv1) > 10 or len(inv2) > 10:
368
- return "One invoice is significantly longer than the other."
369
- return "Significant structural difference; invoices do not match."
370
-
371
-
372
- # -----------------------------
373
- # Updated process_invoices Function with Feedback Override
374
- # -----------------------------
375
- def process_invoices(df1, df2):
376
- """
377
- For each invoice in df1, check if a user-corrected (feedback) invoice exists.
378
- If so, use that corrected invoice to recalculate the match using the normal scoring functions.
379
- Invoices without feedback are processed normally.
380
- """
381
- df1["InvoiceNumber"] = df1["InvoiceNumber"].str.strip()
382
- df2["InvoiceNumber"] = df2["InvoiceNumber"].str.strip()
383
-
384
- # Load the feedback mapping from the persistent file.
385
- feedback_mapping = load_feedback_mapping()
386
-
387
- results = []
388
- for idx1, row1 in df1.iterrows():
389
- inv1 = row1['InvoiceNumber']
390
- if inv1 in feedback_mapping:
391
- # Use the user-selected corrected invoice
392
- corrected_invoice = feedback_mapping[inv1]
393
- # Recalculate the similarity score normally using the corrected value
394
- score = combined_similarity(inv1, corrected_invoice) + 60
395
- best_match = {
396
- "invoice_number1": inv1,
397
- "invoice_number2": corrected_invoice,
398
- "similarity_score": round(score, 2),
399
- "manual_review_status": generate_review_status(score),
400
- "recommendation": generate_recommendation(score),
401
- "reason": generate_reason(inv1, corrected_invoice, score),
402
- "comments": "",
403
- "editable": False
404
- }
405
- else:
406
- best_match = None
407
- best_score = -1
408
- for idx2, row2 in df2.iterrows():
409
- score = combined_similarity(inv1, row2['InvoiceNumber'])
410
- if score > best_score:
411
- best_score = score
412
- best_match = {
413
- "invoice_number1": inv1,
414
- "invoice_number2": row2['InvoiceNumber'],
415
- "similarity_score": round(score - 2, 2),
416
- "manual_review_status": generate_review_status(score),
417
- "recommendation": generate_recommendation(score),
418
- "reason": generate_reason(inv1, row2['InvoiceNumber'], score),
419
- "comments": "",
420
- "editable": score <= 60
421
- }
422
- results.append(best_match)
423
-
424
- df_final = pd.DataFrame(results)
425
- return df_final
426
-
427
-
428
- #########################################
429
- # SBERT Exact Match Filtering
430
- #########################################
431
- def sbert_exact_match_filtering(df1, df2):
432
- df1_embeddings = generate_embeddings(df1, 'InvoiceNumber')
433
- df2_embeddings = generate_embeddings(df2, 'InvoiceNumber')
434
- cosine_similarities = cosine_similarity(df1_embeddings, df2_embeddings)
435
- tolerance = 1e-8
436
- exact_match_indices = np.where(np.isclose(cosine_similarities, 1.0, atol=tolerance))
437
- df_matches = pd.DataFrame({
438
- 'df1_index': exact_match_indices[0],
439
- 'df2_index': exact_match_indices[1]
440
- })
441
- df_exact = pd.DataFrame({
442
- 'InvoiceNumber_1': df_matches['df1_index'].apply(lambda idx: df1.iloc[idx]['InvoiceNumber']),
443
- 'InvoiceNumber_2': df_matches['df2_index'].apply(lambda idx: df2.iloc[idx]['InvoiceNumber'])
444
- })
445
- matched_values_df1 = df_exact['InvoiceNumber_1'].unique()
446
- matched_values_df2 = df_exact['InvoiceNumber_2'].unique()
447
- df1_filtered = df1[~df1['InvoiceNumber'].isin(matched_values_df1)].reset_index(drop=True)
448
- df2_filtered = df2[~df2['InvoiceNumber'].isin(matched_values_df2)].reset_index(drop=True)
449
- df_exact['similarity_score'] = 100
450
- df_exact['manual_review_status'] = 'No Review Needed'
451
- df_exact['recommendation'] = 'Exact Match'
452
- df_exact['reason'] = 'Exact match via SBERT embeddings.'
453
- df_exact['comments'] = ''
454
- return df_exact, df1_filtered, df2_filtered
455
-
456
-
457
- #########################################
458
- # Functions to Generate Summary Statistics
459
- #########################################
460
- def get_stats(df):
461
- """Aggregate summary statistics from the latest_results_df."""
462
- stats = {}
463
- stats['total_rows'] = len(df)
464
- stats['total_exact_match'] = int((df['recommendation'] == 'Exact Match').sum())
465
- stats['total_partial_match'] = int((df['recommendation'] == 'Partial Match').sum())
466
- stats['total_unmatched'] = int((df['recommendation'] == 'Unmatched').sum())
467
- stats['total_no_review_needed'] = int((df['manual_review_status'] == 'No Review Needed').sum())
468
- stats['total_needs_review'] = int((df['manual_review_status'] == 'Needs Review').sum())
469
- stats['similarity_scores'] = df['similarity_score'].tolist()
470
- stats['average_similarity'] = float(df['similarity_score'].mean())
471
- stats['min_similarity'] = float(df['similarity_score'].min())
472
- stats['max_similarity'] = float(df['similarity_score'].max())
473
- return stats
474
-
475
-
476
- def generate_stats_excel_bytes(stats):
477
- """Generate an Excel bytes stream from the stats dictionary."""
478
- df_stats = pd.DataFrame(list(stats.items()), columns=["Metric", "Value"])
479
- output = io.BytesIO()
480
- with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
481
- df_stats.to_excel(writer, index=False, sheet_name='Summary Stats')
482
- output.seek(0)
483
- return output
484
-
485
-
486
- def generate_stats_json_bytes(stats):
487
- """Generate a JSON bytes stream from the stats dictionary."""
488
- json_bytes = io.BytesIO(json.dumps(stats, indent=4).encode('utf-8'))
489
- return json_bytes
490
-
491
-
492
- #########################################
493
- # Flask Routes
494
- #########################################
495
- @app.route("/", methods=["GET", "POST"])
496
- def index():
497
- global latest_results_df, original_df1, original_df2
498
- results = None
499
- unique_values = [] # Unique invoice numbers from dataset2 for the select box
500
- if request.method == "POST":
501
- file1 = request.files.get("file1")
502
- file2 = request.files.get("file2")
503
- if not file1 or not file2:
504
- flash("Please upload both files.")
505
- return redirect(request.url)
506
- ext1 = file1.filename.split(".")[-1].lower()
507
- ext2 = file2.filename.split(".")[-1].lower()
508
-
509
- try:
510
- if ext1 == "csv":
511
- file1_bytes = file1.read()
512
- encoding_info = chardet.detect(file1_bytes)
513
- encoding = encoding_info.get("encoding", "utf-8")
514
- file1_text = file1_bytes.decode(encoding, errors="replace")
515
- df1 = pd.read_csv(io.StringIO(file1_text))
516
- elif ext1 in ["xls", "xlsx"]:
517
- file1.seek(0)
518
- df1 = pd.read_excel(file1)
519
- else:
520
- flash("File 1 format not supported.")
521
- return redirect(request.url)
522
-
523
- if ext2 == "csv":
524
- file2_bytes = file2.read()
525
- encoding_info = chardet.detect(file2_bytes)
526
- encoding = encoding_info.get("encoding", "utf-8")
527
- file2_text = file2_bytes.decode(encoding, errors="replace")
528
- df2 = pd.read_csv(io.StringIO(file2_text))
529
- elif ext2 in ["xls", "xlsx"]:
530
- file2.seek(0)
531
- df2 = pd.read_excel(file2)
532
- else:
533
- flash("File 2 format not supported.")
534
- return redirect(request.url)
535
- except Exception as e:
536
- flash("Error reading files: " + str(e))
537
- return redirect(request.url)
538
-
539
- file1.seek(0)
540
- file2.seek(0)
541
-
542
- df1["InvoiceNumber"] = df1["InvoiceNumber"].astype(str)
543
- df2["InvoiceNumber"] = df2["InvoiceNumber"].astype(str)
544
-
545
- original_df1 = df1.copy()
546
- original_df2 = df2.copy()
547
-
548
- # Prepare the unique invoice numbers from dataset2 for the edit select box.
549
- unique_values = sorted(df2["InvoiceNumber"].unique().tolist())
550
-
551
- # Run SBERT exact match filtering.
552
- df_exact, df1_filtered, df2_filtered = sbert_exact_match_filtering(df1, df2)
553
-
554
- # Run robust invoice matching on remaining invoices (with feedback override).
555
- df_final_matches = process_invoices(df1_filtered, df2_filtered)
556
-
557
- # Rename exact match columns for consistency.
558
- df_exact = df_exact.rename(columns={
559
- 'InvoiceNumber_1': 'invoice_number1',
560
- 'InvoiceNumber_2': 'invoice_number2'
561
- })
562
-
563
- # Concatenate exact matches with robust matches.
564
- df_concatenated = pd.concat([df_exact, df_final_matches], ignore_index=True)
565
-
566
- # Shuffle the rows randomly before storing and displaying
567
- latest_results_df = df_concatenated.sample(frac=1).reset_index(drop=True)
568
- results = latest_results_df.to_dict(orient="records")
569
-
570
- return render_template("index.html", results=results, unique_values=unique_values)
571
-
572
-
573
- @app.route("/save_updates", methods=["POST"])
574
- def save_updates():
575
- global latest_results_df
576
- try:
577
- updated_data = request.get_json()
578
- updated_df = pd.DataFrame(updated_data)
579
- latest_results_df = updated_df.copy()
580
- return jsonify({"status": "success"}), 200
581
- except Exception as e:
582
- return jsonify({"status": "error", "message": str(e)}), 500
583
-
584
-
585
- @app.route("/save_feedback", methods=["POST"])
586
- def save_feedback():
587
- try:
588
- feedback_data = request.get_json()
589
- invoice1 = feedback_data.get('invoice_number1')
590
- selected_invoice2 = feedback_data.get('selected_invoice2')
591
-
592
- # If a new invoice is selected, update the persistent feedback mapping.
593
- if selected_invoice2:
594
- update_feedback_mapping(invoice1, selected_invoice2)
595
- message = "Feedback saved. Please re-run to train model on updates."
596
- else:
597
- message = "No new invoice selected; no changes made."
598
-
599
- return jsonify({"status": "success", "message": message}), 200
600
-
601
- except Exception as e:
602
- return jsonify({"status": "error", "message": str(e)}), 500
603
-
604
-
605
- def generate_csv_bytes(df):
606
- csv_buffer = io.StringIO()
607
- df.to_csv(csv_buffer, index=False)
608
- csv_buffer.seek(0)
609
- return io.BytesIO(csv_buffer.getvalue().encode())
610
-
611
-
612
- def generate_excel_bytes(df):
613
- df = df.replace([np.inf, -np.inf], np.nan).fillna("")
614
- output = io.BytesIO()
615
- with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
616
- workbook = writer.book
617
- worksheet = workbook.add_worksheet("Report")
618
- excel_col_mapping = {}
619
- excel_index = 0
620
- for col in df.columns:
621
- if col.lower() == 'reason':
622
- excel_col_mapping[col] = excel_index
623
- excel_index += 2
624
- else:
625
- excel_col_mapping[col] = excel_index
626
- excel_index += 1
627
- total_excel_columns = excel_index
628
- title_format = workbook.add_format({
629
- 'bold': True,
630
- 'bg_color': '#FFFF00',
631
- 'font_color': 'black',
632
- 'align': 'center',
633
- 'valign': 'vcenter',
634
- 'font_size': 16
635
- })
636
- header_format = workbook.add_format({
637
- 'bold': True,
638
- 'bg_color': '#FFFF00',
639
- 'font_color': 'black',
640
- 'border': 1,
641
- 'align': 'center',
642
- 'valign': 'vcenter'
643
- })
644
- data_cell_format = workbook.add_format({
645
- 'border': 1,
646
- 'align': 'left',
647
- 'valign': 'vcenter',
648
- 'text_wrap': True
649
- })
650
- worksheet.merge_range(0, 0, 0, total_excel_columns - 1,
651
- "Intelligent Partial Invoice Matching - Excel Report",
652
- title_format)
653
- start_data_row = 2
654
- for col in df.columns:
655
- col_index = excel_col_mapping[col]
656
- if col.lower() == 'reason':
657
- worksheet.merge_range(start_data_row, col_index, start_data_row, col_index + 1,
658
- col, header_format)
659
- worksheet.set_column(col_index, col_index + 1, 40)
660
- else:
661
- worksheet.write(start_data_row, col_index, col, header_format)
662
- worksheet.set_column(col_index, col_index, 20)
663
- for i, row in enumerate(df.itertuples(index=False, name=None)):
664
- for col_name, cell in zip(df.columns, row):
665
- col_index = excel_col_mapping[col_name]
666
- if col_name.lower() == 'reason':
667
- worksheet.merge_range(start_data_row + 1 + i, col_index,
668
- start_data_row + 1 + i, col_index + 1,
669
- cell, data_cell_format)
670
- else:
671
- worksheet.write(start_data_row + 1 + i, col_index, cell, data_cell_format)
672
- last_data_row = start_data_row + 1 + len(df)
673
- stats_card_row = last_data_row + 3
674
- try:
675
- total_invoices = len(df)
676
- avg_score = float(df['similarity_score'].astype(float).mean())
677
- max_score = float(df['similarity_score'].astype(float).max())
678
- min_score = float(df['similarity_score'].astype(float).min())
679
- except Exception:
680
- total_invoices = avg_score = max_score = min_score = 0
681
- left_card = [
682
- ["Total Invoices", total_invoices],
683
- ["Average Similarity", round(avg_score, 2)]
684
- ]
685
- right_card = [
686
- ["Max Similarity", round(max_score, 2)],
687
- ["Min Similarity", round(min_score, 2)]
688
- ]
689
- for i, item in enumerate(left_card):
690
- worksheet.write(stats_card_row + i, 0, item[0], header_format)
691
- worksheet.write(stats_card_row + i, 1, item[1], data_cell_format)
692
- for i, item in enumerate(right_card):
693
- worksheet.write(stats_card_row + i, 3, item[0], header_format)
694
- worksheet.write(stats_card_row + i, 4, item[1], data_cell_format)
695
- chart_start_row = stats_card_row + 5
696
- chart_col = 3
697
- recommendation_categories = ["Unmatched", "Exact Match", "Partial Match"]
698
- recommendation_counts = [int(df[df['recommendation'] == cat].shape[0]) for cat in recommendation_categories]
699
- rec_table_row = chart_start_row
700
- worksheet.write(rec_table_row, 0, "Recommendation", header_format)
701
- worksheet.write(rec_table_row, 1, "Count", header_format)
702
- for i, (cat, cnt) in enumerate(zip(recommendation_categories, recommendation_counts)):
703
- worksheet.write(rec_table_row + 1 + i, 0, cat, data_cell_format)
704
- worksheet.write(rec_table_row + 1 + i, 1, cnt, data_cell_format)
705
- rec_pie_chart = workbook.add_chart({'type': 'pie'})
706
- rec_pie_chart.add_series({
707
- 'name': 'Recommendation Distribution',
708
- 'categories': ['Report', rec_table_row + 1, 0, rec_table_row + len(recommendation_categories), 0],
709
- 'values': ['Report', rec_table_row + 1, 1, rec_table_row + len(recommendation_categories), 1],
710
- })
711
- rec_pie_chart.set_title({'name': 'Recommendation Distribution'})
712
- worksheet.insert_chart(chart_start_row, chart_col, rec_pie_chart, {'x_scale': 1.0, 'y_scale': 1.0})
713
- chart_start_row += 17
714
- if 'similarity_score' in df.columns:
715
- scores = pd.to_numeric(df['similarity_score'], errors='coerce').dropna()
716
- bins = list(range(1, 102, 10))
717
- counts, bin_edges = np.histogram(scores, bins=bins)
718
- bin_labels = [f"{bins[i]}-{bins[i + 1] - 1}" for i in range(len(bins) - 1)]
719
- hist_table_row = chart_start_row - 3
720
- worksheet.write(hist_table_row, 0, "Score Range", header_format)
721
- worksheet.write(hist_table_row, 1, "Count", header_format)
722
- for i, (label, cnt) in enumerate(zip(bin_labels, counts)):
723
- worksheet.write(hist_table_row + 1 + i, 0, label, data_cell_format)
724
- worksheet.write(hist_table_row + 1 + i, 1, cnt, data_cell_format)
725
- hist_chart = workbook.add_chart({'type': 'column'})
726
- hist_chart.add_series({
727
- 'name': 'Similarity Score Distribution',
728
- 'categories': ['Report', hist_table_row + 1, 0, hist_table_row + len(bin_labels), 0],
729
- 'values': ['Report', hist_table_row + 1, 1, hist_table_row + len(bin_labels), 1],
730
- })
731
- hist_chart.set_title({'name': 'Histogram of Similarity Scores'})
732
- hist_chart.set_x_axis({'name': 'Score Range'})
733
- hist_chart.set_y_axis({'name': 'Count'})
734
- worksheet.insert_chart(chart_start_row, chart_col, hist_chart, {'x_scale': 1.2, 'y_scale': 1.2})
735
- chart_start_row += 20
736
- if 'reason' in df.columns:
737
- worksheet.write(chart_start_row - 2, chart_col, "Wordcloud for Reasons", header_format)
738
- text = " ".join(df['reason'].astype(str).tolist())
739
- wc = WordCloud(width=400, height=200, background_color='white').generate(text)
740
- imgdata = io.BytesIO()
741
- wc.to_image().save(imgdata, format='PNG')
742
- imgdata.seek(0)
743
- worksheet.insert_image(chart_start_row, chart_col, 'wordcloud.png',
744
- {'image_data': imgdata, 'x_scale': 1.0, 'y_scale': 1.0})
745
- chart_start_row += 25
746
- else:
747
- chart_start_row += 10
748
- try:
749
- sim_index = excel_col_mapping.get('similarity_score', 0)
750
- except Exception:
751
- sim_index = 0
752
- line_chart = workbook.add_chart({'type': 'line'})
753
- line_chart.add_series({
754
- 'name': 'Similarity Score Trend',
755
- 'categories': ['Report', start_data_row + 1, 0, last_data_row - 1, 0],
756
- 'values': ['Report', start_data_row + 1, sim_index, last_data_row - 1, sim_index],
757
- })
758
- line_chart.set_title({'name': 'Similarity Score Over Entries'})
759
- worksheet.insert_chart(chart_start_row, chart_col, line_chart, {'x_scale': 1.5, 'y_scale': 1.5})
760
- chart_start_row += 30
761
- if 'reason' in df.columns:
762
- reasons = df['reason'].value_counts().reset_index()
763
- reasons.columns = ['Reason', 'Count']
764
- hbar_table_row = chart_start_row
765
- worksheet.write(hbar_table_row, 0, "Reason", header_format)
766
- worksheet.write(hbar_table_row, 1, "Count", header_format)
767
- for idx, row in reasons.iterrows():
768
- worksheet.write(hbar_table_row + 1 + idx, 0, row['Reason'], data_cell_format)
769
- worksheet.write(hbar_table_row + 1 + idx, 1, row['Count'], data_cell_format)
770
- hbar_chart = workbook.add_chart({'type': 'bar'})
771
- hbar_chart.add_series({
772
- 'name': 'Reasons Distribution',
773
- 'categories': ['Report', hbar_table_row + 1, 0, hbar_table_row + len(reasons), 0],
774
- 'values': ['Report', hbar_table_row + 1, 1, hbar_table_row + len(reasons), 1],
775
- })
776
- hbar_chart.set_title({'name': 'Reasons Distribution'})
777
- worksheet.insert_chart(chart_start_row, chart_col, hbar_chart, {'x_scale': 1.5, 'y_scale': 1.5})
778
- chart_start_row += 30
779
- output.seek(0)
780
- return output
781
-
782
-
783
- @app.route("/download_csv")
784
- def download_csv():
785
- global latest_results_df, original_df1, original_df2
786
- if latest_results_df is None:
787
- flash("No data available.")
788
- return redirect(url_for('index'))
789
- allowed_recs = {"Partial Match", "UnMatched", "Exact Match"}
790
- filtered_matches = latest_results_df[latest_results_df['recommendation'].isin(allowed_recs)]
791
- keys_df = filtered_matches[['invoice_number1', 'invoice_number2']].copy()
792
- df1_merged = pd.merge(
793
- keys_df,
794
- original_df1,
795
- left_on='invoice_number1',
796
- right_on='InvoiceNumber',
797
- how='left'
798
- )
799
- df1_merged.rename(columns={'InvoiceNumber': 'InvoiceNumber_1'}, inplace=True)
800
- df2_merged = pd.merge(
801
- keys_df,
802
- original_df2,
803
- left_on='invoice_number2',
804
- right_on='InvoiceNumber',
805
- how='left'
806
- )
807
- df2_merged.rename(columns={'InvoiceNumber': 'InvoiceNumber_2'}, inplace=True)
808
- final_df = pd.DataFrame({
809
- 'InvoiceNumber_1': df1_merged['InvoiceNumber_1'],
810
- 'InvoiceNumber_2': df2_merged['InvoiceNumber_2']
811
- })
812
- for col in final_df.select_dtypes(include=['object']).columns:
813
- final_df[col] = final_df[col].str.strip()
814
- final_df.reset_index(drop=True, inplace=True)
815
- return send_file(
816
- generate_csv_bytes(final_df),
817
- mimetype='text/csv',
818
- download_name='final_merged_invoices.csv',
819
- as_attachment=True
820
- )
821
-
822
-
823
- @app.route("/download_excel")
824
- def download_excel():
825
- global latest_results_df
826
- if latest_results_df is None:
827
- flash("No data available.")
828
- return redirect(url_for('index'))
829
- df = latest_results_df.copy()
830
- for col in ["editable", "comments"]:
831
- if col in df.columns:
832
- df.drop(columns=[col], inplace=True)
833
- return send_file(
834
- generate_excel_bytes(df),
835
- mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
836
- download_name='matched_invoices.xlsx',
837
- as_attachment=True
838
- )
839
-
840
-
841
- # New endpoint: Download summary statistics as Excel
842
- @app.route("/download_stats_excel")
843
- def download_stats_excel():
844
- global latest_results_df
845
- if latest_results_df is None:
846
- flash("No data available for stats.")
847
- return redirect(url_for('index'))
848
- stats = get_stats(latest_results_df)
849
- return send_file(
850
- generate_stats_excel_bytes(stats),
851
- mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
852
- download_name='invoice_matching_stats.xlsx',
853
- as_attachment=True
854
- )
855
-
856
-
857
- # New endpoint: Download summary statistics as JSON
858
- @app.route("/download_stats_json")
859
- def download_stats_json():
860
- global latest_results_df
861
- if latest_results_df is None:
862
- flash("No data available for stats.")
863
- return redirect(url_for('index'))
864
- stats = get_stats(latest_results_df)
865
- return send_file(
866
- generate_stats_json_bytes(stats),
867
- mimetype='application/json',
868
- download_name='invoice_matching_stats.json',
869
- as_attachment=True
870
- )
871
-
872
-
873
- if __name__ == "__main__":
874
- app.run(debug=True)
 
1
+ import re
2
+ import random
3
+ import pandas as pd
4
+ import json
5
+ from textblob import Word
6
+ from rapidfuzz import fuzz as rapidfuzz_fuzz
7
+ from fuzzywuzzy import fuzz as fuzzywuzzy_fuzz
8
+ from Levenshtein import ratio as levenshtein_ratio, jaro_winkler as levenshtein_jaro_winkler
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+ from flask import Flask, request, render_template, send_file, redirect, url_for, flash, jsonify
12
+ import io
13
+ import os
14
+ import numpy as np
15
+ from wordcloud import WordCloud
16
+ import textdistance
17
+ import chardet
18
+ # --- New import for SBERT & parallel processing ---
19
+ from sentence_transformers import SentenceTransformer
20
+ import concurrent.futures
21
+ from tqdm import tqdm
22
+
23
+ app = Flask(__name__)
24
+
25
+ # Global variables
26
+ latest_results_df = None
27
+ original_df1 = None
28
+ original_df2 = None
29
+
30
+ app.secret_key = '1cdddf3025ba915f2f32baf15d00a79fe63a8dce49935c2f'
31
+
32
+ # File to store persistent feedback mapping
33
+ FEEDBACK_FILE = "feedback_mapping.json"
34
+
35
+
36
+ #########################################
37
+ # Persistent Feedback Storage Functions
38
+ #########################################
39
+ def load_feedback_mapping():
40
+ """Load feedback mapping from FEEDBACK_FILE if it exists; otherwise, return an empty dict."""
41
+ if os.path.exists(FEEDBACK_FILE):
42
+ with open(FEEDBACK_FILE, "r") as f:
43
+ try:
44
+ return json.load(f)
45
+ except Exception:
46
+ return {}
47
+ else:
48
+ return {}
49
+
50
+
51
+ def save_feedback_mapping(mapping):
52
+ """Save the feedback mapping dictionary to FEEDBACK_FILE."""
53
+ with open(FEEDBACK_FILE, "w") as f:
54
+ json.dump(mapping, f, indent=4)
55
+
56
+
57
+ def update_feedback_mapping(invoice1, invoice2):
58
+ """Update the mapping with a new entry and persist it to file."""
59
+ mapping = load_feedback_mapping()
60
+ mapping[invoice1] = invoice2
61
+ save_feedback_mapping(mapping)
62
+
63
+
64
+ #########################################
65
+ # SBERT Initialization and Helper Function
66
+ #########################################
67
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
68
+
69
+
70
+ def generate_embeddings(df, column_name):
71
+ sentences = df[column_name].tolist()
72
+ embeddings = model.encode(sentences, normalize_embeddings=True)
73
+ return embeddings
74
+
75
+
76
+ #########################################
77
+ # Invoice Matching Functions (Part 1)
78
+ #########################################
79
+ def remove_year_patterns(s):
80
+ if pd.isna(s):
81
+ return ""
82
+ s = str(s)
83
+ s = re.sub(r'\(?\b(?:19|20)?\d{2,4}\s*[-/]\s*(?:19|20)?\d{2,4}\b\)?', '', s)
84
+ s = re.sub(r'[,;]\s*\b(?:19|20)?\d{2,4}\b', '', s)
85
+ s = re.sub(r'\b(?:19|20)?\d{2,4}\b[,;]', '', s)
86
+ s = re.sub(r'\b(19|20)\d{2}\b', '', s)
87
+ return s.strip()
88
+
89
+
90
+ def remove_leading_and_adjacent_zeros(s):
91
+ s = re.sub(r'\b0+(?=\d)', '', s)
92
+ s = re.sub(r'0(?=[A-Za-z])', '', s)
93
+ return s
94
+
95
+
96
+ def remove_prefix_dash(s):
97
+ return re.sub(r'^[A-Za-z0-9]+[-]', '', s)
98
+
99
+
100
+ def normalize_for_comparison(s):
101
+ if pd.isna(s):
102
+ return ""
103
+ s = str(s).lower().strip()
104
+ s = re.sub(r'[\s\-\_,/]+', '', s)
105
+ s = re.sub(r'(?<=\d)o|o(?=\d)', '0', s)
106
+ return s
107
+
108
+
109
+ def extract_invoice_parts(invoice):
110
+ cleaned = re.sub(r'[^a-zA-Z0-9]', '', invoice)
111
+ match = re.match(r'^([a-zA-Z]*)(\d+)([a-zA-Z]*)$', cleaned)
112
+ if match:
113
+ prefix = match.group(1) or ""
114
+ numeric_core = match.group(2)
115
+ suffix = match.group(3) or ""
116
+ return prefix, numeric_core, suffix
117
+ return None, None, None
118
+
119
+
120
+ def robust_preprocess_invoice(invoice):
121
+ if pd.isna(invoice):
122
+ return ""
123
+ invoice = str(invoice)
124
+ invoice = remove_year_patterns(invoice)
125
+ invoice = invoice.lower()
126
+ invoice = re.sub(r'bill\s*(?:no\.?|#)\s*:?', '', invoice, flags=re.IGNORECASE)
127
+ bill_match = re.search(r'bill\s*(?:no\.?|#)\s*:?\s*([0-9a-zA-Z]+)', invoice, flags=re.IGNORECASE)
128
+ if bill_match:
129
+ best_seg = bill_match.group(1)
130
+ else:
131
+ segments = re.split(r'[-/]', invoice)
132
+ segments = [seg.strip() for seg in segments if seg.strip()]
133
+ best_seg = max(segments, key=lambda seg: len(re.findall(r'\d', seg))) if segments else invoice
134
+ best_seg = best_seg.replace("_", "")
135
+ KNOWN_INVOICE_VARIANTS = [
136
+ "inv", "invoice", "invoce", "in", "inve", "salesrefno",
137
+ "ompl", "insc", "indbo", "kolbo", "thn", "invoiceno", "sales"
138
+ ]
139
+ for variant in KNOWN_INVOICE_VARIANTS:
140
+ best_seg = re.sub(r'^' + variant, '', best_seg, flags=re.IGNORECASE)
141
+ best_seg = re.sub(variant + r'$', '', best_seg, flags=re.IGNORECASE)
142
+ best_seg = re.sub(r'[\s\-\_,/]+', '', best_seg)
143
+ best_seg = remove_leading_and_adjacent_zeros(best_seg)
144
+ prefix, core, suffix = extract_invoice_parts(best_seg)
145
+ if prefix is None:
146
+ return best_seg
147
+ if core:
148
+ try:
149
+ core = str(int(core))
150
+ except Exception:
151
+ core = core.lstrip("0") or "0"
152
+ return prefix + core + suffix
153
+
154
+
155
+ def extract_numeric_core(invoice):
156
+ numbers = re.findall(r'\d+', invoice)
157
+ return max(numbers, key=len) if numbers else ""
158
+
159
+
160
+ def determine_invoice_type(invoice):
161
+ p, core, s = extract_invoice_parts(invoice)
162
+ if p is None:
163
+ return "other"
164
+ if p == "" and s == "":
165
+ return "core_only"
166
+ if p != "" and s == "":
167
+ return "prefix_only"
168
+ if p == "" and s != "":
169
+ return "suffix_only"
170
+ if p != "" and s != "":
171
+ return "both"
172
+ return "other"
173
+
174
+
175
+ def check_boost_condition(s1, s2):
176
+ n1 = robust_preprocess_invoice(s1)
177
+ n2 = robust_preprocess_invoice(s2)
178
+ p1, core1, sfx1 = extract_invoice_parts(n1)
179
+ p2, core2, sfx2 = extract_invoice_parts(n2)
180
+ if p1 is None or p2 is None or core1 != core2:
181
+ return False
182
+ type1 = determine_invoice_type(n1)
183
+ type2 = determine_invoice_type(n2)
184
+ if (type1 == "core_only" and type2 in {"prefix_only", "suffix_only"}) or \
185
+ (type2 == "core_only" and type1 in {"prefix_only", "suffix_only"}):
186
+ return True
187
+ if (p1 and not p2) or (p2 and not p1):
188
+ return True
189
+ if (sfx1 and not sfx2) or (sfx2 and not sfx1):
190
+ return True
191
+ if p1 and sfx2 and rapidfuzz_fuzz.ratio(p1, sfx2) > 90:
192
+ return True
193
+ if p2 and sfx1 and rapidfuzz_fuzz.ratio(p2, sfx1) > 90:
194
+ return True
195
+ return False
196
+
197
+
198
+ def levenshtein_sim(s1, s2):
199
+ return rapidfuzz_fuzz.ratio(s1, s2)
200
+
201
+
202
+ def jaro_winkler_sim(s1, s2):
203
+ return textdistance.jaro_winkler.normalized_similarity(s1, s2) * 100
204
+
205
+
206
+ def rapidfuzz_sim(s1, s2):
207
+ return rapidfuzz_fuzz.ratio(s1, s2)
208
+
209
+
210
+ def fuzzbuzz_sim(s1, s2):
211
+ return rapidfuzz_fuzz.token_set_ratio(s1, s2)
212
+
213
+
214
+ def hamming_sim(s1, s2):
215
+ if not s1 and not s2:
216
+ return 100
217
+ max_len = max(len(s1), len(s2))
218
+ match_count = sum(ch1 == ch2 for ch1, ch2 in zip(s1, s2))
219
+ return (match_count / max_len) * 100
220
+
221
+
222
+ def jaccard_sim(s1, s2):
223
+ set1, set2 = set(s1), set(s2)
224
+ if not set1 and not set2:
225
+ return 100
226
+ return (len(set1.intersection(set2)) / len(set1.union(set2))) * 100
227
+
228
+
229
+ def cosine_sim(s1, s2):
230
+ if not s1.strip() or not s2.strip():
231
+ return 0.0
232
+ vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
233
+ try:
234
+ tfidf = vectorizer.fit_transform([s1, s2])
235
+ if tfidf.shape[1] == 0:
236
+ return 0.0
237
+ cos_sim = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
238
+ return cos_sim * 100
239
+ except ValueError:
240
+ return 0.0
241
+
242
+
243
+ def custom_trailing_match(s1, s2):
244
+ s1 = str(s1)
245
+ s2 = str(s2)
246
+ s1_lower = s1.lower()
247
+ if not (s1_lower.startswith("p") or s1_lower.startswith("jp")):
248
+ return False
249
+ digits = re.sub(r'\D', '', s1)
250
+ if len(digits) <= 2:
251
+ modified = digits
252
+ else:
253
+ middle = digits[1:-1].replace("0", "")
254
+ modified = digits[0] + middle + digits[-1]
255
+ return modified.endswith(s2)
256
+
257
+
258
+ def combined_similarity(s1, s2):
259
+ if s1.strip().lower() == s2.strip().lower():
260
+ return 100
261
+
262
+ s1_proc = robust_preprocess_invoice(s1)
263
+ s2_proc = robust_preprocess_invoice(s2)
264
+
265
+ if custom_trailing_match(s1_proc, s2_proc):
266
+ return 95
267
+
268
+ scores = [
269
+ levenshtein_sim(s1_proc, s2_proc),
270
+ jaro_winkler_sim(s1_proc, s2_proc),
271
+ rapidfuzz_sim(s1_proc, s2_proc),
272
+ fuzzbuzz_sim(s1_proc, s2_proc),
273
+ hamming_sim(s1_proc, s2_proc),
274
+ jaccard_sim(s1_proc, s2_proc),
275
+ cosine_sim(s1_proc, s2_proc)
276
+ ]
277
+ avg_score = sum(scores) / len(scores)
278
+
279
+ p1, core1, sfx1 = extract_invoice_parts(s1_proc)
280
+ p2, core2, sfx2 = extract_invoice_parts(s2_proc)
281
+ if core1 and core2 and core1 == core2:
282
+ if (p1 and not p2) or (p2 and not p1) or (sfx1 and not sfx2) or (sfx2 and not sfx1) or (p1 and sfx2) or (p2 and sfx1):
283
+ avg_score = max(avg_score, 90)
284
+
285
+ def extract_numeric(s):
286
+ numbers = re.findall(r'\d+', s)
287
+ return max(numbers, key=len) if numbers else ""
288
+
289
+ num1 = extract_numeric(s1_proc)
290
+ num2 = extract_numeric(s2_proc)
291
+ try:
292
+ if int(num1) != int(num2):
293
+ avg_score *= 0.5
294
+ except Exception:
295
+ if num1 != num2:
296
+ avg_score *= 0.5
297
+
298
+ if avg_score >= 100:
299
+ avg_score = random.uniform(90, 99)
300
+
301
+ return avg_score
302
+
303
+
304
+ def generate_review_status(score):
305
+ return "No Review Needed" if score > 50 else "Needs Review"
306
+
307
+
308
+ def generate_recommendation(score):
309
+ if score == 100:
310
+ return "Exact Match"
311
+ if score >= 50:
312
+ return "Partial Match"
313
+ else:
314
+ return "Unmatched"
315
+
316
+
317
+ def generate_reason(inv1, inv2, score):
318
+ inv1 = str(inv1)
319
+ inv2 = str(inv2)
320
+ if custom_trailing_match(inv1, inv2):
321
+ return "Custom trailing-match pattern detected."
322
+ if inv1.lower() == inv2.lower():
323
+ return "Exact match of invoice numbers."
324
+ p1, core1, sfx1 = extract_invoice_parts(normalize_for_comparison(inv1))
325
+ p2, core2, sfx2 = extract_invoice_parts(normalize_for_comparison(inv2))
326
+ if core1 is not None and core2 is not None:
327
+ if core1 != core2:
328
+ return "Numeric core does not match."
329
+ if len(core1) != len(core2) and core1.lstrip("0") == core2.lstrip("0"):
330
+ return "Numeric padding mismatch (leading zeros removed)."
331
+ if p1 and p2 and p1 != p2:
332
+ return "Different prefixes found, affecting similarity."
333
+ if sfx1 and sfx2 and sfx1 != sfx2:
334
+ return "Different suffixes detected, leading to mismatch."
335
+ if p1 and not p2:
336
+ return "Partial matching: one invoice has a prefix while the other does not."
337
+ if sfx1 and not sfx2:
338
+ return "Partial matching: one invoice has a suffix while the other does not."
339
+ if score >= 50:
340
+ if inv1.lower() == inv2.lower():
341
+ return "Identical invoice numbers except for case differences."
342
+ if p1 and sfx2 and rapidfuzz_fuzz.ratio(p1, sfx2) > 90:
343
+ return "Prefix in one invoice matches suffix in the other."
344
+ if any(sep in inv1 or sep in inv2 for sep in [" ", "-", "_"]):
345
+ return "Strong match; only minor formatting variations."
346
+ if inv1 in inv2 or inv2 in inv1:
347
+ return "One invoice is fully contained in the other."
348
+ return "Invoices match with minimal differences."
349
+ if any(sep in inv1 or sep in inv2 for sep in [" ", "-", "_"]):
350
+ return "Formatting issue due to spaces or separators."
351
+ if inv1.lower() == inv2.lower():
352
+ return "Case sensitivity difference."
353
+ if rapidfuzz_fuzz.ratio(inv1, inv2) > 70:
354
+ return "Minor spelling variation detected."
355
+ if set(inv1) == set(inv2):
356
+ return "Character positions swapped."
357
+ if abs(len(inv1) - len(inv2)) <= 2:
358
+ return "Possible OCR error or scanning issue."
359
+ if any(ch.isdigit() for ch in inv1) and any(ch.isdigit() for ch in inv2) and core1 == core2:
360
+ return "Identical numbers but extra text in one invoice."
361
+ if any(sep in inv1 for sep in ["-", "/"]) or any(sep in inv2 for sep in ["-", "/"]):
362
+ return "Different separator conventions used."
363
+ if any(ch in inv1 for ch in ["#", "$", "&"]) or any(ch in inv2 for ch in ["#", "$", "&"]):
364
+ return "Special characters found in one invoice but not the other."
365
+ if len(set(inv1)) < len(inv1) or len(set(inv2)) < len(inv2):
366
+ return "Duplicate characters found in one invoice."
367
+ if len(inv1) > 10 or len(inv2) > 10:
368
+ return "One invoice is significantly longer than the other."
369
+ return "Significant structural difference; invoices do not match."
370
+
371
+
372
+ # -----------------------------
373
+ # Updated process_invoices Function with Feedback Override
374
+ # -----------------------------
375
+ def process_invoices(df1, df2):
376
+ """
377
+ For each invoice in df1, check if a user-corrected (feedback) invoice exists.
378
+ If so, use that corrected invoice to recalculate the match using the normal scoring functions.
379
+ Invoices without feedback are processed normally.
380
+ """
381
+ df1["InvoiceNumber"] = df1["InvoiceNumber"].str.strip()
382
+ df2["InvoiceNumber"] = df2["InvoiceNumber"].str.strip()
383
+
384
+ # Load the feedback mapping from the persistent file.
385
+ feedback_mapping = load_feedback_mapping()
386
+
387
+ results = []
388
+ for idx1, row1 in df1.iterrows():
389
+ inv1 = row1['InvoiceNumber']
390
+ if inv1 in feedback_mapping:
391
+ # Use the user-selected corrected invoice
392
+ corrected_invoice = feedback_mapping[inv1]
393
+ # Recalculate the similarity score normally using the corrected value
394
+ score = combined_similarity(inv1, corrected_invoice) + 60
395
+ best_match = {
396
+ "invoice_number1": inv1,
397
+ "invoice_number2": corrected_invoice,
398
+ "similarity_score": round(score, 2),
399
+ "manual_review_status": generate_review_status(score),
400
+ "recommendation": generate_recommendation(score),
401
+ "reason": generate_reason(inv1, corrected_invoice, score),
402
+ "comments": "",
403
+ "editable": False
404
+ }
405
+ else:
406
+ best_match = None
407
+ best_score = -1
408
+ for idx2, row2 in df2.iterrows():
409
+ score = combined_similarity(inv1, row2['InvoiceNumber'])
410
+ if score > best_score:
411
+ best_score = score
412
+ best_match = {
413
+ "invoice_number1": inv1,
414
+ "invoice_number2": row2['InvoiceNumber'],
415
+ "similarity_score": round(score - 2, 2),
416
+ "manual_review_status": generate_review_status(score),
417
+ "recommendation": generate_recommendation(score),
418
+ "reason": generate_reason(inv1, row2['InvoiceNumber'], score),
419
+ "comments": "",
420
+ "editable": score <= 60
421
+ }
422
+ results.append(best_match)
423
+
424
+ df_final = pd.DataFrame(results)
425
+ return df_final
426
+
427
+
428
+ #########################################
429
+ # SBERT Exact Match Filtering
430
+ #########################################
431
+ def sbert_exact_match_filtering(df1, df2):
432
+ df1_embeddings = generate_embeddings(df1, 'InvoiceNumber')
433
+ df2_embeddings = generate_embeddings(df2, 'InvoiceNumber')
434
+ cosine_similarities = cosine_similarity(df1_embeddings, df2_embeddings)
435
+ tolerance = 1e-8
436
+ exact_match_indices = np.where(np.isclose(cosine_similarities, 1.0, atol=tolerance))
437
+ df_matches = pd.DataFrame({
438
+ 'df1_index': exact_match_indices[0],
439
+ 'df2_index': exact_match_indices[1]
440
+ })
441
+ df_exact = pd.DataFrame({
442
+ 'InvoiceNumber_1': df_matches['df1_index'].apply(lambda idx: df1.iloc[idx]['InvoiceNumber']),
443
+ 'InvoiceNumber_2': df_matches['df2_index'].apply(lambda idx: df2.iloc[idx]['InvoiceNumber'])
444
+ })
445
+ matched_values_df1 = df_exact['InvoiceNumber_1'].unique()
446
+ matched_values_df2 = df_exact['InvoiceNumber_2'].unique()
447
+ df1_filtered = df1[~df1['InvoiceNumber'].isin(matched_values_df1)].reset_index(drop=True)
448
+ df2_filtered = df2[~df2['InvoiceNumber'].isin(matched_values_df2)].reset_index(drop=True)
449
+ df_exact['similarity_score'] = 100
450
+ df_exact['manual_review_status'] = 'No Review Needed'
451
+ df_exact['recommendation'] = 'Exact Match'
452
+ df_exact['reason'] = 'Exact match via SBERT embeddings.'
453
+ df_exact['comments'] = ''
454
+ return df_exact, df1_filtered, df2_filtered
455
+
456
+
457
+ #########################################
458
+ # Functions to Generate Summary Statistics
459
+ #########################################
460
+ def get_stats(df):
461
+ """Aggregate summary statistics from the latest_results_df."""
462
+ stats = {}
463
+ stats['total_rows'] = len(df)
464
+ stats['total_exact_match'] = int((df['recommendation'] == 'Exact Match').sum())
465
+ stats['total_partial_match'] = int((df['recommendation'] == 'Partial Match').sum())
466
+ stats['total_unmatched'] = int((df['recommendation'] == 'Unmatched').sum())
467
+ stats['total_no_review_needed'] = int((df['manual_review_status'] == 'No Review Needed').sum())
468
+ stats['total_needs_review'] = int((df['manual_review_status'] == 'Needs Review').sum())
469
+ stats['similarity_scores'] = df['similarity_score'].tolist()
470
+ stats['average_similarity'] = float(df['similarity_score'].mean())
471
+ stats['min_similarity'] = float(df['similarity_score'].min())
472
+ stats['max_similarity'] = float(df['similarity_score'].max())
473
+ return stats
474
+
475
+
476
+ def generate_stats_excel_bytes(stats):
477
+ """Generate an Excel bytes stream from the stats dictionary."""
478
+ df_stats = pd.DataFrame(list(stats.items()), columns=["Metric", "Value"])
479
+ output = io.BytesIO()
480
+ with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
481
+ df_stats.to_excel(writer, index=False, sheet_name='Summary Stats')
482
+ output.seek(0)
483
+ return output
484
+
485
+
486
+ def generate_stats_json_bytes(stats):
487
+ """Generate a JSON bytes stream from the stats dictionary."""
488
+ json_bytes = io.BytesIO(json.dumps(stats, indent=4).encode('utf-8'))
489
+ return json_bytes
490
+
491
+
492
+ #########################################
493
+ # Flask Routes
494
+ #########################################
495
+ @app.route("/", methods=["GET", "POST"])
496
+ def index():
497
+ global latest_results_df, original_df1, original_df2
498
+ results = None
499
+ unique_values = [] # Unique invoice numbers from dataset2 for the select box
500
+ if request.method == "POST":
501
+ file1 = request.files.get("file1")
502
+ file2 = request.files.get("file2")
503
+ if not file1 or not file2:
504
+ flash("Please upload both files.")
505
+ return redirect(request.url)
506
+ ext1 = file1.filename.split(".")[-1].lower()
507
+ ext2 = file2.filename.split(".")[-1].lower()
508
+
509
+ try:
510
+ if ext1 == "csv":
511
+ file1_bytes = file1.read()
512
+ encoding_info = chardet.detect(file1_bytes)
513
+ encoding = encoding_info.get("encoding", "utf-8")
514
+ file1_text = file1_bytes.decode(encoding, errors="replace")
515
+ df1 = pd.read_csv(io.StringIO(file1_text))
516
+ elif ext1 in ["xls", "xlsx"]:
517
+ file1.seek(0)
518
+ df1 = pd.read_excel(file1)
519
+ else:
520
+ flash("File 1 format not supported.")
521
+ return redirect(request.url)
522
+
523
+ if ext2 == "csv":
524
+ file2_bytes = file2.read()
525
+ encoding_info = chardet.detect(file2_bytes)
526
+ encoding = encoding_info.get("encoding", "utf-8")
527
+ file2_text = file2_bytes.decode(encoding, errors="replace")
528
+ df2 = pd.read_csv(io.StringIO(file2_text))
529
+ elif ext2 in ["xls", "xlsx"]:
530
+ file2.seek(0)
531
+ df2 = pd.read_excel(file2)
532
+ else:
533
+ flash("File 2 format not supported.")
534
+ return redirect(request.url)
535
+ except Exception as e:
536
+ flash("Error reading files: " + str(e))
537
+ return redirect(request.url)
538
+
539
+ file1.seek(0)
540
+ file2.seek(0)
541
+
542
+ df1["InvoiceNumber"] = df1["InvoiceNumber"].astype(str)
543
+ df2["InvoiceNumber"] = df2["InvoiceNumber"].astype(str)
544
+
545
+ original_df1 = df1.copy()
546
+ original_df2 = df2.copy()
547
+
548
+ # Prepare the unique invoice numbers from dataset2 for the edit select box.
549
+ unique_values = sorted(df2["InvoiceNumber"].unique().tolist())
550
+
551
+ # Run SBERT exact match filtering.
552
+ df_exact, df1_filtered, df2_filtered = sbert_exact_match_filtering(df1, df2)
553
+
554
+ # Run robust invoice matching on remaining invoices (with feedback override).
555
+ df_final_matches = process_invoices(df1_filtered, df2_filtered)
556
+
557
+ # Rename exact match columns for consistency.
558
+ df_exact = df_exact.rename(columns={
559
+ 'InvoiceNumber_1': 'invoice_number1',
560
+ 'InvoiceNumber_2': 'invoice_number2'
561
+ })
562
+
563
+ # Concatenate exact matches with robust matches.
564
+ df_concatenated = pd.concat([df_exact, df_final_matches], ignore_index=True)
565
+
566
+ # Shuffle the rows randomly before storing and displaying
567
+ latest_results_df = df_concatenated.sample(frac=1).reset_index(drop=True)
568
+ results = latest_results_df.to_dict(orient="records")
569
+
570
+ return render_template("index.html", results=results, unique_values=unique_values)
571
+
572
+
573
+ @app.route("/save_updates", methods=["POST"])
574
+ def save_updates():
575
+ global latest_results_df
576
+ try:
577
+ updated_data = request.get_json()
578
+ updated_df = pd.DataFrame(updated_data)
579
+ latest_results_df = updated_df.copy()
580
+ return jsonify({"status": "success"}), 200
581
+ except Exception as e:
582
+ return jsonify({"status": "error", "message": str(e)}), 500
583
+
584
+
585
+ @app.route("/save_feedback", methods=["POST"])
586
+ def save_feedback():
587
+ try:
588
+ feedback_data = request.get_json()
589
+ invoice1 = feedback_data.get('invoice_number1')
590
+ selected_invoice2 = feedback_data.get('selected_invoice2')
591
+
592
+ # If a new invoice is selected, update the persistent feedback mapping.
593
+ if selected_invoice2:
594
+ update_feedback_mapping(invoice1, selected_invoice2)
595
+ message = "Feedback saved. Please re-run to train model on updates."
596
+ else:
597
+ message = "No new invoice selected; no changes made."
598
+
599
+ return jsonify({"status": "success", "message": message}), 200
600
+
601
+ except Exception as e:
602
+ return jsonify({"status": "error", "message": str(e)}), 500
603
+
604
+
605
+ def generate_csv_bytes(df):
606
+ csv_buffer = io.StringIO()
607
+ df.to_csv(csv_buffer, index=False)
608
+ csv_buffer.seek(0)
609
+ return io.BytesIO(csv_buffer.getvalue().encode())
610
+
611
+
612
+ def generate_excel_bytes(df):
613
+ df = df.replace([np.inf, -np.inf], np.nan).fillna("")
614
+ output = io.BytesIO()
615
+ with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
616
+ workbook = writer.book
617
+ worksheet = workbook.add_worksheet("Report")
618
+ excel_col_mapping = {}
619
+ excel_index = 0
620
+ for col in df.columns:
621
+ if col.lower() == 'reason':
622
+ excel_col_mapping[col] = excel_index
623
+ excel_index += 2
624
+ else:
625
+ excel_col_mapping[col] = excel_index
626
+ excel_index += 1
627
+ total_excel_columns = excel_index
628
+ title_format = workbook.add_format({
629
+ 'bold': True,
630
+ 'bg_color': '#FFFF00',
631
+ 'font_color': 'black',
632
+ 'align': 'center',
633
+ 'valign': 'vcenter',
634
+ 'font_size': 16
635
+ })
636
+ header_format = workbook.add_format({
637
+ 'bold': True,
638
+ 'bg_color': '#FFFF00',
639
+ 'font_color': 'black',
640
+ 'border': 1,
641
+ 'align': 'center',
642
+ 'valign': 'vcenter'
643
+ })
644
+ data_cell_format = workbook.add_format({
645
+ 'border': 1,
646
+ 'align': 'left',
647
+ 'valign': 'vcenter',
648
+ 'text_wrap': True
649
+ })
650
+ worksheet.merge_range(0, 0, 0, total_excel_columns - 1,
651
+ "Intelligent Partial Invoice Matching - Excel Report",
652
+ title_format)
653
+ start_data_row = 2
654
+ for col in df.columns:
655
+ col_index = excel_col_mapping[col]
656
+ if col.lower() == 'reason':
657
+ worksheet.merge_range(start_data_row, col_index, start_data_row, col_index + 1,
658
+ col, header_format)
659
+ worksheet.set_column(col_index, col_index + 1, 40)
660
+ else:
661
+ worksheet.write(start_data_row, col_index, col, header_format)
662
+ worksheet.set_column(col_index, col_index, 20)
663
+ for i, row in enumerate(df.itertuples(index=False, name=None)):
664
+ for col_name, cell in zip(df.columns, row):
665
+ col_index = excel_col_mapping[col_name]
666
+ if col_name.lower() == 'reason':
667
+ worksheet.merge_range(start_data_row + 1 + i, col_index,
668
+ start_data_row + 1 + i, col_index + 1,
669
+ cell, data_cell_format)
670
+ else:
671
+ worksheet.write(start_data_row + 1 + i, col_index, cell, data_cell_format)
672
+ last_data_row = start_data_row + 1 + len(df)
673
+ stats_card_row = last_data_row + 3
674
+ try:
675
+ total_invoices = len(df)
676
+ avg_score = float(df['similarity_score'].astype(float).mean())
677
+ max_score = float(df['similarity_score'].astype(float).max())
678
+ min_score = float(df['similarity_score'].astype(float).min())
679
+ except Exception:
680
+ total_invoices = avg_score = max_score = min_score = 0
681
+ left_card = [
682
+ ["Total Invoices", total_invoices],
683
+ ["Average Similarity", round(avg_score, 2)]
684
+ ]
685
+ right_card = [
686
+ ["Max Similarity", round(max_score, 2)],
687
+ ["Min Similarity", round(min_score, 2)]
688
+ ]
689
+ for i, item in enumerate(left_card):
690
+ worksheet.write(stats_card_row + i, 0, item[0], header_format)
691
+ worksheet.write(stats_card_row + i, 1, item[1], data_cell_format)
692
+ for i, item in enumerate(right_card):
693
+ worksheet.write(stats_card_row + i, 3, item[0], header_format)
694
+ worksheet.write(stats_card_row + i, 4, item[1], data_cell_format)
695
+ chart_start_row = stats_card_row + 5
696
+ chart_col = 3
697
+ recommendation_categories = ["Unmatched", "Exact Match", "Partial Match"]
698
+ recommendation_counts = [int(df[df['recommendation'] == cat].shape[0]) for cat in recommendation_categories]
699
+ rec_table_row = chart_start_row
700
+ worksheet.write(rec_table_row, 0, "Recommendation", header_format)
701
+ worksheet.write(rec_table_row, 1, "Count", header_format)
702
+ for i, (cat, cnt) in enumerate(zip(recommendation_categories, recommendation_counts)):
703
+ worksheet.write(rec_table_row + 1 + i, 0, cat, data_cell_format)
704
+ worksheet.write(rec_table_row + 1 + i, 1, cnt, data_cell_format)
705
+ rec_pie_chart = workbook.add_chart({'type': 'pie'})
706
+ rec_pie_chart.add_series({
707
+ 'name': 'Recommendation Distribution',
708
+ 'categories': ['Report', rec_table_row + 1, 0, rec_table_row + len(recommendation_categories), 0],
709
+ 'values': ['Report', rec_table_row + 1, 1, rec_table_row + len(recommendation_categories), 1],
710
+ })
711
+ rec_pie_chart.set_title({'name': 'Recommendation Distribution'})
712
+ worksheet.insert_chart(chart_start_row, chart_col, rec_pie_chart, {'x_scale': 1.0, 'y_scale': 1.0})
713
+ chart_start_row += 17
714
+ if 'similarity_score' in df.columns:
715
+ scores = pd.to_numeric(df['similarity_score'], errors='coerce').dropna()
716
+ bins = list(range(1, 102, 10))
717
+ counts, bin_edges = np.histogram(scores, bins=bins)
718
+ bin_labels = [f"{bins[i]}-{bins[i + 1] - 1}" for i in range(len(bins) - 1)]
719
+ hist_table_row = chart_start_row - 3
720
+ worksheet.write(hist_table_row, 0, "Score Range", header_format)
721
+ worksheet.write(hist_table_row, 1, "Count", header_format)
722
+ for i, (label, cnt) in enumerate(zip(bin_labels, counts)):
723
+ worksheet.write(hist_table_row + 1 + i, 0, label, data_cell_format)
724
+ worksheet.write(hist_table_row + 1 + i, 1, cnt, data_cell_format)
725
+ hist_chart = workbook.add_chart({'type': 'column'})
726
+ hist_chart.add_series({
727
+ 'name': 'Similarity Score Distribution',
728
+ 'categories': ['Report', hist_table_row + 1, 0, hist_table_row + len(bin_labels), 0],
729
+ 'values': ['Report', hist_table_row + 1, 1, hist_table_row + len(bin_labels), 1],
730
+ })
731
+ hist_chart.set_title({'name': 'Histogram of Similarity Scores'})
732
+ hist_chart.set_x_axis({'name': 'Score Range'})
733
+ hist_chart.set_y_axis({'name': 'Count'})
734
+ worksheet.insert_chart(chart_start_row, chart_col, hist_chart, {'x_scale': 1.2, 'y_scale': 1.2})
735
+ chart_start_row += 20
736
+ if 'reason' in df.columns:
737
+ worksheet.write(chart_start_row - 2, chart_col, "Wordcloud for Reasons", header_format)
738
+ text = " ".join(df['reason'].astype(str).tolist())
739
+ wc = WordCloud(width=400, height=200, background_color='white').generate(text)
740
+ imgdata = io.BytesIO()
741
+ wc.to_image().save(imgdata, format='PNG')
742
+ imgdata.seek(0)
743
+ worksheet.insert_image(chart_start_row, chart_col, 'wordcloud.png',
744
+ {'image_data': imgdata, 'x_scale': 1.0, 'y_scale': 1.0})
745
+ chart_start_row += 25
746
+ else:
747
+ chart_start_row += 10
748
+ try:
749
+ sim_index = excel_col_mapping.get('similarity_score', 0)
750
+ except Exception:
751
+ sim_index = 0
752
+ line_chart = workbook.add_chart({'type': 'line'})
753
+ line_chart.add_series({
754
+ 'name': 'Similarity Score Trend',
755
+ 'categories': ['Report', start_data_row + 1, 0, last_data_row - 1, 0],
756
+ 'values': ['Report', start_data_row + 1, sim_index, last_data_row - 1, sim_index],
757
+ })
758
+ line_chart.set_title({'name': 'Similarity Score Over Entries'})
759
+ worksheet.insert_chart(chart_start_row, chart_col, line_chart, {'x_scale': 1.5, 'y_scale': 1.5})
760
+ chart_start_row += 30
761
+ if 'reason' in df.columns:
762
+ reasons = df['reason'].value_counts().reset_index()
763
+ reasons.columns = ['Reason', 'Count']
764
+ hbar_table_row = chart_start_row
765
+ worksheet.write(hbar_table_row, 0, "Reason", header_format)
766
+ worksheet.write(hbar_table_row, 1, "Count", header_format)
767
+ for idx, row in reasons.iterrows():
768
+ worksheet.write(hbar_table_row + 1 + idx, 0, row['Reason'], data_cell_format)
769
+ worksheet.write(hbar_table_row + 1 + idx, 1, row['Count'], data_cell_format)
770
+ hbar_chart = workbook.add_chart({'type': 'bar'})
771
+ hbar_chart.add_series({
772
+ 'name': 'Reasons Distribution',
773
+ 'categories': ['Report', hbar_table_row + 1, 0, hbar_table_row + len(reasons), 0],
774
+ 'values': ['Report', hbar_table_row + 1, 1, hbar_table_row + len(reasons), 1],
775
+ })
776
+ hbar_chart.set_title({'name': 'Reasons Distribution'})
777
+ worksheet.insert_chart(chart_start_row, chart_col, hbar_chart, {'x_scale': 1.5, 'y_scale': 1.5})
778
+ chart_start_row += 30
779
+ output.seek(0)
780
+ return output
781
+
782
+
783
+ @app.route("/download_csv")
784
+ def download_csv():
785
+ global latest_results_df, original_df1, original_df2
786
+ if latest_results_df is None:
787
+ flash("No data available.")
788
+ return redirect(url_for('index'))
789
+ allowed_recs = {"Partial Match", "UnMatched", "Exact Match"}
790
+ filtered_matches = latest_results_df[latest_results_df['recommendation'].isin(allowed_recs)]
791
+ keys_df = filtered_matches[['invoice_number1', 'invoice_number2']].copy()
792
+ df1_merged = pd.merge(
793
+ keys_df,
794
+ original_df1,
795
+ left_on='invoice_number1',
796
+ right_on='InvoiceNumber',
797
+ how='left'
798
+ )
799
+ df1_merged.rename(columns={'InvoiceNumber': 'InvoiceNumber_1'}, inplace=True)
800
+ df2_merged = pd.merge(
801
+ keys_df,
802
+ original_df2,
803
+ left_on='invoice_number2',
804
+ right_on='InvoiceNumber',
805
+ how='left'
806
+ )
807
+ df2_merged.rename(columns={'InvoiceNumber': 'InvoiceNumber_2'}, inplace=True)
808
+ final_df = pd.DataFrame({
809
+ 'InvoiceNumber_1': df1_merged['InvoiceNumber_1'],
810
+ 'InvoiceNumber_2': df2_merged['InvoiceNumber_2']
811
+ })
812
+ for col in final_df.select_dtypes(include=['object']).columns:
813
+ final_df[col] = final_df[col].str.strip()
814
+ final_df.reset_index(drop=True, inplace=True)
815
+ return send_file(
816
+ generate_csv_bytes(final_df),
817
+ mimetype='text/csv',
818
+ download_name='final_merged_invoices.csv',
819
+ as_attachment=True
820
+ )
821
+
822
+
823
+ @app.route("/download_excel")
824
+ def download_excel():
825
+ global latest_results_df
826
+ if latest_results_df is None:
827
+ flash("No data available.")
828
+ return redirect(url_for('index'))
829
+ df = latest_results_df.copy()
830
+ for col in ["editable", "comments"]:
831
+ if col in df.columns:
832
+ df.drop(columns=[col], inplace=True)
833
+ return send_file(
834
+ generate_excel_bytes(df),
835
+ mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
836
+ download_name='matched_invoices.xlsx',
837
+ as_attachment=True
838
+ )
839
+
840
+
841
+ # New endpoint: Download summary statistics as Excel
842
+ @app.route("/download_stats_excel")
843
+ def download_stats_excel():
844
+ global latest_results_df
845
+ if latest_results_df is None:
846
+ flash("No data available for stats.")
847
+ return redirect(url_for('index'))
848
+ stats = get_stats(latest_results_df)
849
+ return send_file(
850
+ generate_stats_excel_bytes(stats),
851
+ mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
852
+ download_name='invoice_matching_stats.xlsx',
853
+ as_attachment=True
854
+ )
855
+
856
+
857
+ # New endpoint: Download summary statistics as JSON
858
+ @app.route("/download_stats_json")
859
+ def download_stats_json():
860
+ global latest_results_df
861
+ if latest_results_df is None:
862
+ flash("No data available for stats.")
863
+ return redirect(url_for('index'))
864
+ stats = get_stats(latest_results_df)
865
+ return send_file(
866
+ generate_stats_json_bytes(stats),
867
+ mimetype='application/json',
868
+ download_name='invoice_matching_stats.json',
869
+ as_attachment=True
870
+ )
871
+
872
+
873
+ if __name__ == "__main__":
874
+ app.run(debug=True)