Spaces:

ACA050
/

ReconAI

Sleeping

App Files Files Community

ReconAI / reconciliation.py

ACA050

Upload 14 files

64e5ee2 verified 20 days ago

raw

history blame contribute delete

6.11 kB

	import pandas as pd
	import numpy as np
	from rapidfuzz import fuzz
	from sentence_transformers import SentenceTransformer
	import faiss
	import os
	import pickle
	import logging

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class ReconciliationEngine:
	def __init__(self, threshold=85.0, model_name='all-MiniLM-L6-v2', index_path='vendor_index.faiss'):
	self.threshold = threshold
	self.model = SentenceTransformer(model_name)
	self.index_path = index_path
	self.vendor_mapping_path = 'vendor_mapping.pkl'
	self.index = None
	self.vendor_names = []
	self._load_or_create_index()

	def _load_or_create_index(self):
	# We need dimension size for the chosen model. MiniLM-L6-v2 is 384
	d = self.model.get_sentence_embedding_dimension()

	if os.path.exists(self.index_path) and os.path.exists(self.vendor_mapping_path):
	logger.info("Loading existing FAISS index.")
	self.index = faiss.read_index(self.index_path)
	with open(self.vendor_mapping_path, 'rb') as f:
	self.vendor_names = pickle.load(f)
	else:
	logger.info("Creating new FAISS index.")
	self.index = faiss.IndexFlatL2(d)
	self.vendor_names = []

	def _save_index(self):
	faiss.write_index(self.index, self.index_path)
	with open(self.vendor_mapping_path, 'wb') as f:
	pickle.dump(self.vendor_names, f)

	def learn_vendors(self, vendors):
	"""Adds new vendors to the FAISS index."""
	if not hasattr(self, 'embedding_cache'):
	self.embedding_cache = {}

	new_vendors = [v for v in set(vendors) if pd.notna(v) and v not in self.vendor_names]
	if new_vendors:
	logger.info(f"Learning {len(new_vendors)} new vendors.")
	embeddings = self.model.encode(new_vendors)
	self.index.add(np.array(embeddings).astype('float32'))
	self.vendor_names.extend(new_vendors)

	# Pre-cache to speed up pair-wise matching later
	for v, emb in zip(new_vendors, embeddings):
	self.embedding_cache[v] = emb / np.linalg.norm(emb)

	self._save_index()

	def get_embedding(self, vendor):
	if not hasattr(self, 'embedding_cache'):
	self.embedding_cache = {}
	if vendor not in self.embedding_cache:
	emb = self.model.encode([vendor])[0]
	self.embedding_cache[vendor] = emb / np.linalg.norm(emb)
	return self.embedding_cache[vendor]

	def get_semantic_similarity(self, vendor1, vendor2):
	if pd.isna(vendor1) or pd.isna(vendor2):
	return 0.0
	emb1_norm = self.get_embedding(vendor1)
	emb2_norm = self.get_embedding(vendor2)
	sim = np.dot(emb1_norm, emb2_norm)
	return max(0.0, sim * 100)

	def search_similar_vendor(self, query_vendor, top_k=1):
	if not self.vendor_names or pd.isna(query_vendor):
	return None, 0.0

	query_emb = self.model.encode([query_vendor]).astype('float32')
	distances, indices = self.index.search(query_emb, top_k)

	best_idx = indices[0][0]
	if best_idx != -1:
	best_match = self.vendor_names[best_idx]
	# Calculate a normalized score based on L2 distance
	# For normalized vectors, L2 distance squared is 2 - 2*cos(theta)
	# This is a rough proxy; let's combine with fuzz for the final score
	fuzz_score = fuzz.ratio(query_vendor.lower(), best_match.lower())
	return best_match, fuzz_score
	return None, 0.0

	def reconcile(self, source_df, target_df, source_key='VendorName', target_key='VendorName', amount_col='Amount'):
	logger.info("Starting reconciliation process.")

	# Learn vendors from both datasets
	self.learn_vendors(source_df[source_key].tolist())
	self.learn_vendors(target_df[target_key].tolist())

	# Basic exact match on InvoiceID if it exists, otherwise we match on VendorName and Amount
	if 'InvoiceID' in source_df.columns and 'InvoiceID' in target_df.columns:
	source_df = source_df.drop_duplicates(subset=['InvoiceID'])
	target_df = target_df.drop_duplicates(subset=['InvoiceID'])
	merged = pd.merge(source_df, target_df, on='InvoiceID', how='outer', suffixes=('_books', '_gst'))

	def determine_status(row):
	if pd.isna(row.get(f'{amount_col}_books')):
	return "Missing in Books"
	if pd.isna(row.get(f'{amount_col}_gst')):
	return "Missing in GST"

	b_amt = float(row.get(f'{amount_col}_books', 0))
	g_amt = float(row.get(f'{amount_col}_gst', 0))

	if abs(b_amt - g_amt) > 0.01:
	return "Amount Mismatch"

	b_vendor_val = row.get(f'{source_key}_books')
	g_vendor_val = row.get(f'{target_key}_gst')
	b_vendor = str(b_vendor_val) if pd.notna(b_vendor_val) else ''
	g_vendor = str(g_vendor_val) if pd.notna(g_vendor_val) else ''

	if b_vendor.lower() == g_vendor.lower() and b_vendor != '':
	return "Exact Match"

	fuzz_score = fuzz.ratio(b_vendor.lower(), g_vendor.lower())
	if fuzz_score >= self.threshold:
	return f"Fuzzy Match ({fuzz_score:.1f}%)"

	sem_score = self.get_semantic_similarity(b_vendor, g_vendor)
	if sem_score >= self.threshold:
	return f"Semantic Match ({sem_score:.1f}%)"

	return "Vendor Mismatch"

	merged['MatchStatus'] = merged.apply(determine_status, axis=1)
	return merged
	else:
	raise ValueError("InvoiceID column is required for current reconciliation logic.")