InvoiceAgenticAI / bounding_box.py
PARTHASAKHAPAUL
Restructure project and sync local as source of truth
9107a5d
import fitz # PyMuPDF
import pandas as pd
import os
import re
# === File paths ===
DATA_DIR = os.path.join(os.getcwd(), "data")
PDF_PATH = os.path.join(DATA_DIR, "invoices/Invoice-26.pdf") # Update for new PDF if needed
CSV_PATH = os.path.join(DATA_DIR, "purchase_orders.csv")
OUTPUT_PATH = os.path.join(DATA_DIR, "annotated_invoice.pdf")
# === Field coordinate map (from your data) ===
FIELD_BOXES = {
"invoice_number": (525, 55, 575, 75),
"order_id": (45, 470, 230, 490),
"customer_name": (40, 135, 100, 155),
"quantity": (370, 235, 385, 250),
"rate": (450, 235, 500, 250),
"expected_amount": (520, 360, 570, 375),
}
# === Step 1: Open PDF and extract text ===
pdf = fitz.open(PDF_PATH)
page = pdf[0]
pdf_text = page.get_text()
# === Step 2: Helper to extract fields ===
def extract_field(pattern, text, group=1):
match = re.search(pattern, text, re.IGNORECASE)
return match.group(group).strip() if match else None
# Extract key identifiers
invoice_number_pdf = extract_field(r"#\s*(\d+)", pdf_text)
order_id_pdf = extract_field(r"Order ID\s*[:\-]?\s*(\S+)", pdf_text)
customer_name_pdf = extract_field(r"Bill To:\s*(.*)", pdf_text)
# === Step 3: Read CSV and match correct row ===
po_df = pd.read_csv(CSV_PATH)
matched_row = po_df[
(po_df['invoice_number'].astype(str) == str(invoice_number_pdf))
| (po_df['order_id'] == order_id_pdf)
]
if matched_row.empty:
raise ValueError(f"No matching CSV row found for Invoice {invoice_number_pdf} / Order {order_id_pdf}")
expected = matched_row.iloc[0].to_dict()
expected = {k.lower(): str(v).strip() for k, v in expected.items()}
print("✅ Loaded expected data from CSV for this PDF:")
for k, v in expected.items():
print(f" {k}: {v}")
# === Step 4: Extract fields from PDF ===
invoice_data = {
"invoice_number": invoice_number_pdf,
"customer_name": customer_name_pdf,
"order_id": order_id_pdf,
}
# Numeric fields
amounts = re.findall(r"\$?([\d,]+\.\d{2})", pdf_text)
invoice_data["expected_amount"] = amounts[-1] if amounts else None
# Extract first item (quantity, rate)
item_lines = re.findall(
r"([A-Za-z0-9 ,\-]+)\s+(\d+)\s+\$?([\d,]+\.\d{2})\s+\$?([\d,]+\.\d{2})",
pdf_text,
)
if item_lines:
invoice_data["quantity"] = item_lines[0][1]
invoice_data["rate"] = item_lines[0][2]
print("\n✅ Extracted data from PDF:")
for k, v in invoice_data.items():
print(f" {k}: {v}")
# === Step 5: Compare PDF vs CSV ===
discrepancies = []
def add_discrepancy(field, expected_val, found_val):
discrepancies.append({"field": field, "expected": expected_val, "found": found_val})
# Compare string fields
for field in ["invoice_number", "order_id", "customer_name"]:
if str(invoice_data.get(field, "")).strip() != str(expected.get(field, "")).strip():
add_discrepancy(field, expected.get(field, ""), invoice_data.get(field, ""))
# Compare numeric fields
for field in ["quantity", "rate", "expected_amount"]:
try:
found_val = float(str(invoice_data.get(field, 0)).replace(",", "").replace("$", ""))
expected_val = float(str(expected.get(field, 0)).replace(",", "").replace("$", ""))
if round(found_val, 2) != round(expected_val, 2):
add_discrepancy(field, expected_val, found_val)
except:
if str(invoice_data.get(field, "")) != str(expected.get(field, "")):
add_discrepancy(field, expected.get(field, ""), invoice_data.get(field, ""))
# === Step 6: Annotate mismatched fields using fixed coordinates ===
for d in discrepancies:
field = d["field"]
if field not in FIELD_BOXES:
print(f"⚠️ No coordinates found for field '{field}' — skipping annotation.")
continue
rect_coords = FIELD_BOXES[field]
rect = fitz.Rect(rect_coords)
expected_text = (
f"{float(d['expected']):,.2f}"
if field in ["quantity", "rate", "expected_amount"]
else str(d["expected"])
)
# Draw red bounding box
page.draw_rect(rect, color=(1, 0, 0), width=1.5)
# Add expected value below box
page.insert_text(
(rect.x0, rect.y1 + 10),
expected_text,
fontsize=9,
color=(1, 0, 0),
)
pdf.save(OUTPUT_PATH)
pdf.close()
print("\n✅ Annotated invoice saved at:", OUTPUT_PATH)
if discrepancies:
print("\n⚠️ Mismatches found:")
for d in discrepancies:
print(f" - {d['field']}: expected {d['expected']}, found {d['found']}")
else:
print("\n✅ No mismatches found! Invoice matches CSV.")