Spaces:
Running
Running
File size: 4,540 Bytes
2b44e69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import fitz # PyMuPDF
import pandas as pd
import os
import re
# === File paths ===
DATA_DIR = os.path.join(os.getcwd(), "data")
PDF_PATH = os.path.join(DATA_DIR, "invoices/Invoice-26.pdf") # Update for new PDF if needed
CSV_PATH = os.path.join(DATA_DIR, "purchase_orders.csv")
OUTPUT_PATH = os.path.join(DATA_DIR, "annotated_invoice.pdf")
# === Field coordinate map (from your data) ===
FIELD_BOXES = {
"invoice_number": (525, 55, 575, 75),
"order_id": (45, 470, 230, 490),
"customer_name": (40, 135, 100, 155),
"quantity": (370, 235, 385, 250),
"rate": (450, 235, 500, 250),
"expected_amount": (520, 360, 570, 375),
}
# === Step 1: Open PDF and extract text ===
pdf = fitz.open(PDF_PATH)
page = pdf[0]
pdf_text = page.get_text()
# === Step 2: Helper to extract fields ===
def extract_field(pattern, text, group=1):
match = re.search(pattern, text, re.IGNORECASE)
return match.group(group).strip() if match else None
# Extract key identifiers
invoice_number_pdf = extract_field(r"#\s*(\d+)", pdf_text)
order_id_pdf = extract_field(r"Order ID\s*[:\-]?\s*(\S+)", pdf_text)
customer_name_pdf = extract_field(r"Bill To:\s*(.*)", pdf_text)
# === Step 3: Read CSV and match correct row ===
po_df = pd.read_csv(CSV_PATH)
matched_row = po_df[
(po_df['invoice_number'].astype(str) == str(invoice_number_pdf))
| (po_df['order_id'] == order_id_pdf)
]
if matched_row.empty:
raise ValueError(f"No matching CSV row found for Invoice {invoice_number_pdf} / Order {order_id_pdf}")
expected = matched_row.iloc[0].to_dict()
expected = {k.lower(): str(v).strip() for k, v in expected.items()}
print("✅ Loaded expected data from CSV for this PDF:")
for k, v in expected.items():
print(f" {k}: {v}")
# === Step 4: Extract fields from PDF ===
invoice_data = {
"invoice_number": invoice_number_pdf,
"customer_name": customer_name_pdf,
"order_id": order_id_pdf,
}
# Numeric fields
amounts = re.findall(r"\$?([\d,]+\.\d{2})", pdf_text)
invoice_data["expected_amount"] = amounts[-1] if amounts else None
# Extract first item (quantity, rate)
item_lines = re.findall(
r"([A-Za-z0-9 ,\-]+)\s+(\d+)\s+\$?([\d,]+\.\d{2})\s+\$?([\d,]+\.\d{2})",
pdf_text,
)
if item_lines:
invoice_data["quantity"] = item_lines[0][1]
invoice_data["rate"] = item_lines[0][2]
print("\n✅ Extracted data from PDF:")
for k, v in invoice_data.items():
print(f" {k}: {v}")
# === Step 5: Compare PDF vs CSV ===
discrepancies = []
def add_discrepancy(field, expected_val, found_val):
discrepancies.append({"field": field, "expected": expected_val, "found": found_val})
# Compare string fields
for field in ["invoice_number", "order_id", "customer_name"]:
if str(invoice_data.get(field, "")).strip() != str(expected.get(field, "")).strip():
add_discrepancy(field, expected.get(field, ""), invoice_data.get(field, ""))
# Compare numeric fields
for field in ["quantity", "rate", "expected_amount"]:
try:
found_val = float(str(invoice_data.get(field, 0)).replace(",", "").replace("$", ""))
expected_val = float(str(expected.get(field, 0)).replace(",", "").replace("$", ""))
if round(found_val, 2) != round(expected_val, 2):
add_discrepancy(field, expected_val, found_val)
except:
if str(invoice_data.get(field, "")) != str(expected.get(field, "")):
add_discrepancy(field, expected.get(field, ""), invoice_data.get(field, ""))
# === Step 6: Annotate mismatched fields using fixed coordinates ===
for d in discrepancies:
field = d["field"]
if field not in FIELD_BOXES:
print(f"⚠️ No coordinates found for field '{field}' — skipping annotation.")
continue
rect_coords = FIELD_BOXES[field]
rect = fitz.Rect(rect_coords)
expected_text = (
f"{float(d['expected']):,.2f}"
if field in ["quantity", "rate", "expected_amount"]
else str(d["expected"])
)
# Draw red bounding box
page.draw_rect(rect, color=(1, 0, 0), width=1.5)
# Add expected value below box
page.insert_text(
(rect.x0, rect.y1 + 10),
expected_text,
fontsize=9,
color=(1, 0, 0),
)
pdf.save(OUTPUT_PATH)
pdf.close()
print("\n✅ Annotated invoice saved at:", OUTPUT_PATH)
if discrepancies:
print("\n⚠️ Mismatches found:")
for d in discrepancies:
print(f" - {d['field']}: expected {d['expected']}, found {d['found']}")
else:
print("\n✅ No mismatches found! Invoice matches CSV.")
|