Spaces:
Running
Running
| import fitz # PyMuPDF | |
| import pandas as pd | |
| import os | |
| import re | |
| # === File paths === | |
| DATA_DIR = os.path.join(os.getcwd(), "data") | |
| PDF_PATH = os.path.join(DATA_DIR, "invoices/Invoice-26.pdf") # Update for new PDF if needed | |
| CSV_PATH = os.path.join(DATA_DIR, "purchase_orders.csv") | |
| OUTPUT_PATH = os.path.join(DATA_DIR, "annotated_invoice.pdf") | |
| # === Field coordinate map (from your data) === | |
| FIELD_BOXES = { | |
| "invoice_number": (525, 55, 575, 75), | |
| "order_id": (45, 470, 230, 490), | |
| "customer_name": (40, 135, 100, 155), | |
| "quantity": (370, 235, 385, 250), | |
| "rate": (450, 235, 500, 250), | |
| "expected_amount": (520, 360, 570, 375), | |
| } | |
| # === Step 1: Open PDF and extract text === | |
| pdf = fitz.open(PDF_PATH) | |
| page = pdf[0] | |
| pdf_text = page.get_text() | |
| # === Step 2: Helper to extract fields === | |
| def extract_field(pattern, text, group=1): | |
| match = re.search(pattern, text, re.IGNORECASE) | |
| return match.group(group).strip() if match else None | |
| # Extract key identifiers | |
| invoice_number_pdf = extract_field(r"#\s*(\d+)", pdf_text) | |
| order_id_pdf = extract_field(r"Order ID\s*[:\-]?\s*(\S+)", pdf_text) | |
| customer_name_pdf = extract_field(r"Bill To:\s*(.*)", pdf_text) | |
| # === Step 3: Read CSV and match correct row === | |
| po_df = pd.read_csv(CSV_PATH) | |
| matched_row = po_df[ | |
| (po_df['invoice_number'].astype(str) == str(invoice_number_pdf)) | |
| | (po_df['order_id'] == order_id_pdf) | |
| ] | |
| if matched_row.empty: | |
| raise ValueError(f"No matching CSV row found for Invoice {invoice_number_pdf} / Order {order_id_pdf}") | |
| expected = matched_row.iloc[0].to_dict() | |
| expected = {k.lower(): str(v).strip() for k, v in expected.items()} | |
| print("✅ Loaded expected data from CSV for this PDF:") | |
| for k, v in expected.items(): | |
| print(f" {k}: {v}") | |
| # === Step 4: Extract fields from PDF === | |
| invoice_data = { | |
| "invoice_number": invoice_number_pdf, | |
| "customer_name": customer_name_pdf, | |
| "order_id": order_id_pdf, | |
| } | |
| # Numeric fields | |
| amounts = re.findall(r"\$?([\d,]+\.\d{2})", pdf_text) | |
| invoice_data["expected_amount"] = amounts[-1] if amounts else None | |
| # Extract first item (quantity, rate) | |
| item_lines = re.findall( | |
| r"([A-Za-z0-9 ,\-]+)\s+(\d+)\s+\$?([\d,]+\.\d{2})\s+\$?([\d,]+\.\d{2})", | |
| pdf_text, | |
| ) | |
| if item_lines: | |
| invoice_data["quantity"] = item_lines[0][1] | |
| invoice_data["rate"] = item_lines[0][2] | |
| print("\n✅ Extracted data from PDF:") | |
| for k, v in invoice_data.items(): | |
| print(f" {k}: {v}") | |
| # === Step 5: Compare PDF vs CSV === | |
| discrepancies = [] | |
| def add_discrepancy(field, expected_val, found_val): | |
| discrepancies.append({"field": field, "expected": expected_val, "found": found_val}) | |
| # Compare string fields | |
| for field in ["invoice_number", "order_id", "customer_name"]: | |
| if str(invoice_data.get(field, "")).strip() != str(expected.get(field, "")).strip(): | |
| add_discrepancy(field, expected.get(field, ""), invoice_data.get(field, "")) | |
| # Compare numeric fields | |
| for field in ["quantity", "rate", "expected_amount"]: | |
| try: | |
| found_val = float(str(invoice_data.get(field, 0)).replace(",", "").replace("$", "")) | |
| expected_val = float(str(expected.get(field, 0)).replace(",", "").replace("$", "")) | |
| if round(found_val, 2) != round(expected_val, 2): | |
| add_discrepancy(field, expected_val, found_val) | |
| except: | |
| if str(invoice_data.get(field, "")) != str(expected.get(field, "")): | |
| add_discrepancy(field, expected.get(field, ""), invoice_data.get(field, "")) | |
| # === Step 6: Annotate mismatched fields using fixed coordinates === | |
| for d in discrepancies: | |
| field = d["field"] | |
| if field not in FIELD_BOXES: | |
| print(f"⚠️ No coordinates found for field '{field}' — skipping annotation.") | |
| continue | |
| rect_coords = FIELD_BOXES[field] | |
| rect = fitz.Rect(rect_coords) | |
| expected_text = ( | |
| f"{float(d['expected']):,.2f}" | |
| if field in ["quantity", "rate", "expected_amount"] | |
| else str(d["expected"]) | |
| ) | |
| # Draw red bounding box | |
| page.draw_rect(rect, color=(1, 0, 0), width=1.5) | |
| # Add expected value below box | |
| page.insert_text( | |
| (rect.x0, rect.y1 + 10), | |
| expected_text, | |
| fontsize=9, | |
| color=(1, 0, 0), | |
| ) | |
| pdf.save(OUTPUT_PATH) | |
| pdf.close() | |
| print("\n✅ Annotated invoice saved at:", OUTPUT_PATH) | |
| if discrepancies: | |
| print("\n⚠️ Mismatches found:") | |
| for d in discrepancies: | |
| print(f" - {d['field']}: expected {d['expected']}, found {d['found']}") | |
| else: | |
| print("\n✅ No mismatches found! Invoice matches CSV.") | |