Spaces:
Build error
Build error
| import pdfplumber | |
| from typing import Optional, Callable, Literal | |
| import base64 | |
| import io | |
| from PIL import Image | |
| from remittance_pdf_processing_utils import remittance_logger, format_amount_str_to_decimal | |
| from vertex_api_invoice_extractor import extract_invoice_numbers_with_vertex_ai, extract_invoice_numbers_from_text_with_vertex_ai, extract_payment_amounts_with_vertex_ai, extract_payment_amounts_from_text_with_vertex_ai | |
| # from dspy_invoice_extractors import SinglePageInvoiceExtractor, MultiPageInvoiceExtractor | |
| from remittance_pdf_processing_types import InvoiceNumbers, InvoiceVerifier, DocumentType, ExtractorFunction, PaymentAmount, Candidate, ProcessedPDFResult, InvoiceListAndAmountVerifier | |
| from anthropic_api_invoice_extractor import extract_invoice_numbers_with_anthropic_ai, extract_payment_amounts_with_anthropic_ai | |
| def is_text_based_pdf(pdf: pdfplumber.PDF) -> bool: | |
| text_threshold = 100 # Minimum number of characters to consider it text-based | |
| for page in pdf.pages: | |
| if len(page.extract_text()) > text_threshold: | |
| return True | |
| return False | |
| def determine_document_type(pdf: pdfplumber.PDF) -> DocumentType: | |
| return 'single' if len(pdf.pages) == 1 else 'multi' | |
| def extract_text_from_pdf(pdf_path: str, wrap_pages: bool = False) -> str: | |
| with pdfplumber.open(pdf_path) as pdf: | |
| if not wrap_pages: | |
| # Keep the current behavior | |
| return "\n".join(page.extract_text() for page in pdf.pages) | |
| else: | |
| # Implement new wrapping behavior | |
| pages_text = [] | |
| for i, page in enumerate(pdf.pages, start=1): | |
| page_text = page.extract_text() | |
| wrapped_page = f"<page_{i}>\n{page_text}\n</page_{i}>" | |
| pages_text.append(wrapped_page) | |
| all_pages_text = "\n".join(pages_text) | |
| return f"<remittance>\n{all_pages_text}\n</remittance>" | |
| # def InvoiceExtractor(doc_type: DocumentType) -> ExtractorFunction: | |
| # if doc_type == 'single': | |
| # def single_page_extractor(text: str) -> list[InvoiceNumbers]: | |
| # return [] | |
| # return single_page_extractor | |
| # else: | |
| # def multi_page_extractor(text: str) -> list[InvoiceNumbers]: | |
| # return [] | |
| # return multi_page_extractor | |
| def extract_invoice_numbers_from_text( | |
| text: str, | |
| doc_type: DocumentType, | |
| multi_hop: bool = False | |
| ) -> list[InvoiceNumbers]: | |
| remittance_logger.info(f"Extracting invoice numbers from {doc_type}-page text-based document (multi_hop: {multi_hop})") | |
| # Call the Vertex AI extractor | |
| return extract_invoice_numbers_from_text_with_vertex_ai(text, multi_hop) | |
| def extract_invoice_numbers_from_single_base64_image(base64_image: str, multi_hop: bool = False) -> list[InvoiceNumbers]: | |
| remittance_logger.debug(f"Extracting invoice numbers from a single base64 image using Vertex AI (multi_hop: {multi_hop})") | |
| return extract_invoice_numbers_with_vertex_ai(base64_image, multi_hop) | |
| def extract_invoice_numbers_from_multi_page_images(base64_images: list[str], multi_hop: bool = False) -> list[InvoiceNumbers]: | |
| remittance_logger.debug(f"Extracting invoice numbers from {len(base64_images)} base64 images using Anthropic AI (multi_hop: {multi_hop})") | |
| return extract_invoice_numbers_with_anthropic_ai(base64_images, multi_hop) | |
| def extract_invoice_numbers_from_base64_images(base64_images: list[str], multi_hop: bool = False) -> list[InvoiceNumbers]: | |
| remittance_logger.info(f"Extracting invoice numbers from {len(base64_images)} base64 image(s) (multi_hop: {multi_hop})") | |
| if len(base64_images) == 1: | |
| return extract_invoice_numbers_from_single_base64_image(base64_images[0], multi_hop) | |
| else: | |
| return extract_invoice_numbers_from_multi_page_images(base64_images, multi_hop) | |
| def extract_invoice_numbers_from_image( | |
| pdf: pdfplumber.PDF, | |
| multi_hop: bool = False, | |
| dpi: int = 257 # Number choosen for optimal resolution for Gemini Flash 1.5 model | |
| ) -> list[InvoiceNumbers]: | |
| remittance_logger.info(f"Extracting invoice numbers from {len(pdf.pages)}-page image-based document (multi_hop: {multi_hop})") | |
| base64_images = [] | |
| for page in pdf.pages: | |
| img = page.to_image(resolution=dpi) | |
| img_bytes = io.BytesIO() | |
| img.save(img_bytes, format='PNG') | |
| img_base64 = base64.b64encode(img_bytes.getvalue()).decode('utf-8') | |
| base64_images.append(img_base64) | |
| return extract_invoice_numbers_from_base64_images(base64_images, multi_hop) | |
| def extract_invoices_from_pdf(pdf_path: str, force_image_processing: bool = False, invoice_verifier: InvoiceVerifier | None = None, force_multi_hop: bool = False) -> tuple[list[InvoiceNumbers], list[InvoiceNumbers]]: | |
| with pdfplumber.open(pdf_path) as pdf: | |
| doc_type = determine_document_type(pdf) | |
| for multi_hop in [True] if force_multi_hop else [False, True]: | |
| # if doc_type == 'single' or force_image_processing: | |
| if force_image_processing: | |
| invoice_numbers_candidates = extract_invoice_numbers_from_image(pdf, multi_hop=multi_hop) | |
| else: | |
| is_text_based = is_text_based_pdf(pdf) | |
| if is_text_based: | |
| text = extract_text_from_pdf(pdf_path, wrap_pages=True) | |
| invoice_numbers_candidates = extract_invoice_numbers_from_text(text, doc_type, multi_hop=multi_hop) | |
| else: | |
| invoice_numbers_candidates = extract_invoice_numbers_from_image(pdf, multi_hop=multi_hop) | |
| if invoice_verifier: | |
| verified_invoices = [ | |
| invoice_verifier(invoice_numbers) or [] | |
| for invoice_numbers in invoice_numbers_candidates | |
| ] | |
| # Filter out empty lists for verified invoices | |
| verified_result = [invoices for invoices in verified_invoices if invoices] | |
| else: | |
| verified_result = [] # When there's no verifier, the verified list should be empty | |
| remittance_logger.info(f"Extracted invoice numbers (post verification, multi_hop={multi_hop}): {verified_result}") | |
| # If we found invoices (either verified or unverified), return them | |
| if verified_result or invoice_numbers_candidates: | |
| return verified_result, invoice_numbers_candidates | |
| # If we've tried both with and without multi_hop and found nothing, return empty lists | |
| remittance_logger.warning("No invoice numbers found after trying both single-hop and multi-hop processing.") | |
| return [], [] | |
| def extract_payment_amounts_from_single_base64_image(base64_image: str) -> list[PaymentAmount]: | |
| remittance_logger.debug("Extracting payment amounts from a single base64 image using Vertex AI") | |
| return extract_payment_amounts_with_vertex_ai(base64_image) | |
| def extract_payment_amounts_from_multi_page_images(base64_images: list[str]) -> list[PaymentAmount]: | |
| remittance_logger.debug(f"Extracting payment amounts from {len(base64_images)} base64 images using Anthropic AI") | |
| return extract_payment_amounts_with_anthropic_ai(base64_images) | |
| def extract_payment_amounts_from_base64_images(base64_images: list[str]) -> list[PaymentAmount]: | |
| remittance_logger.info(f"Extracting payment amounts from {len(base64_images)} base64 image(s)") | |
| if len(base64_images) == 1: | |
| return extract_payment_amounts_from_single_base64_image(base64_images[0]) | |
| else: | |
| return extract_payment_amounts_from_multi_page_images(base64_images) | |
| def extract_payment_amounts_from_pdf(pdf_path: str, force_image_processing: bool = False, payment_amount_formatter: Callable[[str], str] | None = None) -> list[PaymentAmount]: | |
| with pdfplumber.open(pdf_path) as pdf: | |
| doc_type = determine_document_type(pdf) | |
| if doc_type == 'single' or force_image_processing: | |
| payment_amounts = extract_payment_amounts_from_image(pdf) | |
| else: | |
| is_text_based = is_text_based_pdf(pdf) | |
| if is_text_based: | |
| text = extract_text_from_pdf(pdf_path, wrap_pages=True) | |
| payment_amounts = extract_payment_amounts_from_text(text, doc_type) | |
| else: | |
| payment_amounts = extract_payment_amounts_from_image(pdf) | |
| if payment_amount_formatter: | |
| payment_amounts = [payment_amount_formatter(amount) for amount in payment_amounts] | |
| return payment_amounts | |
| def extract_payment_amounts_from_text(text: str, doc_type: DocumentType) -> list[PaymentAmount]: | |
| remittance_logger.info(f"Extracting payment amounts from {doc_type}-page text-based document") | |
| # Call the Vertex AI extractor | |
| return extract_payment_amounts_from_text_with_vertex_ai(text) | |
| def extract_payment_amounts_from_image(pdf: pdfplumber.PDF, dpi: int = 257) -> list[PaymentAmount]: | |
| remittance_logger.info(f"Extracting payment amounts from {len(pdf.pages)}-page image-based document") | |
| base64_images = [] | |
| for page in pdf.pages: | |
| img = page.to_image(resolution=dpi) | |
| img_bytes = io.BytesIO() | |
| img.save(img_bytes, format='PNG') | |
| img_base64 = base64.b64encode(img_bytes.getvalue()).decode('utf-8') | |
| base64_images.append(img_base64) | |
| return extract_payment_amounts_from_base64_images(base64_images) | |
| def process_pdf(pdf_path: str, force_image_processing: bool = False, force_multi_hop: bool = False, invoice_verifier: InvoiceVerifier | None = None, invoice_and_amount_verifier: InvoiceListAndAmountVerifier | None = None) -> ProcessedPDFResult: | |
| verified_invoice_numbers, unverified_invoice_numbers = extract_invoices_from_pdf( | |
| pdf_path, | |
| force_image_processing, | |
| invoice_verifier, | |
| force_multi_hop=force_multi_hop | |
| ) | |
| payment_amounts = extract_payment_amounts_from_pdf(pdf_path, force_image_processing, payment_amount_formatter=format_amount_str_to_decimal) | |
| remittance_logger.debug(f"Extracted payment amounts: {payment_amounts}") | |
| verified_payment_amounts = [] | |
| if invoice_and_amount_verifier and len(verified_invoice_numbers) == 1: | |
| for amount in payment_amounts: | |
| if invoice_and_amount_verifier(verified_invoice_numbers[0], amount): | |
| verified_payment_amounts = [amount] | |
| break | |
| verified_candidate = (verified_invoice_numbers, verified_payment_amounts) | |
| unverified_candidate = (unverified_invoice_numbers, payment_amounts) | |
| return verified_candidate, unverified_candidate | |
| # from typing import list, tuple | |
| def process_pdf_with_flow( | |
| pdf_path: str, | |
| invoice_verifier: InvoiceVerifier | None = None, | |
| invoice_and_amount_verifier: InvoiceListAndAmountVerifier | None = None | |
| ) -> ProcessedPDFResult: | |
| """ | |
| Process a PDF file using a specific flow of extraction methods. | |
| Args: | |
| pdf_path (str): Path to the PDF file. | |
| invoice_verifier (InvoiceVerifier | None): Function to verify invoice numbers. | |
| invoice_and_amount_verifier (InvoiceListAndAmountVerifier | None): Function to verify invoice numbers and amount pairs. | |
| Returns: | |
| ProcessedPDFResult: A tuple containing verified and unverified candidates. | |
| """ | |
| all_verified_invoices: list[InvoiceNumbers] = [] | |
| all_verified_amounts: list[PaymentAmount] = [] | |
| all_unverified_invoices: list[InvoiceNumbers] = [] | |
| all_unverified_amounts: list[PaymentAmount] = [] | |
| with pdfplumber.open(pdf_path) as pdf: | |
| is_text_based = is_text_based_pdf(pdf) | |
| if is_text_based: | |
| # Try single hop text processing | |
| text = extract_text_from_pdf(pdf_path, wrap_pages=True) | |
| result = process_text_based(text, invoice_verifier, invoice_and_amount_verifier, multi_hop=False) | |
| if has_single_verified_pair(result): | |
| return result | |
| accumulate_candidates(result, all_verified_invoices, all_verified_amounts, all_unverified_invoices, all_unverified_amounts) | |
| remittance_logger.debug(f"Result snapshot - single hop text processing: {result}") | |
| # Try multi hop text processing | |
| result = process_text_based(text, invoice_verifier, invoice_and_amount_verifier, multi_hop=True) | |
| if has_single_verified_pair(result): | |
| return result | |
| accumulate_candidates(result, all_verified_invoices, all_verified_amounts, all_unverified_invoices, all_unverified_amounts) | |
| remittance_logger.debug(f"Result snapshot - multi hop text processing: {result}") | |
| # Try single hop image processing | |
| result = process_image_based(pdf, invoice_verifier, invoice_and_amount_verifier, multi_hop=False) | |
| if has_single_verified_pair(result): | |
| return result | |
| accumulate_candidates(result, all_verified_invoices, all_verified_amounts, all_unverified_invoices, all_unverified_amounts) | |
| remittance_logger.debug(f"Result snapshot - single hop image processing: {result}") | |
| # Try multi hop image processing | |
| result = process_image_based(pdf, invoice_verifier, invoice_and_amount_verifier, multi_hop=True) | |
| if has_single_verified_pair(result): | |
| return result | |
| accumulate_candidates(result, all_verified_invoices, all_verified_amounts, all_unverified_invoices, all_unverified_amounts) | |
| remittance_logger.debug(f"Result snapshot - multi hop image processing: {result}") | |
| # If no single verified pair is found, return all accumulated candidates | |
| return (all_verified_invoices, all_verified_amounts), (all_unverified_invoices, all_unverified_amounts) | |
| def process_text_based( | |
| text: str, | |
| invoice_verifier: InvoiceVerifier | None, | |
| invoice_and_amount_verifier: InvoiceListAndAmountVerifier | None, | |
| multi_hop: bool | |
| ) -> ProcessedPDFResult: | |
| invoice_numbers = extract_invoice_numbers_from_text(text, 'multi', multi_hop) | |
| payment_amounts = extract_payment_amounts_from_text(text, 'multi') | |
| return verify_candidates(invoice_numbers, payment_amounts, invoice_verifier, invoice_and_amount_verifier) | |
| def process_image_based( | |
| pdf: pdfplumber.PDF, | |
| invoice_verifier: InvoiceVerifier | None, | |
| invoice_and_amount_verifier: InvoiceListAndAmountVerifier | None, | |
| multi_hop: bool | |
| ) -> ProcessedPDFResult: | |
| invoice_numbers = extract_invoice_numbers_from_image(pdf, multi_hop) | |
| payment_amounts = extract_payment_amounts_from_image(pdf) | |
| return verify_candidates(invoice_numbers, payment_amounts, invoice_verifier, invoice_and_amount_verifier) | |
| def verify_candidates( | |
| invoice_numbers: list[InvoiceNumbers], | |
| payment_amounts: list[PaymentAmount], | |
| invoice_verifier: InvoiceVerifier | None, | |
| invoice_and_amount_verifier: InvoiceListAndAmountVerifier | None | |
| ) -> ProcessedPDFResult: | |
| verified_invoices = [] | |
| verified_amounts = [] | |
| if invoice_verifier: | |
| verified_invoices = [invoice_verifier(inv) for inv in invoice_numbers if invoice_verifier(inv)] | |
| if invoice_and_amount_verifier and len(verified_invoices) == 1: | |
| for amount in payment_amounts: | |
| if invoice_and_amount_verifier(verified_invoices[0], amount): | |
| verified_amounts = [amount] | |
| break | |
| return (verified_invoices, verified_amounts), (invoice_numbers, payment_amounts) | |
| def has_single_verified_pair(result: ProcessedPDFResult) -> bool: | |
| verified, _ = result | |
| return len(verified[0]) == 1 and len(verified[1]) == 1 | |
| def accumulate_candidates( | |
| result: ProcessedPDFResult, | |
| all_verified_invoices: list[InvoiceNumbers], | |
| all_verified_amounts: list[PaymentAmount], | |
| all_unverified_invoices: list[InvoiceNumbers], | |
| all_unverified_amounts: list[PaymentAmount] | |
| ) -> None: | |
| verified, unverified = result | |
| # Helper function to add unique items to a list | |
| def add_unique(items: list, new_items: list) -> None: | |
| for item in new_items: | |
| if isinstance(item, list): # For invoice numbers | |
| if not any(set(item) == set(existing) for existing in items): | |
| items.append(item) | |
| else: # For payment amounts | |
| if item not in items: | |
| items.append(item) | |
| add_unique(all_verified_invoices, verified[0]) | |
| add_unique(all_verified_amounts, verified[1]) | |
| add_unique(all_unverified_invoices, unverified[0]) | |
| add_unique(all_unverified_amounts, unverified[1]) |