Spaces:

Saltech
/

remittance-processing

Sleeping

App Files Files Community

Alejandro-STC commited on Oct 9, 2024

Commit

8de3fa1

verified ·

1 Parent(s): 6f94dc6

Add verification functionality and improve UI

Browse files

Files changed (1) hide show

app.py +98 -33

app.py CHANGED Viewed

@@ -25,6 +25,12 @@ import pdfplumber
 pdf_examples_dir = './pdfexamples/'
 model = dspy.OpenAI(
     model='gpt-3.5-turbo-0125',
     api_key=os.getenv('OPENAI_PROJECT_KEY'),
@@ -147,7 +153,7 @@ def get_PDF_examples(directory):
   example_pdf_files = []
   for filename in os.listdir(directory):
       if filename.endswith('.pdf'):
-          example_pdf_files.append(os.path.join(directory, filename))
   return example_pdf_files
@@ -155,7 +161,7 @@ def get_PDF_examples(directory):
 class FindInvoiceNumberColumns(dspy.Signature):
   """Given an input remittance letter, return a list of column header names that may contain invoice numbers."""
   content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
-  column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "\
                                                 "invoice numbers")
 class InvoiceColumnHeaders(dspy.Module):
@@ -177,7 +183,7 @@ class InvoiceColumnHeaders(dspy.Module):
     return Prediction(column_header_names=unique_headers)
 class FindInvoiceList(dspy.Signature):
-  """Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """\
   """that belong to that column."""
   content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
   invoice_column_header = dspy.InputField(desc="invoice column header name")
@@ -214,7 +220,7 @@ class InvoiceList(dspy.Module):
 class FindTotalAmountColumns(dspy.Signature):
   """Given an input remittance letter, return a list of column header names that may contain the total payment amount."""
   content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
-  total_column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "\
                                                 "the remittance letter total payment amount")
 class TotalAmountColumnHeaders(dspy.Module):
@@ -227,7 +233,7 @@ class TotalAmountColumnHeaders(dspy.Module):
     return prediction
 class FindTotalAmount(dspy.Signature):
-  """Given an input remittance letter and a column header name output the total payment amount """\
   """that belongs to that column."""
   content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
   total_amount_column_header = dspy.InputField(desc="total amount header name")
@@ -256,13 +262,13 @@ class RemittanceLetterTotalAmount(dspy.Module):
     return Prediction(candidate_total_amounts=potential_total_amounts)
-# Pipeline
-def poc_production_pipeline_without_verification(file_content):
   # Get invoice candidates
   invoice_list_baseline = InvoiceList()
   candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
-  candidate_invoices = [",".join(lst) for lst in candidate_invoices]
   # Get total amount candidates
   total_amount_baseline = RemittanceLetterTotalAmount()
@@ -273,35 +279,94 @@ def poc_production_pipeline_without_verification(file_content):
   # Only keep unique amounts
   candidate_total_amounts = list(set(candidate_total_amounts))
-  # For UI visualisation purposes, create a list of tuples where the second tuple value is empty
-  candidate_invoices_for_UI = []
-  candidate_total_amounts_for_UI = []
   for candidate in candidate_invoices:
-      candidate_invoices_for_UI.append((candidate,))
   for candidate in candidate_total_amounts:
-      candidate_total_amounts_for_UI.append((candidate,))
-  return candidate_invoices_for_UI, candidate_total_amounts_for_UI
-def poc_production_pipeline_without_verification_from_PDF(file_path):
-  file_content = extract_text_using_pdfplumber(file_path)
-  return poc_production_pipeline_without_verification(file_content)
-# Main app
-fake_PDF_examples = get_PDF_examples(pdf_examples_dir)
-remittance_letter_demo_without_verification_from_PDF = gr.Interface(
-  poc_production_pipeline_without_verification_from_PDF,
-  [PDF(label="Remittance advice", height=1000)],
-  [
-    gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved invoice proposals"], wrap=True),
-    gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved total amount proposals"], wrap=True)
-  ],
-  examples=fake_PDF_examples,
-  allow_flagging='never'
-)
-remittance_letter_demo_without_verification_from_PDF.launch()

 pdf_examples_dir = './pdfexamples/'
+# model = dspy.LM(
+#     model='gpt-3.5-turbo',
+#     api_key=os.getenv('OPENAI_PROJECT_KEY'),
+#     max_tokens=2000,
+#     temperature=0.01)
 model = dspy.OpenAI(
     model='gpt-3.5-turbo-0125',
     api_key=os.getenv('OPENAI_PROJECT_KEY'),
   example_pdf_files = []
   for filename in os.listdir(directory):
       if filename.endswith('.pdf'):
+          example_pdf_files.append([os.path.join(directory, filename), '', ''])
   return example_pdf_files
 class FindInvoiceNumberColumns(dspy.Signature):
   """Given an input remittance letter, return a list of column header names that may contain invoice numbers."""
   content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
+  column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "
                                                 "invoice numbers")
 class InvoiceColumnHeaders(dspy.Module):
     return Prediction(column_header_names=unique_headers)
 class FindInvoiceList(dspy.Signature):
+  """Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """
   """that belong to that column."""
   content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
   invoice_column_header = dspy.InputField(desc="invoice column header name")
 class FindTotalAmountColumns(dspy.Signature):
   """Given an input remittance letter, return a list of column header names that may contain the total payment amount."""
   content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
+  total_column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "
                                                 "the remittance letter total payment amount")
 class TotalAmountColumnHeaders(dspy.Module):
     return prediction
 class FindTotalAmount(dspy.Signature):
+  """Given an input remittance letter and a column header name output the total payment amount """
   """that belongs to that column."""
   content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
   total_amount_column_header = dspy.InputField(desc="total amount header name")
     return Prediction(candidate_total_amounts=potential_total_amounts)
+# Pipeline with Verification
+def poc_production_pipeline_with_verification(file_content, verification_invoices, verification_total_amount):
   # Get invoice candidates
   invoice_list_baseline = InvoiceList()
   candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
+  candidate_invoices = [','.join(sorted(lst)) for lst in candidate_invoices]
   # Get total amount candidates
   total_amount_baseline = RemittanceLetterTotalAmount()
   # Only keep unique amounts
   candidate_total_amounts = list(set(candidate_total_amounts))
+  # Verify invoices
+  verification_invoices_list = parse_CSV_string_to_unique(verification_invoices)
+  verification_invoices_list_sorted = ','.join(sorted(verification_invoices_list))
+  validated_invoices = []
   for candidate in candidate_invoices:
+      if candidate == verification_invoices_list_sorted:
+          validated_invoices.append(candidate)
+  # Verify total amount
+  verification_total_amount_formatted = format_text_decimal(verification_total_amount)
+  validated_total_amount = []
   for candidate in candidate_total_amounts:
+      if candidate == verification_total_amount_formatted:
+          validated_total_amount.append(candidate)
+  # Prepare output for UI
+  candidate_invoices_for_UI = [(candidate,) for candidate in candidate_invoices]
+  candidate_total_amounts_for_UI = [(candidate,) for candidate in candidate_total_amounts]
+  validated_invoices_for_UI = [(validated,) for validated in validated_invoices]
+  validated_total_amount_for_UI = [(validated,) for validated in validated_total_amount]
+  return candidate_invoices_for_UI, candidate_total_amounts_for_UI, validated_invoices_for_UI, validated_total_amount_for_UI
+def poc_production_pipeline_with_verification_from_PDF(file_path, verification_invoices, verification_total_amount):
+  file_content = extract_text_using_pdfplumber(file_path)
+  return poc_production_pipeline_with_verification(file_content, verification_invoices, verification_total_amount)
+    # Main app function
+def main():
+    fake_PDF_examples = get_PDF_examples(pdf_examples_dir)
+    # remittance_letter_demo_with_verification_from_PDF = gr.Interface(
+    #   poc_production_pipeline_with_verification_from_PDF,
+    #   [
+    #     PDF(label="Remittance advice", height=800),
+    #     gr.Textbox(label="Verification Invoices (comma-separated)", placeholder="Enter invoice numbers here..."),
+    #     gr.Textbox(label="Verification Total Amount", placeholder="Enter total amount here...")
+    #   ],
+    #   [
+    #     gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved Invoice Proposals"], wrap=True),
+    #     gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved Total Amount Proposals"], wrap=True),
+    #     gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Validated Invoices"], wrap=True),
+    #     gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Validated Total Amount"], wrap=True)
+    #   ],
+    #   examples=fake_PDF_examples,
+    #   allow_flagging='never'
+    # )
+    with gr.Blocks() as remittance_demo:
+        gr.Markdown("# Remittance PDF Processor")
+        gr.Markdown("Upload a PDF file to extract invoice numbers and payment amounts. Provide verification data if available for comparison.")
+        with gr.Row():
+            with gr.Column():
+                pdf_input = PDF(label="Remittance advice", height=900)
+            with gr.Column():
+                with gr.Accordion("Verification Inputs", open=False):
+                    verification_invoices = gr.Textbox(label="Verification Invoices (comma-separated)", placeholder="Enter invoice numbers here...")
+                    verification_total_amount = gr.Textbox(label="Verification Total Amount", placeholder="Enter total amount here...")
+                retrieved_invoices = gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved Invoice Proposals"], wrap=True)
+                retrieved_amounts = gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Retrieved Total Amount Proposals"], wrap=True)
+                validated_invoices = gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Validated Invoices"], wrap=True)
+                validated_total_amount = gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Validated Total Amount"], wrap=True)
+                submit_button = gr.Button("Process document")
+        submit_button.click(
+            poc_production_pipeline_with_verification_from_PDF,
+            inputs=[pdf_input, verification_invoices, verification_total_amount],
+            outputs=[retrieved_invoices, retrieved_amounts, validated_invoices, validated_total_amount]
+        )
+        gr.Examples(
+            examples=[[pdf[0]] for pdf in fake_PDF_examples], # We do this so only PDFs are shown
+            inputs=[pdf_input],
+            outputs=[retrieved_invoices, retrieved_amounts, validated_invoices, validated_total_amount],
+            fn=poc_production_pipeline_with_verification_from_PDF,
+            cache_examples=True
+        )
+        remittance_demo.launch()
+# Run the main app if the file is executed directly
+if __name__ == "__main__":
+    main()