Spaces:

Saltech
/

remittance-processing

Sleeping

App Files Files Community

remittance-processing / app.py

Alejandro-STC

Update OpenAI key retrieval from environment variable

7698d94 verified over 1 year ago

raw

history blame

11.3 kB

	import os

	#DSPY
	import dspy
	from dspy import Prediction
	from dspy.evaluate import Evaluate
	from dspy import Prediction
	from dspy.teleprompt import BootstrapFewShot
	from dspy.teleprompt import BootstrapFewShotWithRandomSearch

	# Data handling
	# import pandas as pd

	# Calculations and formatting
	import re
	from decimal import Decimal

	# UI
	import gradio as gr
	from gradio_pdf import PDF

	# PDF handling
	import pdfplumber


	pdf_examples_dir = './pdfexamples/'

	model = dspy.OpenAI(
	model='gpt-3.5-turbo-0125',
	api_key=os.getenv('OPENAI_PROJECT_KEY'),
	max_tokens=2000,
	temperature=0.01)

	dspy.settings.configure(lm=model)


	# Utils
	def parse_CSV_string(csv_string):
	# Parses a CSV string into a unique list
	return list(set(map(str.lower, map(str.strip, csv_string.split(',')))))

	def parse_list_of_CSV_strings(list_of_csv_strings):
	# Parses a list of CSV strings with invoice numbers into a list of lists
	parsed_csv_list = []
	for csv_string in list_of_csv_strings:
	parsed_csv_list.append(parse_CSV_string(csv_string))
	return parsed_csv_list

	def parse_invoice_number(s):
	# Return the invoice number in Siemens' format if found, otherwise just return the string
	rp = r'^\s*?([\S\d]+\d{6})'
	m = re.search(rp, s)
	return m.group(1) if m else s

	def standardize_number(s):
	# Find the last occurrence of a comma or period
	last_separator_index = max(s.rfind(','), s.rfind('.'))
	if last_separator_index != -1:
	# Split the string into two parts
	before_separator = s[:last_separator_index]
	after_separator = s[last_separator_index+1:]

	# Clean the first part of any commas, periods, or whitespace
	before_separator_cleaned = re.sub(r'[.,\s]', '', before_separator)

	# Ensure the decimal part starts with a period, even if it was a comma
	standardized_s = before_separator_cleaned + '.' + after_separator
	else:
	# If there's no separator, just remove commas, periods, or whitespace
	standardized_s = re.sub(r'[.,\s]', '', s)

	return standardized_s

	def remove_chars_after_last_digit(s):
	# Remove any non-digit characters following the last digit in the string
	return re.sub(r'(?<=\d)[^\d]*$', '', s)

	def clean_text(s):
	# This pattern looks for:
	# - Optional non-digit or non-negative sign characters followed by whitespace (if any)
	# - Followed by any characters until a digit is found in the word
	# It then replaces this matched portion with the remaining part of the word from the first digit
	# cleaned_s = re.sub(r'\S?\s?(\S\d\S)', r'\1', s)
	cleaned_s = re.sub(r'[^\d-]\s?(\S\d\S*)', r'\1', s)
	return cleaned_s

	def format_text_decimal(text_decimal):
	# Run functions to format a text decimal
	return clean_text(remove_chars_after_last_digit(standardize_number(text_decimal.strip().lower())))


	# PDF handling
	def extract_text_using_pdfplumber(file_path):
	# TODO: add check for text vs images padf
	with pdfplumber.open(file_path) as pdf:
	extracted_text = ''
	for i, page in enumerate(pdf.pages):
	# Remove duplicate characters from the page.
	deduped_page = page.dedupe_chars(tolerance=1)
	extracted_text += deduped_page.extract_text()
	return extracted_text

	def get_PDF_examples(directory):
	example_pdf_files = []
	for filename in os.listdir(directory):
	if filename.endswith('.pdf'):
	example_pdf_files.append(os.path.join(directory, filename))
	return example_pdf_files


	# Signatures and Models
	class FindInvoiceNumberColumns(dspy.Signature):
	"""Given an input remittance letter, return a list of column header names that may contain invoice numbers."""
	content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
	column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "\
	"invoice numbers")

	class InvoiceColumnHeaders(dspy.Module):
	def __init__(self):
	super().__init__()

	# self.potential_invoice_column_headers = dspy.ChainOfThought(FindInvoiceNumberColumns)
	self.potential_invoice_column_headers = dspy.Predict(FindInvoiceNumberColumns) # Ervin suggests Predict

	def forward(self, file_content):
	prediction = self.potential_invoice_column_headers(content=file_content)
	# NOTE: Instead of a prediction we could return a simple list (for consistency with my other Modules)
	# or even a parsed list (not CSV)
	return prediction

	# This creates a new Prediction object adding the File Content
	# return Prediction(content=file_content, column_header_names=prediction.column_header_names, rationale=prediction.rationale)
	# Creating a new Prediction object with extra data can be useful if we need more data for the verification

	class FindInvoiceList(dspy.Signature):
	"""Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """\
	"""that belong to that column."""
	content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
	invoice_column_header = dspy.InputField(desc="invoice column header name")
	candidate_invoice_numbers = dspy.OutputField(desc="comma-separated list of invoice numbers")

	class InvoiceList(dspy.Module):
	def __init__(self):
	super().__init__()
	self.find_invoice_headers = InvoiceColumnHeaders() # here we could load a compiled program also
	self.find_invoice_numbers = dspy.Predict(FindInvoiceList)

	def forward(self, file_content):
	# Predict column headers (returns a Prediction with a CSV string in "column_header_names")
	predict_column_headers = self.find_invoice_headers(file_content=file_content)
	# Parse CSV into a list
	potential_invoice_column_headers = parse_CSV_string(predict_column_headers.column_header_names)

	potential_invoices = []

	for header in potential_invoice_column_headers:
	prediction = self.find_invoice_numbers(content=file_content, invoice_column_header=header)
	potential_invoices.append(prediction.candidate_invoice_numbers)

	# Remove duplicates
	# potential_invoices = list(set(potential_invoices))
	potential_invoices = parse_list_of_CSV_strings(potential_invoices) # TODO: remove duplicated lists
	# return Prediction(candidate_invoice_numbers=candidates, column_header_names=col_names)
	# return potential_invoices
	# We need to return a Prediction for the Evaluate function later on
	return Prediction(candidate_invoice_numbers=potential_invoices)

	class FindTotalAmountColumns(dspy.Signature):
	"""Given an input remittance letter, return a list of column header names that may contain the total payment amount."""
	content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
	total_column_header_names = dspy.OutputField(desc="comma-separated list of column header names that may contain "\
	"the remittance letter total payment amount")

	class TotalAmountColumnHeaders(dspy.Module):
	def __init__(self):
	super().__init__()
	self.potential_total_amount_column_headers = dspy.Predict(FindTotalAmountColumns)

	def forward(self, file_content):
	prediction = self.potential_total_amount_column_headers(content=file_content)
	return prediction

	class FindTotalAmount(dspy.Signature):
	"""Given an input remittance letter and a column header name output the total payment amount """\
	"""that belongs to that column."""
	content = dspy.InputField(desc="remittance letter", format=lambda s:s) # s:s so it doesn't skip the new lines
	total_amount_column_header = dspy.InputField(desc="total amount header name")
	total_amount = dspy.OutputField(desc="total payment amount")

	class RemittanceLetterTotalAmount(dspy.Module):
	def __init__(self):
	super().__init__()
	# self.find_invoice_list = InvoiceList()
	self.find_total_amount_header = TotalAmountColumnHeaders()
	self.find_total_amount = dspy.Predict(FindTotalAmount)

	def forward(self, file_content):
	# Predict invoice list - we could do this here, but let's just call the 2 modules from a function instead
	# if we called the invoice list prediction here, we should return an object with both the potential total amounts
	# and the potential invoice lists
	# predict_invoice_list = self.find_invoice_list(file_content=file_content)

	# Predict column headers (returns a Prediction with a CSV string in "column_header_names")
	predict_column_headers = self.find_total_amount_header(file_content=file_content)
	# Parse CSV into a list
	potential_total_amount_column_headers = parse_CSV_string(predict_column_headers.total_column_header_names)

	potential_total_amounts = []

	for header in potential_total_amount_column_headers:
	prediction = self.find_total_amount(content=file_content, total_amount_column_header=header)
	potential_total_amounts.append(prediction.total_amount)

	# Remove duplicates
	potential_total_amounts = list(set(potential_total_amounts))
	return Prediction(candidate_total_amounts=potential_total_amounts) # I could just return "prediction" also (references to candidate_total_amounts should change then)


	# Pipeline
	def poc_production_pipeline_without_verification(file_content):
	# TODO: place this in a module - init allows to pass a compiled module and forward handles the data:
	# so we can evaluate the pipeline (check if any tuple matches the verifier)

	# Get invoice candidates
	invoice_list_baseline = InvoiceList()
	candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers

	# Get total amount candidates
	total_amount_baseline = RemittanceLetterTotalAmount()

	# Format all decimals
	candidate_total_amounts = list(map(format_text_decimal,
	total_amount_baseline(file_content=file_content).candidate_total_amounts))


	# For UI visualisation purposes, create a list of tuples where the second tuple value is empty
	candidate_invoices_for_UI = []
	candidate_total_amounts_for_UI = []

	for candidate in candidate_invoices:
	candidate_invoices_for_UI.append((candidate,))

	for candidate in candidate_total_amounts:
	candidate_total_amounts_for_UI.append((candidate,))

	return candidate_invoices_for_UI, candidate_total_amounts_for_UI

	def poc_production_pipeline_without_verification_from_PDF(file_path):
	file_content = extract_text_using_pdfplumber(file_path)
	# return str(poc_production_pipeline_without_verification(file_content))
	return poc_production_pipeline_without_verification(file_content)


	# Main app
	fake_PDF_examples = get_PDF_examples(pdf_examples_dir)

	remittance_letter_demo_without_verification_from_PDF = gr.Interface(
	poc_production_pipeline_without_verification_from_PDF,
	[PDF(label="Remittance letter", height=1000)],
	[
	gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Candidate invoices"], wrap=True),
	gr.Dataframe(col_count=(1, 'fixed'), label="", headers=["Candidate total amounts"], wrap=True)
	],
	examples=fake_PDF_examples,
	allow_flagging='never'
	)

	remittance_letter_demo_without_verification_from_PDF.launch()