Spaces:
Runtime error
Runtime error
Refactored pipeline, added doc
Browse files
app.py
CHANGED
|
@@ -36,6 +36,11 @@ dspy.settings.configure(lm=model)
|
|
| 36 |
|
| 37 |
# Utils
|
| 38 |
def parse_CSV_string(csv_string):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
# Parses a CSV string into a unique list
|
| 40 |
return list(set(map(str.lower, map(str.strip, csv_string.split(',')))))
|
| 41 |
|
|
@@ -43,9 +48,47 @@ def parse_list_of_CSV_strings(list_of_csv_strings):
|
|
| 43 |
# Parses a list of CSV strings with invoice numbers into a list of lists
|
| 44 |
parsed_csv_list = []
|
| 45 |
for csv_string in list_of_csv_strings:
|
| 46 |
-
parsed_csv_list.append(
|
| 47 |
return parsed_csv_list
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
def parse_invoice_number(s):
|
| 50 |
# Return the invoice number in Siemens' format if found, otherwise just return the string
|
| 51 |
rp = r'^\s*?([\S\d]+\d{6})'
|
|
@@ -116,21 +159,22 @@ class FindInvoiceNumberColumns(dspy.Signature):
|
|
| 116 |
"invoice numbers")
|
| 117 |
|
| 118 |
class InvoiceColumnHeaders(dspy.Module):
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
super().__init__()
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
self.potential_invoice_column_headers = dspy.Predict(FindInvoiceNumberColumns) # Ervin suggests Predict
|
| 124 |
|
| 125 |
def forward(self, file_content):
|
| 126 |
prediction = self.potential_invoice_column_headers(content=file_content)
|
| 127 |
-
#
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
# This creates a new Prediction object adding the File Content
|
| 132 |
-
# return Prediction(content=file_content, column_header_names=prediction.column_header_names, rationale=prediction.rationale)
|
| 133 |
-
# Creating a new Prediction object with extra data can be useful if we need more data for the verification
|
| 134 |
|
| 135 |
class FindInvoiceList(dspy.Signature):
|
| 136 |
"""Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """\
|
|
@@ -140,30 +184,32 @@ class FindInvoiceList(dspy.Signature):
|
|
| 140 |
candidate_invoice_numbers = dspy.OutputField(desc="comma-separated list of invoice numbers")
|
| 141 |
|
| 142 |
class InvoiceList(dspy.Module):
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
super().__init__()
|
| 145 |
-
self.
|
|
|
|
| 146 |
self.find_invoice_numbers = dspy.Predict(FindInvoiceList)
|
| 147 |
|
| 148 |
def forward(self, file_content):
|
| 149 |
-
# Predict column headers (returns a Prediction with a CSV string in "column_header_names")
|
| 150 |
predict_column_headers = self.find_invoice_headers(file_content=file_content)
|
| 151 |
-
|
| 152 |
-
potential_invoice_column_headers = parse_CSV_string(predict_column_headers.column_header_names)
|
| 153 |
-
|
| 154 |
-
potential_invoices = []
|
| 155 |
|
|
|
|
| 156 |
for header in potential_invoice_column_headers:
|
| 157 |
prediction = self.find_invoice_numbers(content=file_content, invoice_column_header=header)
|
| 158 |
-
|
| 159 |
-
|
| 160 |
# Remove duplicates
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
# return Prediction(candidate_invoice_numbers=candidates, column_header_names=col_names)
|
| 164 |
-
# return potential_invoices
|
| 165 |
-
# We need to return a Prediction for the Evaluate function later on
|
| 166 |
-
return Prediction(candidate_invoice_numbers=potential_invoices)
|
| 167 |
|
| 168 |
class FindTotalAmountColumns(dspy.Signature):
|
| 169 |
"""Given an input remittance letter, return a list of column header names that may contain the total payment amount."""
|
|
@@ -203,7 +249,7 @@ class RemittanceLetterTotalAmount(dspy.Module):
|
|
| 203 |
# Predict column headers (returns a Prediction with a CSV string in "column_header_names")
|
| 204 |
predict_column_headers = self.find_total_amount_header(file_content=file_content)
|
| 205 |
# Parse CSV into a list
|
| 206 |
-
potential_total_amount_column_headers =
|
| 207 |
|
| 208 |
potential_total_amounts = []
|
| 209 |
|
|
@@ -225,6 +271,8 @@ def poc_production_pipeline_without_verification(file_content):
|
|
| 225 |
invoice_list_baseline = InvoiceList()
|
| 226 |
candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
|
| 227 |
|
|
|
|
|
|
|
| 228 |
# Get total amount candidates
|
| 229 |
total_amount_baseline = RemittanceLetterTotalAmount()
|
| 230 |
|
|
|
|
| 36 |
|
| 37 |
# Utils
|
| 38 |
def parse_CSV_string(csv_string):
|
| 39 |
+
# Parses a CSV string into a list
|
| 40 |
+
return list(map(str.strip, csv_string.split(',')))
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def parse_CSV_string_to_unique(csv_string):
|
| 44 |
# Parses a CSV string into a unique list
|
| 45 |
return list(set(map(str.lower, map(str.strip, csv_string.split(',')))))
|
| 46 |
|
|
|
|
| 48 |
# Parses a list of CSV strings with invoice numbers into a list of lists
|
| 49 |
parsed_csv_list = []
|
| 50 |
for csv_string in list_of_csv_strings:
|
| 51 |
+
parsed_csv_list.append(parse_CSV_string_to_unique(csv_string))
|
| 52 |
return parsed_csv_list
|
| 53 |
|
| 54 |
+
def parse_column_names(s):
|
| 55 |
+
"""
|
| 56 |
+
Parse a comma-separated list of column names from a string.
|
| 57 |
+
Removes the prefix string before splitting the string.
|
| 58 |
+
Args:
|
| 59 |
+
s: raw response from the model, comma-separated list of column names (string)
|
| 60 |
+
Returns:
|
| 61 |
+
list of column names (list of strings)
|
| 62 |
+
"""
|
| 63 |
+
prefix = 'Column Header Names:'
|
| 64 |
+
prefix_length = len(prefix)
|
| 65 |
+
# r_index = s.rfind(prefix)
|
| 66 |
+
# s = s[r_index+prefix_length:] if r_index != -1 else s
|
| 67 |
+
if s.strip().lower().startswith(prefix.lower()):
|
| 68 |
+
s = s[prefix_length:]
|
| 69 |
+
return list(map(str.strip,s.split(',')))
|
| 70 |
+
|
| 71 |
+
def remove_duplicate_lists(lists):
|
| 72 |
+
"""
|
| 73 |
+
Remove duplicate lists from a list of lists.
|
| 74 |
+
Args:
|
| 75 |
+
lists:
|
| 76 |
+
a list of lists of strings
|
| 77 |
+
Returns:
|
| 78 |
+
a list of lists of strings, where each list is unique
|
| 79 |
+
"""
|
| 80 |
+
seen = set()
|
| 81 |
+
unique_lists = []
|
| 82 |
+
|
| 83 |
+
for lst in lists:
|
| 84 |
+
sorted_list = tuple(sorted(lst))
|
| 85 |
+
if sorted_list not in seen:
|
| 86 |
+
seen.add(sorted_list)
|
| 87 |
+
unique_lists.append(lst)
|
| 88 |
+
|
| 89 |
+
return unique_lists
|
| 90 |
+
|
| 91 |
+
|
| 92 |
def parse_invoice_number(s):
|
| 93 |
# Return the invoice number in Siemens' format if found, otherwise just return the string
|
| 94 |
rp = r'^\s*?([\S\d]+\d{6})'
|
|
|
|
| 159 |
"invoice numbers")
|
| 160 |
|
| 161 |
class InvoiceColumnHeaders(dspy.Module):
|
| 162 |
+
"""
|
| 163 |
+
Predict the column headers containing invoice numbers from the remittance letter.
|
| 164 |
+
Attributes:
|
| 165 |
+
response_parser: a function that takes a string and returns a list of strings.
|
| 166 |
+
"""
|
| 167 |
+
def __init__(self, response_parser=parse_CSV_string):
|
| 168 |
super().__init__()
|
| 169 |
+
self.response_parser = response_parser
|
| 170 |
+
self.potential_invoice_column_headers = dspy.Predict(FindInvoiceNumberColumns)
|
|
|
|
| 171 |
|
| 172 |
def forward(self, file_content):
|
| 173 |
prediction = self.potential_invoice_column_headers(content=file_content)
|
| 174 |
+
# Remove duplicates from the prediction
|
| 175 |
+
unique_headers = list(set(self.response_parser(prediction.column_header_names)))
|
| 176 |
+
# Create a new Prediction object with the unique headers
|
| 177 |
+
return Prediction(column_header_names=unique_headers)
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
class FindInvoiceList(dspy.Signature):
|
| 180 |
"""Given an input remittance letter and a column header name output a comma-separated list of all invoice numbers """\
|
|
|
|
| 184 |
candidate_invoice_numbers = dspy.OutputField(desc="comma-separated list of invoice numbers")
|
| 185 |
|
| 186 |
class InvoiceList(dspy.Module):
|
| 187 |
+
"""
|
| 188 |
+
Retrieves a list of list of potential invoice numbers from a remittance letter.
|
| 189 |
+
Attributes:
|
| 190 |
+
response_parser: A function that takes a string and returns a list of invoice numbers.
|
| 191 |
+
Returns:
|
| 192 |
+
A Prediction object with the following fields:
|
| 193 |
+
candidate_invoice_numbers: A list of lists of invoice numbers.
|
| 194 |
+
"""
|
| 195 |
+
def __init__(self, response_parser=parse_CSV_string_to_unique):
|
| 196 |
super().__init__()
|
| 197 |
+
self.response_parser = response_parser
|
| 198 |
+
self.find_invoice_headers = InvoiceColumnHeaders(response_parser=parse_column_names) # here we could load a compiled program also
|
| 199 |
self.find_invoice_numbers = dspy.Predict(FindInvoiceList)
|
| 200 |
|
| 201 |
def forward(self, file_content):
|
|
|
|
| 202 |
predict_column_headers = self.find_invoice_headers(file_content=file_content)
|
| 203 |
+
potential_invoice_column_headers = predict_column_headers.column_header_names
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
+
candidates = []
|
| 206 |
for header in potential_invoice_column_headers:
|
| 207 |
prediction = self.find_invoice_numbers(content=file_content, invoice_column_header=header)
|
| 208 |
+
invoice_number_list = self.response_parser(prediction.candidate_invoice_numbers)
|
| 209 |
+
candidates.append(invoice_number_list)
|
| 210 |
# Remove duplicates
|
| 211 |
+
candidates = remove_duplicate_lists(candidates)
|
| 212 |
+
return Prediction(candidate_invoice_numbers=candidates)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
class FindTotalAmountColumns(dspy.Signature):
|
| 215 |
"""Given an input remittance letter, return a list of column header names that may contain the total payment amount."""
|
|
|
|
| 249 |
# Predict column headers (returns a Prediction with a CSV string in "column_header_names")
|
| 250 |
predict_column_headers = self.find_total_amount_header(file_content=file_content)
|
| 251 |
# Parse CSV into a list
|
| 252 |
+
potential_total_amount_column_headers = parse_CSV_string_to_unique(predict_column_headers.total_column_header_names)
|
| 253 |
|
| 254 |
potential_total_amounts = []
|
| 255 |
|
|
|
|
| 271 |
invoice_list_baseline = InvoiceList()
|
| 272 |
candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
|
| 273 |
|
| 274 |
+
candidate_invoices = [",".join(lst) for lst in candidate_invoices]
|
| 275 |
+
|
| 276 |
# Get total amount candidates
|
| 277 |
total_amount_baseline = RemittanceLetterTotalAmount()
|
| 278 |
|