Spaces:
Runtime error
Runtime error
Update comments
Browse files
app.py
CHANGED
|
@@ -70,7 +70,7 @@ def parse_column_names(s):
|
|
| 70 |
|
| 71 |
def remove_duplicate_lists(lists):
|
| 72 |
"""
|
| 73 |
-
|
| 74 |
Args:
|
| 75 |
lists:
|
| 76 |
a list of lists of strings
|
|
@@ -90,7 +90,7 @@ def remove_duplicate_lists(lists):
|
|
| 90 |
|
| 91 |
|
| 92 |
def parse_invoice_number(s):
|
| 93 |
-
# Return the invoice number in
|
| 94 |
rp = r'^\s*?([\S\d]+\d{6})'
|
| 95 |
m = re.search(rp, s)
|
| 96 |
return m.group(1) if m else s
|
|
@@ -134,11 +134,11 @@ def format_text_decimal(text_decimal):
|
|
| 134 |
|
| 135 |
# PDF handling
|
| 136 |
def extract_text_using_pdfplumber(file_path):
|
| 137 |
-
# TODO: add check for text vs
|
| 138 |
with pdfplumber.open(file_path) as pdf:
|
| 139 |
extracted_text = ''
|
| 140 |
for i, page in enumerate(pdf.pages):
|
| 141 |
-
# Remove duplicate characters from the page
|
| 142 |
deduped_page = page.dedupe_chars(tolerance=1)
|
| 143 |
extracted_text += deduped_page.extract_text()
|
| 144 |
return extracted_text
|
|
@@ -236,16 +236,10 @@ class FindTotalAmount(dspy.Signature):
|
|
| 236 |
class RemittanceLetterTotalAmount(dspy.Module):
|
| 237 |
def __init__(self):
|
| 238 |
super().__init__()
|
| 239 |
-
# self.find_invoice_list = InvoiceList()
|
| 240 |
self.find_total_amount_header = TotalAmountColumnHeaders()
|
| 241 |
self.find_total_amount = dspy.Predict(FindTotalAmount)
|
| 242 |
|
| 243 |
def forward(self, file_content):
|
| 244 |
-
# Predict invoice list - we could do this here, but let's just call the 2 modules from a function instead
|
| 245 |
-
# if we called the invoice list prediction here, we should return an object with both the potential total amounts
|
| 246 |
-
# and the potential invoice lists
|
| 247 |
-
# predict_invoice_list = self.find_invoice_list(file_content=file_content)
|
| 248 |
-
|
| 249 |
# Predict column headers (returns a Prediction with a CSV string in "column_header_names")
|
| 250 |
predict_column_headers = self.find_total_amount_header(file_content=file_content)
|
| 251 |
# Parse CSV into a list
|
|
@@ -259,14 +253,11 @@ class RemittanceLetterTotalAmount(dspy.Module):
|
|
| 259 |
|
| 260 |
# Remove duplicates
|
| 261 |
potential_total_amounts = list(set(potential_total_amounts))
|
| 262 |
-
return Prediction(candidate_total_amounts=potential_total_amounts)
|
| 263 |
|
| 264 |
|
| 265 |
# Pipeline
|
| 266 |
def poc_production_pipeline_without_verification(file_content):
|
| 267 |
-
# TODO: place this in a module - init allows to pass a compiled module and forward handles the data:
|
| 268 |
-
# so we can evaluate the pipeline (check if any tuple matches the verifier)
|
| 269 |
-
|
| 270 |
# Get invoice candidates
|
| 271 |
invoice_list_baseline = InvoiceList()
|
| 272 |
candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
|
|
@@ -296,7 +287,6 @@ def poc_production_pipeline_without_verification(file_content):
|
|
| 296 |
|
| 297 |
def poc_production_pipeline_without_verification_from_PDF(file_path):
|
| 298 |
file_content = extract_text_using_pdfplumber(file_path)
|
| 299 |
-
# return str(poc_production_pipeline_without_verification(file_content))
|
| 300 |
return poc_production_pipeline_without_verification(file_content)
|
| 301 |
|
| 302 |
|
|
|
|
| 70 |
|
| 71 |
def remove_duplicate_lists(lists):
|
| 72 |
"""
|
| 73 |
+
Remove duplicate lists from a list of lists.
|
| 74 |
Args:
|
| 75 |
lists:
|
| 76 |
a list of lists of strings
|
|
|
|
| 90 |
|
| 91 |
|
| 92 |
def parse_invoice_number(s):
|
| 93 |
+
# Return the invoice number in a specific format if found, otherwise just return the input string
|
| 94 |
rp = r'^\s*?([\S\d]+\d{6})'
|
| 95 |
m = re.search(rp, s)
|
| 96 |
return m.group(1) if m else s
|
|
|
|
| 134 |
|
| 135 |
# PDF handling
|
| 136 |
def extract_text_using_pdfplumber(file_path):
|
| 137 |
+
# TODO: add check for text vs image PDF
|
| 138 |
with pdfplumber.open(file_path) as pdf:
|
| 139 |
extracted_text = ''
|
| 140 |
for i, page in enumerate(pdf.pages):
|
| 141 |
+
# Remove duplicate characters from the page
|
| 142 |
deduped_page = page.dedupe_chars(tolerance=1)
|
| 143 |
extracted_text += deduped_page.extract_text()
|
| 144 |
return extracted_text
|
|
|
|
| 236 |
class RemittanceLetterTotalAmount(dspy.Module):
|
| 237 |
def __init__(self):
|
| 238 |
super().__init__()
|
|
|
|
| 239 |
self.find_total_amount_header = TotalAmountColumnHeaders()
|
| 240 |
self.find_total_amount = dspy.Predict(FindTotalAmount)
|
| 241 |
|
| 242 |
def forward(self, file_content):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
# Predict column headers (returns a Prediction with a CSV string in "column_header_names")
|
| 244 |
predict_column_headers = self.find_total_amount_header(file_content=file_content)
|
| 245 |
# Parse CSV into a list
|
|
|
|
| 253 |
|
| 254 |
# Remove duplicates
|
| 255 |
potential_total_amounts = list(set(potential_total_amounts))
|
| 256 |
+
return Prediction(candidate_total_amounts=potential_total_amounts)
|
| 257 |
|
| 258 |
|
| 259 |
# Pipeline
|
| 260 |
def poc_production_pipeline_without_verification(file_content):
|
|
|
|
|
|
|
|
|
|
| 261 |
# Get invoice candidates
|
| 262 |
invoice_list_baseline = InvoiceList()
|
| 263 |
candidate_invoices = invoice_list_baseline(file_content=file_content).candidate_invoice_numbers
|
|
|
|
| 287 |
|
| 288 |
def poc_production_pipeline_without_verification_from_PDF(file_path):
|
| 289 |
file_content = extract_text_using_pdfplumber(file_path)
|
|
|
|
| 290 |
return poc_production_pipeline_without_verification(file_content)
|
| 291 |
|
| 292 |
|