sunnysharma20 commited on
Commit
ad07262
·
verified ·
1 Parent(s): f5bdeb7

Update backend.py

Browse files
Files changed (1) hide show
  1. backend.py +56 -20
backend.py CHANGED
@@ -43,11 +43,11 @@ class InvoicePipeline:
43
  def run(self) -> pd.DataFrame:
44
  # We have defined the way the data has to be returned
45
  df = pd.DataFrame({
46
- "Invoice ID": pd.Series(dtype="int"),
47
  "DESCRIPTION": pd.Series(dtype="str"),
48
  "Issue Data": pd.Series(dtype="str"),
49
  "UNIT PRICE": pd.Series(dtype="str"),
50
- "AMOUNT": pd.Series(dtype="int"),
51
  "Bill For": pd.Series(dtype="str"),
52
  "From": pd.Series(dtype="str"),
53
  "Terms": pd.Series(dtype="str")}
@@ -58,18 +58,43 @@ class InvoicePipeline:
58
  llm_resp = self._extract_data_from_llm_with_rate_limit(raw_text) # Apply rate limit here
59
  if llm_resp: # Check for None response from rate limiter
60
  data = self._parse_response(llm_resp)
61
- df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
 
 
 
 
62
  else:
63
  print(f"Skipping file due to rate limit or API error: {path}")
64
 
65
  return df
66
 
67
- # The default template that the machine will take
68
  def _get_default_prompt_template(self) -> PromptTemplate:
69
- template = """Extract all the following values: Invoice ID, DESCRIPTION, Issue Data,UNIT PRICE, AMOUNT, Bill for, From and Terms for: {pages}
70
- Expected Outcome: remove any dollar symbols {{"Invoice ID":"12341234", "DESCRIPTION": "UNIT PRICE", "AMOUNT": "3", "Date": "2/1/2021", "AMOUNT": "100", "Bill For": "Dev", "From": "Coca Cola", "Terms" : "Net for 30 days"}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  """
72
-
73
  prompt_template = PromptTemplate(input_variables=["pages"], template=template)
74
  return prompt_template
75
 
@@ -86,16 +111,27 @@ class InvoicePipeline:
86
  return resp
87
 
88
  def _parse_response(self, response: str) -> Dict[str, str]:
89
- pattern = r'{(.+)}'
90
- re_match = re.search(pattern, response, re.DOTALL)
91
- if re_match:
92
- extracted_text = re_match.group(1)
93
- try:
94
- data = eval('{' + extracted_text + '}')
95
- return data
96
- except (SyntaxError, NameError) as e:
97
- print(f"Error parsing response: {e}")
98
- return {} # Return an empty dictionary to avoid crashing
99
-
100
- else:
101
- raise Exception("No match found.")
 
 
 
 
 
 
 
 
 
 
 
 
43
  def run(self) -> pd.DataFrame:
44
  # We have defined the way the data has to be returned
45
  df = pd.DataFrame({
46
+ "Invoice ID": pd.Series(dtype="str"), # Changed to string to accommodate the invoice number format
47
  "DESCRIPTION": pd.Series(dtype="str"),
48
  "Issue Data": pd.Series(dtype="str"),
49
  "UNIT PRICE": pd.Series(dtype="str"),
50
+ "AMOUNT": pd.Series(dtype="str"), # Changed to string to handle potential non-integer values
51
  "Bill For": pd.Series(dtype="str"),
52
  "From": pd.Series(dtype="str"),
53
  "Terms": pd.Series(dtype="str")}
 
58
  llm_resp = self._extract_data_from_llm_with_rate_limit(raw_text) # Apply rate limit here
59
  if llm_resp: # Check for None response from rate limiter
60
  data = self._parse_response(llm_resp)
61
+ if data: # Only append if parsing was successful
62
+ df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
63
+ else:
64
+ print(f"Skipping file {path} due to parsing failure.")
65
+
66
  else:
67
  print(f"Skipping file due to rate limit or API error: {path}")
68
 
69
  return df
70
 
 
71
  def _get_default_prompt_template(self) -> PromptTemplate:
72
+ template = """You are an expert invoice data extractor. Analyze the following text and extract the specified fields. Return the results in a *structured, easily parseable format*.
73
+
74
+ Here are the extraction requirements:
75
+
76
+ 1. **Invoice ID:** The unique identifier for the invoice.
77
+ 2. **DESCRIPTION:** A brief description of the product or service provided.
78
+ 3. **Issue Data:** The date the invoice was issued.
79
+ 4. **UNIT PRICE:** The price per unit of the product or service.
80
+ 5. **AMOUNT:** The total amount due for the line item.
81
+ 6. **Bill For:** The entity or individual being billed.
82
+ 7. **From:** The name of the company issuing the invoice.
83
+ 8. **Terms:** The payment terms (e.g., "Net 30 days").
84
+
85
+ *Important Instructions*:
86
+ * Return a single line containing only the extracted values. Do *NOT* include any introductory text, conversational elements, or explanations.
87
+ * Enclose *each value* in double quotes. If a value is not found or is not applicable return "N/A".
88
+ * Do *NOT* include currency symbols (e.g., $, €, £).
89
+ * Separate each extracted value with a pipe symbol (`|`).
90
+ * The order of the extracted values *MUST* be: Invoice ID | DESCRIPTION | Issue Data | UNIT PRICE | AMOUNT | Bill For | From | Terms
91
+
92
+ Example:
93
+ "12345" | "Consulting Services" | "2023-11-15" | "100.00" | "1000.00" | "Acme Corp" | "XYZ Consulting" | "Net 30 days"
94
+
95
+ Here is the text to analyze:
96
+ {pages}
97
  """
 
98
  prompt_template = PromptTemplate(input_variables=["pages"], template=template)
99
  return prompt_template
100
 
 
111
  return resp
112
 
113
  def _parse_response(self, response: str) -> Dict[str, str]:
114
+ """Parses the LLM response using regular expressions."""
115
+ try:
116
+ # Split the response by the pipe symbol
117
+ values = response.strip().split("|")
118
+ if len(values) != 8: # Ensure we have all expected values
119
+ print(f"Warning: Unexpected number of values in response: {len(values)}. Response: {response}")
120
+ return {} # Return empty dictionary
121
+
122
+ # Assign values to keys, handling potential errors
123
+ data = {
124
+ "Invoice ID": values[0].strip().replace('"', ''),
125
+ "DESCRIPTION": values[1].strip().replace('"', ''),
126
+ "Issue Data": values[2].strip().replace('"', ''),
127
+ "UNIT PRICE": values[3].strip().replace('"', ''),
128
+ "AMOUNT": values[4].strip().replace('"', ''),
129
+ "Bill For": values[5].strip().replace('"', ''),
130
+ "From": values[6].strip().replace('"', ''),
131
+ "Terms": values[7].strip().replace('"', '')
132
+ }
133
+ return data
134
+
135
+ except Exception as e:
136
+ print(f"Error parsing LLM response: {e}. Response: {response}")
137
+ return {} # Return an empty dictionary on parsing failure