Spaces:

sunnysharma20
/

PDFReader

Build error

App Files Files Community

sunnysharma20 commited on Mar 8, 2025

Commit

ad07262

verified ·

1 Parent(s): f5bdeb7

Update backend.py

Browse files

Files changed (1) hide show

backend.py +56 -20

backend.py CHANGED Viewed

@@ -43,11 +43,11 @@ class InvoicePipeline:
     def run(self) -> pd.DataFrame:
         # We have defined the way the data has to be returned
         df = pd.DataFrame({
-            "Invoice ID": pd.Series(dtype="int"),
             "DESCRIPTION": pd.Series(dtype="str"),
             "Issue Data": pd.Series(dtype="str"),
             "UNIT PRICE": pd.Series(dtype="str"),
-            "AMOUNT": pd.Series(dtype="int"),
             "Bill For": pd.Series(dtype="str"),
             "From": pd.Series(dtype="str"),
             "Terms": pd.Series(dtype="str")}
@@ -58,18 +58,43 @@ class InvoicePipeline:
             llm_resp = self._extract_data_from_llm_with_rate_limit(raw_text)  # Apply rate limit here
             if llm_resp:  # Check for None response from rate limiter
                 data = self._parse_response(llm_resp)
-                df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
             else:
                 print(f"Skipping file due to rate limit or API error: {path}")
         return df
-    # The default template that the machine will take
     def _get_default_prompt_template(self) -> PromptTemplate:
-        template = """Extract all the following values: Invoice ID, DESCRIPTION, Issue Data,UNIT PRICE, AMOUNT, Bill for, From and Terms for: {pages}
-        Expected Outcome: remove any dollar symbols {{"Invoice ID":"12341234", "DESCRIPTION": "UNIT PRICE", "AMOUNT": "3", "Date": "2/1/2021", "AMOUNT": "100", "Bill For": "Dev", "From": "Coca Cola", "Terms" : "Net for 30 days"}}
         """
         prompt_template = PromptTemplate(input_variables=["pages"], template=template)
         return prompt_template
@@ -86,16 +111,27 @@ class InvoicePipeline:
         return resp
     def _parse_response(self, response: str) -> Dict[str, str]:
-        pattern = r'{(.+)}'
-        re_match = re.search(pattern, response, re.DOTALL)
-        if re_match:
-            extracted_text = re_match.group(1)
-            try:
-                data = eval('{' + extracted_text + '}')
-                return data
-            except (SyntaxError, NameError) as e:
-                print(f"Error parsing response: {e}")
-                return {} # Return an empty dictionary to avoid crashing
-        else:
-            raise Exception("No match found.")

     def run(self) -> pd.DataFrame:
         # We have defined the way the data has to be returned
         df = pd.DataFrame({
+            "Invoice ID": pd.Series(dtype="str"),  # Changed to string to accommodate the invoice number format
             "DESCRIPTION": pd.Series(dtype="str"),
             "Issue Data": pd.Series(dtype="str"),
             "UNIT PRICE": pd.Series(dtype="str"),
+            "AMOUNT": pd.Series(dtype="str"),  # Changed to string to handle potential non-integer values
             "Bill For": pd.Series(dtype="str"),
             "From": pd.Series(dtype="str"),
             "Terms": pd.Series(dtype="str")}
             llm_resp = self._extract_data_from_llm_with_rate_limit(raw_text)  # Apply rate limit here
             if llm_resp:  # Check for None response from rate limiter
                 data = self._parse_response(llm_resp)
+                if data:  # Only append if parsing was successful
+                    df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
+                else:
+                    print(f"Skipping file {path} due to parsing failure.")
             else:
                 print(f"Skipping file due to rate limit or API error: {path}")
         return df
     def _get_default_prompt_template(self) -> PromptTemplate:
+        template = """You are an expert invoice data extractor.  Analyze the following text and extract the specified fields. Return the results in a *structured, easily parseable format*.
+        Here are the extraction requirements:
+        1.  **Invoice ID:** The unique identifier for the invoice.
+        2.  **DESCRIPTION:** A brief description of the product or service provided.
+        3.  **Issue Data:** The date the invoice was issued.
+        4.  **UNIT PRICE:** The price per unit of the product or service.
+        5.  **AMOUNT:** The total amount due for the line item.
+        6.  **Bill For:** The entity or individual being billed.
+        7.  **From:** The name of the company issuing the invoice.
+        8.  **Terms:** The payment terms (e.g., "Net 30 days").
+        *Important Instructions*:
+        *   Return a single line containing only the extracted values.  Do *NOT* include any introductory text, conversational elements, or explanations.
+        *   Enclose *each value* in double quotes. If a value is not found or is not applicable return "N/A".
+        *   Do *NOT* include currency symbols (e.g., $, €, £).
+        *   Separate each extracted value with a pipe symbol (`|`).
+        *   The order of the extracted values *MUST* be: Invoice ID | DESCRIPTION | Issue Data | UNIT PRICE | AMOUNT | Bill For | From | Terms
+        Example:
+        "12345" | "Consulting Services" | "2023-11-15" | "100.00" | "1000.00" | "Acme Corp" | "XYZ Consulting" | "Net 30 days"
+        Here is the text to analyze:
+        {pages}
         """
         prompt_template = PromptTemplate(input_variables=["pages"], template=template)
         return prompt_template
         return resp
     def _parse_response(self, response: str) -> Dict[str, str]:
+        """Parses the LLM response using regular expressions."""
+        try:
+            # Split the response by the pipe symbol
+            values = response.strip().split("|")
+            if len(values) != 8:  # Ensure we have all expected values
+                print(f"Warning: Unexpected number of values in response: {len(values)}.  Response: {response}")
+                return {}  # Return empty dictionary
+            # Assign values to keys, handling potential errors
+            data = {
+                "Invoice ID": values[0].strip().replace('"', ''),
+                "DESCRIPTION": values[1].strip().replace('"', ''),
+                "Issue Data": values[2].strip().replace('"', ''),
+                "UNIT PRICE": values[3].strip().replace('"', ''),
+                "AMOUNT": values[4].strip().replace('"', ''),
+                "Bill For": values[5].strip().replace('"', ''),
+                "From": values[6].strip().replace('"', ''),
+                "Terms": values[7].strip().replace('"', '')
+            }
+            return data
+        except Exception as e:
+            print(f"Error parsing LLM response: {e}. Response: {response}")
+            return {}  # Return an empty dictionary on parsing failure