File size: 6,647 Bytes
73067b0
13d2e83
 
b15a58b
 
73067b0
026503c
9a52f43
 
b15a58b
026503c
fc022bb
73067b0
 
026503c
 
8f342b7
0517407
2e4e294
73067b0
0517407
73067b0
026503c
73067b0
2b56873
026503c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73067b0
8f342b7
73067b0
 
ad07262
026503c
 
 
ad07262
026503c
 
 
8f342b7
0517407
07153bb
026503c
3cc2cc4
9a52f43
3cc2cc4
 
026503c
 
 
ad07262
 
 
 
 
026503c
 
fa91831
 
4bf7d86
 
ad07262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4bf7d86
026503c
4bf7d86
319607a
73067b0
026503c
9a52f43
 
 
 
 
 
 
05f88c9
026503c
 
05f88c9
026503c
650bf53
ad07262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3cc2cc4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import re
import pandas as pd
from typing import List, Dict
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI
from ratelimit import limits, sleep_and_retry
from pdfminer.high_level import extract_text  # Changed from pypdf to pdfminer


# Replace with your actual API key
api_key = ""

os.environ["OPENAI_API_KEY"] = api_key


class InvoicePipeline:

    def __init__(self, paths):
        # This is your file path
        self._paths = paths
        # This is your LLM (GPT)
        self._llm = OpenAI()  # Initialize OpenAI here, no rate limiting yet.
        # This is prompt
        self._prompt_template = self._get_default_prompt_template()

        # Rate Limiting Configuration (adjust based on your OpenAI account limits)
        self.calls_per_minute = 60  # Example: Adjust based on your plan's RPM limit
        self.one_minute = 60

    # Apply rate limiting to the LLM call
    @sleep_and_retry
    @limits(calls=60, period=60)  # Calls/minute
    def _extract_data_from_llm_with_rate_limit(self, raw_data: str) -> str:
        """Extracts data from the LLM with rate limiting."""
        try:
            resp = self._llm(self._prompt_template.format(pages=raw_data))
            return resp
        except Exception as e:
            print(f"Error during OpenAI API call: {e}")
            return None

    # This function will help in extracting and run the code, and will produce a dataframe for us
    def run(self) -> pd.DataFrame:
        # We have defined the way the data has to be returned
        df = pd.DataFrame({
            "Invoice ID": pd.Series(dtype="str"),  # Changed to string to accommodate the invoice number format
            "DESCRIPTION": pd.Series(dtype="str"),
            "Issue Data": pd.Series(dtype="str"),
            "UNIT PRICE": pd.Series(dtype="str"),
            "AMOUNT": pd.Series(dtype="str"),  # Changed to string to handle potential non-integer values
            "Bill For": pd.Series(dtype="str"),
            "From": pd.Series(dtype="str"),
            "Terms": pd.Series(dtype="str")}
        )

        for path in self._paths:
            raw_text = self._get_raw_text_from_pdf(path)  # This function needs to be created
            print(f"Extracted Text: {raw_text}")  # Added line for debugging
            if not raw_text:
                print(f"Skipping file {path} due to empty extraction")
                continue
            llm_resp = self._extract_data_from_llm_with_rate_limit(raw_text)  # Apply rate limit here
            if llm_resp:  # Check for None response from rate limiter
                data = self._parse_response(llm_resp)
                if data:  # Only append if parsing was successful
                    df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
                else:
                    print(f"Skipping file {path} due to parsing failure.")

            else:
                print(f"Skipping file due to rate limit or API error: {path}")

        return df

    def _get_default_prompt_template(self) -> PromptTemplate:
        template = """You are an expert invoice data extractor.  Analyze the following text and extract the specified fields. Return the results in a *structured, easily parseable format*.

        Here are the extraction requirements:

        1.  **Invoice ID:** The unique identifier for the invoice.
        2.  **DESCRIPTION:** A brief description of the product or service provided.
        3.  **Issue Data:** The date the invoice was issued.
        4.  **UNIT PRICE:** The price per unit of the product or service.
        5.  **AMOUNT:** The total amount due for the line item.
        6.  **Bill For:** The entity or individual being billed.
        7.  **From:** The name of the company issuing the invoice.
        8.  **Terms:** The payment terms (e.g., "Net 30 days").

        *Important Instructions*:
        *   Return a single line containing only the extracted values.  Do *NOT* include any introductory text, conversational elements, or explanations.
        *   Enclose *each value* in double quotes. If a value is not found or is not applicable return "N/A".
        *   Do *NOT* include currency symbols (e.g., $, €, £).
        *   Separate each extracted value with a pipe symbol (`|`).
        *   The order of the extracted values *MUST* be: Invoice ID | DESCRIPTION | Issue Data | UNIT PRICE | AMOUNT | Bill For | From | Terms

        Example:
        "12345" | "Consulting Services" | "2023-11-15" | "100.00" | "1000.00" | "Acme Corp" | "XYZ Consulting" | "Net 30 days"

        Here is the text to analyze:
        {pages}
        """
        prompt_template = PromptTemplate(input_variables=["pages"], template=template)
        return prompt_template

    # We will try to extract the text from the PDF to a normal variable.
    def _get_raw_text_from_pdf(self, path: str) -> str:
        """Extracts text from a PDF using pdfminer."""
        try:
            text = extract_text(path)  # Use pdfminer
            return text
        except Exception as e:
            print(f"Error extracting text from PDF using pdfminer: {e}")
            return ""  # Return empty string on failure

    def _extract_data_from_llm(self, raw_data: str) -> str:
        resp = self._llm(self._prompt_template.format(pages=raw_data))
        return resp

    def _parse_response(self, response: str) -> Dict[str, str]:
        """Parses the LLM response using regular expressions."""
        try:
            # Split the response by the pipe symbol
            values = response.strip().split("|")
            if len(values) != 8:  # Ensure we have all expected values
                print(f"Warning: Unexpected number of values in response: {len(values)}.  Response: {response}")
                return {}  # Return empty dictionary

            # Assign values to keys, handling potential errors
            data = {
                "Invoice ID": values[0].strip().replace('"', ''),
                "DESCRIPTION": values[1].strip().replace('"', ''),
                "Issue Data": values[2].strip().replace('"', ''),
                "UNIT PRICE": values[3].strip().replace('"', ''),
                "AMOUNT": values[4].strip().replace('"', ''),
                "Bill For": values[5].strip().replace('"', ''),
                "From": values[6].strip().replace('"', ''),
                "Terms": values[7].strip().replace('"', '')
            }
            return data

        except Exception as e:
            print(f"Error parsing LLM response: {e}. Response: {response}")
            return {}  # Return empty dictionary on parsing failure