import os import re import pandas as pd from typing import List, Dict from langchain.prompts import PromptTemplate from langchain_openai import OpenAI from ratelimit import limits, sleep_and_retry from pdfminer.high_level import extract_text # Changed from pypdf to pdfminer # Replace with your actual API key api_key = "" os.environ["OPENAI_API_KEY"] = api_key class InvoicePipeline: def __init__(self, paths): # This is your file path self._paths = paths # This is your LLM (GPT) self._llm = OpenAI() # Initialize OpenAI here, no rate limiting yet. # This is prompt self._prompt_template = self._get_default_prompt_template() # Rate Limiting Configuration (adjust based on your OpenAI account limits) self.calls_per_minute = 60 # Example: Adjust based on your plan's RPM limit self.one_minute = 60 # Apply rate limiting to the LLM call @sleep_and_retry @limits(calls=60, period=60) # Calls/minute def _extract_data_from_llm_with_rate_limit(self, raw_data: str) -> str: """Extracts data from the LLM with rate limiting.""" try: resp = self._llm(self._prompt_template.format(pages=raw_data)) return resp except Exception as e: print(f"Error during OpenAI API call: {e}") return None # This function will help in extracting and run the code, and will produce a dataframe for us def run(self) -> pd.DataFrame: # We have defined the way the data has to be returned df = pd.DataFrame({ "Invoice ID": pd.Series(dtype="str"), # Changed to string to accommodate the invoice number format "DESCRIPTION": pd.Series(dtype="str"), "Issue Data": pd.Series(dtype="str"), "UNIT PRICE": pd.Series(dtype="str"), "AMOUNT": pd.Series(dtype="str"), # Changed to string to handle potential non-integer values "Bill For": pd.Series(dtype="str"), "From": pd.Series(dtype="str"), "Terms": pd.Series(dtype="str")} ) for path in self._paths: raw_text = self._get_raw_text_from_pdf(path) # This function needs to be created print(f"Extracted Text: {raw_text}") # Added line for debugging if not raw_text: print(f"Skipping file {path} due to empty extraction") continue llm_resp = self._extract_data_from_llm_with_rate_limit(raw_text) # Apply rate limit here if llm_resp: # Check for None response from rate limiter data = self._parse_response(llm_resp) if data: # Only append if parsing was successful df = pd.concat([df, pd.DataFrame([data])], ignore_index=True) else: print(f"Skipping file {path} due to parsing failure.") else: print(f"Skipping file due to rate limit or API error: {path}") return df def _get_default_prompt_template(self) -> PromptTemplate: template = """You are an expert invoice data extractor. Analyze the following text and extract the specified fields. Return the results in a *structured, easily parseable format*. Here are the extraction requirements: 1. **Invoice ID:** The unique identifier for the invoice. 2. **DESCRIPTION:** A brief description of the product or service provided. 3. **Issue Data:** The date the invoice was issued. 4. **UNIT PRICE:** The price per unit of the product or service. 5. **AMOUNT:** The total amount due for the line item. 6. **Bill For:** The entity or individual being billed. 7. **From:** The name of the company issuing the invoice. 8. **Terms:** The payment terms (e.g., "Net 30 days"). *Important Instructions*: * Return a single line containing only the extracted values. Do *NOT* include any introductory text, conversational elements, or explanations. * Enclose *each value* in double quotes. If a value is not found or is not applicable return "N/A". * Do *NOT* include currency symbols (e.g., $, €, £). * Separate each extracted value with a pipe symbol (`|`). * The order of the extracted values *MUST* be: Invoice ID | DESCRIPTION | Issue Data | UNIT PRICE | AMOUNT | Bill For | From | Terms Example: "12345" | "Consulting Services" | "2023-11-15" | "100.00" | "1000.00" | "Acme Corp" | "XYZ Consulting" | "Net 30 days" Here is the text to analyze: {pages} """ prompt_template = PromptTemplate(input_variables=["pages"], template=template) return prompt_template # We will try to extract the text from the PDF to a normal variable. def _get_raw_text_from_pdf(self, path: str) -> str: """Extracts text from a PDF using pdfminer.""" try: text = extract_text(path) # Use pdfminer return text except Exception as e: print(f"Error extracting text from PDF using pdfminer: {e}") return "" # Return empty string on failure def _extract_data_from_llm(self, raw_data: str) -> str: resp = self._llm(self._prompt_template.format(pages=raw_data)) return resp def _parse_response(self, response: str) -> Dict[str, str]: """Parses the LLM response using regular expressions.""" try: # Split the response by the pipe symbol values = response.strip().split("|") if len(values) != 8: # Ensure we have all expected values print(f"Warning: Unexpected number of values in response: {len(values)}. Response: {response}") return {} # Return empty dictionary # Assign values to keys, handling potential errors data = { "Invoice ID": values[0].strip().replace('"', ''), "DESCRIPTION": values[1].strip().replace('"', ''), "Issue Data": values[2].strip().replace('"', ''), "UNIT PRICE": values[3].strip().replace('"', ''), "AMOUNT": values[4].strip().replace('"', ''), "Bill For": values[5].strip().replace('"', ''), "From": values[6].strip().replace('"', ''), "Terms": values[7].strip().replace('"', '') } return data except Exception as e: print(f"Error parsing LLM response: {e}. Response: {response}") return {} # Return empty dictionary on parsing failure