Spaces:
Build error
Build error
| import os | |
| import re | |
| import pandas as pd | |
| from typing import List, Dict | |
| from langchain.prompts import PromptTemplate | |
| from langchain_openai import OpenAI | |
| from ratelimit import limits, sleep_and_retry | |
| from pdfminer.high_level import extract_text # Changed from pypdf to pdfminer | |
| # Replace with your actual API key | |
| api_key = "" | |
| os.environ["OPENAI_API_KEY"] = api_key | |
| class InvoicePipeline: | |
| def __init__(self, paths): | |
| # This is your file path | |
| self._paths = paths | |
| # This is your LLM (GPT) | |
| self._llm = OpenAI() # Initialize OpenAI here, no rate limiting yet. | |
| # This is prompt | |
| self._prompt_template = self._get_default_prompt_template() | |
| # Rate Limiting Configuration (adjust based on your OpenAI account limits) | |
| self.calls_per_minute = 60 # Example: Adjust based on your plan's RPM limit | |
| self.one_minute = 60 | |
| # Apply rate limiting to the LLM call | |
| # Calls/minute | |
| def _extract_data_from_llm_with_rate_limit(self, raw_data: str) -> str: | |
| """Extracts data from the LLM with rate limiting.""" | |
| try: | |
| resp = self._llm(self._prompt_template.format(pages=raw_data)) | |
| return resp | |
| except Exception as e: | |
| print(f"Error during OpenAI API call: {e}") | |
| return None | |
| # This function will help in extracting and run the code, and will produce a dataframe for us | |
| def run(self) -> pd.DataFrame: | |
| # We have defined the way the data has to be returned | |
| df = pd.DataFrame({ | |
| "Invoice ID": pd.Series(dtype="str"), # Changed to string to accommodate the invoice number format | |
| "DESCRIPTION": pd.Series(dtype="str"), | |
| "Issue Data": pd.Series(dtype="str"), | |
| "UNIT PRICE": pd.Series(dtype="str"), | |
| "AMOUNT": pd.Series(dtype="str"), # Changed to string to handle potential non-integer values | |
| "Bill For": pd.Series(dtype="str"), | |
| "From": pd.Series(dtype="str"), | |
| "Terms": pd.Series(dtype="str")} | |
| ) | |
| for path in self._paths: | |
| raw_text = self._get_raw_text_from_pdf(path) # This function needs to be created | |
| print(f"Extracted Text: {raw_text}") # Added line for debugging | |
| if not raw_text: | |
| print(f"Skipping file {path} due to empty extraction") | |
| continue | |
| llm_resp = self._extract_data_from_llm_with_rate_limit(raw_text) # Apply rate limit here | |
| if llm_resp: # Check for None response from rate limiter | |
| data = self._parse_response(llm_resp) | |
| if data: # Only append if parsing was successful | |
| df = pd.concat([df, pd.DataFrame([data])], ignore_index=True) | |
| else: | |
| print(f"Skipping file {path} due to parsing failure.") | |
| else: | |
| print(f"Skipping file due to rate limit or API error: {path}") | |
| return df | |
| def _get_default_prompt_template(self) -> PromptTemplate: | |
| template = """You are an expert invoice data extractor. Analyze the following text and extract the specified fields. Return the results in a *structured, easily parseable format*. | |
| Here are the extraction requirements: | |
| 1. **Invoice ID:** The unique identifier for the invoice. | |
| 2. **DESCRIPTION:** A brief description of the product or service provided. | |
| 3. **Issue Data:** The date the invoice was issued. | |
| 4. **UNIT PRICE:** The price per unit of the product or service. | |
| 5. **AMOUNT:** The total amount due for the line item. | |
| 6. **Bill For:** The entity or individual being billed. | |
| 7. **From:** The name of the company issuing the invoice. | |
| 8. **Terms:** The payment terms (e.g., "Net 30 days"). | |
| *Important Instructions*: | |
| * Return a single line containing only the extracted values. Do *NOT* include any introductory text, conversational elements, or explanations. | |
| * Enclose *each value* in double quotes. If a value is not found or is not applicable return "N/A". | |
| * Do *NOT* include currency symbols (e.g., $, €, £). | |
| * Separate each extracted value with a pipe symbol (`|`). | |
| * The order of the extracted values *MUST* be: Invoice ID | DESCRIPTION | Issue Data | UNIT PRICE | AMOUNT | Bill For | From | Terms | |
| Example: | |
| "12345" | "Consulting Services" | "2023-11-15" | "100.00" | "1000.00" | "Acme Corp" | "XYZ Consulting" | "Net 30 days" | |
| Here is the text to analyze: | |
| {pages} | |
| """ | |
| prompt_template = PromptTemplate(input_variables=["pages"], template=template) | |
| return prompt_template | |
| # We will try to extract the text from the PDF to a normal variable. | |
| def _get_raw_text_from_pdf(self, path: str) -> str: | |
| """Extracts text from a PDF using pdfminer.""" | |
| try: | |
| text = extract_text(path) # Use pdfminer | |
| return text | |
| except Exception as e: | |
| print(f"Error extracting text from PDF using pdfminer: {e}") | |
| return "" # Return empty string on failure | |
| def _extract_data_from_llm(self, raw_data: str) -> str: | |
| resp = self._llm(self._prompt_template.format(pages=raw_data)) | |
| return resp | |
| def _parse_response(self, response: str) -> Dict[str, str]: | |
| """Parses the LLM response using regular expressions.""" | |
| try: | |
| # Split the response by the pipe symbol | |
| values = response.strip().split("|") | |
| if len(values) != 8: # Ensure we have all expected values | |
| print(f"Warning: Unexpected number of values in response: {len(values)}. Response: {response}") | |
| return {} # Return empty dictionary | |
| # Assign values to keys, handling potential errors | |
| data = { | |
| "Invoice ID": values[0].strip().replace('"', ''), | |
| "DESCRIPTION": values[1].strip().replace('"', ''), | |
| "Issue Data": values[2].strip().replace('"', ''), | |
| "UNIT PRICE": values[3].strip().replace('"', ''), | |
| "AMOUNT": values[4].strip().replace('"', ''), | |
| "Bill For": values[5].strip().replace('"', ''), | |
| "From": values[6].strip().replace('"', ''), | |
| "Terms": values[7].strip().replace('"', '') | |
| } | |
| return data | |
| except Exception as e: | |
| print(f"Error parsing LLM response: {e}. Response: {response}") | |
| return {} # Return empty dictionary on parsing failure |