Spaces:
Build error
Build error
File size: 6,647 Bytes
73067b0 13d2e83 b15a58b 73067b0 026503c 9a52f43 b15a58b 026503c fc022bb 73067b0 026503c 8f342b7 0517407 2e4e294 73067b0 0517407 73067b0 026503c 73067b0 2b56873 026503c 73067b0 8f342b7 73067b0 ad07262 026503c ad07262 026503c 8f342b7 0517407 07153bb 026503c 3cc2cc4 9a52f43 3cc2cc4 026503c ad07262 026503c fa91831 4bf7d86 ad07262 4bf7d86 026503c 4bf7d86 319607a 73067b0 026503c 9a52f43 05f88c9 026503c 05f88c9 026503c 650bf53 ad07262 3cc2cc4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | import os
import re
import pandas as pd
from typing import List, Dict
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI
from ratelimit import limits, sleep_and_retry
from pdfminer.high_level import extract_text # Changed from pypdf to pdfminer
# Replace with your actual API key
api_key = ""
os.environ["OPENAI_API_KEY"] = api_key
class InvoicePipeline:
def __init__(self, paths):
# This is your file path
self._paths = paths
# This is your LLM (GPT)
self._llm = OpenAI() # Initialize OpenAI here, no rate limiting yet.
# This is prompt
self._prompt_template = self._get_default_prompt_template()
# Rate Limiting Configuration (adjust based on your OpenAI account limits)
self.calls_per_minute = 60 # Example: Adjust based on your plan's RPM limit
self.one_minute = 60
# Apply rate limiting to the LLM call
@sleep_and_retry
@limits(calls=60, period=60) # Calls/minute
def _extract_data_from_llm_with_rate_limit(self, raw_data: str) -> str:
"""Extracts data from the LLM with rate limiting."""
try:
resp = self._llm(self._prompt_template.format(pages=raw_data))
return resp
except Exception as e:
print(f"Error during OpenAI API call: {e}")
return None
# This function will help in extracting and run the code, and will produce a dataframe for us
def run(self) -> pd.DataFrame:
# We have defined the way the data has to be returned
df = pd.DataFrame({
"Invoice ID": pd.Series(dtype="str"), # Changed to string to accommodate the invoice number format
"DESCRIPTION": pd.Series(dtype="str"),
"Issue Data": pd.Series(dtype="str"),
"UNIT PRICE": pd.Series(dtype="str"),
"AMOUNT": pd.Series(dtype="str"), # Changed to string to handle potential non-integer values
"Bill For": pd.Series(dtype="str"),
"From": pd.Series(dtype="str"),
"Terms": pd.Series(dtype="str")}
)
for path in self._paths:
raw_text = self._get_raw_text_from_pdf(path) # This function needs to be created
print(f"Extracted Text: {raw_text}") # Added line for debugging
if not raw_text:
print(f"Skipping file {path} due to empty extraction")
continue
llm_resp = self._extract_data_from_llm_with_rate_limit(raw_text) # Apply rate limit here
if llm_resp: # Check for None response from rate limiter
data = self._parse_response(llm_resp)
if data: # Only append if parsing was successful
df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
else:
print(f"Skipping file {path} due to parsing failure.")
else:
print(f"Skipping file due to rate limit or API error: {path}")
return df
def _get_default_prompt_template(self) -> PromptTemplate:
template = """You are an expert invoice data extractor. Analyze the following text and extract the specified fields. Return the results in a *structured, easily parseable format*.
Here are the extraction requirements:
1. **Invoice ID:** The unique identifier for the invoice.
2. **DESCRIPTION:** A brief description of the product or service provided.
3. **Issue Data:** The date the invoice was issued.
4. **UNIT PRICE:** The price per unit of the product or service.
5. **AMOUNT:** The total amount due for the line item.
6. **Bill For:** The entity or individual being billed.
7. **From:** The name of the company issuing the invoice.
8. **Terms:** The payment terms (e.g., "Net 30 days").
*Important Instructions*:
* Return a single line containing only the extracted values. Do *NOT* include any introductory text, conversational elements, or explanations.
* Enclose *each value* in double quotes. If a value is not found or is not applicable return "N/A".
* Do *NOT* include currency symbols (e.g., $, €, £).
* Separate each extracted value with a pipe symbol (`|`).
* The order of the extracted values *MUST* be: Invoice ID | DESCRIPTION | Issue Data | UNIT PRICE | AMOUNT | Bill For | From | Terms
Example:
"12345" | "Consulting Services" | "2023-11-15" | "100.00" | "1000.00" | "Acme Corp" | "XYZ Consulting" | "Net 30 days"
Here is the text to analyze:
{pages}
"""
prompt_template = PromptTemplate(input_variables=["pages"], template=template)
return prompt_template
# We will try to extract the text from the PDF to a normal variable.
def _get_raw_text_from_pdf(self, path: str) -> str:
"""Extracts text from a PDF using pdfminer."""
try:
text = extract_text(path) # Use pdfminer
return text
except Exception as e:
print(f"Error extracting text from PDF using pdfminer: {e}")
return "" # Return empty string on failure
def _extract_data_from_llm(self, raw_data: str) -> str:
resp = self._llm(self._prompt_template.format(pages=raw_data))
return resp
def _parse_response(self, response: str) -> Dict[str, str]:
"""Parses the LLM response using regular expressions."""
try:
# Split the response by the pipe symbol
values = response.strip().split("|")
if len(values) != 8: # Ensure we have all expected values
print(f"Warning: Unexpected number of values in response: {len(values)}. Response: {response}")
return {} # Return empty dictionary
# Assign values to keys, handling potential errors
data = {
"Invoice ID": values[0].strip().replace('"', ''),
"DESCRIPTION": values[1].strip().replace('"', ''),
"Issue Data": values[2].strip().replace('"', ''),
"UNIT PRICE": values[3].strip().replace('"', ''),
"AMOUNT": values[4].strip().replace('"', ''),
"Bill For": values[5].strip().replace('"', ''),
"From": values[6].strip().replace('"', ''),
"Terms": values[7].strip().replace('"', '')
}
return data
except Exception as e:
print(f"Error parsing LLM response: {e}. Response: {response}")
return {} # Return empty dictionary on parsing failure |