Spaces:

sunnysharma20
/

PDFReader

Build error

File size: 6,647 Bytes

import os
import re
import pandas as pd
from typing import List, Dict
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI
from ratelimit import limits, sleep_and_retry
from pdfminer.high_level import extract_text  # Changed from pypdf to pdfminer


# Replace with your actual API key
api_key = ""

os.environ["OPENAI_API_KEY"] = api_key


class InvoicePipeline:

    def __init__(self, paths):
        # This is your file path
        self._paths = paths
        # This is your LLM (GPT)
        self._llm = OpenAI()  # Initialize OpenAI here, no rate limiting yet.
        # This is prompt
        self._prompt_template = self._get_default_prompt_template()

        # Rate Limiting Configuration (adjust based on your OpenAI account limits)
        self.calls_per_minute = 60  # Example: Adjust based on your plan's RPM limit
        self.one_minute = 60

    # Apply rate limiting to the LLM call
    @sleep_and_retry
    @limits(calls=60, period=60)  # Calls/minute
    def _extract_data_from_llm_with_rate_limit(self, raw_data: str) -> str:
        """Extracts data from the LLM with rate limiting."""
        try:
            resp = self._llm(self._prompt_template.format(pages=raw_data))
            return resp
        except Exception as e:
            print(f"Error during OpenAI API call: {e}")
            return None

    # This function will help in extracting and run the code, and will produce a dataframe for us
    def run(self) -> pd.DataFrame:
        # We have defined the way the data has to be returned
        df = pd.DataFrame({
            "Invoice ID": pd.Series(dtype="str"),  # Changed to string to accommodate the invoice number format
            "DESCRIPTION": pd.Series(dtype="str"),
            "Issue Data": pd.Series(dtype="str"),
            "UNIT PRICE": pd.Series(dtype="str"),
            "AMOUNT": pd.Series(dtype="str"),  # Changed to string to handle potential non-integer values
            "Bill For": pd.Series(dtype="str"),
            "From": pd.Series(dtype="str"),
            "Terms": pd.Series(dtype="str")}
        )

        for path in self._paths:
            raw_text = self._get_raw_text_from_pdf(path)  # This function needs to be created
            print(f"Extracted Text: {raw_text}")  # Added line for debugging
            if not raw_text:
                print(f"Skipping file {path} due to empty extraction")
                continue
            llm_resp = self._extract_data_from_llm_with_rate_limit(raw_text)  # Apply rate limit here
            if llm_resp:  # Check for None response from rate limiter
                data = self._parse_response(llm_resp)
                if data:  # Only append if parsing was successful
                    df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
                else:
                    print(f"Skipping file {path} due to parsing failure.")

            else:
                print(f"Skipping file due to rate limit or API error: {path}")

        return df

    def _get_default_prompt_template(self) -> PromptTemplate:
        template = """You are an expert invoice data extractor.  Analyze the following text and extract the specified fields. Return the results in a *structured, easily parseable format*.

        Here are the extraction requirements:

        1.  **Invoice ID:** The unique identifier for the invoice.
        2.  **DESCRIPTION:** A brief description of the product or service provided.
        3.  **Issue Data:** The date the invoice was issued.
        4.  **UNIT PRICE:** The price per unit of the product or service.
        5.  **AMOUNT:** The total amount due for the line item.
        6.  **Bill For:** The entity or individual being billed.
        7.  **From:** The name of the company issuing the invoice.
        8.  **Terms:** The payment terms (e.g., "Net 30 days").

        *Important Instructions*:
        *   Return a single line containing only the extracted values.  Do *NOT* include any introductory text, conversational elements, or explanations.
        *   Enclose *each value* in double quotes. If a value is not found or is not applicable return "N/A".
        *   Do *NOT* include currency symbols (e.g., $, €, £).
        *   Separate each extracted value with a pipe symbol (`|`).
        *   The order of the extracted values *MUST* be: Invoice ID | DESCRIPTION | Issue Data | UNIT PRICE | AMOUNT | Bill For | From | Terms

        Example:
        "12345" | "Consulting Services" | "2023-11-15" | "100.00" | "1000.00" | "Acme Corp" | "XYZ Consulting" | "Net 30 days"

        Here is the text to analyze:
        {pages}
        """
        prompt_template = PromptTemplate(input_variables=["pages"], template=template)
        return prompt_template

    # We will try to extract the text from the PDF to a normal variable.
    def _get_raw_text_from_pdf(self, path: str) -> str:
        """Extracts text from a PDF using pdfminer."""
        try:
            text = extract_text(path)  # Use pdfminer
            return text
        except Exception as e:
            print(f"Error extracting text from PDF using pdfminer: {e}")
            return ""  # Return empty string on failure

    def _extract_data_from_llm(self, raw_data: str) -> str:
        resp = self._llm(self._prompt_template.format(pages=raw_data))
        return resp

    def _parse_response(self, response: str) -> Dict[str, str]:
        """Parses the LLM response using regular expressions."""
        try:
            # Split the response by the pipe symbol
            values = response.strip().split("|")
            if len(values) != 8:  # Ensure we have all expected values
                print(f"Warning: Unexpected number of values in response: {len(values)}.  Response: {response}")
                return {}  # Return empty dictionary

            # Assign values to keys, handling potential errors
            data = {
                "Invoice ID": values[0].strip().replace('"', ''),
                "DESCRIPTION": values[1].strip().replace('"', ''),
                "Issue Data": values[2].strip().replace('"', ''),
                "UNIT PRICE": values[3].strip().replace('"', ''),
                "AMOUNT": values[4].strip().replace('"', ''),
                "Bill For": values[5].strip().replace('"', ''),
                "From": values[6].strip().replace('"', ''),
                "Terms": values[7].strip().replace('"', '')
            }
            return data

        except Exception as e:
            print(f"Error parsing LLM response: {e}. Response: {response}")
            return {}  # Return empty dictionary on parsing failure