Spaces:
Build error
Build error
Update backend.py
Browse files- backend.py +49 -23
backend.py
CHANGED
|
@@ -4,41 +4,63 @@ import pandas as pd
|
|
| 4 |
from pypdf import PdfReader
|
| 5 |
from typing import List, Dict
|
| 6 |
from langchain.prompts import PromptTemplate
|
| 7 |
-
# from langchain_google_genai import GoogleGenerativeAI
|
| 8 |
from langchain_openai import OpenAI
|
|
|
|
| 9 |
|
| 10 |
-
|
| 11 |
api_key = "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"
|
| 12 |
|
| 13 |
os.environ["OPENAI_API_KEY"] = api_key
|
|
|
|
|
|
|
| 14 |
class InvoicePipeline:
|
| 15 |
|
| 16 |
def __init__(self, paths):
|
| 17 |
# This is your file path
|
| 18 |
self._paths = paths
|
| 19 |
# This is your LLM (GPT)
|
| 20 |
-
self._llm = OpenAI()
|
| 21 |
# This is prompt
|
| 22 |
self._prompt_template = self._get_default_prompt_template()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
# This function will help in extracting and run the code, and will produce a dataframe for us
|
| 24 |
def run(self) -> pd.DataFrame:
|
| 25 |
# We have defined the way the data has to be returned
|
| 26 |
df = pd.DataFrame({
|
| 27 |
-
"Invoice ID": pd.Series(dtype
|
| 28 |
-
"DESCRIPTION": pd.Series(dtype
|
| 29 |
-
"Issue Data": pd.Series(dtype
|
| 30 |
-
"UNIT PRICE": pd.Series(dtype
|
| 31 |
-
"AMOUNT": pd.Series(dtype
|
| 32 |
-
"Bill For": pd.Series(dtype
|
| 33 |
-
"From": pd.Series(dtype
|
| 34 |
-
"Terms": pd.Series(dtype
|
| 35 |
)
|
| 36 |
|
| 37 |
for path in self._paths:
|
| 38 |
-
raw_text = self._get_raw_text_from_pdf(path)
|
| 39 |
-
llm_resp = self.
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
return df
|
| 44 |
|
|
@@ -48,28 +70,32 @@ class InvoicePipeline:
|
|
| 48 |
Expected Outcome: remove any dollar symbols {{"Invoice ID":"12341234", "DESCRIPTION": "UNIT PRICE", "AMOUNT": "3", "Date": "2/1/2021", "AMOUNT": "100", "Bill For": "Dev", "From": "Coca Cola", "Terms" : "Net for 30 days"}}
|
| 49 |
"""
|
| 50 |
|
| 51 |
-
prompt_template = PromptTemplate(input_variables
|
| 52 |
return prompt_template
|
| 53 |
|
| 54 |
-
|
| 55 |
# We will try to extract the text from the PDF to a normal variable.
|
| 56 |
-
def _get_raw_text_from_pdf(self, path:str) -> str:
|
| 57 |
text = ""
|
| 58 |
pdf_reader = PdfReader(path)
|
| 59 |
for page in pdf_reader.pages:
|
| 60 |
text += page.extract_text()
|
| 61 |
return text
|
| 62 |
|
| 63 |
-
def _extract_data_from_llm(self, raw_data:str) -> str:
|
| 64 |
-
resp = self._llm(self._prompt_template.format(pages
|
| 65 |
return resp
|
| 66 |
-
|
| 67 |
def _parse_response(self, response: str) -> Dict[str, str]:
|
| 68 |
pattern = r'{(.+)}'
|
| 69 |
re_match = re.search(pattern, response, re.DOTALL)
|
| 70 |
if re_match:
|
| 71 |
extracted_text = re_match.group(1)
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
else:
|
| 75 |
raise Exception("No match found.")
|
|
|
|
| 4 |
from pypdf import PdfReader
|
| 5 |
from typing import List, Dict
|
| 6 |
from langchain.prompts import PromptTemplate
|
|
|
|
| 7 |
from langchain_openai import OpenAI
|
| 8 |
+
from ratelimit import limits, sleep_and_retry
|
| 9 |
|
| 10 |
+
# Replace with your actual API key
|
| 11 |
api_key = "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"
|
| 12 |
|
| 13 |
os.environ["OPENAI_API_KEY"] = api_key
|
| 14 |
+
|
| 15 |
+
|
| 16 |
class InvoicePipeline:
|
| 17 |
|
| 18 |
def __init__(self, paths):
|
| 19 |
# This is your file path
|
| 20 |
self._paths = paths
|
| 21 |
# This is your LLM (GPT)
|
| 22 |
+
self._llm = OpenAI() # Initialize OpenAI here, no rate limiting yet.
|
| 23 |
# This is prompt
|
| 24 |
self._prompt_template = self._get_default_prompt_template()
|
| 25 |
+
|
| 26 |
+
# Rate Limiting Configuration (adjust based on your OpenAI account limits)
|
| 27 |
+
self.calls_per_minute = 60 # Example: Adjust based on your plan's RPM limit
|
| 28 |
+
self.one_minute = 60
|
| 29 |
+
|
| 30 |
+
# Apply rate limiting to the LLM call
|
| 31 |
+
@sleep_and_retry
|
| 32 |
+
@limits(calls=60, period=60) # Calls/minute
|
| 33 |
+
def _extract_data_from_llm_with_rate_limit(self, raw_data: str) -> str:
|
| 34 |
+
"""Extracts data from the LLM with rate limiting."""
|
| 35 |
+
try:
|
| 36 |
+
resp = self._llm(self._prompt_template.format(pages=raw_data))
|
| 37 |
+
return resp
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print(f"Error during OpenAI API call: {e}")
|
| 40 |
+
return None
|
| 41 |
+
|
| 42 |
# This function will help in extracting and run the code, and will produce a dataframe for us
|
| 43 |
def run(self) -> pd.DataFrame:
|
| 44 |
# We have defined the way the data has to be returned
|
| 45 |
df = pd.DataFrame({
|
| 46 |
+
"Invoice ID": pd.Series(dtype="int"),
|
| 47 |
+
"DESCRIPTION": pd.Series(dtype="str"),
|
| 48 |
+
"Issue Data": pd.Series(dtype="str"),
|
| 49 |
+
"UNIT PRICE": pd.Series(dtype="str"),
|
| 50 |
+
"AMOUNT": pd.Series(dtype="int"),
|
| 51 |
+
"Bill For": pd.Series(dtype="str"),
|
| 52 |
+
"From": pd.Series(dtype="str"),
|
| 53 |
+
"Terms": pd.Series(dtype="str")}
|
| 54 |
)
|
| 55 |
|
| 56 |
for path in self._paths:
|
| 57 |
+
raw_text = self._get_raw_text_from_pdf(path) # This function needs to be created
|
| 58 |
+
llm_resp = self._extract_data_from_llm_with_rate_limit(raw_text) # Apply rate limit here
|
| 59 |
+
if llm_resp: # Check for None response from rate limiter
|
| 60 |
+
data = self._parse_response(llm_resp)
|
| 61 |
+
df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
|
| 62 |
+
else:
|
| 63 |
+
print(f"Skipping file due to rate limit or API error: {path}")
|
| 64 |
|
| 65 |
return df
|
| 66 |
|
|
|
|
| 70 |
Expected Outcome: remove any dollar symbols {{"Invoice ID":"12341234", "DESCRIPTION": "UNIT PRICE", "AMOUNT": "3", "Date": "2/1/2021", "AMOUNT": "100", "Bill For": "Dev", "From": "Coca Cola", "Terms" : "Net for 30 days"}}
|
| 71 |
"""
|
| 72 |
|
| 73 |
+
prompt_template = PromptTemplate(input_variables=["pages"], template=template)
|
| 74 |
return prompt_template
|
| 75 |
|
|
|
|
| 76 |
# We will try to extract the text from the PDF to a normal variable.
|
| 77 |
+
def _get_raw_text_from_pdf(self, path: str) -> str:
|
| 78 |
text = ""
|
| 79 |
pdf_reader = PdfReader(path)
|
| 80 |
for page in pdf_reader.pages:
|
| 81 |
text += page.extract_text()
|
| 82 |
return text
|
| 83 |
|
| 84 |
+
def _extract_data_from_llm(self, raw_data: str) -> str:
|
| 85 |
+
resp = self._llm(self._prompt_template.format(pages=raw_data))
|
| 86 |
return resp
|
| 87 |
+
|
| 88 |
def _parse_response(self, response: str) -> Dict[str, str]:
|
| 89 |
pattern = r'{(.+)}'
|
| 90 |
re_match = re.search(pattern, response, re.DOTALL)
|
| 91 |
if re_match:
|
| 92 |
extracted_text = re_match.group(1)
|
| 93 |
+
try:
|
| 94 |
+
data = eval('{' + extracted_text + '}')
|
| 95 |
+
return data
|
| 96 |
+
except (SyntaxError, NameError) as e:
|
| 97 |
+
print(f"Error parsing response: {e}")
|
| 98 |
+
return {} # Return an empty dictionary to avoid crashing
|
| 99 |
+
|
| 100 |
else:
|
| 101 |
raise Exception("No match found.")
|