sunnysharma20 commited on
Commit
026503c
·
verified ·
1 Parent(s): 55c9914

Update backend.py

Browse files
Files changed (1) hide show
  1. backend.py +49 -23
backend.py CHANGED
@@ -4,41 +4,63 @@ import pandas as pd
4
  from pypdf import PdfReader
5
  from typing import List, Dict
6
  from langchain.prompts import PromptTemplate
7
- # from langchain_google_genai import GoogleGenerativeAI
8
  from langchain_openai import OpenAI
 
9
 
10
-
11
  api_key = "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"
12
 
13
  os.environ["OPENAI_API_KEY"] = api_key
 
 
14
  class InvoicePipeline:
15
 
16
  def __init__(self, paths):
17
  # This is your file path
18
  self._paths = paths
19
  # This is your LLM (GPT)
20
- self._llm = OpenAI()
21
  # This is prompt
22
  self._prompt_template = self._get_default_prompt_template()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # This function will help in extracting and run the code, and will produce a dataframe for us
24
  def run(self) -> pd.DataFrame:
25
  # We have defined the way the data has to be returned
26
  df = pd.DataFrame({
27
- "Invoice ID": pd.Series(dtype = "int"),
28
- "DESCRIPTION": pd.Series(dtype = "str"),
29
- "Issue Data": pd.Series(dtype = "str"),
30
- "UNIT PRICE": pd.Series(dtype = "str"),
31
- "AMOUNT": pd.Series(dtype = "int"),
32
- "Bill For": pd.Series(dtype = "str"),
33
- "From": pd.Series(dtype ="str"),
34
- "Terms": pd.Series(dtype = "str")}
35
  )
36
 
37
  for path in self._paths:
38
- raw_text = self._get_raw_text_from_pdf(path) # This function needs to be created
39
- llm_resp = self._extract_data_from_llm(raw_text) #
40
- data = self._parse_response(llm_resp)
41
- df = pd.concat([df, pd.DataFrame([data])], ignore_index = True)
 
 
 
42
 
43
  return df
44
 
@@ -48,28 +70,32 @@ class InvoicePipeline:
48
  Expected Outcome: remove any dollar symbols {{"Invoice ID":"12341234", "DESCRIPTION": "UNIT PRICE", "AMOUNT": "3", "Date": "2/1/2021", "AMOUNT": "100", "Bill For": "Dev", "From": "Coca Cola", "Terms" : "Net for 30 days"}}
49
  """
50
 
51
- prompt_template = PromptTemplate(input_variables = ["pages"], template = template)
52
  return prompt_template
53
 
54
-
55
  # We will try to extract the text from the PDF to a normal variable.
56
- def _get_raw_text_from_pdf(self, path:str) -> str:
57
  text = ""
58
  pdf_reader = PdfReader(path)
59
  for page in pdf_reader.pages:
60
  text += page.extract_text()
61
  return text
62
 
63
- def _extract_data_from_llm(self, raw_data:str) -> str:
64
- resp = self._llm(self._prompt_template.format(pages = raw_data))
65
  return resp
66
-
67
  def _parse_response(self, response: str) -> Dict[str, str]:
68
  pattern = r'{(.+)}'
69
  re_match = re.search(pattern, response, re.DOTALL)
70
  if re_match:
71
  extracted_text = re_match.group(1)
72
- data = eval('{' + extracted_text + '}')
73
- return data
 
 
 
 
 
74
  else:
75
  raise Exception("No match found.")
 
4
  from pypdf import PdfReader
5
  from typing import List, Dict
6
  from langchain.prompts import PromptTemplate
 
7
  from langchain_openai import OpenAI
8
+ from ratelimit import limits, sleep_and_retry
9
 
10
+ # Replace with your actual API key
11
  api_key = "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"
12
 
13
  os.environ["OPENAI_API_KEY"] = api_key
14
+
15
+
16
  class InvoicePipeline:
17
 
18
  def __init__(self, paths):
19
  # This is your file path
20
  self._paths = paths
21
  # This is your LLM (GPT)
22
+ self._llm = OpenAI() # Initialize OpenAI here, no rate limiting yet.
23
  # This is prompt
24
  self._prompt_template = self._get_default_prompt_template()
25
+
26
+ # Rate Limiting Configuration (adjust based on your OpenAI account limits)
27
+ self.calls_per_minute = 60 # Example: Adjust based on your plan's RPM limit
28
+ self.one_minute = 60
29
+
30
+ # Apply rate limiting to the LLM call
31
+ @sleep_and_retry
32
+ @limits(calls=60, period=60) # Calls/minute
33
+ def _extract_data_from_llm_with_rate_limit(self, raw_data: str) -> str:
34
+ """Extracts data from the LLM with rate limiting."""
35
+ try:
36
+ resp = self._llm(self._prompt_template.format(pages=raw_data))
37
+ return resp
38
+ except Exception as e:
39
+ print(f"Error during OpenAI API call: {e}")
40
+ return None
41
+
42
  # This function will help in extracting and run the code, and will produce a dataframe for us
43
  def run(self) -> pd.DataFrame:
44
  # We have defined the way the data has to be returned
45
  df = pd.DataFrame({
46
+ "Invoice ID": pd.Series(dtype="int"),
47
+ "DESCRIPTION": pd.Series(dtype="str"),
48
+ "Issue Data": pd.Series(dtype="str"),
49
+ "UNIT PRICE": pd.Series(dtype="str"),
50
+ "AMOUNT": pd.Series(dtype="int"),
51
+ "Bill For": pd.Series(dtype="str"),
52
+ "From": pd.Series(dtype="str"),
53
+ "Terms": pd.Series(dtype="str")}
54
  )
55
 
56
  for path in self._paths:
57
+ raw_text = self._get_raw_text_from_pdf(path) # This function needs to be created
58
+ llm_resp = self._extract_data_from_llm_with_rate_limit(raw_text) # Apply rate limit here
59
+ if llm_resp: # Check for None response from rate limiter
60
+ data = self._parse_response(llm_resp)
61
+ df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
62
+ else:
63
+ print(f"Skipping file due to rate limit or API error: {path}")
64
 
65
  return df
66
 
 
70
  Expected Outcome: remove any dollar symbols {{"Invoice ID":"12341234", "DESCRIPTION": "UNIT PRICE", "AMOUNT": "3", "Date": "2/1/2021", "AMOUNT": "100", "Bill For": "Dev", "From": "Coca Cola", "Terms" : "Net for 30 days"}}
71
  """
72
 
73
+ prompt_template = PromptTemplate(input_variables=["pages"], template=template)
74
  return prompt_template
75
 
 
76
  # We will try to extract the text from the PDF to a normal variable.
77
+ def _get_raw_text_from_pdf(self, path: str) -> str:
78
  text = ""
79
  pdf_reader = PdfReader(path)
80
  for page in pdf_reader.pages:
81
  text += page.extract_text()
82
  return text
83
 
84
+ def _extract_data_from_llm(self, raw_data: str) -> str:
85
+ resp = self._llm(self._prompt_template.format(pages=raw_data))
86
  return resp
87
+
88
  def _parse_response(self, response: str) -> Dict[str, str]:
89
  pattern = r'{(.+)}'
90
  re_match = re.search(pattern, response, re.DOTALL)
91
  if re_match:
92
  extracted_text = re_match.group(1)
93
+ try:
94
+ data = eval('{' + extracted_text + '}')
95
+ return data
96
+ except (SyntaxError, NameError) as e:
97
+ print(f"Error parsing response: {e}")
98
+ return {} # Return an empty dictionary to avoid crashing
99
+
100
  else:
101
  raise Exception("No match found.")