Spaces:

sunnysharma20
/

PDFReader

Build error

App Files Files Community

sunnysharma20 commited on Mar 8, 2025

Commit

026503c

verified ·

1 Parent(s): 55c9914

Update backend.py

Browse files

Files changed (1) hide show

backend.py +49 -23

backend.py CHANGED Viewed

@@ -4,41 +4,63 @@ import pandas as pd
 from pypdf import PdfReader
 from typing import List, Dict
 from langchain.prompts import PromptTemplate
-# from langchain_google_genai import GoogleGenerativeAI
 from langchain_openai import OpenAI
 api_key = "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"
 os.environ["OPENAI_API_KEY"] = api_key
 class InvoicePipeline:
     def __init__(self, paths):
         # This is your file path
         self._paths = paths
         # This is your LLM (GPT)
-        self._llm = OpenAI()
         # This is prompt
         self._prompt_template = self._get_default_prompt_template()
     # This function will help in extracting and run the code, and will produce a dataframe for us
     def run(self) -> pd.DataFrame:
         # We have defined the way the data has to be returned
         df = pd.DataFrame({
-            "Invoice ID": pd.Series(dtype = "int"),
-            "DESCRIPTION": pd.Series(dtype = "str"),
-            "Issue Data": pd.Series(dtype = "str"),
-            "UNIT PRICE": pd.Series(dtype = "str"),
-            "AMOUNT": pd.Series(dtype = "int"),
-            "Bill For": pd.Series(dtype = "str"),
-            "From": pd.Series(dtype ="str"),
-            "Terms": pd.Series(dtype = "str")}
         )
         for path in self._paths:
-            raw_text = self._get_raw_text_from_pdf(path) # This function needs to be created
-            llm_resp = self._extract_data_from_llm(raw_text) #
-            data = self._parse_response(llm_resp)
-            df = pd.concat([df, pd.DataFrame([data])], ignore_index = True)
         return df
@@ -48,28 +70,32 @@ class InvoicePipeline:
         Expected Outcome: remove any dollar symbols {{"Invoice ID":"12341234", "DESCRIPTION": "UNIT PRICE", "AMOUNT": "3", "Date": "2/1/2021", "AMOUNT": "100", "Bill For": "Dev", "From": "Coca Cola", "Terms" : "Net for 30 days"}}
         """
-        prompt_template = PromptTemplate(input_variables = ["pages"], template = template)
         return prompt_template
     # We will try to extract the text from the PDF to a normal variable.
-    def _get_raw_text_from_pdf(self, path:str) -> str:
         text = ""
         pdf_reader = PdfReader(path)
         for page in pdf_reader.pages:
             text += page.extract_text()
         return text
-    def _extract_data_from_llm(self, raw_data:str) -> str:
-        resp = self._llm(self._prompt_template.format(pages = raw_data))
         return resp
     def _parse_response(self, response: str) -> Dict[str, str]:
         pattern = r'{(.+)}'
         re_match = re.search(pattern, response, re.DOTALL)
         if re_match:
             extracted_text = re_match.group(1)
-            data = eval('{' + extracted_text + '}')
-            return data
         else:
             raise Exception("No match found.")

 from pypdf import PdfReader
 from typing import List, Dict
 from langchain.prompts import PromptTemplate
 from langchain_openai import OpenAI
+from ratelimit import limits, sleep_and_retry
+# Replace with your actual API key
 api_key = "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"
 os.environ["OPENAI_API_KEY"] = api_key
 class InvoicePipeline:
     def __init__(self, paths):
         # This is your file path
         self._paths = paths
         # This is your LLM (GPT)
+        self._llm = OpenAI()  # Initialize OpenAI here, no rate limiting yet.
         # This is prompt
         self._prompt_template = self._get_default_prompt_template()
+        # Rate Limiting Configuration (adjust based on your OpenAI account limits)
+        self.calls_per_minute = 60  # Example: Adjust based on your plan's RPM limit
+        self.one_minute = 60
+    # Apply rate limiting to the LLM call
+    @sleep_and_retry
+    @limits(calls=60, period=60)  # Calls/minute
+    def _extract_data_from_llm_with_rate_limit(self, raw_data: str) -> str:
+        """Extracts data from the LLM with rate limiting."""
+        try:
+            resp = self._llm(self._prompt_template.format(pages=raw_data))
+            return resp
+        except Exception as e:
+            print(f"Error during OpenAI API call: {e}")
+            return None
     # This function will help in extracting and run the code, and will produce a dataframe for us
     def run(self) -> pd.DataFrame:
         # We have defined the way the data has to be returned
         df = pd.DataFrame({
+            "Invoice ID": pd.Series(dtype="int"),
+            "DESCRIPTION": pd.Series(dtype="str"),
+            "Issue Data": pd.Series(dtype="str"),
+            "UNIT PRICE": pd.Series(dtype="str"),
+            "AMOUNT": pd.Series(dtype="int"),
+            "Bill For": pd.Series(dtype="str"),
+            "From": pd.Series(dtype="str"),
+            "Terms": pd.Series(dtype="str")}
         )
         for path in self._paths:
+            raw_text = self._get_raw_text_from_pdf(path)  # This function needs to be created
+            llm_resp = self._extract_data_from_llm_with_rate_limit(raw_text)  # Apply rate limit here
+            if llm_resp:  # Check for None response from rate limiter
+                data = self._parse_response(llm_resp)
+                df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
+            else:
+                print(f"Skipping file due to rate limit or API error: {path}")
         return df
         Expected Outcome: remove any dollar symbols {{"Invoice ID":"12341234", "DESCRIPTION": "UNIT PRICE", "AMOUNT": "3", "Date": "2/1/2021", "AMOUNT": "100", "Bill For": "Dev", "From": "Coca Cola", "Terms" : "Net for 30 days"}}
         """
+        prompt_template = PromptTemplate(input_variables=["pages"], template=template)
         return prompt_template
     # We will try to extract the text from the PDF to a normal variable.
+    def _get_raw_text_from_pdf(self, path: str) -> str:
         text = ""
         pdf_reader = PdfReader(path)
         for page in pdf_reader.pages:
             text += page.extract_text()
         return text
+    def _extract_data_from_llm(self, raw_data: str) -> str:
+        resp = self._llm(self._prompt_template.format(pages=raw_data))
         return resp
     def _parse_response(self, response: str) -> Dict[str, str]:
         pattern = r'{(.+)}'
         re_match = re.search(pattern, response, re.DOTALL)
         if re_match:
             extracted_text = re_match.group(1)
+            try:
+                data = eval('{' + extracted_text + '}')
+                return data
+            except (SyntaxError, NameError) as e:
+                print(f"Error parsing response: {e}")
+                return {} # Return an empty dictionary to avoid crashing
         else:
             raise Exception("No match found.")