Spaces:

productions
/

Data_Conversion

Sleeping

App Files Files Community

naveenvenkatesh commited on Oct 27, 2023

Commit

4ec3e55

1 Parent(s): 417a9e0

Upload 5 files

Browse files

Files changed (5) hide show

ContractGenerator.py +43 -0
contract_missing_clausses.py +89 -0
extract_date.py +90 -0
invoice_extractor.py +341 -0
pdftojson.py +33 -10

ContractGenerator.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import openai
+class ContractGenerator:
+    """
+    A class for generating contract forms based on user instructions using the OpenAI GPT-3.5 model.
+    """
+    def __init__(self, api_key: str):
+        """
+        Initialize the ContractGenerator.
+        Args:
+            api_key (str): Your OpenAI API key.
+        """
+        openai.api_key = api_key
+    def generate_contract(self, instructions: str) -> None:
+        """
+        Generate a contract form based on user instructions.
+        Args:
+            instructions (str): User-provided instructions for the contract form.
+        Raises:
+            openai.error.OpenAIError: If there is an error with the OpenAI API request.
+        """
+        # Define a prompt
+        prompt = f"Your task is to generate a contract form based on user instructions. ***Instructions:{instructions}***"
+        try:
+            # Generate text using the GPT-3.5 model
+            response = openai.Completion.create(
+                engine="text-davinci-003",
+                prompt=prompt,
+                max_tokens=500  # You can adjust the length of the generated text
+            )
+            # Print the generated text
+            return response.choices[0].text
+        except openai.error.OpenAIError as e:
+            print(f"Error generating the contract: {str(e)}")

contract_missing_clausses.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import openai
+import pdfplumber
+import logging
+# Configure logging
+logging.basicConfig(
+    filename='contract_missing_clausses.log',  # You can adjust the log file name here
+    filemode='a',
+    format='[%(asctime)s] [%(levelname)s] [%(filename)s] [%(lineno)s:%(funcName)s()] %(message)s',
+    datefmt='%Y-%b-%d %H:%M:%S'
+)
+LOGGER = logging.getLogger(__name__)
+log_level_env = 'INFO'  # You can adjust the log level here
+log_level_dict = {
+    'DEBUG': logging.DEBUG,
+    'INFO': logging.INFO,
+    'WARNING': logging.WARNING,
+    'ERROR': logging.ERROR,
+    'CRITICAL': logging.CRITICAL
+}
+if log_level_env in log_level_dict:
+    log_level = log_level_dict[log_level_env]
+else:
+    log_level = log_level_dict['INFO']
+LOGGER.setLevel(log_level)
+class ContractMissingClauses:
+  """
+  Class for identifying missing clauses, sub-clauses, and terms in a contract.
+  """
+  def __init__(self,open_api_key):
+      """
+      Initialize the ContractMissingClauses class and set up the OpenAI API client.
+      """
+      # Initialize the OpenAI API client
+      openai.api_key = open_api_key
+  def get_missing_clauses(self, contract: str):
+      """
+      Generate and print missing clauses, sub-clauses, and terms in the given contract.
+      Args:
+          contract (str): The text of the contract.
+      """
+      try:
+          LOGGER.info("Analyzing contract and extracting missing clauses...")
+          # Generate text using the OpenAI GPT-3 model
+          response = openai.Completion.create(
+              engine="text-davinci-003",  # You can specify different engines
+              prompt="identify missing clauses,sub-clauses and terms from given contrct ***{contract}*** return only missing (clauses,sub-clauses and terms) seperately.",
+              temperature=0,
+              max_tokens=500,  # The maximum number of tokens (words) in the generated text
+          )
+          # Print the generated text
+          return response.choices[0].text
+      except Exception as e:
+            # If an error occurs during the key-value extraction process, log the error
+            LOGGER.error(f"Error occurred while extracting missing clauses: {str(e)}")
+  def iterate_each_page(self,pdf_file):
+    """
+    Iterate through each page of a PDF contract, extract text, and call get_missing_clauses for each page.
+    """
+    try:
+      LOGGER.info("Analyzing contract and extracting pdf page...")
+      # Initialize pdfplumber
+      pdf = pdfplumber.open(pdf_file.name)
+      # Iterate through each page and extract text
+      for page in pdf.pages:
+          contract = page.extract_text()
+          self.get_missing_clauses(contract)
+    except Exception as e:
+            # If an error occurs during the key-value extraction process, log the error
+            LOGGER.error(f"Error occurred while extracting pdf page: {str(e)}")

extract_date.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from PyPDF2 import PdfReader
+import openai
+import fitz  # PyMuPDF
+import logging
+# Configure logging
+logging.basicConfig(
+    filename='extract_date.log',  # You can adjust the log file name here
+    filemode='a',
+    format='[%(asctime)s] [%(levelname)s] [%(filename)s] [%(lineno)s:%(funcName)s()] %(message)s',
+    datefmt='%Y-%b-%d %H:%M:%S'
+)
+LOGGER = logging.getLogger(__name__)
+log_level_env = 'INFO'  # You can adjust the log level here
+log_level_dict = {
+    'DEBUG': logging.DEBUG,
+    'INFO': logging.INFO,
+    'WARNING': logging.WARNING,
+    'ERROR': logging.ERROR,
+    'CRITICAL': logging.CRITICAL
+}
+if log_level_env in log_level_dict:
+    log_level = log_level_dict[log_level_env]
+else:
+    log_level = log_level_dict['INFO']
+LOGGER.setLevel(log_level)
+class ExtractDateAndDuration:
+    def __init__(self,api_key):
+        """
+        Initialize the ExtractDateAndDuration class.
+        """
+        openai.api_key = api_key
+    def get_date_and_duration(self, contract_text: str) -> str:
+        """
+        Extract dates and durations from the provided contract text.
+        Args:
+            contract_text (str): The text of the contract to analyze.
+        Returns:
+            str: Extracted dates and durations.
+        """
+        try:
+            response = openai.Completion.create(
+                engine="text-davinci-003",
+                prompt=f"""Your task is Identify Dates and Durations Mentioned in the contract and extract that date and duration in key-value pair.
+                ```contract: {contract_text}```
+                """,
+                max_tokens=300,
+                temperature=0
+            )
+            extracted_date_duration = response.choices[0].text.strip()
+            return extracted_date_duration
+        except Exception as e:
+            LOGGER.error(f"An error occurred during text analysis: {str(e)}")
+    def itrate_each_page(self, pdf_file_path: str):
+        """
+        Extract text from each page of a PDF document and process it.
+        Args:
+            pdf_file_path (str): The path to the PDF document.
+        Returns:
+            str: Extracted text from the PDF pages.
+        """
+        try:
+            # Open the multi-page PDF using PdfReaderer
+            pdf = PdfReader(pdf_file_path.name)
+            extracted_date_duration = ""
+            # Extract text from each page and pass it to the process_text function
+            for page_number in range(len(pdf.pages)):
+                # Extract text from the page
+                page = pdf.pages[page_number]
+                text = page.extract_text()
+                # Pass the text to the process_text function for further processing
+                extracted_date_duration += self.get_date_and_duration(text)
+            return extracted_date_duration
+        except Exception as e:
+            LOGGER.error(f"An error occurred while processing the PDF document: {str(e)}")

invoice_extractor.py ADDED Viewed

	@@ -0,0 +1,341 @@

+import os
+import logging
+from PIL import Image, ImageDraw
+import traceback
+import torch
+from docquery import pipeline
+from docquery.document import load_bytes, load_document, ImageDocument
+from docquery.ocr_reader import get_ocr_reader
+from pdf2image import convert_from_path
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Initialize the logger
+logging.basicConfig(filename="invoice_extraction.log", level=logging.DEBUG)  # Create a log file
+# Checkpoint for different models
+CHECKPOINTS = {
+    "LayoutLMv1 for Invoices 🧾": "impira/layoutlm-invoices",
+}
+PIPELINES = {}
+class InvoiceKeyValuePair():
+    """
+    This class provides a utility to extract key-value pairs from invoices using LayoutLM.
+    """
+    def __init__(self):
+      self.fields = {
+                "Vendor Name": ["Vendor Name - Logo?", "Vendor Name - Address?"],
+                "Vendor Address": ["Vendor Address?"],
+                "Customer Name": ["Customer Name?"],
+                "Customer Address": ["Customer Address?"],
+                "Invoice Number": ["Invoice Number?"],
+                "Invoice Date": ["Invoice Date?"],
+                "Due Date": ["Due Date?"],
+                "Subtotal": ["Subtotal?"],
+                "Total Tax": ["Total Tax?"],
+                "Invoice Total": ["Invoice Total?"],
+                "Amount Due": ["Amount Due?"],
+                "Payment Terms": ["Payment Terms?"],
+                "Remit To Name": ["Remit To Name?"],
+                "Remit To Address": ["Remit To Address?"],
+            }
+      self.model = list(CHECKPOINTS.keys())[0]
+    def ensure_list(self, x):
+        try:
+            # Log the function entry
+            logging.info(f'Entering ensure_list with x={x}')
+            # Check if 'x' is already a list
+            if isinstance(x, list):
+                return x
+            else:
+                # If 'x' is not a list, wrap it in a list and return
+                return [x]
+        except Exception as e:
+            # Log exceptions
+            logging.error("An error occurred:", exc_info=True)
+            return []
+    def construct_pipeline(self, task, model):
+        try:
+            # Log the function entry
+            logging.info(f'Entering construct_pipeline with task={task} and model={model}')
+            # Global dictionary to cache pipelines based on model checkpoint names
+            global PIPELINES
+            # Check if a pipeline for the specified model already exists in the cache
+            if model in PIPELINES:
+                # If it exists, return the cached pipeline
+                return PIPELINES[model]
+            try:
+                # Determine the device to use for inference (GPU if available, else CPU)
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+                # Create the pipeline using the specified task and model checkpoint
+                ret = pipeline(task=task, model=CHECKPOINTS[model], device=device)
+                # Cache the created pipeline for future use
+                PIPELINES[model] = ret
+                # Return the constructed pipeline
+                return ret
+            except Exception as e:
+                # Handle exceptions and log the error message
+                logging.error("An error occurred:", exc_info=True)
+                return None
+        except Exception as e:
+            # Log exceptions
+            logging.error("An error occurred:", exc_info=True)
+            return None
+    def run_pipeline(self, model, question, document, top_k):
+        try:
+            # Log the function entry
+            logging.info(f'Entering run_pipeline with model={model}, question={question}, and document={document}')
+            # Use the construct_pipeline method to get or create a pipeline for the specified model
+            pipeline = self.construct_pipeline("document-question-answering", model)
+            # Use the constructed pipeline to perform question-answering on the document
+            # Pass the question, document context, and top_k as arguments to the pipeline
+            return pipeline(question=question, **document.context, top_k=top_k)
+        except Exception as e:
+            # Log exceptions
+            logging.error("An error occurred:", exc_info=True)
+            return None
+    def lift_word_boxes(self, document, page):
+        try:
+            # Log the function entry
+            logging.info(f'Entering lift_word_boxes with document={document} and page={page}')
+            # Extract the word boxes for the specified page from the document's context
+            return document.context["image"][page][1]
+        except Exception as e:
+            # Log exceptions
+            logging.error("An error occurred:", exc_info=True)
+            return []
+    def expand_bbox(self, word_boxes):
+        try:
+            # Log the function entry
+            logging.info(f'Entering expand_bbox with word_boxes={word_boxes}')
+            # Check if the input list of word boxes is empty
+            if len(word_boxes) == 0:
+                return None
+            # Extract the minimum and maximum coordinates of the word boxes
+            min_x, min_y, max_x, max_y = zip(*[x[1] for x in word_boxes])
+            # Calculate the overall minimum and maximum coordinates
+            min_x, min_y, max_x, max_y = [min(min_x), min(min_y), max(max_x), max(max_y)]
+            # Return the expanded bounding box as [min_x, min_y, max_x, max_y]
+            return [min_x, min_y, max_x, max_y]
+        except Exception as e:
+            # Log exceptions
+            logging.error("An error occurred:", exc_info=True)
+            return None
+    def normalize_bbox(self, box, width, height, padding=0.005):
+        try:
+            # Log the function entry
+            logging.info(f'Entering normalize_bbox with box={box}, width={width}, height={height}, and padding={padding}')
+            # Extract the bounding box coordinates and convert them from millimeters to fractions
+            min_x, min_y, max_x, max_y = [c / 1000 for c in box]
+            # Apply padding if specified (as a fraction of image dimensions)
+            if padding != 0:
+                min_x = max(0, min_x - padding)
+                min_y = max(0, min_y - padding)
+                max_x = min(max_x + padding, 1)
+                max_y = min(max_y + padding, 1)
+            # Scale the normalized coordinates to match the image dimensions
+            return [min_x * width, min_y * height, max_x * width, max_y * height]
+        except Exception as e:
+            # Log exceptions
+            logging.error("An error occurred:", exc_info=True)
+            return None
+    def annotate_page(self, prediction, pages, document):
+        try:
+            # Log the function entry
+            logging.info(f'Entering annotate_page with prediction={prediction}, pages={pages}, and document={document}')
+            # Check if a prediction exists and contains word_ids
+            if prediction is not None and "word_ids" in prediction:
+                # Get the image of the page where the prediction was made
+                image = pages[prediction["page"]]
+                # Create a drawing object for the image
+                draw = ImageDraw.Draw(image, "RGBA")
+                # Extract word boxes for the page
+                word_boxes = self.lift_word_boxes(document, prediction["page"])
+                # Expand and normalize the bounding box of the predicted words
+                x1, y1, x2, y2 = self.normalize_bbox(
+                    self.expand_bbox([word_boxes[i] for i in prediction["word_ids"]]),
+                    image.width,
+                    image.height,
+                )
+                # Draw a semi-transparent green rectangle around the predicted words
+                draw.rectangle(((x1, y1), (x2, y2)), fill=(0, 255, 0, int(0.4 * 255)))
+        except Exception as e:
+            # Log exceptions
+            logging.error("An error occurred:", exc_info=True)
+    def process_fields(self, document, fields, model=list(CHECKPOINTS.keys())[0]):
+        try:
+            # Log the function entry
+            logging.info(f'Entering process_fields with document={document}, fields={fields}, and model={model}')
+            # Convert preview pages of the document to RGB format
+            pages = [x.copy().convert("RGB") for x in document.preview]
+            # Initialize dictionaries to store results
+            ret = {}
+            table = []
+            # Iterate through the fields and associated questions
+            for (field_name, questions) in fields.items():
+                # Extract answers for each question and filter based on score
+                answers = [
+                    a
+                    for q in questions
+                    for a in self.ensure_list(self.run_pipeline(model, q, document, top_k=1))
+                    if a.get("score", 1) > 0.5
+                ]
+                # Sort answers by score (higher score first)
+                answers.sort(key=lambda x: -x.get("score", 0) if x else 0)
+                # Get the top answer (if any)
+                top = answers[0] if len(answers) > 0 else None
+                # Annotate the page with the top answer's bounding box
+                self.annotate_page(top, pages, document)
+                # Store the top answer for the field and add it to the table
+                ret[field_name] = top
+                table.append([field_name, top.get("answer") if top is not None else None])
+            # Return the table of key-value pairs
+            return table
+        except Exception as e:
+            # Log exceptions
+            logging.error("An error occurred:", exc_info=True)
+            return []
+    def process_document(self, document, fields, model, error=None):
+        try:
+            # Log the function entry
+            logging.info(f'Entering process_document with document={document}, fields={fields}, model={model}, and error={error}')
+            # Check if the document is not None and no error occurred during processing
+            if document is not None and error is None:
+                # Process the fields in the document using the specified model
+                table = self.process_fields(document, fields, model)
+                return table
+        except Exception as e:
+            # Log exceptions
+            logging.error("An error occurred:", exc_info=True)
+            return []
+    def process_path(self, path, fields, model):
+        try:
+            # Log the function entry
+            logging.info(f'Entering process_path with path={path}, fields={fields}, and model={model}')
+            # Initialize error and document variables
+            error = None
+            document = None
+            # Check if a file path is provided
+            if path:
+                try:
+                    # Load the document from the specified file path
+                    document = load_document(path)
+                except Exception as e:
+                    # Handle exceptions and store the error message
+                    logging.error("An error occurred:", exc_info=True)
+                    error = str(e)
+            # Process the loaded document and extract key-value pairs
+            return self.process_document(document, fields, model, error)
+        except Exception as e:
+            # Log exceptions
+            logging.error("An error occurred:", exc_info=True)
+            return []
+    def pdf_to_image(self, file_path):
+        try:
+            # Log the function entry
+            logging.info(f'Entering pdf_to_image with file_path={file_path}')
+            # Convert PDF to a list of image objects (one for each page)
+            images = convert_from_path(file_path)
+            # Loop through each image and save it
+            for i, image in enumerate(images):
+                image_path = f'page_{i + 1}.png'
+            return image_path
+        except Exception as e:
+            # Log exceptions
+            logging.error("An error occurred:", exc_info=True)
+            return []
+    def process_upload(self, file):
+        try:
+            # Log the function entry
+            logging.info(f'Entering process_upload with file={file}')
+            # Get the model and fields from the instance
+            model = self.model
+            fields = self.fields
+            # Convert the uploaded PDF file to a list of image files
+            image = self.pdf_to_image(file)
+            # Use the first generated image file as the file path for processing
+            file = image
+            # Process the document (image) and extract key-value pairs
+            return self.process_path(file if file else None, fields, model)
+        except Exception as e:
+            # Log exceptions
+            logging.error("An error occurred:", exc_info=True)
+            return []
+    def extract_key_value_pair(self, invoice_file):
+        try:
+            # Log the function entry
+            logging.info(f'Entering extract_key_value_pair with invoice_file={invoice_file}')
+            # Process the uploaded invoice PDF file and extract key-value pairs
+            data = self.process_upload(invoice_file.name)
+            # Iterate through the extracted key-value pairs and print them
+            for item in data:
+                key, value = item
+                return f'{key}: {value}'
+        except Exception as e:
+            # Log exceptions
+            logging.error("An error occurred:", exc_info=True)

pdftojson.py CHANGED Viewed

@@ -1,16 +1,40 @@
 import os
 import PyPDF2
 from langchain import PromptTemplate, LLMChain
 from langchain.llms import OpenAI
 class PdftoJson:
-    def __init__(self):
         """
         Initialize the PdftoJson class with OpenAI API key.
         """
-        # OPENAI_API_KEY =  ""
-        # os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
     def _get_json(self, input_text: str) -> str:
         """
@@ -23,6 +47,7 @@ class PdftoJson:
             str: JSON result containing topics and content.
         """
         try:
             # Initialize the OpenAI language model with specified settings
             llm = OpenAI(temperature=0, max_tokens=1000)
@@ -42,10 +67,11 @@ class PdftoJson:
             text = input_text
             json_result = llm_chain.run(text)
             return json_result
         except Exception as e:
-            print(f"Error occurred while generating JSON result: {str(e)}")
     def extract_text_from_pdf(self, pdf_path: str):
@@ -56,6 +82,7 @@ class PdftoJson:
             pdf_path (str): Path to the PDF file.
         """
         try:
             # Open the PDF file in binary read mode
             with open(pdf_path.name, "rb") as pdf_file:
@@ -71,13 +98,9 @@ class PdftoJson:
                     # Generate JSON result for the extracted text
                     json_result = self._get_json(text)
-                    # # Clear Extra Spaces
-                    # clear_json_result = self._remove_empty_lines(json_result)
-                    # # Save the JSON result to a file
-                    # self._save_json(clear_json_result)
                     return json_result
         except Exception as e:
-            print(f"Error occurred during extraction and processing: {str(e)}")

 import os
 import PyPDF2
+import logging
 from langchain import PromptTemplate, LLMChain
 from langchain.llms import OpenAI
+# Configure logging
+logging.basicConfig(
+    filename='pdftojson.log',  # You can adjust the log file name here
+    filemode='a',
+    format='[%(asctime)s] [%(levelname)s] [%(filename)s] [%(lineno)s:%(funcName)s()] %(message)s',
+    datefmt='%Y-%b-%d %H:%M:%S'
+)
+LOGGER = logging.getLogger(__name__)
+log_level_env = 'INFO'  # You can adjust the log level here
+log_level_dict = {
+    'DEBUG': logging.DEBUG,
+    'INFO': logging.INFO,
+    'WARNING': logging.WARNING,
+    'ERROR': logging.ERROR,
+    'CRITICAL': logging.CRITICAL
+}
+if log_level_env in log_level_dict:
+    log_level = log_level_dict[log_level_env]
+else:
+    log_level = log_level_dict['INFO']
+LOGGER.setLevel(log_level)
 class PdftoJson:
+    def __init__(self,openai_api_key: str):
         """
         Initialize the PdftoJson class with OpenAI API key.
         """
+        OPENAI_API_KEY = openai_api_key
+        os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
     def _get_json(self, input_text: str) -> str:
         """
             str: JSON result containing topics and content.
         """
         try:
+            LOGGER.info("Generating JSON result by analyzing input text...")
             # Initialize the OpenAI language model with specified settings
             llm = OpenAI(temperature=0, max_tokens=1000)
             text = input_text
             json_result = llm_chain.run(text)
+            LOGGER.info("Generated JSON result successfully.")
             return json_result
         except Exception as e:
+            LOGGER.error(f"Error occurred while generating JSON result: {str(e)}")
     def extract_text_from_pdf(self, pdf_path: str):
             pdf_path (str): Path to the PDF file.
         """
         try:
+            LOGGER.info("Extracting text from PDF, generating JSON result, and saving to a file...")
             # Open the PDF file in binary read mode
             with open(pdf_path.name, "rb") as pdf_file:
                     # Generate JSON result for the extracted text
                     json_result = self._get_json(text)
                     return json_result
+            LOGGER.info("Extraction, JSON generation, and saving completed.")
         except Exception as e:
+            LOGGER.error(f"Error occurred during extraction and processing: {str(e)}")