Legal_AI_Agent

Build error

App Files Files Community

cryogenic22 commited on Dec 10, 2024

Commit

a685f5a

verified ·

1 Parent(s): d871011

Update utils/document_processor.py

Browse files

Files changed (1) hide show

utils/document_processor.py +105 -7

utils/document_processor.py CHANGED Viewed

@@ -5,10 +5,22 @@ from pdf2image import convert_from_bytes
 import pytesseract
 from PIL import Image
 from typing import Tuple, List, Dict
 class DocumentProcessor:
-    def process_document(self, file) -> Tuple[str, List[Dict]]:
-        """Process a document and return its text and chunks."""
         file_type = file.name.split(".")[-1].lower()
         if file_type == "pdf":
             text = self._process_pdf(file)
@@ -18,9 +30,10 @@ class DocumentProcessor:
             text = self._process_text(file)
         else:
             raise ValueError(f"Unsupported file type: {file_type}")
         chunks = self._chunk_text(text)
-        return text, chunks
     def _process_pdf(self, file) -> str:
         """Extract text from a PDF, including OCR for scanned PDFs."""
@@ -28,10 +41,9 @@ class DocumentProcessor:
         text = ""
         for page in reader.pages:
             page_text = page.extract_text()
-            if page_text:
                 text += page_text
             else:
-                st.warning("Detected a scanned PDF. Performing OCR...")
                 pdf_bytes = file.read()
                 text += self._perform_ocr(pdf_bytes)
         return text
@@ -56,10 +68,96 @@ class DocumentProcessor:
             detected_encoding = chardet.detect(raw_data)["encoding"]
             return raw_data.decode(detected_encoding)
         except Exception as e:
-            st.error(f"Error processing text file: {e}")
             return ""
     def _chunk_text(self, text: str, chunk_size: int = 500) -> List[Dict]:
         """Split text into smaller chunks for vectorization."""
         return [{"chunk_id": idx, "text": text[i:i + chunk_size]}
                 for idx, i in enumerate(range(0, len(text), chunk_size))]

 import pytesseract
 from PIL import Image
 from typing import Tuple, List, Dict
+import json
+import os
 class DocumentProcessor:
+    def __init__(self, ontology_path: str = "data/legal_ontology.json"):
+        """
+        Initialize Document Processor.
+        Args:
+            ontology_path (str): Path to the legal ontology JSON file.
+        """
+        self.ontology = self._load_ontology(ontology_path)
+    def process_document(self, file) -> Tuple[str, List[Dict], Dict]:
+        """Process a document, extract text, chunks, and metadata."""
         file_type = file.name.split(".")[-1].lower()
         if file_type == "pdf":
             text = self._process_pdf(file)
             text = self._process_text(file)
         else:
             raise ValueError(f"Unsupported file type: {file_type}")
         chunks = self._chunk_text(text)
+        metadata = self._extract_metadata(text, file.name)
+        return text, chunks, metadata
     def _process_pdf(self, file) -> str:
         """Extract text from a PDF, including OCR for scanned PDFs."""
         text = ""
         for page in reader.pages:
             page_text = page.extract_text()
+            if page_text.strip():
                 text += page_text
             else:
                 pdf_bytes = file.read()
                 text += self._perform_ocr(pdf_bytes)
         return text
             detected_encoding = chardet.detect(raw_data)["encoding"]
             return raw_data.decode(detected_encoding)
         except Exception as e:
+            print(f"Error processing text file: {e}")
             return ""
     def _chunk_text(self, text: str, chunk_size: int = 500) -> List[Dict]:
         """Split text into smaller chunks for vectorization."""
         return [{"chunk_id": idx, "text": text[i:i + chunk_size]}
                 for idx, i in enumerate(range(0, len(text), chunk_size))]
+    def _extract_metadata(self, text: str, file_name: str) -> Dict:
+        """
+        Extract metadata such as document type, jurisdiction, and key parties.
+        Args:
+            text (str): Extracted document text.
+            file_name (str): Original file name.
+        Returns:
+            Dict: Extracted metadata.
+        """
+        metadata = {
+            "title": file_name,
+            "type": self._infer_document_type(text),
+            "jurisdiction": self._infer_jurisdiction(text),
+            "key_parties": self._extract_key_parties(text),
+            "effective_dates": self._extract_dates(text),
+            "ontology_links": self._link_to_ontology(text)
+        }
+        return metadata
+    def _infer_document_type(self, text: str) -> str:
+        """Infer the type of the document based on keywords."""
+        document_types = {
+            "judgement": ["court", "judge", "judgment", "verdict"],
+            "contract": ["agreement", "contract", "clause", "terms"],
+            "mou": ["memorandum of understanding", "mou", "collaboration"],
+            "will": ["testament", "executor", "bequest", "inheritance"]
+        }
+        for doc_type, keywords in document_types.items():
+            if any(keyword.lower() in text.lower() for keyword in keywords):
+                return doc_type
+        return "unknown"
+    def _infer_jurisdiction(self, text: str) -> str:
+        """Infer the jurisdiction based on keywords in the text."""
+        jurisdictions = {
+            "US": ["united states", "california", "federal law"],
+            "UK": ["united kingdom", "england", "scotland", "british law"],
+            "UAE": ["united arab emirates", "dubai", "abu dhabi"],
+            "India": ["india", "indian law", "supreme court"]
+        }
+        for jurisdiction, keywords in jurisdictions.items():
+            if any(keyword.lower() in text.lower() for keyword in keywords):
+                return jurisdiction
+        return "unknown"
+    def _extract_key_parties(self, text: str) -> List[str]:
+        """Extract key parties involved in the document."""
+        # Simplified logic for extracting parties; regex or NLP can enhance this.
+        lines = text.splitlines()
+        parties = [line.strip() for line in lines if "party" in line.lower()]
+        return parties[:5]  # Limit to 5 parties for simplicity
+    def _extract_dates(self, text: str) -> List[str]:
+        """Extract dates from the text."""
+        # Simplified example using date patterns
+        import re
+        date_pattern = r"\b(?:\d{1,2}/\d{1,2}/\d{2,4}|\d{1,2} \w+ \d{4})\b"
+        return re.findall(date_pattern, text)
+    def _link_to_ontology(self, text: str) -> List[Dict]:
+        """
+        Link document content to legal ontology for context and relevance.
+        Args:
+            text (str): Extracted document text.
+        Returns:
+            List[Dict]: Relevant ontology concepts and links.
+        """
+        relevant_ontology = []
+        for concept in self.ontology:
+            if concept["keyword"].lower() in text.lower():
+                relevant_ontology.append({"concept": concept["name"], "description": concept["description"]})
+        return relevant_ontology
+    def _load_ontology(self, path: str) -> List[Dict]:
+        """Load the legal ontology from a JSON file."""
+        if os.path.exists(path):
+            with open(path, "r") as f:
+                return json.load(f)
+        else:
+            print("Ontology file not found. Using an empty ontology.")
+            return []