Legal_AI_Agent

Build error

App Files Files Community

cryogenic22 commited on Dec 10, 2024

Commit

62d7d31

verified ·

1 Parent(s): a685f5a

Update utils/document_processor.py

Browse files

Files changed (1) hide show

utils/document_processor.py +15 -21

utils/document_processor.py CHANGED Viewed

@@ -7,6 +7,7 @@ from PIL import Image
 from typing import Tuple, List, Dict
 import json
 import os
 class DocumentProcessor:
@@ -15,7 +16,7 @@ class DocumentProcessor:
         Initialize Document Processor.
         Args:
-            ontology_path (str): Path to the legal ontology JSON file.
         """
         self.ontology = self._load_ontology(ontology_path)
@@ -78,7 +79,7 @@ class DocumentProcessor:
     def _extract_metadata(self, text: str, file_name: str) -> Dict:
         """
-        Extract metadata such as document type, jurisdiction, and key parties.
         Args:
             text (str): Extracted document text.
@@ -112,28 +113,21 @@ class DocumentProcessor:
     def _infer_jurisdiction(self, text: str) -> str:
         """Infer the jurisdiction based on keywords in the text."""
-        jurisdictions = {
-            "US": ["united states", "california", "federal law"],
-            "UK": ["united kingdom", "england", "scotland", "british law"],
-            "UAE": ["united arab emirates", "dubai", "abu dhabi"],
-            "India": ["india", "indian law", "supreme court"]
-        }
-        for jurisdiction, keywords in jurisdictions.items():
-            if any(keyword.lower() in text.lower() for keyword in keywords):
-                return jurisdiction
         return "unknown"
     def _extract_key_parties(self, text: str) -> List[str]:
         """Extract key parties involved in the document."""
-        # Simplified logic for extracting parties; regex or NLP can enhance this.
         lines = text.splitlines()
         parties = [line.strip() for line in lines if "party" in line.lower()]
-        return parties[:5]  # Limit to 5 parties for simplicity
     def _extract_dates(self, text: str) -> List[str]:
         """Extract dates from the text."""
-        # Simplified example using date patterns
-        import re
         date_pattern = r"\b(?:\d{1,2}/\d{1,2}/\d{2,4}|\d{1,2} \w+ \d{4})\b"
         return re.findall(date_pattern, text)
@@ -148,16 +142,16 @@ class DocumentProcessor:
             List[Dict]: Relevant ontology concepts and links.
         """
         relevant_ontology = []
-        for concept in self.ontology:
-            if concept["keyword"].lower() in text.lower():
-                relevant_ontology.append({"concept": concept["name"], "description": concept["description"]})
         return relevant_ontology
-    def _load_ontology(self, path: str) -> List[Dict]:
-        """Load the legal ontology from a JSON file."""
         if os.path.exists(path):
             with open(path, "r") as f:
                 return json.load(f)
         else:
             print("Ontology file not found. Using an empty ontology.")
-            return []

 from typing import Tuple, List, Dict
 import json
 import os
+import re
 class DocumentProcessor:
         Initialize Document Processor.
         Args:
+            ontology_path (str): Path to the legal ontology JSON-LD file.
         """
         self.ontology = self._load_ontology(ontology_path)
     def _extract_metadata(self, text: str, file_name: str) -> Dict:
         """
+        Extract metadata such as document type, jurisdiction, and key legal concepts.
         Args:
             text (str): Extracted document text.
     def _infer_jurisdiction(self, text: str) -> str:
         """Infer the jurisdiction based on keywords in the text."""
+        jurisdictions = {entry["@id"]: entry["rdfs:label"]
+                         for entry in self.ontology["@graph"] if entry["@type"] == "vocab:Jurisdiction"}
+        for jurisdiction_id, label in jurisdictions.items():
+            if label.lower() in text.lower():
+                return label
         return "unknown"
     def _extract_key_parties(self, text: str) -> List[str]:
         """Extract key parties involved in the document."""
         lines = text.splitlines()
         parties = [line.strip() for line in lines if "party" in line.lower()]
+        return parties[:5]
     def _extract_dates(self, text: str) -> List[str]:
         """Extract dates from the text."""
         date_pattern = r"\b(?:\d{1,2}/\d{1,2}/\d{2,4}|\d{1,2} \w+ \d{4})\b"
         return re.findall(date_pattern, text)
             List[Dict]: Relevant ontology concepts and links.
         """
         relevant_ontology = []
+        for concept in self.ontology["@graph"]:
+            if "rdfs:label" in concept and concept["rdfs:label"].lower() in text.lower():
+                relevant_ontology.append({"concept": concept["rdfs:label"], "description": concept.get("rdfs:comment", "")})
         return relevant_ontology
+    def _load_ontology(self, path: str) -> Dict:
+        """Load the legal ontology from a JSON-LD file."""
         if os.path.exists(path):
             with open(path, "r") as f:
                 return json.load(f)
         else:
             print("Ontology file not found. Using an empty ontology.")
+            return {"@graph": []}