cryogenic22 commited on
Commit
62d7d31
·
verified ·
1 Parent(s): a685f5a

Update utils/document_processor.py

Browse files
Files changed (1) hide show
  1. utils/document_processor.py +15 -21
utils/document_processor.py CHANGED
@@ -7,6 +7,7 @@ from PIL import Image
7
  from typing import Tuple, List, Dict
8
  import json
9
  import os
 
10
 
11
 
12
  class DocumentProcessor:
@@ -15,7 +16,7 @@ class DocumentProcessor:
15
  Initialize Document Processor.
16
 
17
  Args:
18
- ontology_path (str): Path to the legal ontology JSON file.
19
  """
20
  self.ontology = self._load_ontology(ontology_path)
21
 
@@ -78,7 +79,7 @@ class DocumentProcessor:
78
 
79
  def _extract_metadata(self, text: str, file_name: str) -> Dict:
80
  """
81
- Extract metadata such as document type, jurisdiction, and key parties.
82
 
83
  Args:
84
  text (str): Extracted document text.
@@ -112,28 +113,21 @@ class DocumentProcessor:
112
 
113
  def _infer_jurisdiction(self, text: str) -> str:
114
  """Infer the jurisdiction based on keywords in the text."""
115
- jurisdictions = {
116
- "US": ["united states", "california", "federal law"],
117
- "UK": ["united kingdom", "england", "scotland", "british law"],
118
- "UAE": ["united arab emirates", "dubai", "abu dhabi"],
119
- "India": ["india", "indian law", "supreme court"]
120
- }
121
- for jurisdiction, keywords in jurisdictions.items():
122
- if any(keyword.lower() in text.lower() for keyword in keywords):
123
- return jurisdiction
124
  return "unknown"
125
 
126
  def _extract_key_parties(self, text: str) -> List[str]:
127
  """Extract key parties involved in the document."""
128
- # Simplified logic for extracting parties; regex or NLP can enhance this.
129
  lines = text.splitlines()
130
  parties = [line.strip() for line in lines if "party" in line.lower()]
131
- return parties[:5] # Limit to 5 parties for simplicity
132
 
133
  def _extract_dates(self, text: str) -> List[str]:
134
  """Extract dates from the text."""
135
- # Simplified example using date patterns
136
- import re
137
  date_pattern = r"\b(?:\d{1,2}/\d{1,2}/\d{2,4}|\d{1,2} \w+ \d{4})\b"
138
  return re.findall(date_pattern, text)
139
 
@@ -148,16 +142,16 @@ class DocumentProcessor:
148
  List[Dict]: Relevant ontology concepts and links.
149
  """
150
  relevant_ontology = []
151
- for concept in self.ontology:
152
- if concept["keyword"].lower() in text.lower():
153
- relevant_ontology.append({"concept": concept["name"], "description": concept["description"]})
154
  return relevant_ontology
155
 
156
- def _load_ontology(self, path: str) -> List[Dict]:
157
- """Load the legal ontology from a JSON file."""
158
  if os.path.exists(path):
159
  with open(path, "r") as f:
160
  return json.load(f)
161
  else:
162
  print("Ontology file not found. Using an empty ontology.")
163
- return []
 
7
  from typing import Tuple, List, Dict
8
  import json
9
  import os
10
+ import re
11
 
12
 
13
  class DocumentProcessor:
 
16
  Initialize Document Processor.
17
 
18
  Args:
19
+ ontology_path (str): Path to the legal ontology JSON-LD file.
20
  """
21
  self.ontology = self._load_ontology(ontology_path)
22
 
 
79
 
80
  def _extract_metadata(self, text: str, file_name: str) -> Dict:
81
  """
82
+ Extract metadata such as document type, jurisdiction, and key legal concepts.
83
 
84
  Args:
85
  text (str): Extracted document text.
 
113
 
114
  def _infer_jurisdiction(self, text: str) -> str:
115
  """Infer the jurisdiction based on keywords in the text."""
116
+ jurisdictions = {entry["@id"]: entry["rdfs:label"]
117
+ for entry in self.ontology["@graph"] if entry["@type"] == "vocab:Jurisdiction"}
118
+ for jurisdiction_id, label in jurisdictions.items():
119
+ if label.lower() in text.lower():
120
+ return label
 
 
 
 
121
  return "unknown"
122
 
123
  def _extract_key_parties(self, text: str) -> List[str]:
124
  """Extract key parties involved in the document."""
 
125
  lines = text.splitlines()
126
  parties = [line.strip() for line in lines if "party" in line.lower()]
127
+ return parties[:5]
128
 
129
  def _extract_dates(self, text: str) -> List[str]:
130
  """Extract dates from the text."""
 
 
131
  date_pattern = r"\b(?:\d{1,2}/\d{1,2}/\d{2,4}|\d{1,2} \w+ \d{4})\b"
132
  return re.findall(date_pattern, text)
133
 
 
142
  List[Dict]: Relevant ontology concepts and links.
143
  """
144
  relevant_ontology = []
145
+ for concept in self.ontology["@graph"]:
146
+ if "rdfs:label" in concept and concept["rdfs:label"].lower() in text.lower():
147
+ relevant_ontology.append({"concept": concept["rdfs:label"], "description": concept.get("rdfs:comment", "")})
148
  return relevant_ontology
149
 
150
+ def _load_ontology(self, path: str) -> Dict:
151
+ """Load the legal ontology from a JSON-LD file."""
152
  if os.path.exists(path):
153
  with open(path, "r") as f:
154
  return json.load(f)
155
  else:
156
  print("Ontology file not found. Using an empty ontology.")
157
+ return {"@graph": []}