cryogenic22 commited on
Commit
827fd16
·
verified ·
1 Parent(s): a027a75

Update utils/document_processor.py

Browse files
Files changed (1) hide show
  1. utils/document_processor.py +2 -21
utils/document_processor.py CHANGED
@@ -30,9 +30,7 @@ class DocumentProcessor:
30
  Returns:
31
  Tuple[str, List[Dict], Dict]: Extracted text, text chunks, and metadata.
32
  """
33
- # Process the document to extract text
34
  text, chunks = self.process_document(file)
35
- # Extract metadata using ontology and document content
36
  metadata = self._extract_metadata(text, file.name)
37
  return text, chunks, metadata
38
 
@@ -101,16 +99,7 @@ class DocumentProcessor:
101
  for idx, i in enumerate(range(0, len(text), chunk_size))]
102
 
103
  def _extract_metadata(self, text: str, file_name: str) -> Dict:
104
- """
105
- Extract metadata such as document type, jurisdiction, and key legal concepts.
106
-
107
- Args:
108
- text (str): Extracted document text.
109
- file_name (str): Original file name.
110
-
111
- Returns:
112
- Dict: Extracted metadata.
113
- """
114
  metadata = {
115
  "title": file_name,
116
  "type": self._infer_document_type(text),
@@ -155,15 +144,7 @@ class DocumentProcessor:
155
  return re.findall(date_pattern, text)
156
 
157
  def _link_to_ontology(self, text: str) -> List[Dict]:
158
- """
159
- Link document content to legal ontology for context and relevance.
160
-
161
- Args:
162
- text (str): Extracted document text.
163
-
164
- Returns:
165
- List[Dict]: Relevant ontology concepts and links.
166
- """
167
  relevant_ontology = []
168
  for concept in self.ontology["@graph"]:
169
  if "rdfs:label" in concept and concept["rdfs:label"].lower() in text.lower():
 
30
  Returns:
31
  Tuple[str, List[Dict], Dict]: Extracted text, text chunks, and metadata.
32
  """
 
33
  text, chunks = self.process_document(file)
 
34
  metadata = self._extract_metadata(text, file.name)
35
  return text, chunks, metadata
36
 
 
99
  for idx, i in enumerate(range(0, len(text), chunk_size))]
100
 
101
  def _extract_metadata(self, text: str, file_name: str) -> Dict:
102
+ """Extract metadata such as document type, jurisdiction, and key legal concepts."""
 
 
 
 
 
 
 
 
 
103
  metadata = {
104
  "title": file_name,
105
  "type": self._infer_document_type(text),
 
144
  return re.findall(date_pattern, text)
145
 
146
  def _link_to_ontology(self, text: str) -> List[Dict]:
147
+ """Link document content to legal ontology for context and relevance."""
 
 
 
 
 
 
 
 
148
  relevant_ontology = []
149
  for concept in self.ontology["@graph"]:
150
  if "rdfs:label" in concept and concept["rdfs:label"].lower() in text.lower():