Spaces:
Build error
Build error
Update utils/document_processor.py
Browse files- utils/document_processor.py +2 -21
utils/document_processor.py
CHANGED
|
@@ -30,9 +30,7 @@ class DocumentProcessor:
|
|
| 30 |
Returns:
|
| 31 |
Tuple[str, List[Dict], Dict]: Extracted text, text chunks, and metadata.
|
| 32 |
"""
|
| 33 |
-
# Process the document to extract text
|
| 34 |
text, chunks = self.process_document(file)
|
| 35 |
-
# Extract metadata using ontology and document content
|
| 36 |
metadata = self._extract_metadata(text, file.name)
|
| 37 |
return text, chunks, metadata
|
| 38 |
|
|
@@ -101,16 +99,7 @@ class DocumentProcessor:
|
|
| 101 |
for idx, i in enumerate(range(0, len(text), chunk_size))]
|
| 102 |
|
| 103 |
def _extract_metadata(self, text: str, file_name: str) -> Dict:
|
| 104 |
-
"""
|
| 105 |
-
Extract metadata such as document type, jurisdiction, and key legal concepts.
|
| 106 |
-
|
| 107 |
-
Args:
|
| 108 |
-
text (str): Extracted document text.
|
| 109 |
-
file_name (str): Original file name.
|
| 110 |
-
|
| 111 |
-
Returns:
|
| 112 |
-
Dict: Extracted metadata.
|
| 113 |
-
"""
|
| 114 |
metadata = {
|
| 115 |
"title": file_name,
|
| 116 |
"type": self._infer_document_type(text),
|
|
@@ -155,15 +144,7 @@ class DocumentProcessor:
|
|
| 155 |
return re.findall(date_pattern, text)
|
| 156 |
|
| 157 |
def _link_to_ontology(self, text: str) -> List[Dict]:
|
| 158 |
-
"""
|
| 159 |
-
Link document content to legal ontology for context and relevance.
|
| 160 |
-
|
| 161 |
-
Args:
|
| 162 |
-
text (str): Extracted document text.
|
| 163 |
-
|
| 164 |
-
Returns:
|
| 165 |
-
List[Dict]: Relevant ontology concepts and links.
|
| 166 |
-
"""
|
| 167 |
relevant_ontology = []
|
| 168 |
for concept in self.ontology["@graph"]:
|
| 169 |
if "rdfs:label" in concept and concept["rdfs:label"].lower() in text.lower():
|
|
|
|
| 30 |
Returns:
|
| 31 |
Tuple[str, List[Dict], Dict]: Extracted text, text chunks, and metadata.
|
| 32 |
"""
|
|
|
|
| 33 |
text, chunks = self.process_document(file)
|
|
|
|
| 34 |
metadata = self._extract_metadata(text, file.name)
|
| 35 |
return text, chunks, metadata
|
| 36 |
|
|
|
|
| 99 |
for idx, i in enumerate(range(0, len(text), chunk_size))]
|
| 100 |
|
| 101 |
def _extract_metadata(self, text: str, file_name: str) -> Dict:
|
| 102 |
+
"""Extract metadata such as document type, jurisdiction, and key legal concepts."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
metadata = {
|
| 104 |
"title": file_name,
|
| 105 |
"type": self._infer_document_type(text),
|
|
|
|
| 144 |
return re.findall(date_pattern, text)
|
| 145 |
|
| 146 |
def _link_to_ontology(self, text: str) -> List[Dict]:
|
| 147 |
+
"""Link document content to legal ontology for context and relevance."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
relevant_ontology = []
|
| 149 |
for concept in self.ontology["@graph"]:
|
| 150 |
if "rdfs:label" in concept and concept["rdfs:label"].lower() in text.lower():
|