Legal_AI_Agent

Build error

cryogenic22 commited on Dec 10, 2024

Commit

b898c42

verified ·

1 Parent(s): 2664f83

Update utils/document_processor.py

Files changed (1) hide show

utils/document_processor.py CHANGED Viewed

@@ -20,8 +20,32 @@ class DocumentProcessor:
         """
         self.ontology = self._load_ontology(ontology_path)
-    def process_document(self, file) -> Tuple[str, List[Dict], Dict]:
-        """Process a document, extract text, chunks, and metadata."""
         file_type = file.name.split(".")[-1].lower()
         if file_type == "pdf":
             text = self._process_pdf(file)
@@ -33,8 +57,7 @@ class DocumentProcessor:
             raise ValueError(f"Unsupported file type: {file_type}")
         chunks = self._chunk_text(text)
-        metadata = self._extract_metadata(text, file.name)
-        return text, chunks, metadata
     def _process_pdf(self, file) -> str:
         """Extract text from a PDF, including OCR for scanned PDFs."""

         """
         self.ontology = self._load_ontology(ontology_path)
+    def process_and_tag_document(self, file) -> Tuple[str, List[Dict], Dict]:
+        """
+        Process a document, extract text, chunks, and metadata.
+        Args:
+            file: Uploaded document file.
+        Returns:
+            Tuple[str, List[Dict], Dict]: Extracted text, text chunks, and metadata.
+        """
+        # Process the document to extract text
+        text, chunks = self.process_document(file)
+        # Extract metadata using ontology and document content
+        metadata = self._extract_metadata(text, file.name)
+        return text, chunks, metadata
+    def process_document(self, file) -> Tuple[str, List[Dict]]:
+        """
+        Process a document and return its text and chunks.
+        Args:
+            file: Uploaded document file.
+        Returns:
+            Tuple[str, List[Dict]]: Extracted text and text chunks.
+        """
         file_type = file.name.split(".")[-1].lower()
         if file_type == "pdf":
             text = self._process_pdf(file)
             raise ValueError(f"Unsupported file type: {file_type}")
         chunks = self._chunk_text(text)
+        return text, chunks
     def _process_pdf(self, file) -> str:
         """Extract text from a PDF, including OCR for scanned PDFs."""