Legal_AI_Agent

Build error

cryogenic22 commited on Dec 10, 2024

Commit

69d7210

verified ·

1 Parent(s): 475500c

Update utils/document_processor.py

Files changed (1) hide show

utils/document_processor.py CHANGED Viewed

@@ -17,11 +17,9 @@ from pathlib import Path
 import streamlit as st
 import shutil
 class DocumentProcessor:
     def __init__(self, base_path: str = None):
         """Initialize Document Processor with proper data directory handling."""
-        # Set up base paths
         self.base_path = self._setup_data_directories(base_path)
         self.ontology_path = os.path.join(self.base_path, "ontology", "legal_ontology.json")
@@ -133,6 +131,13 @@ class DocumentProcessor:
             st.error(f"Error in document processing pipeline: {str(e)}")
             raise
     def process_document(self, file_path: str) -> Tuple[str, List[Dict]]:
         """Process a document based on its type."""
         file_type = Path(file_path).suffix.lower()
@@ -147,6 +152,9 @@ class DocumentProcessor:
         chunks = self._create_chunks(text)
         return text, chunks
     def _process_pdf(self, file_path: str) -> str:
         """Extract text from PDF, using OCR if necessary."""
         try:

 import streamlit as st
 import shutil
 class DocumentProcessor:
     def __init__(self, base_path: str = None):
         """Initialize Document Processor with proper data directory handling."""
         self.base_path = self._setup_data_directories(base_path)
         self.ontology_path = os.path.join(self.base_path, "ontology", "legal_ontology.json")
             st.error(f"Error in document processing pipeline: {str(e)}")
             raise
+    def _tokenize_text(self, text: str) -> List[str]:
+        """Tokenize text into sentences using NLTK."""
+        try:
+            return sent_tokenize(text)
+        except Exception:
+            return [sentence.strip() for sentence in text.split('.') if sentence.strip()]
     def process_document(self, file_path: str) -> Tuple[str, List[Dict]]:
         """Process a document based on its type."""
         file_type = Path(file_path).suffix.lower()
         chunks = self._create_chunks(text)
         return text, chunks
+    # ... (other methods remain unchanged)
     def _process_pdf(self, file_path: str) -> str:
         """Extract text from PDF, using OCR if necessary."""
         try: