Spaces:
Build error
Build error
Update utils/document_processor.py
Browse files- utils/document_processor.py +10 -2
utils/document_processor.py
CHANGED
|
@@ -17,11 +17,9 @@ from pathlib import Path
|
|
| 17 |
import streamlit as st
|
| 18 |
import shutil
|
| 19 |
|
| 20 |
-
|
| 21 |
class DocumentProcessor:
|
| 22 |
def __init__(self, base_path: str = None):
|
| 23 |
"""Initialize Document Processor with proper data directory handling."""
|
| 24 |
-
# Set up base paths
|
| 25 |
self.base_path = self._setup_data_directories(base_path)
|
| 26 |
self.ontology_path = os.path.join(self.base_path, "ontology", "legal_ontology.json")
|
| 27 |
|
|
@@ -133,6 +131,13 @@ class DocumentProcessor:
|
|
| 133 |
st.error(f"Error in document processing pipeline: {str(e)}")
|
| 134 |
raise
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
def process_document(self, file_path: str) -> Tuple[str, List[Dict]]:
|
| 137 |
"""Process a document based on its type."""
|
| 138 |
file_type = Path(file_path).suffix.lower()
|
|
@@ -147,6 +152,9 @@ class DocumentProcessor:
|
|
| 147 |
chunks = self._create_chunks(text)
|
| 148 |
return text, chunks
|
| 149 |
|
|
|
|
|
|
|
|
|
|
| 150 |
def _process_pdf(self, file_path: str) -> str:
|
| 151 |
"""Extract text from PDF, using OCR if necessary."""
|
| 152 |
try:
|
|
|
|
| 17 |
import streamlit as st
|
| 18 |
import shutil
|
| 19 |
|
|
|
|
| 20 |
class DocumentProcessor:
|
| 21 |
def __init__(self, base_path: str = None):
|
| 22 |
"""Initialize Document Processor with proper data directory handling."""
|
|
|
|
| 23 |
self.base_path = self._setup_data_directories(base_path)
|
| 24 |
self.ontology_path = os.path.join(self.base_path, "ontology", "legal_ontology.json")
|
| 25 |
|
|
|
|
| 131 |
st.error(f"Error in document processing pipeline: {str(e)}")
|
| 132 |
raise
|
| 133 |
|
| 134 |
+
def _tokenize_text(self, text: str) -> List[str]:
|
| 135 |
+
"""Tokenize text into sentences using NLTK."""
|
| 136 |
+
try:
|
| 137 |
+
return sent_tokenize(text)
|
| 138 |
+
except Exception:
|
| 139 |
+
return [sentence.strip() for sentence in text.split('.') if sentence.strip()]
|
| 140 |
+
|
| 141 |
def process_document(self, file_path: str) -> Tuple[str, List[Dict]]:
|
| 142 |
"""Process a document based on its type."""
|
| 143 |
file_type = Path(file_path).suffix.lower()
|
|
|
|
| 152 |
chunks = self._create_chunks(text)
|
| 153 |
return text, chunks
|
| 154 |
|
| 155 |
+
# ... (other methods remain unchanged)
|
| 156 |
+
|
| 157 |
+
|
| 158 |
def _process_pdf(self, file_path: str) -> str:
|
| 159 |
"""Extract text from PDF, using OCR if necessary."""
|
| 160 |
try:
|