cryogenic22 commited on
Commit
69d7210
·
verified ·
1 Parent(s): 475500c

Update utils/document_processor.py

Browse files
Files changed (1) hide show
  1. utils/document_processor.py +10 -2
utils/document_processor.py CHANGED
@@ -17,11 +17,9 @@ from pathlib import Path
17
  import streamlit as st
18
  import shutil
19
 
20
-
21
  class DocumentProcessor:
22
  def __init__(self, base_path: str = None):
23
  """Initialize Document Processor with proper data directory handling."""
24
- # Set up base paths
25
  self.base_path = self._setup_data_directories(base_path)
26
  self.ontology_path = os.path.join(self.base_path, "ontology", "legal_ontology.json")
27
 
@@ -133,6 +131,13 @@ class DocumentProcessor:
133
  st.error(f"Error in document processing pipeline: {str(e)}")
134
  raise
135
 
 
 
 
 
 
 
 
136
  def process_document(self, file_path: str) -> Tuple[str, List[Dict]]:
137
  """Process a document based on its type."""
138
  file_type = Path(file_path).suffix.lower()
@@ -147,6 +152,9 @@ class DocumentProcessor:
147
  chunks = self._create_chunks(text)
148
  return text, chunks
149
 
 
 
 
150
  def _process_pdf(self, file_path: str) -> str:
151
  """Extract text from PDF, using OCR if necessary."""
152
  try:
 
17
  import streamlit as st
18
  import shutil
19
 
 
20
  class DocumentProcessor:
21
  def __init__(self, base_path: str = None):
22
  """Initialize Document Processor with proper data directory handling."""
 
23
  self.base_path = self._setup_data_directories(base_path)
24
  self.ontology_path = os.path.join(self.base_path, "ontology", "legal_ontology.json")
25
 
 
131
  st.error(f"Error in document processing pipeline: {str(e)}")
132
  raise
133
 
134
+ def _tokenize_text(self, text: str) -> List[str]:
135
+ """Tokenize text into sentences using NLTK."""
136
+ try:
137
+ return sent_tokenize(text)
138
+ except Exception:
139
+ return [sentence.strip() for sentence in text.split('.') if sentence.strip()]
140
+
141
  def process_document(self, file_path: str) -> Tuple[str, List[Dict]]:
142
  """Process a document based on its type."""
143
  file_type = Path(file_path).suffix.lower()
 
152
  chunks = self._create_chunks(text)
153
  return text, chunks
154
 
155
+ # ... (other methods remain unchanged)
156
+
157
+
158
  def _process_pdf(self, file_path: str) -> str:
159
  """Extract text from PDF, using OCR if necessary."""
160
  try: