cryogenic22 commited on
Commit
dc3b7e9
·
verified ·
1 Parent(s): 55f1461

Update utils/document_processor.py

Browse files
Files changed (1) hide show
  1. utils/document_processor.py +359 -136
utils/document_processor.py CHANGED
@@ -13,24 +13,98 @@ import spacy
13
  import nltk
14
  from nltk.tokenize import sent_tokenize
15
  from nltk.corpus import stopwords
 
 
16
 
17
  class DocumentProcessor:
18
- def __init__(self, ontology_path: str = "data/legal_ontology.json"):
19
- """Initialize Document Processor with enhanced NLP capabilities."""
20
- self.ontology = self._load_ontology(ontology_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  # Initialize NLP components
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  try:
24
  self.nlp = spacy.load("en_core_web_sm")
25
  except OSError:
26
  spacy.cli.download("en_core_web_sm")
27
  self.nlp = spacy.load("en_core_web_sm")
28
 
29
- # Initialize NLTK components
30
  try:
31
  nltk.data.find('tokenizers/punkt')
32
  except LookupError:
33
  nltk.download('punkt')
 
 
 
34
  nltk.download('stopwords')
35
 
36
  self.stop_words = set(stopwords.words('english'))
@@ -38,168 +112,209 @@ class DocumentProcessor:
38
  def process_and_tag_document(self, file) -> Tuple[str, List[Dict], Dict]:
39
  """Process document with enhanced metadata extraction and chunking."""
40
  try:
 
 
 
 
 
 
 
 
 
 
 
 
41
  # Extract text and perform initial processing
42
- text, chunks = self.process_document(file)
43
 
44
  # Extract and enrich metadata
45
  metadata = self._extract_metadata(text, file.name)
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- # Enhance chunks with NLP analysis
48
- enhanced_chunks = self._enhance_chunks(chunks, metadata)
 
 
 
 
49
 
50
- return text, enhanced_chunks, metadata
51
  except Exception as e:
52
  print(f"Error processing document: {e}")
53
  raise
54
 
55
- def _enhance_chunks(self, chunks: List[Dict], metadata: Dict) -> List[Dict]:
56
- """Enhance chunks with NLP analysis and metadata."""
57
- enhanced_chunks = []
58
- for chunk in chunks:
59
- # Process chunk with spaCy
60
- doc = self.nlp(chunk['text'])
61
-
62
- # Extract key entities
63
- entities = [(ent.text, ent.label_) for ent in doc.ents]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- # Extract key phrases
66
- noun_phrases = [chunk.text for chunk in doc.noun_chunks]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- # Link to ontology concepts
69
- ontology_links = self._link_to_ontology(chunk['text'])
 
70
 
71
- # Add enhancements to chunk
72
- enhanced_chunks.append({
73
- **chunk,
74
- 'entities': entities,
75
- 'key_phrases': noun_phrases,
76
- 'ontology_links': ontology_links,
77
- 'metadata': metadata
78
- })
79
-
80
- return enhanced_chunks
81
 
82
- def _chunk_text(self, text: str, chunk_size: int = 500) -> List[Dict]:
83
- """Improved text chunking with sentence boundary preservation."""
84
  # Split into sentences
85
  sentences = sent_tokenize(text)
86
 
87
  chunks = []
88
  current_chunk = []
89
  current_length = 0
 
90
 
91
  for sentence in sentences:
92
  sentence_length = len(sentence)
93
 
94
  if current_length + sentence_length > chunk_size and current_chunk:
95
- # Store current chunk
96
  chunk_text = ' '.join(current_chunk)
97
- chunks.append({
98
- 'chunk_id': len(chunks),
99
- 'text': chunk_text,
100
- 'start_idx': text.index(current_chunk[0]),
101
- 'end_idx': text.index(current_chunk[-1]) + len(current_chunk[-1])
102
- })
103
  current_chunk = []
104
  current_length = 0
105
 
106
  current_chunk.append(sentence)
107
  current_length += sentence_length
108
 
109
- # Add final chunk
110
  if current_chunk:
111
  chunk_text = ' '.join(current_chunk)
112
- chunks.append({
113
- 'chunk_id': len(chunks),
114
- 'text': chunk_text,
115
- 'start_idx': text.index(current_chunk[0]),
116
- 'end_idx': text.index(current_chunk[-1]) + len(current_chunk[-1])
117
- })
118
 
119
  return chunks
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  def _extract_metadata(self, text: str, file_name: str) -> Dict:
122
- """Enhanced metadata extraction with NLP support."""
123
- # Process text with spaCy
124
  doc = self.nlp(text[:10000]) # Process first 10k chars for efficiency
125
 
126
  metadata = {
127
- 'title': file_name,
128
- 'type': self._infer_document_type(text, doc),
129
- 'jurisdiction': self._infer_jurisdiction(text, doc),
130
  'processed_at': datetime.now().isoformat(),
131
- 'key_entities': self._extract_key_entities(doc),
132
- 'dates': self._extract_dates(text),
 
 
 
133
  'citations': self._extract_citations(text),
134
- 'ontology_links': self._link_to_ontology(text),
135
- 'language_stats': self._get_language_stats(doc)
 
136
  }
137
 
138
  return metadata
139
 
140
- def _infer_document_type(self, text: str, doc: spacy.tokens.Doc) -> str:
141
- """Improved document type inference using NLP and patterns."""
142
- # Define document type patterns with weights
 
 
 
 
 
 
 
 
 
143
  type_patterns = {
144
- 'judgment': {
145
- 'keywords': ['court', 'judge', 'judgment', 'verdict', 'ruling'],
146
- 'weight': 1.5
147
- },
148
- 'contract': {
149
- 'keywords': ['agreement', 'contract', 'party', 'clause', 'terms'],
150
- 'weight': 1.2
151
- },
152
- 'legislation': {
153
- 'keywords': ['act', 'statute', 'regulation', 'law', 'provision'],
154
- 'weight': 1.3
155
- },
156
- 'memo': {
157
- 'keywords': ['memorandum', 'memo', 'note', 'circular'],
158
- 'weight': 1.0
159
- }
160
  }
161
 
162
- # Calculate scores for each type
163
- scores = {}
164
  text_lower = text.lower()
 
 
165
 
166
- for doc_type, pattern in type_patterns.items():
167
- score = 0
168
- for keyword in pattern['keywords']:
169
- count = text_lower.count(keyword)
170
- score += count * pattern['weight']
171
- scores[doc_type] = score
172
 
173
- # Get type with highest score
174
- if scores:
175
- max_score = max(scores.values())
176
- if max_score > 0:
177
- return max(scores.items(), key=lambda x: x[1])[0]
178
-
179
- return 'unknown'
180
-
181
- def _extract_key_entities(self, doc: spacy.tokens.Doc) -> Dict[str, List[str]]:
182
- """Extract and categorize key entities from text."""
183
- entities = {
184
- 'PERSON': set(),
185
- 'ORG': set(),
186
- 'GPE': set(),
187
- 'LAW': set(),
188
- 'DATE': set()
189
- }
190
-
191
- for ent in doc.ents:
192
- if ent.label_ in entities:
193
- entities[ent.label_].add(ent.text)
194
-
195
- return {k: list(v) for k, v in entities.items()}
196
 
197
  def _extract_citations(self, text: str) -> List[Dict]:
198
- """Extract legal citations using regex patterns."""
199
  citation_patterns = [
200
  r'\[\d{4}\]\s+\w+\s+\d+', # [2021] EWHC 123
201
  r'\d+\s+U\.S\.\s+\d+', # 123 U.S. 456
202
- r'\(\d{4}\)\s+\d+\s+\w+\s+\d+', # (2021) 12 ABC 345
203
  ]
204
 
205
  citations = []
@@ -214,8 +329,23 @@ class DocumentProcessor:
214
 
215
  return citations
216
 
217
- def _get_language_stats(self, doc: spacy.tokens.Doc) -> Dict:
218
- """Get language statistics for the document."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  return {
220
  'sentence_count': len(list(doc.sents)),
221
  'word_count': len([token for token in doc if not token.is_space]),
@@ -223,16 +353,27 @@ class DocumentProcessor:
223
  for sent in doc.sents) / len(list(doc.sents)) if doc.sents else 0
224
  }
225
 
 
 
 
 
 
 
 
 
 
 
 
226
  def _link_to_ontology(self, text: str) -> List[Dict]:
227
- """Enhanced ontology linking with context."""
228
  relevant_concepts = []
229
  text_lower = text.lower()
230
 
231
- for concept in self.ontology['@graph']:
232
- if 'rdfs:label' not in concept:
233
  continue
234
 
235
- label = concept['rdfs:label'].lower()
236
  if label in text_lower:
237
  # Get surrounding context
238
  start_idx = text_lower.index(label)
@@ -244,29 +385,111 @@ class DocumentProcessor:
244
  'type': concept.get('@type', 'Unknown'),
245
  'description': concept.get('rdfs:comment', ''),
246
  'context': text[context_start:context_end].strip(),
247
- 'confidence': self._calculate_concept_confidence(text, concept)
248
  })
249
 
250
  return relevant_concepts
251
 
252
- def _calculate_concept_confidence(self, text: str, concept: Dict) -> float:
253
- """Calculate confidence score for ontology concept match."""
254
- confidence = 0.0
255
-
256
- # Check for exact label match
257
- if concept.get('rdfs:label', '').lower() in text.lower():
258
- confidence += 0.6
259
-
260
- # Check for related terms
261
- if 'related_terms' in concept:
262
- related_matches = sum(1 for term in concept['related_terms']
263
- if term.lower() in text.lower())
264
- confidence += 0.2 * (related_matches / len(concept['related_terms']))
265
-
266
- # Check for context terms
267
- if 'context_terms' in concept:
268
- context_matches = sum(1 for term in concept['context_terms']
269
- if term.lower() in text.lower())
270
- confidence += 0.2 * (context_matches / len(concept['context_terms']))
271
-
272
- return min(1.0, confidence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  import nltk
14
  from nltk.tokenize import sent_tokenize
15
  from nltk.corpus import stopwords
16
+ from pathlib import Path
17
+ import shutil
18
 
19
  class DocumentProcessor:
20
+ def __init__(self, base_path: str = None):
21
+ """Initialize Document Processor with proper data directory handling.
22
+
23
+ Args:
24
+ base_path: Optional base path override. If None, will use appropriate
25
+ path based on environment (local vs HF Spaces)
26
+ """
27
+ # Set up base paths
28
+ self.base_path = self._setup_data_directories(base_path)
29
+ self.ontology_path = os.path.join(self.base_path, "legal_ontology.json")
30
+
31
+ # Ensure ontology exists
32
+ self._ensure_ontology_exists()
33
+
34
+ # Load ontology
35
+ self.ontology = self._load_ontology()
36
 
37
  # Initialize NLP components
38
+ self._setup_nlp()
39
+
40
+ # Create processing directories
41
+ self.processed_path = os.path.join(self.base_path, "processed")
42
+ self.temp_path = os.path.join(self.base_path, "temp")
43
+ os.makedirs(self.processed_path, exist_ok=True)
44
+ os.makedirs(self.temp_path, exist_ok=True)
45
+
46
+ def _setup_data_directories(self, base_path: Optional[str] = None) -> str:
47
+ """Set up data directories with HF Spaces compatibility."""
48
+ if base_path:
49
+ data_path = base_path
50
+ else:
51
+ # Check if running in Hugging Face Spaces
52
+ if os.environ.get('SPACE_ID'):
53
+ # Use the persistent storage in HF Spaces
54
+ data_path = "/data"
55
+ else:
56
+ # Local development path
57
+ data_path = os.path.join(os.getcwd(), "data")
58
+
59
+ # Create necessary subdirectories
60
+ subdirs = ["ontology", "processed", "temp", "indexes"]
61
+ for subdir in subdirs:
62
+ os.makedirs(os.path.join(data_path, subdir), exist_ok=True)
63
+
64
+ return data_path
65
+
66
+ def _ensure_ontology_exists(self):
67
+ """Ensure the legal ontology file exists, create if not."""
68
+ if not os.path.exists(self.ontology_path):
69
+ default_ontology = {
70
+ "@graph": [
71
+ {
72
+ "@id": "concept:Contract",
73
+ "@type": "vocab:LegalConcept",
74
+ "rdfs:label": "Contract",
75
+ "rdfs:comment": "A legally binding agreement between parties",
76
+ "vocab:relatedConcepts": ["Offer", "Acceptance", "Consideration"]
77
+ },
78
+ {
79
+ "@id": "concept:Judgment",
80
+ "@type": "vocab:LegalConcept",
81
+ "rdfs:label": "Judgment",
82
+ "rdfs:comment": "A court's final determination of the rights and obligations",
83
+ "vocab:relatedConcepts": ["Court Order", "Decision", "Ruling"]
84
+ }
85
+ ]
86
+ }
87
+
88
+ with open(self.ontology_path, 'w') as f:
89
+ json.dump(default_ontology, f, indent=2)
90
+
91
+ def _setup_nlp(self):
92
+ """Initialize NLP components with error handling."""
93
+ # Setup spaCy
94
  try:
95
  self.nlp = spacy.load("en_core_web_sm")
96
  except OSError:
97
  spacy.cli.download("en_core_web_sm")
98
  self.nlp = spacy.load("en_core_web_sm")
99
 
100
+ # Setup NLTK
101
  try:
102
  nltk.data.find('tokenizers/punkt')
103
  except LookupError:
104
  nltk.download('punkt')
105
+ try:
106
+ nltk.data.find('corpora/stopwords')
107
+ except LookupError:
108
  nltk.download('stopwords')
109
 
110
  self.stop_words = set(stopwords.words('english'))
 
112
  def process_and_tag_document(self, file) -> Tuple[str, List[Dict], Dict]:
113
  """Process document with enhanced metadata extraction and chunking."""
114
  try:
115
+ # Generate unique document ID
116
+ doc_id = datetime.now().strftime('%Y%m%d_%H%M%S')
117
+
118
+ # Create document directory
119
+ doc_dir = os.path.join(self.processed_path, doc_id)
120
+ os.makedirs(doc_dir, exist_ok=True)
121
+
122
+ # Save original file
123
+ original_path = os.path.join(doc_dir, "original" + Path(file.name).suffix)
124
+ with open(original_path, 'wb') as f:
125
+ f.write(file.getvalue())
126
+
127
  # Extract text and perform initial processing
128
+ text, chunks = self.process_document(original_path)
129
 
130
  # Extract and enrich metadata
131
  metadata = self._extract_metadata(text, file.name)
132
+ metadata['doc_id'] = doc_id
133
+ metadata['original_path'] = original_path
134
+
135
+ # Save processed text
136
+ text_path = os.path.join(doc_dir, "processed.txt")
137
+ with open(text_path, 'w', encoding='utf-8') as f:
138
+ f.write(text)
139
+
140
+ # Save chunks
141
+ chunks_path = os.path.join(doc_dir, "chunks.json")
142
+ with open(chunks_path, 'w') as f:
143
+ json.dump(chunks, f, indent=2)
144
 
145
+ # Save metadata
146
+ metadata_path = os.path.join(doc_dir, "metadata.json")
147
+ with open(metadata_path, 'w') as f:
148
+ json.dump(metadata, f, indent=2)
149
+
150
+ return text, chunks, metadata
151
 
 
152
  except Exception as e:
153
  print(f"Error processing document: {e}")
154
  raise
155
 
156
+ def process_document(self, file_path: str) -> Tuple[str, List[Dict]]:
157
+ """Process a document and return its text and chunks."""
158
+ file_type = Path(file_path).suffix.lower()
159
+
160
+ if file_type == '.pdf':
161
+ text = self._process_pdf(file_path)
162
+ elif file_type == '.docx':
163
+ text = self._process_docx(file_path)
164
+ elif file_type in ['.txt', '.csv']:
165
+ text = self._process_text(file_path)
166
+ else:
167
+ raise ValueError(f"Unsupported file type: {file_type}")
168
+
169
+ # Create chunks with enhanced metadata
170
+ chunks = self._create_enhanced_chunks(text)
171
+ return text, chunks
172
+
173
+ def _process_pdf(self, file_path: str) -> str:
174
+ """Extract text from PDF with OCR fallback."""
175
+ reader = pypdf.PdfReader(file_path)
176
+ text = ""
177
+
178
+ for page_num, page in enumerate(reader.pages, 1):
179
+ page_text = page.extract_text()
180
 
181
+ if page_text.strip():
182
+ text += f"\n--- Page {page_num} ---\n{page_text}"
183
+ else:
184
+ # Perform OCR if text extraction fails
185
+ images = convert_from_bytes(open(file_path, 'rb').read())
186
+ page_text = pytesseract.image_to_string(images[page_num - 1])
187
+ text += f"\n--- Page {page_num} (OCR) ---\n{page_text}"
188
+
189
+ return text
190
+
191
+ def _process_docx(self, file_path: str) -> str:
192
+ """Process DOCX files with metadata."""
193
+ doc = docx.Document(file_path)
194
+ text = ""
195
+
196
+ # Process document sections
197
+ for para in doc.paragraphs:
198
+ if para.text.strip():
199
+ text += para.text + "\n"
200
+
201
+ return text
202
+
203
+ def _process_text(self, file_path: str) -> str:
204
+ """Process text files with encoding detection."""
205
+ try:
206
+ with open(file_path, 'rb') as f:
207
+ raw_data = f.read()
208
 
209
+ # Detect encoding
210
+ result = chardet.detect(raw_data)
211
+ encoding = result['encoding'] if result['confidence'] > 0.7 else 'utf-8'
212
 
213
+ # Decode text
214
+ return raw_data.decode(encoding)
215
+ except Exception as e:
216
+ print(f"Error processing text file: {e}")
217
+ return ""
 
 
 
 
 
218
 
219
+ def _create_enhanced_chunks(self, text: str) -> List[Dict]:
220
+ """Create enhanced chunks with NLP analysis."""
221
  # Split into sentences
222
  sentences = sent_tokenize(text)
223
 
224
  chunks = []
225
  current_chunk = []
226
  current_length = 0
227
+ chunk_size = 500 # Approximate target chunk size
228
 
229
  for sentence in sentences:
230
  sentence_length = len(sentence)
231
 
232
  if current_length + sentence_length > chunk_size and current_chunk:
233
+ # Process current chunk
234
  chunk_text = ' '.join(current_chunk)
235
+ chunks.append(self._process_chunk(chunk_text, len(chunks)))
 
 
 
 
 
236
  current_chunk = []
237
  current_length = 0
238
 
239
  current_chunk.append(sentence)
240
  current_length += sentence_length
241
 
242
+ # Process final chunk
243
  if current_chunk:
244
  chunk_text = ' '.join(current_chunk)
245
+ chunks.append(self._process_chunk(chunk_text, len(chunks)))
 
 
 
 
 
246
 
247
  return chunks
248
 
249
+ def _process_chunk(self, text: str, chunk_id: int) -> Dict:
250
+ """Process a single chunk with NLP analysis."""
251
+ doc = self.nlp(text)
252
+
253
+ return {
254
+ 'chunk_id': chunk_id,
255
+ 'text': text,
256
+ 'entities': [(ent.text, ent.label_) for ent in doc.ents],
257
+ 'noun_phrases': [chunk.text for chunk in doc.noun_chunks],
258
+ 'word_count': len([token for token in doc if not token.is_space]),
259
+ 'sentence_count': len(list(doc.sents)),
260
+ 'ontology_links': self._link_to_ontology(text)
261
+ }
262
+
263
  def _extract_metadata(self, text: str, file_name: str) -> Dict:
264
+ """Extract enhanced metadata from document."""
 
265
  doc = self.nlp(text[:10000]) # Process first 10k chars for efficiency
266
 
267
  metadata = {
268
+ 'filename': file_name,
269
+ 'file_type': Path(file_name).suffix.lower(),
 
270
  'processed_at': datetime.now().isoformat(),
271
+ 'word_count': len([token for token in doc if not token.is_space]),
272
+ 'sentence_count': len(list(doc.sents)),
273
+ 'entities': self._extract_entities(doc),
274
+ 'document_type': self._infer_document_type(text),
275
+ 'language_stats': self._get_language_stats(doc),
276
  'citations': self._extract_citations(text),
277
+ 'dates': self._extract_dates(text),
278
+ 'key_phrases': [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) > 1][:10],
279
+ 'ontology_concepts': self._link_to_ontology(text)
280
  }
281
 
282
  return metadata
283
 
284
+ def _extract_entities(self, doc) -> Dict[str, List[str]]:
285
+ """Extract named entities from text."""
286
+ entities = {}
287
+ for ent in doc.ents:
288
+ if ent.label_ not in entities:
289
+ entities[ent.label_] = []
290
+ if ent.text not in entities[ent.label_]:
291
+ entities[ent.label_].append(ent.text)
292
+ return entities
293
+
294
+ def _infer_document_type(self, text: str) -> str:
295
+ """Infer document type using rule-based classification."""
296
  type_patterns = {
297
+ 'contract': ['agreement', 'parties', 'obligations', 'terms and conditions'],
298
+ 'judgment': ['court', 'judge', 'ruling', 'ordered', 'judgment'],
299
+ 'legislation': ['act', 'statute', 'regulation', 'amended', 'parliament'],
300
+ 'memo': ['memorandum', 'memo', 'note', 'meeting minutes']
 
 
 
 
 
 
 
 
 
 
 
 
301
  }
302
 
 
 
303
  text_lower = text.lower()
304
+ scores = {doc_type: sum(1 for pattern in patterns if pattern in text_lower)
305
+ for doc_type, patterns in type_patterns.items()}
306
 
307
+ if not scores or max(scores.values()) == 0:
308
+ return 'unknown'
 
 
 
 
309
 
310
+ return max(scores.items(), key=lambda x: x[1])[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
  def _extract_citations(self, text: str) -> List[Dict]:
313
+ """Extract legal citations."""
314
  citation_patterns = [
315
  r'\[\d{4}\]\s+\w+\s+\d+', # [2021] EWHC 123
316
  r'\d+\s+U\.S\.\s+\d+', # 123 U.S. 456
317
+ r'\(\d{4}\)\s+\d+\s+\w+\s+\d+' # (2021) 12 ABC 345
318
  ]
319
 
320
  citations = []
 
329
 
330
  return citations
331
 
332
+ def _extract_dates(self, text: str) -> List[str]:
333
+ """Extract dates from text."""
334
+ date_patterns = [
335
+ r'\d{1,2}/\d{1,2}/\d{2,4}',
336
+ r'\d{1,2}-\d{1,2}-\d{2,4}',
337
+ r'\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}'
338
+ ]
339
+
340
+ dates = []
341
+ for pattern in date_patterns:
342
+ matches = re.finditer(pattern, text, re.IGNORECASE)
343
+ dates.extend(match.group() for match in matches)
344
+
345
+ return dates
346
+
347
+ def _get_language_stats(self, doc) -> Dict:
348
+ """Get language statistics from document."""
349
  return {
350
  'sentence_count': len(list(doc.sents)),
351
  'word_count': len([token for token in doc if not token.is_space]),
 
353
  for sent in doc.sents) / len(list(doc.sents)) if doc.sents else 0
354
  }
355
 
356
+ def _load_ontology(self) -> Dict:
357
+ """Load legal ontology from file."""
358
+ try:
359
+ if os.path.exists(self.ontology_path):
360
+ with open(self.ontology_path, 'r') as f:
361
+ return json.load(f)
362
+ return {"@graph": []}
363
+ except Exception as e:
364
+ print(f"Error loading ontology: {e}")
365
+ return {"@graph": []}
366
+
367
  def _link_to_ontology(self, text: str) -> List[Dict]:
368
+ """Link text to ontology concepts."""
369
  relevant_concepts = []
370
  text_lower = text.lower()
371
 
372
+ for concept in self.ontology.get("@graph", []):
373
+ if "rdfs:label" not in concept:
374
  continue
375
 
376
+ label = concept["rdfs:label"].lower()
377
  if label in text_lower:
378
  # Get surrounding context
379
  start_idx = text_lower.index(label)
 
385
  'type': concept.get('@type', 'Unknown'),
386
  'description': concept.get('rdfs:comment', ''),
387
  'context': text[context_start:context_end].strip(),
388
+ 'location': {'start': start_idx, 'end': start_idx + len(label)}
389
  })
390
 
391
  return relevant_concepts
392
 
393
+ def cleanup(self):
394
+ """Clean up temporary files."""
395
+ try:
396
+ shutil.rmtree(self.temp_path)
397
+ os.makedirs(self.temp_path, exist_ok=True)
398
+ except Exception as e:
399
+ print(f"Error cleaning up temporary files: {e}")
400
+
401
+ def get_document_path(self, doc_id: str) -> Optional[str]:
402
+ """Get the path to a processed document."""
403
+ doc_dir = os.path.join(self.processed_path, doc_id)
404
+ if not os.path.exists(doc_dir):
405
+ return None
406
+ return doc_dir
407
+
408
+ def get_document_metadata(self, doc_id: str) -> Optional[Dict]:
409
+ """Get metadata for a processed document."""
410
+ doc_dir = self.get_document_path(doc_id)
411
+ if not doc_dir:
412
+ return None
413
+
414
+ metadata_path = os.path.join(doc_dir, "metadata.json")
415
+ try:
416
+ with open(metadata_path, 'r') as f:
417
+ return json.load(f)
418
+ except Exception as e:
419
+ print(f"Error loading metadata for document {doc_id}: {e}")
420
+ return None
421
+
422
+ def get_document_chunks(self, doc_id: str) -> Optional[List[Dict]]:
423
+ """Get chunks for a processed document."""
424
+ doc_dir = self.get_document_path(doc_id)
425
+ if not doc_dir:
426
+ return None
427
+
428
+ chunks_path = os.path.join(doc_dir, "chunks.json")
429
+ try:
430
+ with open(chunks_path, 'r') as f:
431
+ return json.load(f)
432
+ except Exception as e:
433
+ print(f"Error loading chunks for document {doc_id}: {e}")
434
+ return None
435
+
436
+ def reprocess_document(self, doc_id: str) -> Optional[Tuple[str, List[Dict], Dict]]:
437
+ """Reprocess an existing document."""
438
+ doc_dir = self.get_document_path(doc_id)
439
+ if not doc_dir:
440
+ return None
441
+
442
+ original_path = os.path.join(doc_dir, "original" + Path(doc_dir).suffix)
443
+ if not os.path.exists(original_path):
444
+ return None
445
+
446
+ try:
447
+ # Process the original file again
448
+ with open(original_path, 'rb') as f:
449
+ text, chunks = self.process_document(original_path)
450
+
451
+ # Update metadata
452
+ metadata = self._extract_metadata(text, os.path.basename(original_path))
453
+ metadata['doc_id'] = doc_id
454
+ metadata['original_path'] = original_path
455
+ metadata['reprocessed_at'] = datetime.now().isoformat()
456
+
457
+ # Save updated files
458
+ text_path = os.path.join(doc_dir, "processed.txt")
459
+ with open(text_path, 'w', encoding='utf-8') as f:
460
+ f.write(text)
461
+
462
+ chunks_path = os.path.join(doc_dir, "chunks.json")
463
+ with open(chunks_path, 'w') as f:
464
+ json.dump(chunks, f, indent=2)
465
+
466
+ metadata_path = os.path.join(doc_dir, "metadata.json")
467
+ with open(metadata_path, 'w') as f:
468
+ json.dump(metadata, f, indent=2)
469
+
470
+ return text, chunks, metadata
471
+
472
+ except Exception as e:
473
+ print(f"Error reprocessing document {doc_id}: {e}")
474
+ return None
475
+
476
+ def delete_document(self, doc_id: str) -> bool:
477
+ """Delete a processed document and its files."""
478
+ doc_dir = self.get_document_path(doc_id)
479
+ if not doc_dir:
480
+ return False
481
+
482
+ try:
483
+ shutil.rmtree(doc_dir)
484
+ return True
485
+ except Exception as e:
486
+ print(f"Error deleting document {doc_id}: {e}")
487
+ return False
488
+
489
+ def __enter__(self):
490
+ """Context manager entry."""
491
+ return self
492
+
493
+ def __exit__(self, exc_type, exc_val, exc_tb):
494
+ """Context manager exit with cleanup."""
495
+ self.cleanup()