cryogenic22 commited on
Commit
13fe8cc
·
verified ·
1 Parent(s): c5654cc

Update utils/document_processor.py

Browse files
Files changed (1) hide show
  1. utils/document_processor.py +79 -349
utils/document_processor.py CHANGED
@@ -17,22 +17,23 @@ from pathlib import Path
17
  import streamlit as st
18
  import shutil
19
 
 
20
  class DocumentProcessor:
21
  def __init__(self, base_path: str = None):
22
  """Initialize Document Processor with proper data directory handling."""
23
  # Set up base paths
24
  self.base_path = self._setup_data_directories(base_path)
25
- self.ontology_path = os.path.join(self.base_path, "legal_ontology.json")
26
-
27
  # Initialize NLP components
28
  self._initialize_nlp()
29
-
30
  # Ensure ontology exists
31
  self._ensure_ontology_exists()
32
-
33
  # Load ontology
34
  self.ontology = self._load_ontology()
35
-
36
  # Create processing directories
37
  self.processed_path = os.path.join(self.base_path, "processed")
38
  self.temp_path = os.path.join(self.base_path, "temp")
@@ -41,62 +42,36 @@ class DocumentProcessor:
41
 
42
  def _setup_data_directories(self, base_path: Optional[str] = None) -> str:
43
  """Set up data directories with error handling."""
44
- if base_path:
45
- data_path = base_path
46
- else:
47
- # Check if running in Hugging Face Spaces
48
- if os.environ.get('SPACE_ID'):
49
- data_path = "/data"
50
- else:
51
- data_path = os.path.join(os.getcwd(), "data")
52
-
53
- # Create necessary subdirectories
54
  subdirs = ["ontology", "processed", "temp", "indexes"]
55
  for subdir in subdirs:
56
  os.makedirs(os.path.join(data_path, subdir), exist_ok=True)
57
-
58
  return data_path
59
 
60
  def _initialize_nlp(self):
61
- """Initialize NLP components with comprehensive error handling."""
62
  try:
63
- # Initialize spaCy
64
  try:
65
  self.nlp = spacy.load("en_core_web_sm")
66
  except OSError:
67
  st.info("Downloading spaCy model...")
68
  os.system("python -m spacy download en_core_web_sm")
69
  self.nlp = spacy.load("en_core_web_sm")
70
-
71
- # Initialize NLTK components
72
  nltk_data_dir = os.path.join(self.base_path, "nltk_data")
73
  os.makedirs(nltk_data_dir, exist_ok=True)
74
-
75
- # Add custom NLTK data path
76
  nltk.data.path.append(nltk_data_dir)
77
-
78
- # Ensure all required NLTK resources are available
79
- required_resources = [
80
- 'punkt',
81
- 'averaged_perceptron_tagger',
82
- 'maxent_ne_chunker',
83
- 'words',
84
- 'stopwords'
85
- ]
86
-
87
  for resource in required_resources:
88
  try:
89
  nltk.download(resource, download_dir=nltk_data_dir, quiet=True)
90
  except Exception as e:
91
  st.warning(f"Could not download {resource}: {str(e)}")
92
-
93
- # Initialize stopwords
94
- try:
95
- self.stop_words = set(nltk.corpus.stopwords.words('english'))
96
- except Exception as e:
97
- st.warning(f"Could not load stopwords, using empty set: {str(e)}")
98
- self.stop_words = set()
99
-
100
  except Exception as e:
101
  st.error(f"Error initializing NLP components: {str(e)}")
102
  raise
@@ -122,75 +97,38 @@ class DocumentProcessor:
122
  }
123
  ]
124
  }
125
-
126
  with open(self.ontology_path, 'w') as f:
127
  json.dump(default_ontology, f, indent=2)
128
 
129
  def _load_ontology(self) -> Dict:
130
  """Load legal ontology with error handling."""
131
  try:
132
- if os.path.exists(self.ontology_path):
133
- with open(self.ontology_path, 'r') as f:
134
- return json.load(f)
135
- return {"@graph": []}
136
  except Exception as e:
137
  st.error(f"Error loading ontology: {str(e)}")
138
  return {"@graph": []}
139
 
140
  def process_and_tag_document(self, file) -> Tuple[str, List[Dict], Dict]:
141
- """Process document with enhanced metadata extraction and chunking."""
142
  try:
143
- # Generate unique document ID
144
  doc_id = datetime.now().strftime('%Y%m%d_%H%M%S')
145
-
146
- # Create document directory
147
  doc_dir = os.path.join(self.processed_path, doc_id)
148
  os.makedirs(doc_dir, exist_ok=True)
149
-
150
- # Save original file
151
  original_path = os.path.join(doc_dir, "original" + Path(file.name).suffix)
152
  with open(original_path, 'wb') as f:
153
  f.write(file.getvalue())
154
-
155
- # Extract text and perform initial processing
156
- text = ""
157
- try:
158
- text, chunks = self.process_document(original_path)
159
- except Exception as e:
160
- st.error(f"Error processing document content: {str(e)}")
161
- raise
162
-
163
- # Extract and enrich metadata
164
- try:
165
- metadata = self._extract_metadata(text, file.name)
166
- metadata['doc_id'] = doc_id
167
- metadata['original_path'] = original_path
168
- except Exception as e:
169
- st.error(f"Error extracting metadata: {str(e)}")
170
- raise
171
-
172
- # Save processed content
173
- try:
174
- # Save processed text
175
- text_path = os.path.join(doc_dir, "processed.txt")
176
- with open(text_path, 'w', encoding='utf-8') as f:
177
- f.write(text)
178
-
179
- # Save chunks
180
- chunks_path = os.path.join(doc_dir, "chunks.json")
181
- with open(chunks_path, 'w') as f:
182
- json.dump(chunks, f, indent=2)
183
-
184
- # Save metadata
185
- metadata_path = os.path.join(doc_dir, "metadata.json")
186
- with open(metadata_path, 'w') as f:
187
- json.dump(metadata, f, indent=2)
188
- except Exception as e:
189
- st.error(f"Error saving processed content: {str(e)}")
190
- raise
191
-
192
  return text, chunks, metadata
193
-
194
  except Exception as e:
195
  st.error(f"Error in document processing pipeline: {str(e)}")
196
  raise
@@ -198,7 +136,6 @@ class DocumentProcessor:
198
  def process_document(self, file_path: str) -> Tuple[str, List[Dict]]:
199
  """Process a document based on its type."""
200
  file_type = Path(file_path).suffix.lower()
201
-
202
  if file_type == '.pdf':
203
  text = self._process_pdf(file_path)
204
  elif file_type == '.docx':
@@ -207,307 +144,100 @@ class DocumentProcessor:
207
  text = self._process_text(file_path)
208
  else:
209
  raise ValueError(f"Unsupported file type: {file_type}")
210
-
211
- # Create chunks with enhanced metadata
212
  chunks = self._create_chunks(text)
213
  return text, chunks
214
 
215
  def _process_pdf(self, file_path: str) -> str:
216
- """Extract text from PDF with OCR fallback."""
217
  try:
218
  reader = pypdf.PdfReader(file_path)
219
  text = ""
220
-
221
  for page_num, page in enumerate(reader.pages, 1):
222
  page_text = page.extract_text()
223
-
224
- if page_text.strip():
225
- text += f"\n--- Page {page_num} ---\n{page_text}"
226
- else:
227
- # Perform OCR if text extraction fails
228
- st.info(f"Performing OCR for page {page_num}...")
229
- with open(file_path, 'rb') as pdf_file:
230
- images = convert_from_bytes(pdf_file.read())
231
- page_text = pytesseract.image_to_string(images[page_num - 1])
232
- text += f"\n--- Page {page_num} (OCR) ---\n{page_text}"
233
-
234
  return text
235
-
236
  except Exception as e:
237
  st.error(f"Error processing PDF: {str(e)}")
238
  raise
239
 
240
  def _process_docx(self, file_path: str) -> str:
241
- """Process DOCX files with metadata."""
242
  try:
243
  doc = docx.Document(file_path)
244
- text = ""
245
-
246
- for para in doc.paragraphs:
247
- if para.text.strip():
248
- text += para.text + "\n"
249
-
250
- return text
251
-
252
  except Exception as e:
253
  st.error(f"Error processing DOCX: {str(e)}")
254
  raise
255
 
256
  def _process_text(self, file_path: str) -> str:
257
- """Process text files with encoding detection."""
258
  try:
259
  with open(file_path, 'rb') as f:
260
  raw_data = f.read()
261
-
262
- # Detect encoding
263
- result = chardet.detect(raw_data)
264
- encoding = result['encoding'] if result['confidence'] > 0.7 else 'utf-8'
265
-
266
- # Decode text
267
  return raw_data.decode(encoding)
268
-
269
  except Exception as e:
270
  st.error(f"Error processing text file: {str(e)}")
271
  raise
272
 
273
  def _create_chunks(self, text: str) -> List[Dict]:
274
- """Create enhanced chunks with NLP analysis."""
275
- try:
276
- # Split into sentences
277
- sentences = self._tokenize_text(text)
278
-
279
- chunks = []
280
- current_chunk = []
281
- current_length = 0
282
- chunk_size = 500 # Target chunk size
283
-
284
- for sentence in sentences:
285
- sentence_length = len(sentence)
286
-
287
- if current_length + sentence_length > chunk_size and current_chunk:
288
- # Process current chunk
289
- chunk_text = ' '.join(current_chunk)
290
- chunks.append(self._process_chunk(chunk_text, len(chunks)))
291
- current_chunk = []
292
- current_length = 0
293
-
294
- current_chunk.append(sentence)
295
- current_length += sentence_length
296
-
297
- # Process final chunk
298
- if current_chunk:
299
- chunk_text = ' '.join(current_chunk)
300
- chunks.append(self._process_chunk(chunk_text, len(chunks)))
301
-
302
- return chunks
303
-
304
- except Exception as e:
305
- st.error(f"Error creating chunks: {str(e)}")
306
- raise
307
-
308
- def _tokenize_text(self, text: str) -> List[str]:
309
- """Tokenize text with fallback options."""
310
- try:
311
- return sent_tokenize(text)
312
- except Exception:
313
- # Fallback to basic splitting
314
- return [s.strip() for s in text.split('.') if s.strip()]
315
 
316
  def _process_chunk(self, text: str, chunk_id: int) -> Dict:
317
- """Process a single chunk with NLP analysis."""
318
- try:
319
- doc = self.nlp(text)
320
-
321
- return {
322
- 'chunk_id': chunk_id,
323
- 'text': text,
324
- 'entities': [(ent.text, ent.label_) for ent in doc.ents],
325
- 'noun_phrases': [chunk.text for chunk in doc.noun_chunks],
326
- 'word_count': len([token for token in doc if not token.is_space]),
327
- 'sentence_count': len(list(doc.sents)),
328
- 'ontology_links': self._link_to_ontology(text)
329
- }
330
-
331
- except Exception as e:
332
- st.error(f"Error processing chunk: {str(e)}")
333
- raise
334
-
335
- def _extract_metadata(self, text: str, file_name: str) -> Dict:
336
- """Extract enhanced metadata from document."""
337
- try:
338
- doc = self.nlp(text[:10000]) # Process first 10k chars for efficiency
339
-
340
- metadata = {
341
- 'filename': file_name,
342
- 'file_type': Path(file_name).suffix.lower(),
343
- 'processed_at': datetime.now().isoformat(),
344
- 'word_count': len([token for token in doc if not token.is_space]),
345
- 'sentence_count': len(list(doc.sents)),
346
- 'entities': self._extract_entities(doc),
347
- 'document_type': self._infer_document_type(text),
348
- 'language_stats': self._get_language_stats(doc),
349
- 'citations': self._extract_citations(text),
350
- 'dates': self._extract_dates(text),
351
- 'key_phrases': [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) > 1][:10],
352
- 'ontology_concepts': self._link_to_ontology(text)
353
- }
354
-
355
- return metadata
356
-
357
- except Exception as e:
358
- st.error(f"Error extracting metadata: {str(e)}")
359
- raise
360
-
361
- def _extract_entities(self, doc) -> Dict[str, List[str]]:
362
- """Extract named entities with deduplication."""
363
- entities = {}
364
- seen = set()
365
-
366
- for ent in doc.ents:
367
- if ent.text not in seen:
368
- if ent.label_ not in entities:
369
- entities[ent.label_] = []
370
- entities[ent.label_].append(ent.text)
371
- seen.add(ent.text)
372
-
373
- return entities
374
-
375
- def _infer_document_type(self, text: str) -> str:
376
- """Infer document type using rule-based classification."""
377
- type_patterns = {
378
- 'contract': ['agreement', 'parties', 'obligations', 'terms and conditions'],
379
- 'judgment': ['court', 'judge', 'ruling', 'ordered', 'judgment'],
380
- 'legislation': ['act', 'statute', 'regulation', 'amended', 'parliament'],
381
- 'memo': ['memorandum', 'memo', 'note', 'meeting minutes']
382
  }
383
-
384
- text_lower = text.lower()
385
- scores = {doc_type: sum(1 for pattern in patterns if pattern in text_lower)
386
- for doc_type, patterns in type_patterns.items()}
387
-
388
- if not scores or max(scores.values()) == 0:
389
- return 'unknown'
390
-
391
- return max(scores.items(), key=lambda x: x[1])[0]
392
-
393
- def _extract_citations(self, text: str) -> List[Dict]:
394
- """Extract legal citations."""
395
- citation_patterns = [
396
- r'\[\d{4}\]\s+\w+\s+\d+', # [2021] EWHC 123
397
- r'\d+\s+U\.S\.\s+\d+', # 123 U.S. 456
398
- r'\(\d{4}\)\s+\d+\s+\w+\s+\d+' # (2021) 12 ABC 345
399
- ]
400
-
401
- citations = []
402
- for pattern in citation_patterns:
403
- matches = re.finditer(pattern, text)
404
- for match in matches:
405
- citations.append({
406
- 'citation': match.group(),
407
- 'start_idx': match.start(),
408
- 'end_idx': match.end()
409
- })
410
-
411
- return citations
412
 
413
- def _extract_dates(self, text: str) -> List[str]:
414
- """Extract dates with multiple formats."""
415
- date_patterns = [
416
- r'\d{1,2}/\d{1,2}/\d{2,4}',
417
- r'\d{1,2}-\d{1,2}-\d{2,4}',
418
- r'\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}'
419
- ]
420
-
421
- dates = []
422
- for pattern in date_patterns:
423
- matches = re.finditer(pattern, text, re.IGNORECASE)
424
- dates.extend(match.group() for match in matches)
425
-
426
- return dates
427
-
428
- def _get_language_stats(self, doc) -> Dict:
429
- """Get detailed language statistics."""
430
  return {
431
- 'sentence_count': len(list(doc.sents)),
432
- 'word_count': len([token for token in doc if not token.is_space]),
433
- 'avg_sentence_length': sum(len([token for token in sent if not token.is_space])
434
- for sent in doc.sents) / len(list(doc.sents)) if doc.sents else 0,
435
- 'unique_words': len(set(token.text.lower() for token in doc if not token.is_space))
436
  }
437
 
438
- def _link_to_ontology(self, text: str) -> List[Dict]:
439
- """Link text to ontology concepts."""
440
- relevant_concepts = []
441
- text_lower = text.lower()
442
-
443
- for concept in self.ontology.get("@graph", []):
444
- if "rdfs:label" not in concept:
445
- continue
446
-
447
- label = concept["rdfs:label"].lower()
448
- if label in text_lower:
449
- # Get surrounding context
450
- start_idx = text_lower.index(label)
451
- context_start = max(0, start_idx - 100)
452
- context_end = min(len(text), start_idx + len(label) + 100)
453
-
454
- relevant_concepts.append({
455
- 'concept': concept['rdfs:label'],
456
- 'type': concept.get('@type', 'Unknown'),
457
- 'description': concept.get('rdfs:comment', ''),
458
- 'context': text[context_start:context_end].strip(),
459
- 'location': {'start': start_idx, 'end': start_idx + len(label)}
460
- })
461
-
462
- return relevant_concepts
463
-
464
- def get_document_path(self, doc_id: str) -> Optional[str]:
465
- """Get the path to a processed document."""
466
- doc_dir = os.path.join(self.processed_path, doc_id)
467
- if not os.path.exists(doc_dir):
468
- return None
469
- return doc_dir
470
-
471
- def get_document_metadata(self, doc_id: str) -> Optional[Dict]:
472
- """Get metadata for a processed document."""
473
- doc_dir = self.get_document_path(doc_id)
474
- if not doc_dir:
475
- return None
476
-
477
- metadata_path = os.path.join(doc_dir, "metadata.json")
478
- try:
479
- with open(metadata_path, 'r') as f:
480
- return json.load(f)
481
- except Exception as e:
482
- st.error(f"Error loading metadata for document {doc_id}: {str(e)}")
483
- return None
484
-
485
- def get_document_chunks(self, doc_id: str) -> Optional[List[Dict]]:
486
- """Get chunks for a processed document."""
487
- doc_dir = self.get_document_path(doc_id)
488
- if not doc_dir:
489
- return None
490
-
491
- chunks_path = os.path.join(doc_dir, "chunks.json")
492
- try:
493
- with open(chunks_path, 'r') as f:
494
- return json.load(f)
495
- except Exception as e:
496
- st.error(f"Error loading chunks for document {doc_id}: {str(e)}")
497
- return None
498
 
499
  def cleanup(self):
500
  """Clean up temporary files."""
501
- try:
502
- shutil.rmtree(self.temp_path)
503
- os.makedirs(self.temp_path, exist_ok=True)
504
- except Exception as e:
505
- st.warning(f"Error cleaning up temporary files: {str(e)}")
506
 
507
  def __enter__(self):
508
- """Context manager entry."""
509
  return self
510
 
511
  def __exit__(self, exc_type, exc_val, exc_tb):
512
- """Context manager exit with cleanup."""
513
- self.cleanup()
 
17
  import streamlit as st
18
  import shutil
19
 
20
+
21
  class DocumentProcessor:
22
  def __init__(self, base_path: str = None):
23
  """Initialize Document Processor with proper data directory handling."""
24
  # Set up base paths
25
  self.base_path = self._setup_data_directories(base_path)
26
+ self.ontology_path = os.path.join(self.base_path, "ontology", "legal_ontology.json")
27
+
28
  # Initialize NLP components
29
  self._initialize_nlp()
30
+
31
  # Ensure ontology exists
32
  self._ensure_ontology_exists()
33
+
34
  # Load ontology
35
  self.ontology = self._load_ontology()
36
+
37
  # Create processing directories
38
  self.processed_path = os.path.join(self.base_path, "processed")
39
  self.temp_path = os.path.join(self.base_path, "temp")
 
42
 
43
  def _setup_data_directories(self, base_path: Optional[str] = None) -> str:
44
  """Set up data directories with error handling."""
45
+ data_path = base_path or os.path.join(os.getcwd(), "data")
 
 
 
 
 
 
 
 
 
46
  subdirs = ["ontology", "processed", "temp", "indexes"]
47
  for subdir in subdirs:
48
  os.makedirs(os.path.join(data_path, subdir), exist_ok=True)
 
49
  return data_path
50
 
51
  def _initialize_nlp(self):
52
+ """Initialize NLP components."""
53
  try:
54
+ # Load spaCy model
55
  try:
56
  self.nlp = spacy.load("en_core_web_sm")
57
  except OSError:
58
  st.info("Downloading spaCy model...")
59
  os.system("python -m spacy download en_core_web_sm")
60
  self.nlp = spacy.load("en_core_web_sm")
61
+
62
+ # Initialize NLTK
63
  nltk_data_dir = os.path.join(self.base_path, "nltk_data")
64
  os.makedirs(nltk_data_dir, exist_ok=True)
 
 
65
  nltk.data.path.append(nltk_data_dir)
66
+
67
+ required_resources = ['punkt', 'averaged_perceptron_tagger', 'maxent_ne_chunker', 'words', 'stopwords']
 
 
 
 
 
 
 
 
68
  for resource in required_resources:
69
  try:
70
  nltk.download(resource, download_dir=nltk_data_dir, quiet=True)
71
  except Exception as e:
72
  st.warning(f"Could not download {resource}: {str(e)}")
73
+
74
+ self.stop_words = set(nltk.corpus.stopwords.words('english'))
 
 
 
 
 
 
75
  except Exception as e:
76
  st.error(f"Error initializing NLP components: {str(e)}")
77
  raise
 
97
  }
98
  ]
99
  }
 
100
  with open(self.ontology_path, 'w') as f:
101
  json.dump(default_ontology, f, indent=2)
102
 
103
  def _load_ontology(self) -> Dict:
104
  """Load legal ontology with error handling."""
105
  try:
106
+ with open(self.ontology_path, 'r') as f:
107
+ return json.load(f)
 
 
108
  except Exception as e:
109
  st.error(f"Error loading ontology: {str(e)}")
110
  return {"@graph": []}
111
 
112
  def process_and_tag_document(self, file) -> Tuple[str, List[Dict], Dict]:
113
+ """Process document and generate metadata."""
114
  try:
 
115
  doc_id = datetime.now().strftime('%Y%m%d_%H%M%S')
 
 
116
  doc_dir = os.path.join(self.processed_path, doc_id)
117
  os.makedirs(doc_dir, exist_ok=True)
118
+
 
119
  original_path = os.path.join(doc_dir, "original" + Path(file.name).suffix)
120
  with open(original_path, 'wb') as f:
121
  f.write(file.getvalue())
122
+
123
+ # Extract text and process document
124
+ text, chunks = self.process_document(original_path)
125
+ metadata = self._extract_metadata(text, file.name)
126
+ metadata.update({"doc_id": doc_id, "original_path": original_path})
127
+
128
+ # Save processed data
129
+ self._save_processed_data(doc_dir, text, chunks, metadata)
130
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  return text, chunks, metadata
 
132
  except Exception as e:
133
  st.error(f"Error in document processing pipeline: {str(e)}")
134
  raise
 
136
  def process_document(self, file_path: str) -> Tuple[str, List[Dict]]:
137
  """Process a document based on its type."""
138
  file_type = Path(file_path).suffix.lower()
 
139
  if file_type == '.pdf':
140
  text = self._process_pdf(file_path)
141
  elif file_type == '.docx':
 
144
  text = self._process_text(file_path)
145
  else:
146
  raise ValueError(f"Unsupported file type: {file_type}")
 
 
147
  chunks = self._create_chunks(text)
148
  return text, chunks
149
 
150
  def _process_pdf(self, file_path: str) -> str:
151
+ """Extract text from PDF, using OCR if necessary."""
152
  try:
153
  reader = pypdf.PdfReader(file_path)
154
  text = ""
 
155
  for page_num, page in enumerate(reader.pages, 1):
156
  page_text = page.extract_text()
157
+ if not page_text.strip():
158
+ st.info(f"Performing OCR on page {page_num}...")
159
+ images = convert_from_bytes(open(file_path, 'rb').read())
160
+ page_text = pytesseract.image_to_string(images[page_num - 1])
161
+ text += f"\n--- Page {page_num} ---\n{page_text}"
 
 
 
 
 
 
162
  return text
 
163
  except Exception as e:
164
  st.error(f"Error processing PDF: {str(e)}")
165
  raise
166
 
167
  def _process_docx(self, file_path: str) -> str:
168
+ """Extract text from DOCX files."""
169
  try:
170
  doc = docx.Document(file_path)
171
+ return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
 
 
 
 
 
 
 
172
  except Exception as e:
173
  st.error(f"Error processing DOCX: {str(e)}")
174
  raise
175
 
176
  def _process_text(self, file_path: str) -> str:
177
+ """Process plain text files."""
178
  try:
179
  with open(file_path, 'rb') as f:
180
  raw_data = f.read()
181
+ encoding = chardet.detect(raw_data).get('encoding', 'utf-8')
 
 
 
 
 
182
  return raw_data.decode(encoding)
 
183
  except Exception as e:
184
  st.error(f"Error processing text file: {str(e)}")
185
  raise
186
 
187
  def _create_chunks(self, text: str) -> List[Dict]:
188
+ """Chunk text for further processing."""
189
+ sentences = self._tokenize_text(text)
190
+ chunk_size = 500
191
+ chunks = []
192
+ current_chunk, current_length = [], 0
193
+ for sentence in sentences:
194
+ if current_length + len(sentence) > chunk_size and current_chunk:
195
+ chunks.append(self._process_chunk(' '.join(current_chunk), len(chunks)))
196
+ current_chunk, current_length = [], 0
197
+ current_chunk.append(sentence)
198
+ current_length += len(sentence)
199
+ if current_chunk:
200
+ chunks.append(self._process_chunk(' '.join(current_chunk), len(chunks)))
201
+ return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  def _process_chunk(self, text: str, chunk_id: int) -> Dict:
204
+ """Process individual chunks with NLP and ontology linking."""
205
+ doc = self.nlp(text)
206
+ return {
207
+ 'chunk_id': chunk_id,
208
+ 'text': text,
209
+ 'entities': [(ent.text, ent.label_) for ent in doc.ents],
210
+ 'noun_phrases': [np.text for np in doc.noun_chunks],
211
+ 'ontology_links': self._link_to_ontology(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
+ def _extract_metadata(self, text: str, file_name: str) -> Dict:
215
+ """Extract metadata from text."""
216
+ doc = self.nlp(text[:10000])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  return {
218
+ 'filename': file_name,
219
+ 'file_type': Path(file_name).suffix.lower(),
220
+ 'processed_at': datetime.now().isoformat(),
221
+ 'entities': self._extract_entities(doc),
222
+ 'document_type': self._infer_document_type(text)
223
  }
224
 
225
+ def _save_processed_data(self, doc_dir: str, text: str, chunks: List[Dict], metadata: Dict):
226
+ """Save processed data to disk."""
227
+ with open(os.path.join(doc_dir, "processed.txt"), 'w', encoding='utf-8') as f:
228
+ f.write(text)
229
+ with open(os.path.join(doc_dir, "chunks.json"), 'w') as f:
230
+ json.dump(chunks, f, indent=2)
231
+ with open(os.path.join(doc_dir, "metadata.json"), 'w') as f:
232
+ json.dump(metadata, f, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
  def cleanup(self):
235
  """Clean up temporary files."""
236
+ shutil.rmtree(self.temp_path, ignore_errors=True)
237
+ os.makedirs(self.temp_path, exist_ok=True)
 
 
 
238
 
239
  def __enter__(self):
 
240
  return self
241
 
242
  def __exit__(self, exc_type, exc_val, exc_tb):
243
+ self.cleanup()