cryogenic22 commited on
Commit
b898c42
·
verified ·
1 Parent(s): 2664f83

Update utils/document_processor.py

Browse files
Files changed (1) hide show
  1. utils/document_processor.py +27 -4
utils/document_processor.py CHANGED
@@ -20,8 +20,32 @@ class DocumentProcessor:
20
  """
21
  self.ontology = self._load_ontology(ontology_path)
22
 
23
- def process_document(self, file) -> Tuple[str, List[Dict], Dict]:
24
- """Process a document, extract text, chunks, and metadata."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  file_type = file.name.split(".")[-1].lower()
26
  if file_type == "pdf":
27
  text = self._process_pdf(file)
@@ -33,8 +57,7 @@ class DocumentProcessor:
33
  raise ValueError(f"Unsupported file type: {file_type}")
34
 
35
  chunks = self._chunk_text(text)
36
- metadata = self._extract_metadata(text, file.name)
37
- return text, chunks, metadata
38
 
39
  def _process_pdf(self, file) -> str:
40
  """Extract text from a PDF, including OCR for scanned PDFs."""
 
20
  """
21
  self.ontology = self._load_ontology(ontology_path)
22
 
23
+ def process_and_tag_document(self, file) -> Tuple[str, List[Dict], Dict]:
24
+ """
25
+ Process a document, extract text, chunks, and metadata.
26
+
27
+ Args:
28
+ file: Uploaded document file.
29
+
30
+ Returns:
31
+ Tuple[str, List[Dict], Dict]: Extracted text, text chunks, and metadata.
32
+ """
33
+ # Process the document to extract text
34
+ text, chunks = self.process_document(file)
35
+ # Extract metadata using ontology and document content
36
+ metadata = self._extract_metadata(text, file.name)
37
+ return text, chunks, metadata
38
+
39
+ def process_document(self, file) -> Tuple[str, List[Dict]]:
40
+ """
41
+ Process a document and return its text and chunks.
42
+
43
+ Args:
44
+ file: Uploaded document file.
45
+
46
+ Returns:
47
+ Tuple[str, List[Dict]]: Extracted text and text chunks.
48
+ """
49
  file_type = file.name.split(".")[-1].lower()
50
  if file_type == "pdf":
51
  text = self._process_pdf(file)
 
57
  raise ValueError(f"Unsupported file type: {file_type}")
58
 
59
  chunks = self._chunk_text(text)
60
+ return text, chunks
 
61
 
62
  def _process_pdf(self, file) -> str:
63
  """Extract text from a PDF, including OCR for scanned PDFs."""