Spaces:
Build error
Build error
Update utils/document_processor.py
Browse files- utils/document_processor.py +27 -4
utils/document_processor.py
CHANGED
|
@@ -20,8 +20,32 @@ class DocumentProcessor:
|
|
| 20 |
"""
|
| 21 |
self.ontology = self._load_ontology(ontology_path)
|
| 22 |
|
| 23 |
-
def
|
| 24 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
file_type = file.name.split(".")[-1].lower()
|
| 26 |
if file_type == "pdf":
|
| 27 |
text = self._process_pdf(file)
|
|
@@ -33,8 +57,7 @@ class DocumentProcessor:
|
|
| 33 |
raise ValueError(f"Unsupported file type: {file_type}")
|
| 34 |
|
| 35 |
chunks = self._chunk_text(text)
|
| 36 |
-
|
| 37 |
-
return text, chunks, metadata
|
| 38 |
|
| 39 |
def _process_pdf(self, file) -> str:
|
| 40 |
"""Extract text from a PDF, including OCR for scanned PDFs."""
|
|
|
|
| 20 |
"""
|
| 21 |
self.ontology = self._load_ontology(ontology_path)
|
| 22 |
|
| 23 |
+
def process_and_tag_document(self, file) -> Tuple[str, List[Dict], Dict]:
|
| 24 |
+
"""
|
| 25 |
+
Process a document, extract text, chunks, and metadata.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
file: Uploaded document file.
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
Tuple[str, List[Dict], Dict]: Extracted text, text chunks, and metadata.
|
| 32 |
+
"""
|
| 33 |
+
# Process the document to extract text
|
| 34 |
+
text, chunks = self.process_document(file)
|
| 35 |
+
# Extract metadata using ontology and document content
|
| 36 |
+
metadata = self._extract_metadata(text, file.name)
|
| 37 |
+
return text, chunks, metadata
|
| 38 |
+
|
| 39 |
+
def process_document(self, file) -> Tuple[str, List[Dict]]:
|
| 40 |
+
"""
|
| 41 |
+
Process a document and return its text and chunks.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
file: Uploaded document file.
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
Tuple[str, List[Dict]]: Extracted text and text chunks.
|
| 48 |
+
"""
|
| 49 |
file_type = file.name.split(".")[-1].lower()
|
| 50 |
if file_type == "pdf":
|
| 51 |
text = self._process_pdf(file)
|
|
|
|
| 57 |
raise ValueError(f"Unsupported file type: {file_type}")
|
| 58 |
|
| 59 |
chunks = self._chunk_text(text)
|
| 60 |
+
return text, chunks
|
|
|
|
| 61 |
|
| 62 |
def _process_pdf(self, file) -> str:
|
| 63 |
"""Extract text from a PDF, including OCR for scanned PDFs."""
|