Spaces:
Sleeping
Sleeping
Update utils/ingestion.py
Browse files- utils/ingestion.py +55 -15
utils/ingestion.py
CHANGED
|
@@ -3,6 +3,7 @@ import time
|
|
| 3 |
import os
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import Dict, Any, List
|
|
|
|
| 6 |
|
| 7 |
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
| 8 |
from docling.datamodel.base_models import InputFormat
|
|
@@ -15,7 +16,10 @@ from docling.datamodel.pipeline_options import (
|
|
| 15 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
| 16 |
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
| 17 |
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
class DocumentProcessor:
|
|
@@ -33,13 +37,12 @@ class DocumentProcessor:
|
|
| 33 |
pipeline_options.table_structure_options.do_cell_matching = True
|
| 34 |
pipeline_options.ocr_options.lang = ["en"]
|
| 35 |
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
|
| 36 |
-
|
| 37 |
-
# ✅ Automatically handle CPU fallback
|
| 38 |
try:
|
| 39 |
pipeline_options.accelerator_options = AcceleratorOptions(
|
| 40 |
num_threads=8, device=AcceleratorDevice.MPS
|
| 41 |
)
|
| 42 |
-
except Exception
|
| 43 |
print("⚠️ MPS is not available. Falling back to CPU.")
|
| 44 |
pipeline_options.accelerator_options = AcceleratorOptions(
|
| 45 |
num_threads=8, device=AcceleratorDevice.CPU
|
|
@@ -79,21 +82,59 @@ class DocumentProcessor:
|
|
| 79 |
|
| 80 |
return metadata
|
| 81 |
|
| 82 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
"""Process document and create searchable index with metadata"""
|
| 84 |
-
print(f"📄 Processing document: {
|
| 85 |
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
|
|
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
|
|
|
| 92 |
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
processed_chunks.append(metadata)
|
| 97 |
|
| 98 |
print("✅ Chunking completed. Creating vector database...")
|
| 99 |
collection = self.client.get_or_create_collection(name="document_chunks")
|
|
@@ -114,7 +155,6 @@ class DocumentProcessor:
|
|
| 114 |
embeddings.append(embedding)
|
| 115 |
metadata_list.append({
|
| 116 |
"headings": json.dumps(chunk.get('headings', [])),
|
| 117 |
-
"page": chunk.get('page_info', None),
|
| 118 |
"content_type": chunk.get('content_type', None)
|
| 119 |
})
|
| 120 |
ids.append(str(idx))
|
|
|
|
| 3 |
import os
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import Dict, Any, List
|
| 6 |
+
import chromadb
|
| 7 |
|
| 8 |
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
| 9 |
from docling.datamodel.base_models import InputFormat
|
|
|
|
| 16 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
| 17 |
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
| 18 |
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
|
| 19 |
+
|
| 20 |
+
from docx import Document # DOCX support
|
| 21 |
+
from pptx import Presentation # PPTX support
|
| 22 |
+
from bs4 import BeautifulSoup # HTML support
|
| 23 |
|
| 24 |
|
| 25 |
class DocumentProcessor:
|
|
|
|
| 37 |
pipeline_options.table_structure_options.do_cell_matching = True
|
| 38 |
pipeline_options.ocr_options.lang = ["en"]
|
| 39 |
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
|
| 40 |
+
|
|
|
|
| 41 |
try:
|
| 42 |
pipeline_options.accelerator_options = AcceleratorOptions(
|
| 43 |
num_threads=8, device=AcceleratorDevice.MPS
|
| 44 |
)
|
| 45 |
+
except Exception:
|
| 46 |
print("⚠️ MPS is not available. Falling back to CPU.")
|
| 47 |
pipeline_options.accelerator_options = AcceleratorOptions(
|
| 48 |
num_threads=8, device=AcceleratorDevice.CPU
|
|
|
|
| 82 |
|
| 83 |
return metadata
|
| 84 |
|
| 85 |
+
def extract_text_from_docx(self, docx_path: str) -> List[str]:
|
| 86 |
+
"""Extract text from a DOCX file"""
|
| 87 |
+
doc = Document(docx_path)
|
| 88 |
+
return [para.text.strip() for para in doc.paragraphs if para.text.strip()]
|
| 89 |
+
|
| 90 |
+
def extract_text_from_pptx(self, pptx_path: str) -> List[str]:
|
| 91 |
+
"""Extract text from a PPTX file"""
|
| 92 |
+
ppt = Presentation(pptx_path)
|
| 93 |
+
slides_text = []
|
| 94 |
+
for slide in ppt.slides:
|
| 95 |
+
text = " ".join([shape.text for shape in slide.shapes if hasattr(shape, "text")])
|
| 96 |
+
if text.strip():
|
| 97 |
+
slides_text.append(text.strip())
|
| 98 |
+
return slides_text
|
| 99 |
+
|
| 100 |
+
def extract_text_from_html(self, html_path: str) -> List[str]:
|
| 101 |
+
"""Extract text from an HTML file"""
|
| 102 |
+
with open(html_path, "r", encoding="utf-8") as file:
|
| 103 |
+
soup = BeautifulSoup(file, "html.parser")
|
| 104 |
+
return [text.strip() for text in soup.stripped_strings if text.strip()]
|
| 105 |
+
|
| 106 |
+
def process_document(self, file_path: str):
|
| 107 |
"""Process document and create searchable index with metadata"""
|
| 108 |
+
print(f"📄 Processing document: {file_path}")
|
| 109 |
start_time = time.time()
|
| 110 |
+
file_ext = Path(file_path).suffix.lower()
|
| 111 |
+
|
| 112 |
+
if file_ext == ".pdf":
|
| 113 |
+
result = self.converter.convert(file_path)
|
| 114 |
+
doc = result.document
|
| 115 |
+
chunker = HybridChunker(tokenizer="jinaai/jina-embeddings-v3")
|
| 116 |
+
chunks = list(chunker.chunk(doc))
|
| 117 |
+
|
| 118 |
+
processed_chunks = []
|
| 119 |
+
for chunk in chunks:
|
| 120 |
+
metadata = self.extract_chunk_metadata(chunk)
|
| 121 |
+
processed_chunks.append(metadata)
|
| 122 |
+
|
| 123 |
+
elif file_ext == ".docx":
|
| 124 |
+
texts = self.extract_text_from_docx(file_path)
|
| 125 |
+
processed_chunks = [{"text": text, "headings": [], "content_type": "DOCX"} for text in texts]
|
| 126 |
|
| 127 |
+
elif file_ext == ".pptx":
|
| 128 |
+
texts = self.extract_text_from_pptx(file_path)
|
| 129 |
+
processed_chunks = [{"text": text, "headings": [], "content_type": "PPTX"} for text in texts]
|
| 130 |
|
| 131 |
+
elif file_ext == ".html":
|
| 132 |
+
texts = self.extract_text_from_html(file_path)
|
| 133 |
+
processed_chunks = [{"text": text, "headings": [], "content_type": "HTML"} for text in texts]
|
| 134 |
|
| 135 |
+
else:
|
| 136 |
+
print(f"❌ Unsupported file format: {file_ext}")
|
| 137 |
+
return None
|
|
|
|
| 138 |
|
| 139 |
print("✅ Chunking completed. Creating vector database...")
|
| 140 |
collection = self.client.get_or_create_collection(name="document_chunks")
|
|
|
|
| 155 |
embeddings.append(embedding)
|
| 156 |
metadata_list.append({
|
| 157 |
"headings": json.dumps(chunk.get('headings', [])),
|
|
|
|
| 158 |
"content_type": chunk.get('content_type', None)
|
| 159 |
})
|
| 160 |
ids.append(str(idx))
|