Spaces:
Running
Running
File size: 2,616 Bytes
1e732dd 696f787 1e732dd 696f787 1e732dd 696f787 1e732dd 9659593 1e732dd 696f787 1e732dd 696f787 1e732dd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | """
MediGuard AI — Indexing Service
Orchestrates: PDF parse → chunk → embed → index into OpenSearch.
"""
from __future__ import annotations
import logging
import uuid
from datetime import UTC, datetime
from src.services.indexing.text_chunker import MedicalChunk
logger = logging.getLogger(__name__)
class IndexingService:
"""Coordinates chunking → embedding → OpenSearch indexing."""
def __init__(self, chunker, embedding_service, opensearch_client):
self.chunker = chunker
self.embedding_service = embedding_service
self.opensearch_client = opensearch_client
def index_text(
self,
text: str,
*,
document_id: str = "",
title: str = "",
source_file: str = "",
) -> int:
"""Chunk, embed, and index a single document's text. Returns count of indexed chunks."""
if not document_id:
document_id = str(uuid.uuid4())
chunks = self.chunker.chunk_text(
text,
document_id=document_id,
title=title,
source_file=source_file,
)
if not chunks:
logger.warning("No chunks generated for document '%s'", title)
return 0
# Embed all chunks
texts = [c.text for c in chunks]
embeddings = self.embedding_service.embed_documents(texts)
# Prepare OpenSearch documents
now = datetime.now(UTC).isoformat()
docs: list[dict] = []
for chunk, emb in zip(chunks, embeddings):
doc = chunk.to_dict()
doc["_id"] = f"{document_id}_{chunk.chunk_index}"
doc["embedding"] = emb
doc["indexed_at"] = now
docs.append(doc)
indexed = self.opensearch_client.bulk_index(docs)
logger.info(
"Indexed %d chunks for '%s' (document_id=%s)",
indexed,
title,
document_id,
)
return indexed
def index_chunks(self, chunks: list[MedicalChunk]) -> int:
"""Embed and index pre-built chunks."""
if not chunks:
return 0
texts = [c.text for c in chunks]
embeddings = self.embedding_service.embed_documents(texts)
now = datetime.now(UTC).isoformat()
docs: list[dict] = []
for chunk, emb in zip(chunks, embeddings):
doc = chunk.to_dict()
doc["_id"] = f"{chunk.document_id}_{chunk.chunk_index}"
doc["embedding"] = emb
doc["indexed_at"] = now
docs.append(doc)
return self.opensearch_client.bulk_index(docs)
|