File size: 2,616 Bytes
1e732dd
 
 
 
 
 
 
 
 
 
696f787
1e732dd
696f787
1e732dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
696f787
 
1e732dd
 
 
 
 
 
 
 
 
 
9659593
 
 
1e732dd
 
 
696f787
1e732dd
 
 
 
 
696f787
 
1e732dd
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
MediGuard AI — Indexing Service

Orchestrates: PDF parse → chunk → embed → index into OpenSearch.
"""

from __future__ import annotations

import logging
import uuid
from datetime import UTC, datetime

from src.services.indexing.text_chunker import MedicalChunk

logger = logging.getLogger(__name__)


class IndexingService:
    """Coordinates chunking → embedding → OpenSearch indexing."""

    def __init__(self, chunker, embedding_service, opensearch_client):
        self.chunker = chunker
        self.embedding_service = embedding_service
        self.opensearch_client = opensearch_client

    def index_text(
        self,
        text: str,
        *,
        document_id: str = "",
        title: str = "",
        source_file: str = "",
    ) -> int:
        """Chunk, embed, and index a single document's text. Returns count of indexed chunks."""
        if not document_id:
            document_id = str(uuid.uuid4())

        chunks = self.chunker.chunk_text(
            text,
            document_id=document_id,
            title=title,
            source_file=source_file,
        )
        if not chunks:
            logger.warning("No chunks generated for document '%s'", title)
            return 0

        # Embed all chunks
        texts = [c.text for c in chunks]
        embeddings = self.embedding_service.embed_documents(texts)

        # Prepare OpenSearch documents
        now = datetime.now(UTC).isoformat()
        docs: list[dict] = []
        for chunk, emb in zip(chunks, embeddings):
            doc = chunk.to_dict()
            doc["_id"] = f"{document_id}_{chunk.chunk_index}"
            doc["embedding"] = emb
            doc["indexed_at"] = now
            docs.append(doc)

        indexed = self.opensearch_client.bulk_index(docs)
        logger.info(
            "Indexed %d chunks for '%s' (document_id=%s)",
            indexed,
            title,
            document_id,
        )
        return indexed

    def index_chunks(self, chunks: list[MedicalChunk]) -> int:
        """Embed and index pre-built chunks."""
        if not chunks:
            return 0
        texts = [c.text for c in chunks]
        embeddings = self.embedding_service.embed_documents(texts)
        now = datetime.now(UTC).isoformat()
        docs: list[dict] = []
        for chunk, emb in zip(chunks, embeddings):
            doc = chunk.to_dict()
            doc["_id"] = f"{chunk.document_id}_{chunk.chunk_index}"
            doc["embedding"] = emb
            doc["indexed_at"] = now
            docs.append(doc)
        return self.opensearch_client.bulk_index(docs)