File size: 4,603 Bytes
1bc3f18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from stores.llm.LLMProviderFactory import LLMProviderFactory
from stores.vector_store.Qdrant import QdrantStore

from ingestion.loaders.File_loader import load_file
from ingestion.chunkers.recursive_chunker import recursive_chunk
from ingestion.pdf_outline import extract_pdf_outline, build_page_bookmark_map , recursive_chunk_with_pages
from ingestion.loaders.pdf_loader import load_pdf_with_pages

from config import get_settings

import os
from qdrant_client import QdrantClient , models

class IndexingController:
    def __init__(self):
        config = get_settings()
        self.factory = LLMProviderFactory(config)
        self.embedder = self.factory.create(config.EMBEDDING_BACKEND)
        self.embedder.set_embedding_model(config.EMBEDDING_MODEL_ID, config.EMBEDDING_MODEL_SIZE)
        if config.QDRANT_TYPE == "cloud":
            self.vector_store_client = QdrantClient(url=config.QDRANT_DOCKER_URL,api_key=config.QDRANT_API_KEY,timeout=120)
        elif config.QDRANT_TYPE == "docker":
            self.vector_store_client = QdrantClient(url=config.QDRANT_DOCKER_URL,timeout=120)    
        elif config.QDRANT_TYPE == "local":
            self.vector_store_client = QdrantClient(path="data/qdrant",prefer_grpc=False,timeout=120)

        string_fields = ["metadata.username", "metadata.source", "metadata.course","metadata.bookmark_path"]

        if not self.vector_store_client.collection_exists(collection_name=get_settings().QDRANT_COLLECTION):
            # 2. Create the collection if it doesn't
            self.vector_store_client.create_collection(
                collection_name=get_settings().QDRANT_COLLECTION,
                vectors_config=models.VectorParams(
                    size=get_settings().EMBEDDING_MODEL_SIZE,
                    distance=models.Distance.COSINE
                ),
            )

        for field in string_fields:
            self.vector_store_client.create_payload_index(
                collection_name=get_settings().QDRANT_COLLECTION,
                field_name=field,
                field_schema=models.KeywordIndexParams(
                    type=models.KeywordIndexType.KEYWORD
                )
            )
            
        self.vector_store= QdrantStore(self.vector_store_client,config.QDRANT_COLLECTION, config.EMBEDDING_MODEL_SIZE)

    def embed_chunks(self, chunks):                  
        return self.embedder.embed_text_batch(chunks)
        
    def process_file(self,file_path, original_filename, username=None, course=None):
        file_name = os.path.basename(file_path)
        ext = os.path.splitext(file_path)[1].lower()

        bookmark_map = {}

        if ext == ".pdf":
            outline , total_pages= extract_pdf_outline(file_path)
            bookmark_map = build_page_bookmark_map(outline , total_pages)

            pages = load_pdf_with_pages(file_path)
            chunks = recursive_chunk_with_pages(pages)

        else:
            text = load_file(file_path)
            if isinstance(text, list):
                text = " ".join([doc.page_content for doc in text])
            chunks_text = recursive_chunk(text)
            chunks = [{"text": c, "page": None} for c in chunks_text]

        embeddings = self.embed_chunks([c["text"] for c in chunks])

        valid_embs = []
        valid_payloads = []

        for idx, (chunk_obj, emb) in enumerate(zip(chunks, embeddings)):
            if emb is not None:
                page = chunk_obj["page"]
                bookmark_path = bookmark_map.get(page, [])

                valid_embs.append(emb)
                valid_payloads.append({
                    "content": chunk_obj["text"],
                    "metadata": {
                        "source": original_filename,
                        "chunk_index": idx,
                        "total_chunks": len(chunks),
                        "username": username,
                        "course": course,
                        "page": page,
                        "bookmark_path": bookmark_path, 
                    }
                }
                )
                print(f"[DEBUG] Prepared payload for chunk {idx}: page={page}, bookmark_path={bookmark_path}")

        self.vector_store.upsert_embeddings(
            self.vector_store_client,
            get_settings().QDRANT_COLLECTION,
            valid_embs,
            valid_payloads
        )
        print(f"[INFO] Stored {len(valid_embs)} embeddings for file '{file_name}'.")

        return {
            "num_chunks": len(chunks),
            "chunks": chunks,
            "embeddings": embeddings
        }