from stores.llm.LLMProviderFactory import LLMProviderFactory from stores.vector_store.Qdrant import QdrantStore from ingestion.loaders.File_loader import load_file from ingestion.chunkers.recursive_chunker import recursive_chunk from ingestion.pdf_outline import extract_pdf_outline, build_page_bookmark_map , recursive_chunk_with_pages from ingestion.loaders.pdf_loader import load_pdf_with_pages from config import get_settings import os from qdrant_client import QdrantClient , models class IndexingController: def __init__(self): config = get_settings() self.factory = LLMProviderFactory(config) self.embedder = self.factory.create(config.EMBEDDING_BACKEND) self.embedder.set_embedding_model(config.EMBEDDING_MODEL_ID, config.EMBEDDING_MODEL_SIZE) if config.QDRANT_TYPE == "cloud": self.vector_store_client = QdrantClient(url=config.QDRANT_DOCKER_URL,api_key=config.QDRANT_API_KEY,timeout=120) elif config.QDRANT_TYPE == "docker": self.vector_store_client = QdrantClient(url=config.QDRANT_DOCKER_URL,timeout=120) elif config.QDRANT_TYPE == "local": self.vector_store_client = QdrantClient(path="data/qdrant",prefer_grpc=False,timeout=120) string_fields = ["metadata.username", "metadata.source", "metadata.course","metadata.bookmark_path"] if not self.vector_store_client.collection_exists(collection_name=get_settings().QDRANT_COLLECTION): # 2. Create the collection if it doesn't self.vector_store_client.create_collection( collection_name=get_settings().QDRANT_COLLECTION, vectors_config=models.VectorParams( size=get_settings().EMBEDDING_MODEL_SIZE, distance=models.Distance.COSINE ), ) for field in string_fields: self.vector_store_client.create_payload_index( collection_name=get_settings().QDRANT_COLLECTION, field_name=field, field_schema=models.KeywordIndexParams( type=models.KeywordIndexType.KEYWORD ) ) self.vector_store= QdrantStore(self.vector_store_client,config.QDRANT_COLLECTION, config.EMBEDDING_MODEL_SIZE) def embed_chunks(self, chunks): return self.embedder.embed_text_batch(chunks) def process_file(self,file_path, original_filename, username=None, course=None): file_name = os.path.basename(file_path) ext = os.path.splitext(file_path)[1].lower() bookmark_map = {} if ext == ".pdf": outline , total_pages= extract_pdf_outline(file_path) bookmark_map = build_page_bookmark_map(outline , total_pages) pages = load_pdf_with_pages(file_path) chunks = recursive_chunk_with_pages(pages) else: text = load_file(file_path) if isinstance(text, list): text = " ".join([doc.page_content for doc in text]) chunks_text = recursive_chunk(text) chunks = [{"text": c, "page": None} for c in chunks_text] embeddings = self.embed_chunks([c["text"] for c in chunks]) valid_embs = [] valid_payloads = [] for idx, (chunk_obj, emb) in enumerate(zip(chunks, embeddings)): if emb is not None: page = chunk_obj["page"] bookmark_path = bookmark_map.get(page, []) valid_embs.append(emb) valid_payloads.append({ "content": chunk_obj["text"], "metadata": { "source": original_filename, "chunk_index": idx, "total_chunks": len(chunks), "username": username, "course": course, "page": page, "bookmark_path": bookmark_path, } } ) print(f"[DEBUG] Prepared payload for chunk {idx}: page={page}, bookmark_path={bookmark_path}") self.vector_store.upsert_embeddings( self.vector_store_client, get_settings().QDRANT_COLLECTION, valid_embs, valid_payloads ) print(f"[INFO] Stored {len(valid_embs)} embeddings for file '{file_name}'.") return { "num_chunks": len(chunks), "chunks": chunks, "embeddings": embeddings }