EXAM_RAG_API / indexing /indexingController.py
MinaNasser's picture
1st
1bc3f18
from stores.llm.LLMProviderFactory import LLMProviderFactory
from stores.vector_store.Qdrant import QdrantStore
from ingestion.loaders.File_loader import load_file
from ingestion.chunkers.recursive_chunker import recursive_chunk
from ingestion.pdf_outline import extract_pdf_outline, build_page_bookmark_map , recursive_chunk_with_pages
from ingestion.loaders.pdf_loader import load_pdf_with_pages
from config import get_settings
import os
from qdrant_client import QdrantClient , models
class IndexingController:
def __init__(self):
config = get_settings()
self.factory = LLMProviderFactory(config)
self.embedder = self.factory.create(config.EMBEDDING_BACKEND)
self.embedder.set_embedding_model(config.EMBEDDING_MODEL_ID, config.EMBEDDING_MODEL_SIZE)
if config.QDRANT_TYPE == "cloud":
self.vector_store_client = QdrantClient(url=config.QDRANT_DOCKER_URL,api_key=config.QDRANT_API_KEY,timeout=120)
elif config.QDRANT_TYPE == "docker":
self.vector_store_client = QdrantClient(url=config.QDRANT_DOCKER_URL,timeout=120)
elif config.QDRANT_TYPE == "local":
self.vector_store_client = QdrantClient(path="data/qdrant",prefer_grpc=False,timeout=120)
string_fields = ["metadata.username", "metadata.source", "metadata.course","metadata.bookmark_path"]
if not self.vector_store_client.collection_exists(collection_name=get_settings().QDRANT_COLLECTION):
# 2. Create the collection if it doesn't
self.vector_store_client.create_collection(
collection_name=get_settings().QDRANT_COLLECTION,
vectors_config=models.VectorParams(
size=get_settings().EMBEDDING_MODEL_SIZE,
distance=models.Distance.COSINE
),
)
for field in string_fields:
self.vector_store_client.create_payload_index(
collection_name=get_settings().QDRANT_COLLECTION,
field_name=field,
field_schema=models.KeywordIndexParams(
type=models.KeywordIndexType.KEYWORD
)
)
self.vector_store= QdrantStore(self.vector_store_client,config.QDRANT_COLLECTION, config.EMBEDDING_MODEL_SIZE)
def embed_chunks(self, chunks):
return self.embedder.embed_text_batch(chunks)
def process_file(self,file_path, original_filename, username=None, course=None):
file_name = os.path.basename(file_path)
ext = os.path.splitext(file_path)[1].lower()
bookmark_map = {}
if ext == ".pdf":
outline , total_pages= extract_pdf_outline(file_path)
bookmark_map = build_page_bookmark_map(outline , total_pages)
pages = load_pdf_with_pages(file_path)
chunks = recursive_chunk_with_pages(pages)
else:
text = load_file(file_path)
if isinstance(text, list):
text = " ".join([doc.page_content for doc in text])
chunks_text = recursive_chunk(text)
chunks = [{"text": c, "page": None} for c in chunks_text]
embeddings = self.embed_chunks([c["text"] for c in chunks])
valid_embs = []
valid_payloads = []
for idx, (chunk_obj, emb) in enumerate(zip(chunks, embeddings)):
if emb is not None:
page = chunk_obj["page"]
bookmark_path = bookmark_map.get(page, [])
valid_embs.append(emb)
valid_payloads.append({
"content": chunk_obj["text"],
"metadata": {
"source": original_filename,
"chunk_index": idx,
"total_chunks": len(chunks),
"username": username,
"course": course,
"page": page,
"bookmark_path": bookmark_path,
}
}
)
print(f"[DEBUG] Prepared payload for chunk {idx}: page={page}, bookmark_path={bookmark_path}")
self.vector_store.upsert_embeddings(
self.vector_store_client,
get_settings().QDRANT_COLLECTION,
valid_embs,
valid_payloads
)
print(f"[INFO] Stored {len(valid_embs)} embeddings for file '{file_name}'.")
return {
"num_chunks": len(chunks),
"chunks": chunks,
"embeddings": embeddings
}