Spaces:
Paused
Paused
| from stores.llm.LLMProviderFactory import LLMProviderFactory | |
| from stores.vector_store.Qdrant import QdrantStore | |
| from ingestion.loaders.File_loader import load_file | |
| from ingestion.chunkers.recursive_chunker import recursive_chunk | |
| from ingestion.pdf_outline import extract_pdf_outline, build_page_bookmark_map , recursive_chunk_with_pages | |
| from ingestion.loaders.pdf_loader import load_pdf_with_pages | |
| from config import get_settings | |
| import os | |
| from qdrant_client import QdrantClient , models | |
| class IndexingController: | |
| def __init__(self): | |
| config = get_settings() | |
| self.factory = LLMProviderFactory(config) | |
| self.embedder = self.factory.create(config.EMBEDDING_BACKEND) | |
| self.embedder.set_embedding_model(config.EMBEDDING_MODEL_ID, config.EMBEDDING_MODEL_SIZE) | |
| if config.QDRANT_TYPE == "cloud": | |
| self.vector_store_client = QdrantClient(url=config.QDRANT_DOCKER_URL,api_key=config.QDRANT_API_KEY,timeout=120) | |
| elif config.QDRANT_TYPE == "docker": | |
| self.vector_store_client = QdrantClient(url=config.QDRANT_DOCKER_URL,timeout=120) | |
| elif config.QDRANT_TYPE == "local": | |
| self.vector_store_client = QdrantClient(path="data/qdrant",prefer_grpc=False,timeout=120) | |
| string_fields = ["metadata.username", "metadata.source", "metadata.course","metadata.bookmark_path"] | |
| if not self.vector_store_client.collection_exists(collection_name=get_settings().QDRANT_COLLECTION): | |
| # 2. Create the collection if it doesn't | |
| self.vector_store_client.create_collection( | |
| collection_name=get_settings().QDRANT_COLLECTION, | |
| vectors_config=models.VectorParams( | |
| size=get_settings().EMBEDDING_MODEL_SIZE, | |
| distance=models.Distance.COSINE | |
| ), | |
| ) | |
| for field in string_fields: | |
| self.vector_store_client.create_payload_index( | |
| collection_name=get_settings().QDRANT_COLLECTION, | |
| field_name=field, | |
| field_schema=models.KeywordIndexParams( | |
| type=models.KeywordIndexType.KEYWORD | |
| ) | |
| ) | |
| self.vector_store= QdrantStore(self.vector_store_client,config.QDRANT_COLLECTION, config.EMBEDDING_MODEL_SIZE) | |
| def embed_chunks(self, chunks): | |
| return self.embedder.embed_text_batch(chunks) | |
| def process_file(self,file_path, original_filename, username=None, course=None): | |
| file_name = os.path.basename(file_path) | |
| ext = os.path.splitext(file_path)[1].lower() | |
| bookmark_map = {} | |
| if ext == ".pdf": | |
| outline , total_pages= extract_pdf_outline(file_path) | |
| bookmark_map = build_page_bookmark_map(outline , total_pages) | |
| pages = load_pdf_with_pages(file_path) | |
| chunks = recursive_chunk_with_pages(pages) | |
| else: | |
| text = load_file(file_path) | |
| if isinstance(text, list): | |
| text = " ".join([doc.page_content for doc in text]) | |
| chunks_text = recursive_chunk(text) | |
| chunks = [{"text": c, "page": None} for c in chunks_text] | |
| embeddings = self.embed_chunks([c["text"] for c in chunks]) | |
| valid_embs = [] | |
| valid_payloads = [] | |
| for idx, (chunk_obj, emb) in enumerate(zip(chunks, embeddings)): | |
| if emb is not None: | |
| page = chunk_obj["page"] | |
| bookmark_path = bookmark_map.get(page, []) | |
| valid_embs.append(emb) | |
| valid_payloads.append({ | |
| "content": chunk_obj["text"], | |
| "metadata": { | |
| "source": original_filename, | |
| "chunk_index": idx, | |
| "total_chunks": len(chunks), | |
| "username": username, | |
| "course": course, | |
| "page": page, | |
| "bookmark_path": bookmark_path, | |
| } | |
| } | |
| ) | |
| print(f"[DEBUG] Prepared payload for chunk {idx}: page={page}, bookmark_path={bookmark_path}") | |
| self.vector_store.upsert_embeddings( | |
| self.vector_store_client, | |
| get_settings().QDRANT_COLLECTION, | |
| valid_embs, | |
| valid_payloads | |
| ) | |
| print(f"[INFO] Stored {len(valid_embs)} embeddings for file '{file_name}'.") | |
| return { | |
| "num_chunks": len(chunks), | |
| "chunks": chunks, | |
| "embeddings": embeddings | |
| } | |