from src.ingestion import ingestion_and_chunking from qdrant_client import QdrantClient from qdrant_client.models import Distance, VectorParams, SparseVectorParams, PointStruct from fastembed import TextEmbedding, SparseTextEmbedding import uuid from dotenv import load_dotenv import os load_dotenv() qdrant_api_key = os.getenv("QDRANT_API_KEY") qdrant_url = os.getenv("QDRANT_URL") def upload_file(file_path: str, user_id: str, collection_name="pdf_rag"): client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key) dense_model = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2") sparse_model = SparseTextEmbedding(model_name="Qdrant/bm25") if not client.collection_exists(collection_name): client.create_collection( collection_name=collection_name, vectors_config={ "dense": VectorParams(size=384, distance=Distance.COSINE) }, sparse_vectors_config={ "sparse": SparseVectorParams() } ) docs = ingestion_and_chunking(file_path) texts = [doc.page_content for doc in docs] dense_vectors = list(dense_model.embed(texts)) sparse_vectors = list(sparse_model.embed(texts)) points = [] file_id = str(uuid.uuid4()) for i, doc in enumerate(docs): dense_vec = dense_vectors[i].tolist() sparse_emb = sparse_vectors[i] sparse_vec = { "indices": sparse_emb.indices.tolist(), "values": sparse_emb.values.tolist() } chunk_id = str(uuid.uuid4()) point = PointStruct( id=chunk_id, vector={ "dense": dense_vec, "sparse": sparse_vec }, payload={ "user_id": user_id, "file_id": file_id, "text": doc.page_content, "source": doc.metadata.get("source"), "pages": doc.metadata.get("pages"), "section": doc.metadata.get("section") } ) points.append(point) try: client.create_payload_index( collection_name=collection_name, field_name="user_id", field_schema="keyword" ) except Exception: pass client.upsert(collection_name=collection_name, points=points)