File size: 2,373 Bytes
9cc7f8d 77d7fca 9cc7f8d 77d7fca 9cc7f8d 77d7fca 9cc7f8d 77d7fca 9cc7f8d 77d7fca 9cc7f8d bb05158 9cc7f8d bb05158 77d7fca bb05158 77d7fca bb05158 77d7fca bb05158 9cc7f8d bb05158 77d7fca bb05158 9cc7f8d bb05158 77d7fca bb05158 77d7fca bb05158 77d7fca bb05158 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | from src.ingestion import ingestion_and_chunking
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, SparseVectorParams, PointStruct
from fastembed import TextEmbedding, SparseTextEmbedding
import uuid
from dotenv import load_dotenv
import os
load_dotenv()
qdrant_api_key = os.getenv("QDRANT_API_KEY")
qdrant_url = os.getenv("QDRANT_URL")
def upload_file(file_path: str, user_id: str, collection_name="pdf_rag"):
client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
dense_model = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
sparse_model = SparseTextEmbedding(model_name="Qdrant/bm25")
if not client.collection_exists(collection_name):
client.create_collection(
collection_name=collection_name,
vectors_config={
"dense": VectorParams(size=384, distance=Distance.COSINE)
},
sparse_vectors_config={
"sparse": SparseVectorParams()
}
)
docs = ingestion_and_chunking(file_path)
texts = [doc.page_content for doc in docs]
dense_vectors = list(dense_model.embed(texts))
sparse_vectors = list(sparse_model.embed(texts))
points = []
file_id = str(uuid.uuid4())
for i, doc in enumerate(docs):
dense_vec = dense_vectors[i].tolist()
sparse_emb = sparse_vectors[i]
sparse_vec = {
"indices": sparse_emb.indices.tolist(),
"values": sparse_emb.values.tolist()
}
chunk_id = str(uuid.uuid4())
point = PointStruct(
id=chunk_id,
vector={
"dense": dense_vec,
"sparse": sparse_vec
},
payload={
"user_id": user_id,
"file_id": file_id,
"text": doc.page_content,
"source": doc.metadata.get("source"),
"pages": doc.metadata.get("pages"),
"section": doc.metadata.get("section")
}
)
points.append(point)
try:
client.create_payload_index(
collection_name=collection_name,
field_name="user_id",
field_schema="keyword"
)
except Exception:
pass
client.upsert(collection_name=collection_name, points=points)
|