File size: 2,373 Bytes
9cc7f8d
 
 
77d7fca
9cc7f8d
 
 
77d7fca
9cc7f8d
77d7fca
9cc7f8d
 
 
77d7fca
 
9cc7f8d
 
77d7fca
9cc7f8d
 
 
 
 
 
 
 
 
 
 
 
bb05158
 
9cc7f8d
bb05158
 
77d7fca
bb05158
 
77d7fca
bb05158
 
77d7fca
bb05158
 
 
 
 
9cc7f8d
bb05158
77d7fca
bb05158
 
 
 
 
 
 
 
 
 
 
 
 
9cc7f8d
bb05158
77d7fca
bb05158
77d7fca
bb05158
 
 
 
 
 
 
 
77d7fca
bb05158
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from src.ingestion import ingestion_and_chunking
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, SparseVectorParams, PointStruct
from fastembed import TextEmbedding, SparseTextEmbedding
import uuid
from dotenv import load_dotenv
import os

load_dotenv()

qdrant_api_key = os.getenv("QDRANT_API_KEY")
qdrant_url = os.getenv("QDRANT_URL")


def upload_file(file_path: str, user_id: str, collection_name="pdf_rag"):
    client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)

    dense_model = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
    sparse_model = SparseTextEmbedding(model_name="Qdrant/bm25")

    if not client.collection_exists(collection_name):
        client.create_collection(
            collection_name=collection_name,
            vectors_config={
                "dense": VectorParams(size=384, distance=Distance.COSINE)
            },
            sparse_vectors_config={
                "sparse": SparseVectorParams()
            }
        )
    docs = ingestion_and_chunking(file_path)
    texts = [doc.page_content for doc in docs]

    dense_vectors = list(dense_model.embed(texts))
    sparse_vectors = list(sparse_model.embed(texts))

    points = []
    file_id = str(uuid.uuid4())

    for i, doc in enumerate(docs):
        dense_vec = dense_vectors[i].tolist()

        sparse_emb = sparse_vectors[i]
        sparse_vec = {
            "indices": sparse_emb.indices.tolist(),
            "values": sparse_emb.values.tolist()
        }

        chunk_id = str(uuid.uuid4())

        point = PointStruct(
            id=chunk_id,
            vector={
                "dense": dense_vec,
                "sparse": sparse_vec
            },
            payload={
                "user_id": user_id,
                "file_id": file_id,
                "text": doc.page_content,
                "source": doc.metadata.get("source"),
                "pages": doc.metadata.get("pages"),
                "section": doc.metadata.get("section")
            }
        )

        points.append(point)

    try:
        client.create_payload_index(
            collection_name=collection_name,
            field_name="user_id",
            field_schema="keyword"
        )
    except Exception:
        pass

    client.upsert(collection_name=collection_name, points=points)