File size: 2,002 Bytes
20a8e92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# embedding_engine.py

import uuid
from qdrant_client import QdrantClient, models
from qdrant_client.http.models import Distance, VectorParams
from sentence_transformers import SentenceTransformer
from app.core.config import QDRANT_URL, QDRANT_API_KEY

embedder = SentenceTransformer("all-MiniLM-L6-v2")

qdrant = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
    check_compatibility=False
    )

COLLECTION_NAME = "smartnotes"
BATCH_SIZE = 100


def ensure_collection():
    collections = qdrant.get_collections().collections
    if COLLECTION_NAME not in [c.name for c in collections]:
        qdrant.create_collection(
            collection_name=COLLECTION_NAME,
            vectors_config=VectorParams(
                size=384,
                distance=Distance.COSINE
            ),
        )

            # ✅ Add this part
    qdrant.create_payload_index(
        collection_name=COLLECTION_NAME,
        field_name="doc_id",
        field_schema="keyword"
    )



def embed_and_store(text_chunks, doc_id):
    """Embed chunks and store them in Qdrant efficiently."""
    ensure_collection()
    print(f"🔹 Embedding {len(text_chunks)} chunks...")

    # Generate embeddings
    vectors = embedder.encode(text_chunks, show_progress_bar=True).tolist()

    # Prepare points
    points = [
        models.PointStruct(
            id=str(uuid.uuid4()),
            vector=vectors[i],
            payload={"doc_id": doc_id, "text": text_chunks[i]},
        )
        for i in range(len(vectors))
    ]

    # ✅ Upsert in small batches to avoid timeouts
    print("🔹 Uploading to Qdrant in batches...")
    for i in range(0, len(points), BATCH_SIZE):
        batch = points[i:i + BATCH_SIZE]
        qdrant.upsert(collection_name=COLLECTION_NAME, points=batch)
        print(f"   → Uploaded batch {i // BATCH_SIZE + 1}/{len(points) // BATCH_SIZE + 1}")

    print("✅ All embeddings stored successfully!")