Spaces:
Running
Running
File size: 4,179 Bytes
d6dd5a9 3c7b2df d6dd5a9 20a8e92 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | # # embedding_engine.py
# import uuid
# from qdrant_client import QdrantClient, models
# from qdrant_client.http.models import Distance, VectorParams
# from sentence_transformers import SentenceTransformer
# from app.core.config import QDRANT_URL, QDRANT_API_KEY
# embedder = SentenceTransformer("all-MiniLM-L6-v2")
# qdrant = QdrantClient(
# url=QDRANT_URL,
# api_key=QDRANT_API_KEY,
# check_compatibility=False
# )
# COLLECTION_NAME = "smartnotes"
# BATCH_SIZE = 100
# def ensure_collection():
# collections = qdrant.get_collections().collections
# if COLLECTION_NAME not in [c.name for c in collections]:
# qdrant.create_collection(
# collection_name=COLLECTION_NAME,
# vectors_config=VectorParams(
# size=384,
# distance=Distance.COSINE
# ),
# )
# # β
Add this part
# qdrant.create_payload_index(
# collection_name=COLLECTION_NAME,
# field_name="doc_id",
# field_schema="keyword"
# )
# def embed_and_store(text_chunks, doc_id):
# """Embed chunks and store them in Qdrant efficiently."""
# ensure_collection()
# print(f"πΉ Embedding {len(text_chunks)} chunks...")
# # Generate embeddings
# vectors = embedder.encode(text_chunks, show_progress_bar=True).tolist()
# # Prepare points
# points = [
# models.PointStruct(
# id=str(uuid.uuid4()),
# vector=vectors[i],
# payload={"doc_id": doc_id, "text": text_chunks[i]},
# )
# for i in range(len(vectors))
# ]
# # β
Upsert in small batches to avoid timeouts
# print("πΉ Uploading to Qdrant in batches...")
# for i in range(0, len(points), BATCH_SIZE):
# batch = points[i:i + BATCH_SIZE]
# qdrant.upsert(collection_name=COLLECTION_NAME, points=batch)
# print(f" β Uploaded batch {i // BATCH_SIZE + 1}/{len(points) // BATCH_SIZE + 1}")
# print("β
All embeddings stored successfully!")
# embedding_engine.py
import uuid
from qdrant_client import QdrantClient, models
from qdrant_client.http.models import Distance, VectorParams
from sentence_transformers import SentenceTransformer
from app.core.config import QDRANT_URL, QDRANT_API_KEY
# from config import QDRANT_URL, QDRANT_API_KEY
# embedder = SentenceTransformer("all-MiniLM-L6-v2")
# embedder.save("models/all-MiniLM-L6-v2")
MODEL_PATH = "app/core/models/all-MiniLM-L6-v2"
embedder = SentenceTransformer(MODEL_PATH)
qdrant = QdrantClient(
url=QDRANT_URL,
api_key=QDRANT_API_KEY,
check_compatibility=False
)
COLLECTION_NAME = "smartnotes"
BATCH_SIZE = 100
def ensure_collection():
collections = qdrant.get_collections().collections
if COLLECTION_NAME not in [c.name for c in collections]:
qdrant.create_collection(
collection_name=COLLECTION_NAME,
vectors_config=VectorParams(
size=384,
distance=Distance.COSINE
),
)
# β
Add this part
qdrant.create_payload_index(
collection_name=COLLECTION_NAME,
field_name="doc_id",
field_schema="keyword"
)
def embed_and_store(text_chunks, doc_id):
"""Embed chunks and store them in Qdrant efficiently."""
ensure_collection()
print(f"πΉ Embedding {len(text_chunks)} chunks...")
# Generate embeddings
vectors = embedder.encode(text_chunks, show_progress_bar=True).tolist()
# Prepare points
points = [
models.PointStruct(
id=str(uuid.uuid4()),
vector=vectors[i],
payload={"doc_id": doc_id, "text": text_chunks[i]},
)
for i in range(len(vectors))
]
# β
Upsert in small batches to avoid timeouts
print("πΉ Uploading to Qdrant in batches...")
for i in range(0, len(points), BATCH_SIZE):
batch = points[i:i + BATCH_SIZE]
qdrant.upsert(collection_name=COLLECTION_NAME, points=batch)
print(f" β Uploaded batch {i // BATCH_SIZE + 1}/{len(points) // BATCH_SIZE + 1}")
print("β
All embeddings stored successfully!") |