Spaces:

pluto90
/

Smart-Notes-backend

Running

App Files Files Community

pluto90 commited on Apr 7

Commit

d6dd5a9

verified ·

1 Parent(s): 5e47fb2

Update app/core/embedding_engine.py

Browse files

Files changed (1) hide show

app/core/embedding_engine.py +151 -65

app/core/embedding_engine.py CHANGED Viewed

@@ -1,66 +1,152 @@
-# embedding_engine.py
-import uuid
-from qdrant_client import QdrantClient, models
-from qdrant_client.http.models import Distance, VectorParams
-from sentence_transformers import SentenceTransformer
-from app.core.config import QDRANT_URL, QDRANT_API_KEY
-embedder = SentenceTransformer("all-MiniLM-L6-v2")
-qdrant = QdrantClient(
-    url=QDRANT_URL,
-    api_key=QDRANT_API_KEY,
-    check_compatibility=False
-    )
-COLLECTION_NAME = "smartnotes"
-BATCH_SIZE = 100
-def ensure_collection():
-    collections = qdrant.get_collections().collections
-    if COLLECTION_NAME not in [c.name for c in collections]:
-        qdrant.create_collection(
-            collection_name=COLLECTION_NAME,
-            vectors_config=VectorParams(
-                size=384,
-                distance=Distance.COSINE
-            ),
-        )
-            # ✅ Add this part
-    qdrant.create_payload_index(
-        collection_name=COLLECTION_NAME,
-        field_name="doc_id",
-        field_schema="keyword"
-    )
-def embed_and_store(text_chunks, doc_id):
-    """Embed chunks and store them in Qdrant efficiently."""
-    ensure_collection()
-    print(f"🔹 Embedding {len(text_chunks)} chunks...")
-    # Generate embeddings
-    vectors = embedder.encode(text_chunks, show_progress_bar=True).tolist()
-    # Prepare points
-    points = [
-        models.PointStruct(
-            id=str(uuid.uuid4()),
-            vector=vectors[i],
-            payload={"doc_id": doc_id, "text": text_chunks[i]},
-        )
-        for i in range(len(vectors))
-    ]
-    # ✅ Upsert in small batches to avoid timeouts
-    print("🔹 Uploading to Qdrant in batches...")
-    for i in range(0, len(points), BATCH_SIZE):
-        batch = points[i:i + BATCH_SIZE]
-        qdrant.upsert(collection_name=COLLECTION_NAME, points=batch)
-        print(f"   → Uploaded batch {i // BATCH_SIZE + 1}/{len(points) // BATCH_SIZE + 1}")
     print("✅ All embeddings stored successfully!")

+# # embedding_engine.py
+# import uuid
+# from qdrant_client import QdrantClient, models
+# from qdrant_client.http.models import Distance, VectorParams
+# from sentence_transformers import SentenceTransformer
+# from app.core.config import QDRANT_URL, QDRANT_API_KEY
+# embedder = SentenceTransformer("all-MiniLM-L6-v2")
+# qdrant = QdrantClient(
+#     url=QDRANT_URL,
+#     api_key=QDRANT_API_KEY,
+#     check_compatibility=False
+#     )
+# COLLECTION_NAME = "smartnotes"
+# BATCH_SIZE = 100
+# def ensure_collection():
+#     collections = qdrant.get_collections().collections
+#     if COLLECTION_NAME not in [c.name for c in collections]:
+#         qdrant.create_collection(
+#             collection_name=COLLECTION_NAME,
+#             vectors_config=VectorParams(
+#                 size=384,
+#                 distance=Distance.COSINE
+#             ),
+#         )
+#             # ✅ Add this part
+#     qdrant.create_payload_index(
+#         collection_name=COLLECTION_NAME,
+#         field_name="doc_id",
+#         field_schema="keyword"
+#     )
+# def embed_and_store(text_chunks, doc_id):
+#     """Embed chunks and store them in Qdrant efficiently."""
+#     ensure_collection()
+#     print(f"🔹 Embedding {len(text_chunks)} chunks...")
+#     # Generate embeddings
+#     vectors = embedder.encode(text_chunks, show_progress_bar=True).tolist()
+#     # Prepare points
+#     points = [
+#         models.PointStruct(
+#             id=str(uuid.uuid4()),
+#             vector=vectors[i],
+#             payload={"doc_id": doc_id, "text": text_chunks[i]},
+#         )
+#         for i in range(len(vectors))
+#     ]
+#     # ✅ Upsert in small batches to avoid timeouts
+#     print("🔹 Uploading to Qdrant in batches...")
+#     for i in range(0, len(points), BATCH_SIZE):
+#         batch = points[i:i + BATCH_SIZE]
+#         qdrant.upsert(collection_name=COLLECTION_NAME, points=batch)
+#         print(f"   → Uploaded batch {i // BATCH_SIZE + 1}/{len(points) // BATCH_SIZE + 1}")
+#     print("✅ All embeddings stored successfully!")
+# embedding_engine.py
+import uuid
+from qdrant_client import QdrantClient, models
+from qdrant_client.http.models import Distance, VectorParams
+from sentence_transformers import SentenceTransformer
+from app.core.config import QDRANT_URL, QDRANT_API_KEY
+# from config import QDRANT_URL, QDRANT_API_KEY
+# embedder = SentenceTransformer("all-MiniLM-L6-v2")
+# embedder.save("models/all-MiniLM-L6-v2")
+MODEL_PATH = "models/all-MiniLM-L6-v2"
+embedder = SentenceTransformer(MODEL_PATH)
+qdrant = QdrantClient(
+    url=QDRANT_URL,
+    api_key=QDRANT_API_KEY,
+    check_compatibility=False
+    )
+COLLECTION_NAME = "smartnotes"
+BATCH_SIZE = 100
+def ensure_collection():
+    collections = qdrant.get_collections().collections
+    if COLLECTION_NAME not in [c.name for c in collections]:
+        qdrant.create_collection(
+            collection_name=COLLECTION_NAME,
+            vectors_config=VectorParams(
+                size=384,
+                distance=Distance.COSINE
+            ),
+        )
+            # ✅ Add this part
+    qdrant.create_payload_index(
+        collection_name=COLLECTION_NAME,
+        field_name="doc_id",
+        field_schema="keyword"
+    )
+def embed_and_store(text_chunks, doc_id):
+    """Embed chunks and store them in Qdrant efficiently."""
+    ensure_collection()
+    print(f"🔹 Embedding {len(text_chunks)} chunks...")
+    # Generate embeddings
+    vectors = embedder.encode(text_chunks, show_progress_bar=True).tolist()
+    # Prepare points
+    points = [
+        models.PointStruct(
+            id=str(uuid.uuid4()),
+            vector=vectors[i],
+            payload={"doc_id": doc_id, "text": text_chunks[i]},
+        )
+        for i in range(len(vectors))
+    ]
+    # ✅ Upsert in small batches to avoid timeouts
+    print("🔹 Uploading to Qdrant in batches...")
+    for i in range(0, len(points), BATCH_SIZE):
+        batch = points[i:i + BATCH_SIZE]
+        qdrant.upsert(collection_name=COLLECTION_NAME, points=batch)
+        print(f"   → Uploaded batch {i // BATCH_SIZE + 1}/{len(points) // BATCH_SIZE + 1}")
     print("✅ All embeddings stored successfully!")