Spaces:

pluto90
/

Smart-Notes-backend

Running

App Files Files Community

Smart-Notes-backend / app /core /embedding_engine.py

pluto90

Update app/core/embedding_engine.py

1f0931f verified about 2 months ago

raw

history blame

4.1 kB

	# embedding_engine.py
	import uuid, time
	from pathlib import Path
	from qdrant_client import QdrantClient, models
	from qdrant_client.http.models import Distance, VectorParams
	from sentence_transformers import SentenceTransformer
	from app.core.config import QDRANT_URL, QDRANT_API_KEY

	# MODEL_PATH = "app/core/models/bge-base-en-v1.5"
	# embedder = SentenceTransformer(MODEL_PATH)


	# ✅ Resolve model path relative to THIS file, not the working directory
	# Works on local, HuggingFace, Docker — anywhere
	BASE_DIR = Path(__file__).resolve().parent # → app/core/
	MODEL_PATH = BASE_DIR / "models" / "bge-base-en-v1.5"

	print(f"📁 Model path: {MODEL_PATH}")
	print(f"📁 Model exists: {MODEL_PATH.exists()}")

	if not MODEL_PATH.exists():
	raise RuntimeError(
	f"BGE model not found at {MODEL_PATH}. "
	f"Ensure the model folder is committed to the repo under app/core/models/bge-base-en-v1.5/"
	)

	embedder = SentenceTransformer(str(MODEL_PATH)) # SentenceTransformer needs str, not Path
	print("✅ Embedder loaded successfully")



	qdrant = QdrantClient(
	url=QDRANT_URL,
	api_key=QDRANT_API_KEY,
	check_compatibility=False
	)

	COLLECTION_NAME = "smartnotes"
	BATCH_SIZE = 5 # ✅ reduced for free tier


	def ensure_collection():
	collections = qdrant.get_collections().collections
	if COLLECTION_NAME not in [c.name for c in collections]:
	qdrant.create_collection(
	collection_name=COLLECTION_NAME,
	vectors_config=VectorParams(size=768, distance=Distance.COSINE),
	)
	qdrant.create_payload_index(
	collection_name=COLLECTION_NAME,
	field_name="doc_id",
	field_schema="keyword"
	)


	def embed_and_store(text_chunks, doc_id):
	print(f"📊 Final chunks being embedded: {len(text_chunks)}")
	ensure_collection()

	vectors = embed_documents(text_chunks) # ✅ now uses correct doc prefix

	points = [
	models.PointStruct(
	id=str(uuid.uuid4()),
	vector=vectors[i],
	payload={
	"doc_id": doc_id,
	"text": text_chunks[i],
	"chunk_id": i,
	"length": len(text_chunks[i])
	},
	)
	for i in range(len(vectors))
	]

	failed_batches = []

	for i in range(0, len(points), BATCH_SIZE):
	batch = points[i:i + BATCH_SIZE]
	batch_num = i // BATCH_SIZE + 1
	success = False

	for attempt in range(4): # ✅ 4 attempts with exponential backoff
	try:
	qdrant.upsert(collection_name=COLLECTION_NAME, points=batch)
	success = True
	print(f" → Batch {batch_num} uploaded")
	break
	except Exception as e:
	wait = 2 ** attempt # 1s, 2s, 4s, 8s
	print(f" ⚠️ Batch {batch_num} attempt {attempt+1} failed: {e} \| retrying in {wait}s")
	time.sleep(wait)

	if not success:
	failed_batches.append(batch_num)
	print(f" ❌ Batch {batch_num} permanently failed")

	time.sleep(0.6) # ✅ throttle between successful batches

	if failed_batches:
	# ✅ raise so the caller (routes.py) knows something went wrong
	raise RuntimeError(f"Failed to upload batches: {failed_batches}")

	print(f"✅ All batches uploaded for doc_id={doc_id}")


	def embed_documents(texts):
	"""Embed document chunks with correct BGE prefix and normalization."""
	prefixed = [f"Represent this sentence: {t}" for t in texts] # ✅ correct BGE doc prefix
	vectors = []
	for i in range(0, len(prefixed), 32):
	batch = prefixed[i:i + 32]
	batch_vectors = embedder.encode(
	batch, normalize_embeddings=True, show_progress_bar=False)

	vectors.extend(batch_vectors.tolist())
	return vectors


	def embed_query(text):
	"""Embed a search query — BGE uses 'query:' prefix for retrieval."""
	return embedder.encode(
	f"query: {text}",
	normalize_embeddings=True
	).tolist() # ✅ always return list, not numpy array