Spaces:

siddhm11
/

prompt-engine

Running

prompt-engine / backend /migrate_embeddings.py

siddhm11

feat: update backend with complete prompt engine v4.0 - Enhanced main.py with request logging, CORS config, embedding preload - Updated config.py with tier-based model routing, rate limits, Stripe stubs - Expanded prompts.py with streaming, voice-enhance, feedback, history, usage endpoints - Added llm_service.py with Groq client pool and key rotation - Updated memory_service.py with passive learning, feedback analysis - Added migrate_embeddings.py for multilingual model migration - Updated schemas.py, auth.py, saved_prompts.py, requirements.txt - Removed prompt_engineering_skeleton directory

257fe44 3 months ago

raw

history blame contribute delete

8.97 kB

	"""
	One-time migration script: Re-embed all Qdrant vectors using the new multilingual model.

	Old model: all-MiniLM-L6-v2 (English only)
	New model: paraphrase-multilingual-MiniLM-L12-v2 (50+ languages)

	Both models produce 384-dim vectors, but they live in DIFFERENT vector spaces,
	so every existing vector must be re-computed with the new model.

	Usage (from project root):
	python -m backend.migrate_embeddings

	What this does:
	1. Connects to MongoDB + Qdrant using .env credentials
	2. Deletes & recreates both Qdrant collections (prompt_memory, saved_prompt_vectors)
	3. Loads the new multilingual embedding model
	4. Re-embeds all prompt_logs docs → prompt_memory collection
	5. Re-embeds all saved_prompts docs → saved_prompt_vectors collection
	"""

	import sys
	import uuid
	import time

	from pymongo import MongoClient
	from qdrant_client import QdrantClient
	from qdrant_client.models import VectorParams, Distance, PointStruct
	from sentence_transformers import SentenceTransformer

	from .core.config import settings


	# ─── CONFIG ────────────────────────────────────────────────────────────────────
	PROMPT_MEMORY_COLLECTION = settings.COLLECTION_NAME # "prompt_memory"
	SAVED_PROMPTS_COLLECTION = "saved_prompt_vectors"
	VECTOR_SIZE = 384
	NEW_MODEL_NAME = settings.EMBEDDING_MODEL_NAME # should already be the multilingual model


	def _create_collection(qdrant: QdrantClient, name: str):
	"""Delete if exists, then create fresh with 384-dim cosine + user_id index."""
	# Delete old collection
	try:
	qdrant.delete_collection(name)
	print(f" 🗑️ Deleted old collection: '{name}'")
	except Exception:
	pass # didn't exist

	# Create new
	qdrant.create_collection(
	collection_name=name,
	vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE),
	)
	print(f" ✅ Created collection: '{name}'")

	# Add user_id payload index
	try:
	qdrant.create_payload_index(
	collection_name=name,
	field_name="user_id",
	field_schema="keyword",
	)
	except Exception:
	pass


	def main():
	print("=" * 60)
	print("🔄 Embedding Migration Script")
	print(f" New model: {NEW_MODEL_NAME}")
	print("=" * 60)

	# ── 1. Connect to MongoDB ──────────────────────────────────────────────
	mongo_uri = settings.MONGO_URI
	if not mongo_uri:
	print("❌ MONGO_URI not set in .env — cannot migrate.")
	sys.exit(1)

	print("\n📦 Connecting to MongoDB...")
	mongo_client = MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
	try:
	mongo_client.admin.command("ping")
	except Exception as e:
	print(f"❌ MongoDB connection failed: {e}")
	sys.exit(1)

	db = mongo_client["prompt_engine_db"]
	prompt_logs_col = db["prompt_logs"]
	saved_prompts_col = db["saved_prompts"]

	prompt_logs_count = prompt_logs_col.count_documents({})
	saved_prompts_count = saved_prompts_col.count_documents({})
	print(f" ✅ MongoDB connected — {prompt_logs_count} prompt logs, {saved_prompts_count} saved prompts")

	# ── 2. Connect to Qdrant ──────────────────────────────────────────────
	print("\n📦 Connecting to Qdrant...")
	qdrant_url = settings.QDRANT_URL
	qdrant_api_key = settings.QDRANT_API_KEY

	if not qdrant_url or qdrant_url == ":memory:":
	print("❌ QDRANT_URL not set or is :memory: — cannot migrate a persistent instance.")
	sys.exit(1)

	qdrant = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
	print(f" ✅ Qdrant connected ({qdrant_url})")

	# ── 3. Recreate collections ───────────────────────────────────────────
	print("\n🔨 Recreating Qdrant collections...")
	_create_collection(qdrant, PROMPT_MEMORY_COLLECTION)
	_create_collection(qdrant, SAVED_PROMPTS_COLLECTION)

	# ── 4. Load the new embedding model ───────────────────────────────────
	print(f"\n⏳ Loading embedding model: {NEW_MODEL_NAME}")
	start_load = time.time()
	try:
	model = SentenceTransformer(NEW_MODEL_NAME, backend="onnx")
	print(f" ✅ Model loaded (ONNX backend) in {time.time() - start_load:.1f}s")
	except Exception:
	model = SentenceTransformer(NEW_MODEL_NAME)
	print(f" ✅ Model loaded (default backend) in {time.time() - start_load:.1f}s")

	def embed(text: str):
	return model.encode(text, convert_to_numpy=True).tolist()

	# ── 5. Re-embed prompt_logs → prompt_memory ───────────────────────────
	print(f"\n📝 Re-embedding {prompt_logs_count} prompt logs → '{PROMPT_MEMORY_COLLECTION}'...")
	success_logs = 0
	skipped_logs = 0
	batch_points = []
	BATCH_SIZE = 50

	for i, doc in enumerate(prompt_logs_col.find({})):
	original = doc.get("original", "")
	enhanced = doc.get("enhanced", "")
	user_id = doc.get("user_id", "")

	if not original or not user_id:
	skipped_logs += 1
	continue

	try:
	vec = embed(original)
	point_id = uuid.uuid4().int % (2**63)
	batch_points.append(PointStruct(
	id=point_id,
	vector=vec,
	payload={
	"user_id": user_id,
	"original_prompt": original,
	"refined_prompt": enhanced or "",
	},
	))
	success_logs += 1

	# Flush batch
	if len(batch_points) >= BATCH_SIZE:
	qdrant.upsert(collection_name=PROMPT_MEMORY_COLLECTION, points=batch_points)
	batch_points = []
	print(f" ... processed {i + 1}/{prompt_logs_count}")

	except Exception as e:
	print(f" ⚠️ Failed to embed prompt log (id={doc.get('_id')}): {e}")
	skipped_logs += 1

	# Flush remaining
	if batch_points:
	qdrant.upsert(collection_name=PROMPT_MEMORY_COLLECTION, points=batch_points)
	batch_points = []

	print(f" ✅ Done — {success_logs} embedded, {skipped_logs} skipped")

	# ── 6. Re-embed saved_prompts → saved_prompt_vectors ──────────────────
	print(f"\n📝 Re-embedding {saved_prompts_count} saved prompts → '{SAVED_PROMPTS_COLLECTION}'...")
	success_saved = 0
	skipped_saved = 0

	for i, doc in enumerate(saved_prompts_col.find({})):
	content = doc.get("content", "")
	user_id = doc.get("user_id", "")
	mongo_id = str(doc["_id"])

	if not content or not user_id:
	skipped_saved += 1
	continue

	try:
	vec = embed(content)
	point_id = abs(hash(mongo_id)) % (2**63)
	batch_points.append(PointStruct(
	id=point_id,
	vector=vec,
	payload={
	"user_id": user_id,
	"mongo_id": mongo_id,
	"content": content,
	"title": doc.get("title", "") or "",
	"tags": doc.get("tags", []) or [],
	},
	))
	success_saved += 1

	if len(batch_points) >= BATCH_SIZE:
	qdrant.upsert(collection_name=SAVED_PROMPTS_COLLECTION, points=batch_points)
	batch_points = []
	print(f" ... processed {i + 1}/{saved_prompts_count}")

	except Exception as e:
	print(f" ⚠️ Failed to embed saved prompt (id={mongo_id}): {e}")
	skipped_saved += 1

	if batch_points:
	qdrant.upsert(collection_name=SAVED_PROMPTS_COLLECTION, points=batch_points)

	print(f" ✅ Done — {success_saved} embedded, {skipped_saved} skipped")

	# ── 7. Summary ────────────────────────────────────────────────────────
	print("\n" + "=" * 60)
	print("✅ MIGRATION COMPLETE")
	print(f" Model: {NEW_MODEL_NAME}")
	print(f" prompt_memory: {success_logs} vectors ({skipped_logs} skipped)")
	print(f" saved_prompts: {success_saved} vectors ({skipped_saved} skipped)")
	print("=" * 60)
	print("\nYou can now restart the server:")
	print(" python -m uvicorn backend.main:app --reload")


	if __name__ == "__main__":
	main()