Spaces:
Running
Running
| """ | |
| Sync Qdrant with MongoDB - removes listings from Qdrant that no longer exist in MongoDB. | |
| Usage: python scripts/sync_qdrant_with_mongo.py | |
| """ | |
| import asyncio | |
| import os | |
| from dotenv import load_dotenv | |
| from motor.motor_asyncio import AsyncIOMotorClient | |
| from qdrant_client import QdrantClient | |
| from qdrant_client.models import PointIdsList | |
| load_dotenv() | |
| MONGO_URI = os.getenv("MONGO_URI") or os.getenv("DATABASE_URL") | |
| DB_NAME = os.getenv("MONGODB_DATABASE", "lojiz") | |
| QDRANT_URL = os.getenv("QDRANT_URL") | |
| QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") | |
| COLLECTION_NAME = "listings" | |
| async def sync_qdrant_with_mongo(): | |
| """Remove listings from Qdrant that don't exist in MongoDB.""" | |
| print("π Connecting to MongoDB...") | |
| mongo_client = AsyncIOMotorClient(MONGO_URI) | |
| db = mongo_client[DB_NAME] | |
| print("π Connecting to Qdrant...") | |
| qdrant = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY) | |
| # Step 1: Get all listing IDs from MongoDB | |
| print("\nπ Fetching MongoDB listings...") | |
| mongo_ids = set() | |
| async for doc in db.listings.find({}, {"_id": 1}): | |
| mongo_ids.add(str(doc["_id"])) | |
| print(f" Found {len(mongo_ids)} listings in MongoDB") | |
| # Step 2: Get all point IDs from Qdrant | |
| print("\nπ Fetching Qdrant points...") | |
| qdrant_ids = set() | |
| offset = None | |
| while True: | |
| result = qdrant.scroll( | |
| collection_name=COLLECTION_NAME, | |
| limit=100, | |
| offset=offset, | |
| with_payload=["mongo_id", "title"], | |
| with_vectors=False | |
| ) | |
| points, next_offset = result | |
| if not points: | |
| break | |
| for point in points: | |
| # Try to get mongo_id from payload, or use point id | |
| mongo_id = point.payload.get("mongo_id") or str(point.id) | |
| qdrant_ids.add((point.id, mongo_id, point.payload.get("title", "Unknown"))) | |
| if next_offset is None: | |
| break | |
| offset = next_offset | |
| print(f" Found {len(qdrant_ids)} points in Qdrant") | |
| # Step 3: Find IDs in Qdrant but not in MongoDB | |
| stale_points = [] | |
| for point_id, mongo_id, title in qdrant_ids: | |
| if mongo_id not in mongo_ids: | |
| stale_points.append((point_id, title)) | |
| print(f"\nποΈ Found {len(stale_points)} stale points to delete:") | |
| for point_id, title in stale_points: | |
| print(f" - {title} (ID: {point_id})") | |
| if not stale_points: | |
| print("\nβ Qdrant is already in sync with MongoDB!") | |
| mongo_client.close() | |
| return | |
| # Step 4: Delete stale points from Qdrant | |
| confirm = input(f"\nβ οΈ Delete {len(stale_points)} stale points from Qdrant? (y/n): ") | |
| if confirm.lower() != 'y': | |
| print("β Cancelled") | |
| mongo_client.close() | |
| return | |
| point_ids = [p[0] for p in stale_points] | |
| qdrant.delete( | |
| collection_name=COLLECTION_NAME, | |
| points_selector=PointIdsList(points=point_ids) | |
| ) | |
| print(f"\nπ Deleted {len(stale_points)} stale points from Qdrant!") | |
| print("β Qdrant is now synced with MongoDB") | |
| mongo_client.close() | |
| if __name__ == "__main__": | |
| asyncio.run(sync_qdrant_with_mongo()) | |