Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Audit the reviews collection and report every class of bad data. | |
| Checks: | |
| 1. target_id stored as ObjectId instead of string | |
| 2. target_type="user" reviews missing review_source field | |
| 3. target_type="listing" reviews missing listing_owner_id | |
| (breaks the profile page query that finds short-stay reviews) | |
| 4. Reviews pointing at non-existent users or listings | |
| 5. User rating/reviews_count out of sync with actual review docs | |
| Run: | |
| python scripts/audit_reviews.py # report only | |
| python scripts/audit_reviews.py --fix # auto-fix what it safely can | |
| """ | |
| import asyncio | |
| import sys | |
| import os | |
| from datetime import datetime | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from motor.motor_asyncio import AsyncIOMotorClient | |
| from bson import ObjectId | |
| from app.config import settings | |
| FIX = "--fix" in sys.argv | |
| async def get_db(): | |
| client = AsyncIOMotorClient(settings.MONGODB_URL, serverSelectionTimeoutMS=5000) | |
| await client.admin.command("ping") | |
| return client[settings.MONGODB_DATABASE] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def check_objectid_target_ids(db): | |
| """CHECK 1 β target_id stored as BSON ObjectId instead of string.""" | |
| print("\nββ CHECK 1: ObjectId target_id βββββββββββββββββββββββββββββ") | |
| docs = await db.reviews.find( | |
| {"target_id": {"$type": "objectId"}} | |
| ).to_list(length=None) | |
| if not docs: | |
| print(" β All target_id fields are strings.") | |
| return [] | |
| print(f" β {len(docs)} review(s) with ObjectId target_id:") | |
| affected = [] | |
| for doc in docs: | |
| print(f" _id={doc['_id']} target_type={doc.get('target_type')} " | |
| f"target_id={doc['target_id']} reviewer={doc.get('reviewer_id')}") | |
| if FIX: | |
| await db.reviews.update_one( | |
| {"_id": doc["_id"]}, | |
| {"$set": {"target_id": str(doc["target_id"])}}, | |
| ) | |
| print(f" β fixed: target_id β '{str(doc['target_id'])}'") | |
| affected.append(str(doc["target_id"])) | |
| return affected | |
| async def check_missing_review_source(db): | |
| """CHECK 2 β target_type='user' reviews with no review_source.""" | |
| print("\nββ CHECK 2: Missing review_source on user reviews ββββββββββ") | |
| docs = await db.reviews.find({ | |
| "target_type": "user", | |
| "review_source": {"$exists": False}, | |
| }).to_list(length=None) | |
| if not docs: | |
| print(" β All user reviews have review_source.") | |
| return | |
| print(f" β {len(docs)} user review(s) missing review_source:") | |
| for doc in docs: | |
| print(f" _id={doc['_id']} reviewer={doc.get('reviewer_id')} " | |
| f"target={doc.get('target_id')} rating={doc.get('rating')} " | |
| f"text=\"{str(doc.get('review_text',''))[:60]}\"") | |
| if FIX: | |
| # Cannot determine 'viewing' vs 'personal' automatically for old unlabelled docs β | |
| # cross-reference with person_reviews to identify viewing reviews. | |
| person_review_keys = set() | |
| async for pr in db["person_reviews"].find({}): | |
| r_id = str(pr.get("reviewer_id", "")) | |
| t_id = str(pr.get("reviewed_id") or pr.get("target_id", "")) | |
| if r_id and t_id: | |
| person_review_keys.add((r_id, t_id)) | |
| for doc in docs: | |
| r_id = str(doc.get("reviewer_id", "")) | |
| t_id = str(doc.get("target_id", "")) | |
| source = "viewing" if (r_id, t_id) in person_review_keys else "personal" | |
| await db.reviews.update_one( | |
| {"_id": doc["_id"]}, | |
| {"$set": {"review_source": source}}, | |
| ) | |
| print(f" β fixed _id={doc['_id']}: review_source={source}") | |
| async def check_listing_reviews_missing_owner(db): | |
| """CHECK 3 β target_type='listing' reviews missing listing_owner_id.""" | |
| print("\nββ CHECK 3: Listing reviews missing listing_owner_id βββββββ") | |
| docs = await db.reviews.find({ | |
| "target_type": "listing", | |
| "listing_owner_id": {"$exists": False}, | |
| }).to_list(length=None) | |
| if not docs: | |
| print(" β All listing reviews have listing_owner_id.") | |
| return | |
| print(f" β {len(docs)} listing review(s) missing listing_owner_id " | |
| f"(won't appear on landlord profile page):") | |
| fixed = skipped = 0 | |
| for doc in docs: | |
| listing_id = doc.get("target_id", "") | |
| listing = None | |
| if ObjectId.is_valid(listing_id): | |
| listing = await db.listings.find_one({"_id": ObjectId(listing_id)}) | |
| if listing: | |
| owner_id = str(listing.get("user_id", "")) | |
| title = listing.get("title", "") | |
| print(f" _id={doc['_id']} listing='{title[:40]}' owner={owner_id}") | |
| if FIX and owner_id: | |
| await db.reviews.update_one( | |
| {"_id": doc["_id"]}, | |
| {"$set": { | |
| "listing_owner_id": owner_id, | |
| "listing_title": title, | |
| }}, | |
| ) | |
| print(f" β fixed: listing_owner_id={owner_id}") | |
| fixed += 1 | |
| else: | |
| print(f" _id={doc['_id']} listing_id={listing_id} " | |
| f"β listing not found in DB (orphan review)") | |
| skipped += 1 | |
| if FIX: | |
| print(f" Fixed: {fixed} | Skipped (orphan): {skipped}") | |
| async def check_orphan_reviews(db): | |
| """CHECK 4 β reviews pointing at users/listings that no longer exist.""" | |
| print("\nββ CHECK 4: Orphan reviews (deleted target) ββββββββββββββββ") | |
| user_reviews = await db.reviews.find({"target_type": "user"}).to_list(length=None) | |
| listing_reviews = await db.reviews.find({"target_type": "listing"}).to_list(length=None) | |
| orphan_user = orphan_listing = 0 | |
| for doc in user_reviews: | |
| t_id = str(doc.get("target_id", "")) | |
| if not ObjectId.is_valid(t_id): | |
| print(f" β User review {doc['_id']}: target_id '{t_id}' is not a valid ObjectId") | |
| orphan_user += 1 | |
| continue | |
| exists = await db.users.find_one({"_id": ObjectId(t_id)}, {"_id": 1}) | |
| if not exists: | |
| print(f" β User review {doc['_id']}: user {t_id} no longer exists (orphan)") | |
| orphan_user += 1 | |
| for doc in listing_reviews: | |
| t_id = str(doc.get("target_id", "")) | |
| if not ObjectId.is_valid(t_id): | |
| print(f" β Listing review {doc['_id']}: target_id '{t_id}' is not a valid ObjectId") | |
| orphan_listing += 1 | |
| continue | |
| exists = await db.listings.find_one({"_id": ObjectId(t_id)}, {"_id": 1}) | |
| if not exists: | |
| print(f" β Listing review {doc['_id']}: listing {t_id} no longer exists (orphan)") | |
| orphan_listing += 1 | |
| if orphan_user == 0 and orphan_listing == 0: | |
| print(" β No orphan reviews found.") | |
| else: | |
| print(f" Orphan user reviews: {orphan_user} | Orphan listing reviews: {orphan_listing}") | |
| if not FIX: | |
| print(" (Re-run with --fix to delete orphans)") | |
| else: | |
| # Only delete if explicitly fixing | |
| print(" Deleting orphan reviews...") | |
| for doc in user_reviews: | |
| t_id = str(doc.get("target_id", "")) | |
| if not ObjectId.is_valid(t_id): | |
| await db.reviews.delete_one({"_id": doc["_id"]}) | |
| continue | |
| if not await db.users.find_one({"_id": ObjectId(t_id)}, {"_id": 1}): | |
| await db.reviews.delete_one({"_id": doc["_id"]}) | |
| print(f" Deleted orphan user review {doc['_id']}") | |
| for doc in listing_reviews: | |
| t_id = str(doc.get("target_id", "")) | |
| if not ObjectId.is_valid(t_id): | |
| await db.reviews.delete_one({"_id": doc["_id"]}) | |
| continue | |
| if not await db.listings.find_one({"_id": ObjectId(t_id)}, {"_id": 1}): | |
| await db.reviews.delete_one({"_id": doc["_id"]}) | |
| print(f" Deleted orphan listing review {doc['_id']}") | |
| async def check_rating_sync(db): | |
| """CHECK 5 β user.rating / reviews_count out of sync with actual reviews.""" | |
| print("\nββ CHECK 5: User rating/count out of sync ββββββββββββββββββ") | |
| # Get all users who have at least one review | |
| reviewer_targets = await db.reviews.distinct("target_id", {"target_type": "user"}) | |
| out_of_sync = 0 | |
| for uid in reviewer_targets: | |
| if not ObjectId.is_valid(str(uid)): | |
| continue | |
| uid_str = str(uid) | |
| reviews = await db.reviews.find({ | |
| "target_type": "user", "target_id": uid_str | |
| }).to_list(length=None) | |
| count = len(reviews) | |
| avg = round(sum(r.get("rating", 0) for r in reviews) / count, 2) if count else 0.0 | |
| user = await db.users.find_one({"_id": ObjectId(uid_str)}) | |
| if not user: | |
| continue | |
| stored_rating = user.get("rating", 0.0) | |
| stored_count = user.get("reviews_count", 0) | |
| name = f"{user.get('firstName','')} {user.get('lastName','')}".strip() or uid_str | |
| if abs((stored_rating or 0) - avg) > 0.01 or (stored_count or 0) != count: | |
| print(f" β {name}: stored={stored_rating}β /{stored_count}rev " | |
| f"actual={avg}β /{count}rev") | |
| out_of_sync += 1 | |
| if FIX: | |
| await db.users.update_one( | |
| {"_id": ObjectId(uid_str)}, | |
| {"$set": {"rating": avg, "reviews_count": count}}, | |
| ) | |
| print(f" β fixed") | |
| if out_of_sync == 0: | |
| print(" β All user ratings are in sync.") | |
| elif not FIX: | |
| print(f" {out_of_sync} user(s) out of sync. Re-run with --fix to correct.") | |
| async def print_summary(db): | |
| print("\nββ COLLECTION SNAPSHOT βββββββββββββββββββββββββββββββββββββ") | |
| total = await db.reviews.count_documents({}) | |
| user_total = await db.reviews.count_documents({"target_type": "user"}) | |
| listing_total = await db.reviews.count_documents({"target_type": "listing"}) | |
| personal = await db.reviews.count_documents({"target_type": "user", "review_source": "personal"}) | |
| viewing = await db.reviews.count_documents({"target_type": "user", "review_source": "viewing"}) | |
| no_source = await db.reviews.count_documents({"target_type": "user", "review_source": {"$exists": False}}) | |
| shortstay = await db.reviews.count_documents({"target_type": "listing"}) | |
| print(f" Total reviews : {total}") | |
| print(f" ββ user reviews : {user_total}") | |
| print(f" β ββ personal : {personal}") | |
| print(f" β ββ viewing : {viewing}") | |
| print(f" β ββ no source : {no_source} β should be 0 after --fix") | |
| print(f" ββ listing reviews : {listing_total} (short-stay)") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def main(): | |
| print("\n" + "=" * 60) | |
| print("REVIEWS COLLECTION AUDIT") | |
| mode = "FIX MODE β issues will be corrected" if FIX else "REPORT ONLY β no changes written (pass --fix to correct)" | |
| print(f"MODE: {mode}") | |
| print("=" * 60) | |
| print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| db = await get_db() | |
| print(f"Connected β db: {settings.MONGODB_DATABASE}") | |
| await print_summary(db) | |
| await check_objectid_target_ids(db) | |
| await check_missing_review_source(db) | |
| await check_listing_reviews_missing_owner(db) | |
| await check_orphan_reviews(db) | |
| await check_rating_sync(db) | |
| print("\n" + "=" * 60) | |
| print("AUDIT COMPLETE") | |
| print(f"Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| print("=" * 60) | |
| if __name__ == "__main__": | |
| asyncio.run(main()) | |