#!/usr/bin/env python3 """ Audit the reviews collection and report every class of bad data. Checks: 1. target_id stored as ObjectId instead of string 2. target_type="user" reviews missing review_source field 3. target_type="listing" reviews missing listing_owner_id (breaks the profile page query that finds short-stay reviews) 4. Reviews pointing at non-existent users or listings 5. User rating/reviews_count out of sync with actual review docs Run: python scripts/audit_reviews.py # report only python scripts/audit_reviews.py --fix # auto-fix what it safely can """ import asyncio import sys import os from datetime import datetime sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from motor.motor_asyncio import AsyncIOMotorClient from bson import ObjectId from app.config import settings FIX = "--fix" in sys.argv async def get_db(): client = AsyncIOMotorClient(settings.MONGODB_URL, serverSelectionTimeoutMS=5000) await client.admin.command("ping") return client[settings.MONGODB_DATABASE] # ───────────────────────────────────────────────────────────────────────────── async def check_objectid_target_ids(db): """CHECK 1 — target_id stored as BSON ObjectId instead of string.""" print("\n── CHECK 1: ObjectId target_id ─────────────────────────────") docs = await db.reviews.find( {"target_id": {"$type": "objectId"}} ).to_list(length=None) if not docs: print(" ✅ All target_id fields are strings.") return [] print(f" ⚠ {len(docs)} review(s) with ObjectId target_id:") affected = [] for doc in docs: print(f" _id={doc['_id']} target_type={doc.get('target_type')} " f"target_id={doc['target_id']} reviewer={doc.get('reviewer_id')}") if FIX: await db.reviews.update_one( {"_id": doc["_id"]}, {"$set": {"target_id": str(doc["target_id"])}}, ) print(f" → fixed: target_id → '{str(doc['target_id'])}'") affected.append(str(doc["target_id"])) return affected async def check_missing_review_source(db): """CHECK 2 — target_type='user' reviews with no review_source.""" print("\n── CHECK 2: Missing review_source on user reviews ──────────") docs = await db.reviews.find({ "target_type": "user", "review_source": {"$exists": False}, }).to_list(length=None) if not docs: print(" ✅ All user reviews have review_source.") return print(f" ⚠ {len(docs)} user review(s) missing review_source:") for doc in docs: print(f" _id={doc['_id']} reviewer={doc.get('reviewer_id')} " f"target={doc.get('target_id')} rating={doc.get('rating')} " f"text=\"{str(doc.get('review_text',''))[:60]}\"") if FIX: # Cannot determine 'viewing' vs 'personal' automatically for old unlabelled docs — # cross-reference with person_reviews to identify viewing reviews. person_review_keys = set() async for pr in db["person_reviews"].find({}): r_id = str(pr.get("reviewer_id", "")) t_id = str(pr.get("reviewed_id") or pr.get("target_id", "")) if r_id and t_id: person_review_keys.add((r_id, t_id)) for doc in docs: r_id = str(doc.get("reviewer_id", "")) t_id = str(doc.get("target_id", "")) source = "viewing" if (r_id, t_id) in person_review_keys else "personal" await db.reviews.update_one( {"_id": doc["_id"]}, {"$set": {"review_source": source}}, ) print(f" → fixed _id={doc['_id']}: review_source={source}") async def check_listing_reviews_missing_owner(db): """CHECK 3 — target_type='listing' reviews missing listing_owner_id.""" print("\n── CHECK 3: Listing reviews missing listing_owner_id ───────") docs = await db.reviews.find({ "target_type": "listing", "listing_owner_id": {"$exists": False}, }).to_list(length=None) if not docs: print(" ✅ All listing reviews have listing_owner_id.") return print(f" ⚠ {len(docs)} listing review(s) missing listing_owner_id " f"(won't appear on landlord profile page):") fixed = skipped = 0 for doc in docs: listing_id = doc.get("target_id", "") listing = None if ObjectId.is_valid(listing_id): listing = await db.listings.find_one({"_id": ObjectId(listing_id)}) if listing: owner_id = str(listing.get("user_id", "")) title = listing.get("title", "") print(f" _id={doc['_id']} listing='{title[:40]}' owner={owner_id}") if FIX and owner_id: await db.reviews.update_one( {"_id": doc["_id"]}, {"$set": { "listing_owner_id": owner_id, "listing_title": title, }}, ) print(f" → fixed: listing_owner_id={owner_id}") fixed += 1 else: print(f" _id={doc['_id']} listing_id={listing_id} " f"⚠ listing not found in DB (orphan review)") skipped += 1 if FIX: print(f" Fixed: {fixed} | Skipped (orphan): {skipped}") async def check_orphan_reviews(db): """CHECK 4 — reviews pointing at users/listings that no longer exist.""" print("\n── CHECK 4: Orphan reviews (deleted target) ────────────────") user_reviews = await db.reviews.find({"target_type": "user"}).to_list(length=None) listing_reviews = await db.reviews.find({"target_type": "listing"}).to_list(length=None) orphan_user = orphan_listing = 0 for doc in user_reviews: t_id = str(doc.get("target_id", "")) if not ObjectId.is_valid(t_id): print(f" ⚠ User review {doc['_id']}: target_id '{t_id}' is not a valid ObjectId") orphan_user += 1 continue exists = await db.users.find_one({"_id": ObjectId(t_id)}, {"_id": 1}) if not exists: print(f" ⚠ User review {doc['_id']}: user {t_id} no longer exists (orphan)") orphan_user += 1 for doc in listing_reviews: t_id = str(doc.get("target_id", "")) if not ObjectId.is_valid(t_id): print(f" ⚠ Listing review {doc['_id']}: target_id '{t_id}' is not a valid ObjectId") orphan_listing += 1 continue exists = await db.listings.find_one({"_id": ObjectId(t_id)}, {"_id": 1}) if not exists: print(f" ⚠ Listing review {doc['_id']}: listing {t_id} no longer exists (orphan)") orphan_listing += 1 if orphan_user == 0 and orphan_listing == 0: print(" ✅ No orphan reviews found.") else: print(f" Orphan user reviews: {orphan_user} | Orphan listing reviews: {orphan_listing}") if not FIX: print(" (Re-run with --fix to delete orphans)") else: # Only delete if explicitly fixing print(" Deleting orphan reviews...") for doc in user_reviews: t_id = str(doc.get("target_id", "")) if not ObjectId.is_valid(t_id): await db.reviews.delete_one({"_id": doc["_id"]}) continue if not await db.users.find_one({"_id": ObjectId(t_id)}, {"_id": 1}): await db.reviews.delete_one({"_id": doc["_id"]}) print(f" Deleted orphan user review {doc['_id']}") for doc in listing_reviews: t_id = str(doc.get("target_id", "")) if not ObjectId.is_valid(t_id): await db.reviews.delete_one({"_id": doc["_id"]}) continue if not await db.listings.find_one({"_id": ObjectId(t_id)}, {"_id": 1}): await db.reviews.delete_one({"_id": doc["_id"]}) print(f" Deleted orphan listing review {doc['_id']}") async def check_rating_sync(db): """CHECK 5 — user.rating / reviews_count out of sync with actual reviews.""" print("\n── CHECK 5: User rating/count out of sync ──────────────────") # Get all users who have at least one review reviewer_targets = await db.reviews.distinct("target_id", {"target_type": "user"}) out_of_sync = 0 for uid in reviewer_targets: if not ObjectId.is_valid(str(uid)): continue uid_str = str(uid) reviews = await db.reviews.find({ "target_type": "user", "target_id": uid_str }).to_list(length=None) count = len(reviews) avg = round(sum(r.get("rating", 0) for r in reviews) / count, 2) if count else 0.0 user = await db.users.find_one({"_id": ObjectId(uid_str)}) if not user: continue stored_rating = user.get("rating", 0.0) stored_count = user.get("reviews_count", 0) name = f"{user.get('firstName','')} {user.get('lastName','')}".strip() or uid_str if abs((stored_rating or 0) - avg) > 0.01 or (stored_count or 0) != count: print(f" ⚠ {name}: stored={stored_rating}★/{stored_count}rev " f"actual={avg}★/{count}rev") out_of_sync += 1 if FIX: await db.users.update_one( {"_id": ObjectId(uid_str)}, {"$set": {"rating": avg, "reviews_count": count}}, ) print(f" → fixed") if out_of_sync == 0: print(" ✅ All user ratings are in sync.") elif not FIX: print(f" {out_of_sync} user(s) out of sync. Re-run with --fix to correct.") async def print_summary(db): print("\n── COLLECTION SNAPSHOT ─────────────────────────────────────") total = await db.reviews.count_documents({}) user_total = await db.reviews.count_documents({"target_type": "user"}) listing_total = await db.reviews.count_documents({"target_type": "listing"}) personal = await db.reviews.count_documents({"target_type": "user", "review_source": "personal"}) viewing = await db.reviews.count_documents({"target_type": "user", "review_source": "viewing"}) no_source = await db.reviews.count_documents({"target_type": "user", "review_source": {"$exists": False}}) shortstay = await db.reviews.count_documents({"target_type": "listing"}) print(f" Total reviews : {total}") print(f" ├─ user reviews : {user_total}") print(f" │ ├─ personal : {personal}") print(f" │ ├─ viewing : {viewing}") print(f" │ └─ no source : {no_source} ← should be 0 after --fix") print(f" └─ listing reviews : {listing_total} (short-stay)") # ───────────────────────────────────────────────────────────────────────────── async def main(): print("\n" + "=" * 60) print("REVIEWS COLLECTION AUDIT") mode = "FIX MODE — issues will be corrected" if FIX else "REPORT ONLY — no changes written (pass --fix to correct)" print(f"MODE: {mode}") print("=" * 60) print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") db = await get_db() print(f"Connected — db: {settings.MONGODB_DATABASE}") await print_summary(db) await check_objectid_target_ids(db) await check_missing_review_source(db) await check_listing_reviews_missing_owner(db) await check_orphan_reviews(db) await check_rating_sync(db) print("\n" + "=" * 60) print("AUDIT COMPLETE") print(f"Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print("=" * 60) if __name__ == "__main__": asyncio.run(main())