AIDA / scripts /audit_reviews.py
destinyebuka's picture
new update
9a0c720
#!/usr/bin/env python3
"""
Audit the reviews collection and report every class of bad data.
Checks:
1. target_id stored as ObjectId instead of string
2. target_type="user" reviews missing review_source field
3. target_type="listing" reviews missing listing_owner_id
(breaks the profile page query that finds short-stay reviews)
4. Reviews pointing at non-existent users or listings
5. User rating/reviews_count out of sync with actual review docs
Run:
python scripts/audit_reviews.py # report only
python scripts/audit_reviews.py --fix # auto-fix what it safely can
"""
import asyncio
import sys
import os
from datetime import datetime
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from motor.motor_asyncio import AsyncIOMotorClient
from bson import ObjectId
from app.config import settings
FIX = "--fix" in sys.argv
async def get_db():
client = AsyncIOMotorClient(settings.MONGODB_URL, serverSelectionTimeoutMS=5000)
await client.admin.command("ping")
return client[settings.MONGODB_DATABASE]
# ─────────────────────────────────────────────────────────────────────────────
async def check_objectid_target_ids(db):
"""CHECK 1 β€” target_id stored as BSON ObjectId instead of string."""
print("\n── CHECK 1: ObjectId target_id ─────────────────────────────")
docs = await db.reviews.find(
{"target_id": {"$type": "objectId"}}
).to_list(length=None)
if not docs:
print(" βœ… All target_id fields are strings.")
return []
print(f" ⚠ {len(docs)} review(s) with ObjectId target_id:")
affected = []
for doc in docs:
print(f" _id={doc['_id']} target_type={doc.get('target_type')} "
f"target_id={doc['target_id']} reviewer={doc.get('reviewer_id')}")
if FIX:
await db.reviews.update_one(
{"_id": doc["_id"]},
{"$set": {"target_id": str(doc["target_id"])}},
)
print(f" β†’ fixed: target_id β†’ '{str(doc['target_id'])}'")
affected.append(str(doc["target_id"]))
return affected
async def check_missing_review_source(db):
"""CHECK 2 β€” target_type='user' reviews with no review_source."""
print("\n── CHECK 2: Missing review_source on user reviews ──────────")
docs = await db.reviews.find({
"target_type": "user",
"review_source": {"$exists": False},
}).to_list(length=None)
if not docs:
print(" βœ… All user reviews have review_source.")
return
print(f" ⚠ {len(docs)} user review(s) missing review_source:")
for doc in docs:
print(f" _id={doc['_id']} reviewer={doc.get('reviewer_id')} "
f"target={doc.get('target_id')} rating={doc.get('rating')} "
f"text=\"{str(doc.get('review_text',''))[:60]}\"")
if FIX:
# Cannot determine 'viewing' vs 'personal' automatically for old unlabelled docs β€”
# cross-reference with person_reviews to identify viewing reviews.
person_review_keys = set()
async for pr in db["person_reviews"].find({}):
r_id = str(pr.get("reviewer_id", ""))
t_id = str(pr.get("reviewed_id") or pr.get("target_id", ""))
if r_id and t_id:
person_review_keys.add((r_id, t_id))
for doc in docs:
r_id = str(doc.get("reviewer_id", ""))
t_id = str(doc.get("target_id", ""))
source = "viewing" if (r_id, t_id) in person_review_keys else "personal"
await db.reviews.update_one(
{"_id": doc["_id"]},
{"$set": {"review_source": source}},
)
print(f" β†’ fixed _id={doc['_id']}: review_source={source}")
async def check_listing_reviews_missing_owner(db):
"""CHECK 3 β€” target_type='listing' reviews missing listing_owner_id."""
print("\n── CHECK 3: Listing reviews missing listing_owner_id ───────")
docs = await db.reviews.find({
"target_type": "listing",
"listing_owner_id": {"$exists": False},
}).to_list(length=None)
if not docs:
print(" βœ… All listing reviews have listing_owner_id.")
return
print(f" ⚠ {len(docs)} listing review(s) missing listing_owner_id "
f"(won't appear on landlord profile page):")
fixed = skipped = 0
for doc in docs:
listing_id = doc.get("target_id", "")
listing = None
if ObjectId.is_valid(listing_id):
listing = await db.listings.find_one({"_id": ObjectId(listing_id)})
if listing:
owner_id = str(listing.get("user_id", ""))
title = listing.get("title", "")
print(f" _id={doc['_id']} listing='{title[:40]}' owner={owner_id}")
if FIX and owner_id:
await db.reviews.update_one(
{"_id": doc["_id"]},
{"$set": {
"listing_owner_id": owner_id,
"listing_title": title,
}},
)
print(f" β†’ fixed: listing_owner_id={owner_id}")
fixed += 1
else:
print(f" _id={doc['_id']} listing_id={listing_id} "
f"⚠ listing not found in DB (orphan review)")
skipped += 1
if FIX:
print(f" Fixed: {fixed} | Skipped (orphan): {skipped}")
async def check_orphan_reviews(db):
"""CHECK 4 β€” reviews pointing at users/listings that no longer exist."""
print("\n── CHECK 4: Orphan reviews (deleted target) ────────────────")
user_reviews = await db.reviews.find({"target_type": "user"}).to_list(length=None)
listing_reviews = await db.reviews.find({"target_type": "listing"}).to_list(length=None)
orphan_user = orphan_listing = 0
for doc in user_reviews:
t_id = str(doc.get("target_id", ""))
if not ObjectId.is_valid(t_id):
print(f" ⚠ User review {doc['_id']}: target_id '{t_id}' is not a valid ObjectId")
orphan_user += 1
continue
exists = await db.users.find_one({"_id": ObjectId(t_id)}, {"_id": 1})
if not exists:
print(f" ⚠ User review {doc['_id']}: user {t_id} no longer exists (orphan)")
orphan_user += 1
for doc in listing_reviews:
t_id = str(doc.get("target_id", ""))
if not ObjectId.is_valid(t_id):
print(f" ⚠ Listing review {doc['_id']}: target_id '{t_id}' is not a valid ObjectId")
orphan_listing += 1
continue
exists = await db.listings.find_one({"_id": ObjectId(t_id)}, {"_id": 1})
if not exists:
print(f" ⚠ Listing review {doc['_id']}: listing {t_id} no longer exists (orphan)")
orphan_listing += 1
if orphan_user == 0 and orphan_listing == 0:
print(" βœ… No orphan reviews found.")
else:
print(f" Orphan user reviews: {orphan_user} | Orphan listing reviews: {orphan_listing}")
if not FIX:
print(" (Re-run with --fix to delete orphans)")
else:
# Only delete if explicitly fixing
print(" Deleting orphan reviews...")
for doc in user_reviews:
t_id = str(doc.get("target_id", ""))
if not ObjectId.is_valid(t_id):
await db.reviews.delete_one({"_id": doc["_id"]})
continue
if not await db.users.find_one({"_id": ObjectId(t_id)}, {"_id": 1}):
await db.reviews.delete_one({"_id": doc["_id"]})
print(f" Deleted orphan user review {doc['_id']}")
for doc in listing_reviews:
t_id = str(doc.get("target_id", ""))
if not ObjectId.is_valid(t_id):
await db.reviews.delete_one({"_id": doc["_id"]})
continue
if not await db.listings.find_one({"_id": ObjectId(t_id)}, {"_id": 1}):
await db.reviews.delete_one({"_id": doc["_id"]})
print(f" Deleted orphan listing review {doc['_id']}")
async def check_rating_sync(db):
"""CHECK 5 β€” user.rating / reviews_count out of sync with actual reviews."""
print("\n── CHECK 5: User rating/count out of sync ──────────────────")
# Get all users who have at least one review
reviewer_targets = await db.reviews.distinct("target_id", {"target_type": "user"})
out_of_sync = 0
for uid in reviewer_targets:
if not ObjectId.is_valid(str(uid)):
continue
uid_str = str(uid)
reviews = await db.reviews.find({
"target_type": "user", "target_id": uid_str
}).to_list(length=None)
count = len(reviews)
avg = round(sum(r.get("rating", 0) for r in reviews) / count, 2) if count else 0.0
user = await db.users.find_one({"_id": ObjectId(uid_str)})
if not user:
continue
stored_rating = user.get("rating", 0.0)
stored_count = user.get("reviews_count", 0)
name = f"{user.get('firstName','')} {user.get('lastName','')}".strip() or uid_str
if abs((stored_rating or 0) - avg) > 0.01 or (stored_count or 0) != count:
print(f" ⚠ {name}: stored={stored_rating}β˜…/{stored_count}rev "
f"actual={avg}β˜…/{count}rev")
out_of_sync += 1
if FIX:
await db.users.update_one(
{"_id": ObjectId(uid_str)},
{"$set": {"rating": avg, "reviews_count": count}},
)
print(f" β†’ fixed")
if out_of_sync == 0:
print(" βœ… All user ratings are in sync.")
elif not FIX:
print(f" {out_of_sync} user(s) out of sync. Re-run with --fix to correct.")
async def print_summary(db):
print("\n── COLLECTION SNAPSHOT ─────────────────────────────────────")
total = await db.reviews.count_documents({})
user_total = await db.reviews.count_documents({"target_type": "user"})
listing_total = await db.reviews.count_documents({"target_type": "listing"})
personal = await db.reviews.count_documents({"target_type": "user", "review_source": "personal"})
viewing = await db.reviews.count_documents({"target_type": "user", "review_source": "viewing"})
no_source = await db.reviews.count_documents({"target_type": "user", "review_source": {"$exists": False}})
shortstay = await db.reviews.count_documents({"target_type": "listing"})
print(f" Total reviews : {total}")
print(f" β”œβ”€ user reviews : {user_total}")
print(f" β”‚ β”œβ”€ personal : {personal}")
print(f" β”‚ β”œβ”€ viewing : {viewing}")
print(f" β”‚ └─ no source : {no_source} ← should be 0 after --fix")
print(f" └─ listing reviews : {listing_total} (short-stay)")
# ─────────────────────────────────────────────────────────────────────────────
async def main():
print("\n" + "=" * 60)
print("REVIEWS COLLECTION AUDIT")
mode = "FIX MODE β€” issues will be corrected" if FIX else "REPORT ONLY β€” no changes written (pass --fix to correct)"
print(f"MODE: {mode}")
print("=" * 60)
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
db = await get_db()
print(f"Connected β€” db: {settings.MONGODB_DATABASE}")
await print_summary(db)
await check_objectid_target_ids(db)
await check_missing_review_source(db)
await check_listing_reviews_missing_owner(db)
await check_orphan_reviews(db)
await check_rating_sync(db)
print("\n" + "=" * 60)
print("AUDIT COMPLETE")
print(f"Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 60)
if __name__ == "__main__":
asyncio.run(main())