AIDA / scripts /audit_missing_geolocation.py
destinyebuka's picture
fyp
0fc788f
Raw
History Blame Contribute Delete
3.58 kB
"""
Read-only audit for listings missing usable geolocation.
Prints listings where latitude/longitude are missing, null, blank,
non-numeric, or outside valid coordinate ranges.
"""
import asyncio
from typing import Any, Optional
from app.database import connect_db, disconnect_db, get_db
def _to_float(value: Any) -> Optional[float]:
if value is None:
return None
if isinstance(value, str) and not value.strip():
return None
try:
return float(value)
except (TypeError, ValueError):
return None
def _geo_problem(doc: dict) -> Optional[str]:
has_lat = "latitude" in doc
has_lon = "longitude" in doc
lat = _to_float(doc.get("latitude"))
lon = _to_float(doc.get("longitude"))
if not has_lat and not has_lon:
return "missing latitude and longitude fields"
if not has_lat:
return "missing latitude field"
if not has_lon:
return "missing longitude field"
if lat is None and lon is None:
return "empty/invalid latitude and longitude"
if lat is None:
return "empty/invalid latitude"
if lon is None:
return "empty/invalid longitude"
if not -90 <= lat <= 90:
return "latitude out of range"
if not -180 <= lon <= 180:
return "longitude out of range"
return None
async def main():
await connect_db()
try:
db = await get_db()
total = await db.listings.count_documents({})
active_total = await db.listings.count_documents({"status": "active"})
projection = {
"title": 1,
"location": 1,
"address": 1,
"status": 1,
"listing_type": 1,
"latitude": 1,
"longitude": 1,
"created_at": 1,
"createdAt": 1,
}
cursor = db.listings.find({}, projection).sort("created_at", -1)
missing = []
async for doc in cursor:
problem = _geo_problem(doc)
if problem:
missing.append((problem, doc))
print("=== Listings Missing Usable Geolocation ===")
print(f"Total listings: {total}")
print(f"Active listings: {active_total}")
print(f"Need geolocation: {len(missing)}")
print()
by_status = {}
by_problem = {}
for problem, doc in missing:
by_status[doc.get("status") or "unknown"] = by_status.get(doc.get("status") or "unknown", 0) + 1
by_problem[problem] = by_problem.get(problem, 0) + 1
print("By status:")
for status, count in sorted(by_status.items()):
print(f" {status}: {count}")
print()
print("By problem:")
for problem, count in sorted(by_problem.items()):
print(f" {problem}: {count}")
print()
for idx, (problem, doc) in enumerate(missing, start=1):
title = (doc.get("title") or "No title").replace("\n", " ")[:80]
print(f"{idx}. {title}")
print(f" id: {doc.get('_id')}")
print(f" status: {doc.get('status') or 'unknown'}")
print(f" type: {doc.get('listing_type') or 'unknown'}")
print(f" location: {doc.get('location')}")
print(f" address: {doc.get('address')}")
print(f" latitude: {doc.get('latitude')!r}")
print(f" longitude: {doc.get('longitude')!r}")
print(f" problem: {problem}")
print()
finally:
await disconnect_db()
if __name__ == "__main__":
asyncio.run(main())