sheami / tests /dedup_trends.py
vikramvasudevan's picture
Upload folder using huggingface_hub
9d8f87b verified
from modules.db import SheamiDB
db = None # global placeholder
async def deduplicate_trend_data():
global db
if db is None:
db = SheamiDB()
cursor = db.trends.find({})
updated_count = 0
async for doc in cursor:
# Create a dict to keep max value for each date
date_max = {}
for entry in doc.get("trend_data", []):
date = entry["date"]
try:
value = float(entry["value"])
except ValueError as e:
# Skip this entry because value is not a valid float
print("Error converting str to float ", e)
continue
print("entry = ", entry)
print("date_max = ", date_max)
if date not in date_max or value > float(date_max[date]["value"]):
date_max[date] = entry
deduped_trend_data = list(date_max.values())
# Only update if changes are made
if len(deduped_trend_data) != len(doc.get("trend_data", [])):
await db.trends.update_one(
{"_id": doc["_id"]}, {"$set": {"trend_data": deduped_trend_data}}
)
updated_count += 1
print(f"Updated {updated_count} documents.")
# Usage:
if __name__ == "__main__":
import asyncio
asyncio.run(deduplicate_trend_data())