proofly / update_data.py
Pragthedon's picture
Initial backend API deployment
4f48a4e
"""
update_data.py
Enhanced Data Updater Script with:
- Time-based pruning (30 days)
- Robust multi-source fetching
"""
import os
import sys
from datetime import datetime, timedelta, timezone
# Ensure the root directory is in the path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from project.config import FAISS_FILE
from project.database import init_db, clear_db, get_total_evidence_count, prune_old_evidence
from model import (
embed_model, fetch_rss, fetch_gdelt, fetch_newsapi,
fetch_wikipedia, fetch_duckduckgo, build_faiss
)
def update_evidence_database():
print("=========================================")
print(" Proofly - Data Updater Script")
print("=========================================\n")
print("[1/3] Initializing database...")
init_db()
before_count = get_total_evidence_count()
print(f"Existing evidence count: {before_count}")
print("\n[2/3] Pruning old data (30+ days)...")
deleted = prune_old_evidence(days=30)
print(f"Removed {deleted} outdated records.")
seed_topics = [
"breaking news today",
"weekly global news summary",
"news in India politics economy technology",
"US China Europe international relations updates",
"latest global news and international relations",
"geopolitics conflicts diplomacy world leaders",
"global economy inflation stock markets central banks",
"business trends startups mergers acquisitions corporate news",
"latest technology trends AI machine learning robotics",
"cybersecurity data breaches and internet privacy",
"semiconductors chips industry and hardware innovation",
"space exploration NASA ESA missions satellites astronomy",
"scientific discoveries physics climate research biology",
"climate change global warming renewable energy sustainability",
"natural disasters earthquakes floods hurricanes wildfire news",
"global health updates diseases vaccines medical research",
"biotechnology pharma industry and healthcare systems",
"military developments defense technology global conflicts",
"social media trends internet culture digital society",
"education systems policies global learning developments",
"international sports events football cricket olympics news",
"movies tv shows celebrities entertainment industry news",
"basic physics chemistry biology fundamental concepts",
"earth science planets solar system geography facts",
"Indian cities climate change and weather patterns",
""
]
print("\n[3/3] Fetching new data from all connected sources...")
for idx, topic in enumerate(seed_topics):
print(f"\n--- Fetching Topic {idx+1}/{len(seed_topics)}: '{topic}' ---")
claim_emb = embed_model.encode([topic], normalize_embeddings=True)
try:
fetch_rss(claim_emb)
fetch_gdelt(topic, claim_emb)
fetch_newsapi(topic, claim_emb)
fetch_wikipedia(topic)
except Exception as e:
print(f"❌ Error fetching topic {topic}: {e}")
print("\n[4/4] Building clean FAISS Vector Index from SQLite...")
success = build_faiss()
if success:
after_count = get_total_evidence_count()
new_data = after_count - before_count + deleted
print(f"\n✅ UPDATE COMPLETE!")
print(f" Newly Fetched Chunks: {new_data}")
print(f" Total Indexed Chunks: {after_count}")
else:
print("\n❌ FAISS UPDATE FAILED! No evidence was gathered to index.")
if __name__ == "__main__":
update_evidence_database()