Spaces:
Running
Running
| """ | |
| update_data.py | |
| Enhanced Data Updater Script with: | |
| - Time-based pruning (30 days) | |
| - Robust multi-source fetching | |
| """ | |
| import os | |
| import sys | |
| from datetime import datetime, timedelta, timezone | |
| # Ensure the root directory is in the path | |
| sys.path.append(os.path.dirname(os.path.abspath(__file__))) | |
| from project.config import FAISS_FILE | |
| from project.database import init_db, clear_db, get_total_evidence_count, prune_old_evidence | |
| from model import ( | |
| embed_model, fetch_rss, fetch_gdelt, fetch_newsapi, | |
| fetch_wikipedia, fetch_duckduckgo, build_faiss | |
| ) | |
| def update_evidence_database(): | |
| print("=========================================") | |
| print(" Proofly - Data Updater Script") | |
| print("=========================================\n") | |
| print("[1/3] Initializing database...") | |
| init_db() | |
| before_count = get_total_evidence_count() | |
| print(f"Existing evidence count: {before_count}") | |
| print("\n[2/3] Pruning old data (30+ days)...") | |
| deleted = prune_old_evidence(days=30) | |
| print(f"Removed {deleted} outdated records.") | |
| seed_topics = [ | |
| "breaking news today", | |
| "weekly global news summary", | |
| "news in India politics economy technology", | |
| "US China Europe international relations updates", | |
| "latest global news and international relations", | |
| "geopolitics conflicts diplomacy world leaders", | |
| "global economy inflation stock markets central banks", | |
| "business trends startups mergers acquisitions corporate news", | |
| "latest technology trends AI machine learning robotics", | |
| "cybersecurity data breaches and internet privacy", | |
| "semiconductors chips industry and hardware innovation", | |
| "space exploration NASA ESA missions satellites astronomy", | |
| "scientific discoveries physics climate research biology", | |
| "climate change global warming renewable energy sustainability", | |
| "natural disasters earthquakes floods hurricanes wildfire news", | |
| "global health updates diseases vaccines medical research", | |
| "biotechnology pharma industry and healthcare systems", | |
| "military developments defense technology global conflicts", | |
| "social media trends internet culture digital society", | |
| "education systems policies global learning developments", | |
| "international sports events football cricket olympics news", | |
| "movies tv shows celebrities entertainment industry news", | |
| "basic physics chemistry biology fundamental concepts", | |
| "earth science planets solar system geography facts", | |
| "Indian cities climate change and weather patterns", | |
| "" | |
| ] | |
| print("\n[3/3] Fetching new data from all connected sources...") | |
| for idx, topic in enumerate(seed_topics): | |
| print(f"\n--- Fetching Topic {idx+1}/{len(seed_topics)}: '{topic}' ---") | |
| claim_emb = embed_model.encode([topic], normalize_embeddings=True) | |
| try: | |
| fetch_rss(claim_emb) | |
| fetch_gdelt(topic, claim_emb) | |
| fetch_newsapi(topic, claim_emb) | |
| fetch_wikipedia(topic) | |
| except Exception as e: | |
| print(f"❌ Error fetching topic {topic}: {e}") | |
| print("\n[4/4] Building clean FAISS Vector Index from SQLite...") | |
| success = build_faiss() | |
| if success: | |
| after_count = get_total_evidence_count() | |
| new_data = after_count - before_count + deleted | |
| print(f"\n✅ UPDATE COMPLETE!") | |
| print(f" Newly Fetched Chunks: {new_data}") | |
| print(f" Total Indexed Chunks: {after_count}") | |
| else: | |
| print("\n❌ FAISS UPDATE FAILED! No evidence was gathered to index.") | |
| if __name__ == "__main__": | |
| update_evidence_database() | |