import sqlite3 import requests from bs4 import BeautifulSoup from loguru import logger import sys # Setup logging logger.remove() logger.add(sys.stderr, format="{level: <8} | {message}", level="INFO") DB_PATH = "data/satellites.db" BASE_URL = "https://space.skyrocket.de/directories/" def verify_countries(conn): logger.info("VERIFYING COUNTRIES...") url = "https://space.skyrocket.de/directories/sat_c.htm" try: resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) resp.raise_for_status() soup = BeautifulSoup(resp.text, "lxml") # Count on page page_count = len(soup.select("ul.country-list li")) # Count in DB cursor = conn.cursor() db_count = cursor.execute("SELECT COUNT(*) FROM countries").fetchone()[0] if page_count == db_count: logger.info(f"✅ Countries match! Page: {page_count}, DB: {db_count}") else: logger.warning(f"❌ Countries mismatch! Page: {page_count}, DB: {db_count}") except Exception as e: logger.error(f"Error checking countries: {e}") def verify_categories(conn, country="China"): logger.info(f"VERIFYING CATEGORIES FOR {country}...") # Get country URL from DB cursor = conn.cursor() row = cursor.execute("SELECT url FROM countries WHERE country_name=?", (country,)).fetchone() if not row: logger.error(f"Country {country} not found in DB") return url = row[0] try: resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) soup = BeautifulSoup(resp.text, "lxml") table = soup.find("table", class_="index") if not table: logger.warning("No category table found on page") return # Count links inside the table page_links = table.select("ul li a") page_count = len(page_links) # Count in DB db_count = cursor.execute("SELECT COUNT(*) FROM categories WHERE country_name=?", (country,)).fetchone()[0] if page_count == db_count: logger.info(f"✅ Categories match for {country}! Page: {page_count}, DB: {db_count}") else: logger.warning(f"❌ Categories mismatch for {country}! Page: {page_count}, DB: {db_count}") # Optional: Find which ones are missing db_cats = [r[0] for r in cursor.execute("SELECT category_name FROM categories WHERE country_name=?", (country,)).fetchall()] page_cats = [l.text.strip() for l in page_links] missing = set(page_cats) - set(db_cats) if missing: logger.warning(f"Missing in DB: {missing}") except Exception as e: logger.error(f"Error checking categories: {e}") def verify_satellites(conn, country="China", sample_size=3): logger.info(f"VERIFYING SATELLITES FOR {country}...") cursor = conn.cursor() categories = cursor.execute("SELECT category_name, url FROM categories WHERE country_name=?", (country,)).fetchall() import random sampled_categories = random.sample(categories, min(sample_size, len(categories))) for cat_name, url in sampled_categories: logger.info(f" Checking Category: {cat_name}") try: resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) soup = BeautifulSoup(resp.text, "lxml") table = soup.find("table", class_="index") if not table: logger.warning(" No satellite table found query skipping") continue # Logic to count total satellites on page # We must use logic similar to scraper: Iterate rows, find 3rd column, count links page_sat_count = 0 rows = table.find_all("tr") for row in rows: cols = row.find_all("td") # Need robust logic from scraper or simplier approximation? # Simple approximation: find all links in the last column # But column index varies (2 or 3). # Let's count ALL links in the table that look like satellite links # Usually href="../doc_sdat/..." links = table.find_all("a") valid_links = [l for l in links if "doc_sdat" in l.get("href", "") and "cancelled" not in l.parent.get("class", [])] # Deduplicate based on HREF because sometimes text differs but link is same? # Actually scraper inserts every link. page_sat_count = len(valid_links) # DB Count db_count = cursor.execute("SELECT COUNT(*) FROM satellites WHERE country_name=? AND category_name=?", (country, cat_name)).fetchone()[0] if page_sat_count == db_count: logger.info(f" ✅ Match! Page: {page_sat_count}, DB: {db_count}") else: logger.warning(f" ⚠️ Mismatch. Page (heuristic): {page_sat_count}, DB: {db_count}") # Note: Heuristic might be imperfect if scraper ignores some specific links or table structure is weird except Exception as e: logger.error(f" Error: {e}") if __name__ == "__main__": conn = sqlite3.connect(DB_PATH) verify_countries(conn) verify_categories(conn) verify_satellites(conn) conn.close()