import sqlite3
import requests
from bs4 import BeautifulSoup
from loguru import logger
import sys
# Setup logging
logger.remove()
logger.add(sys.stderr, format="{level: <8} | {message}", level="INFO")
DB_PATH = "data/satellites.db"
BASE_URL = "https://space.skyrocket.de/directories/"
def verify_countries(conn):
logger.info("VERIFYING COUNTRIES...")
url = "https://space.skyrocket.de/directories/sat_c.htm"
try:
resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")
# Count on page
page_count = len(soup.select("ul.country-list li"))
# Count in DB
cursor = conn.cursor()
db_count = cursor.execute("SELECT COUNT(*) FROM countries").fetchone()[0]
if page_count == db_count:
logger.info(f"✅ Countries match! Page: {page_count}, DB: {db_count}")
else:
logger.warning(f"❌ Countries mismatch! Page: {page_count}, DB: {db_count}")
except Exception as e:
logger.error(f"Error checking countries: {e}")
def verify_categories(conn, country="China"):
logger.info(f"VERIFYING CATEGORIES FOR {country}...")
# Get country URL from DB
cursor = conn.cursor()
row = cursor.execute("SELECT url FROM countries WHERE country_name=?", (country,)).fetchone()
if not row:
logger.error(f"Country {country} not found in DB")
return
url = row[0]
try:
resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(resp.text, "lxml")
table = soup.find("table", class_="index")
if not table:
logger.warning("No category table found on page")
return
# Count links inside the table
page_links = table.select("ul li a")
page_count = len(page_links)
# Count in DB
db_count = cursor.execute("SELECT COUNT(*) FROM categories WHERE country_name=?", (country,)).fetchone()[0]
if page_count == db_count:
logger.info(f"✅ Categories match for {country}! Page: {page_count}, DB: {db_count}")
else:
logger.warning(f"❌ Categories mismatch for {country}! Page: {page_count}, DB: {db_count}")
# Optional: Find which ones are missing
db_cats = [r[0] for r in cursor.execute("SELECT category_name FROM categories WHERE country_name=?", (country,)).fetchall()]
page_cats = [l.text.strip() for l in page_links]
missing = set(page_cats) - set(db_cats)
if missing:
logger.warning(f"Missing in DB: {missing}")
except Exception as e:
logger.error(f"Error checking categories: {e}")
def verify_satellites(conn, country="China", sample_size=3):
logger.info(f"VERIFYING SATELLITES FOR {country}...")
cursor = conn.cursor()
categories = cursor.execute("SELECT category_name, url FROM categories WHERE country_name=?", (country,)).fetchall()
import random
sampled_categories = random.sample(categories, min(sample_size, len(categories)))
for cat_name, url in sampled_categories:
logger.info(f" Checking Category: {cat_name}")
try:
resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(resp.text, "lxml")
table = soup.find("table", class_="index")
if not table:
logger.warning(" No satellite table found query skipping")
continue
# Logic to count total satellites on page
# We must use logic similar to scraper: Iterate rows, find 3rd column, count links
page_sat_count = 0
rows = table.find_all("tr")
for row in rows:
cols = row.find_all("td")
# Need robust logic from scraper or simplier approximation?
# Simple approximation: find all links in the last column
# But column index varies (2 or 3).
# Let's count ALL links in the table that look like satellite links
# Usually href="../doc_sdat/..."
links = table.find_all("a")
valid_links = [l for l in links if "doc_sdat" in l.get("href", "") and "cancelled" not in l.parent.get("class", [])]
# Deduplicate based on HREF because sometimes text differs but link is same?
# Actually scraper inserts every link.
page_sat_count = len(valid_links)
# DB Count
db_count = cursor.execute("SELECT COUNT(*) FROM satellites WHERE country_name=? AND category_name=?", (country, cat_name)).fetchone()[0]
if page_sat_count == db_count:
logger.info(f" ✅ Match! Page: {page_sat_count}, DB: {db_count}")
else:
logger.warning(f" ⚠️ Mismatch. Page (heuristic): {page_sat_count}, DB: {db_count}")
# Note: Heuristic might be imperfect if scraper ignores some specific links or table structure is weird
except Exception as e:
logger.error(f" Error: {e}")
if __name__ == "__main__":
conn = sqlite3.connect(DB_PATH)
verify_countries(conn)
verify_categories(conn)
verify_satellites(conn)
conn.close()