""" Script to restore valid http image links from `data/books_basic_info.csv` into `data/books.db`. This fixes the issue where processed data contained local file paths instead of original network URLs. """ import sqlite3 import pandas as pd from pathlib import Path import sys # Add project root to path PROJECT_ROOT = Path(__file__).resolve().parent.parent sys.path.append(str(PROJECT_ROOT)) from src.config import DATA_DIR from src.utils import setup_logger logger = setup_logger("fix_covers") def fix_covers(): db_path = DATA_DIR / "books.db" csv_path = DATA_DIR / "books_basic_info.csv" if not db_path.exists(): logger.error(f"Database not found at {db_path}") return if not csv_path.exists(): logger.error(f"Source CSV not found at {csv_path}") return logger.info("Loading source CSV (this may take a moment)...") # Read strict columns to save memory try: df = pd.read_csv(csv_path, usecols=["isbn10", "isbn13", "image"], dtype=str) except ValueError: # Fallback if names differ df = pd.read_csv(csv_path, dtype=str) logger.info(f"Loaded {len(df)} rows from CSV.") # Filter for valid images only df = df[df["image"].str.startswith("http", na=False)] logger.info(f"Found {len(df)} rows with valid http links.") conn = sqlite3.connect(db_path) cursor = conn.cursor() updated_count = 0 # We'll prioritize ISBN13, then ISBN10 # Prepare batch update updates = [] logger.info("Preparing updates...") for _, row in df.iterrows(): url = row["image"] isbn13 = row.get("isbn13") isbn10 = row.get("isbn10") if pd.notna(isbn13) and str(isbn13).strip(): updates.append((url, str(isbn13))) elif pd.notna(isbn10) and str(isbn10).strip(): # If we updates based on isbn10, we need a different query or handle it # Let's simple try to update by isbn10 if isbn13 matches nothing? # Actually, let's just create a list of (url, id, type) pass # Strategy: # The DB looks like it might have 10-digit ISBNs in the 'isbn13' column for some rows. # So we use the valid ISBN from CSV (finding either 10 or 13) and match against BOTH columns in DB. updates = [] for _, row in df.iterrows(): url = row["image"] # Get whatever ID we have i13 = str(row.get("isbn13", "")).strip() i10 = str(row.get("isbn10", "")).strip() # Helper: is this a valid-looking ID? has_13 = len(i13) > 5 and i13.lower() != "nan" has_10 = len(i10) > 5 and i10.lower() != "nan" # We will try to match whatever we have against BOTH columns in DB to be safe ids_to_try = [] if has_13: ids_to_try.append(i13) if has_10: ids_to_try.append(i10) # If we have both, we probably only need to queue one update set that tries both # But separate updates are safer if specific logic handles it # Actually, let's just create a flattened list of (url, id_to_match) # And run query: UPDATE books SET thumbnail = ? WHERE isbn13 = ? OR isbn10 = ? for pid in set(ids_to_try): updates.append((url, url, pid, pid)) logger.info(f"Prepared {len(updates)} update params.") # Executing aggressive update # Note: We update `thumbnail` AND `image` columns. chunk_size = 10000 for i in range(0, len(updates), chunk_size): chunk = updates[i:i + chunk_size] cursor.executemany("UPDATE books SET thumbnail = ?, image = ? WHERE isbn13 = ? OR isbn10 = ?", chunk) if i % 50000 == 0: logger.info(f"Processed {i} updates...") # conn.commit() # Optional commit/checkpoint conn.commit() logger.info("Update complete. Vacuuming...") # Check result cursor.execute("SELECT count(*) FROM books WHERE thumbnail LIKE 'http%'") count = cursor.fetchone()[0] logger.info(f"Total books with http thumbnails now: {count}") conn.close() if __name__ == "__main__": fix_covers()