Spaces:
Sleeping
Sleeping
File size: 4,219 Bytes
eb63144 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | """
Script to restore valid http image links from `data/books_basic_info.csv` into `data/books.db`.
This fixes the issue where processed data contained local file paths instead of original network URLs.
"""
import sqlite3
import pandas as pd
from pathlib import Path
import sys
# Add project root to path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.append(str(PROJECT_ROOT))
from src.config import DATA_DIR
from src.utils import setup_logger
logger = setup_logger("fix_covers")
def fix_covers():
db_path = DATA_DIR / "books.db"
csv_path = DATA_DIR / "books_basic_info.csv"
if not db_path.exists():
logger.error(f"Database not found at {db_path}")
return
if not csv_path.exists():
logger.error(f"Source CSV not found at {csv_path}")
return
logger.info("Loading source CSV (this may take a moment)...")
# Read strict columns to save memory
try:
df = pd.read_csv(csv_path, usecols=["isbn10", "isbn13", "image"], dtype=str)
except ValueError:
# Fallback if names differ
df = pd.read_csv(csv_path, dtype=str)
logger.info(f"Loaded {len(df)} rows from CSV.")
# Filter for valid images only
df = df[df["image"].str.startswith("http", na=False)]
logger.info(f"Found {len(df)} rows with valid http links.")
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
updated_count = 0
# We'll prioritize ISBN13, then ISBN10
# Prepare batch update
updates = []
logger.info("Preparing updates...")
for _, row in df.iterrows():
url = row["image"]
isbn13 = row.get("isbn13")
isbn10 = row.get("isbn10")
if pd.notna(isbn13) and str(isbn13).strip():
updates.append((url, str(isbn13)))
elif pd.notna(isbn10) and str(isbn10).strip():
# If we updates based on isbn10, we need a different query or handle it
# Let's simple try to update by isbn10 if isbn13 matches nothing?
# Actually, let's just create a list of (url, id, type)
pass
# Strategy:
# The DB looks like it might have 10-digit ISBNs in the 'isbn13' column for some rows.
# So we use the valid ISBN from CSV (finding either 10 or 13) and match against BOTH columns in DB.
updates = []
for _, row in df.iterrows():
url = row["image"]
# Get whatever ID we have
i13 = str(row.get("isbn13", "")).strip()
i10 = str(row.get("isbn10", "")).strip()
# Helper: is this a valid-looking ID?
has_13 = len(i13) > 5 and i13.lower() != "nan"
has_10 = len(i10) > 5 and i10.lower() != "nan"
# We will try to match whatever we have against BOTH columns in DB to be safe
ids_to_try = []
if has_13: ids_to_try.append(i13)
if has_10: ids_to_try.append(i10)
# If we have both, we probably only need to queue one update set that tries both
# But separate updates are safer if specific logic handles it
# Actually, let's just create a flattened list of (url, id_to_match)
# And run query: UPDATE books SET thumbnail = ? WHERE isbn13 = ? OR isbn10 = ?
for pid in set(ids_to_try):
updates.append((url, url, pid, pid))
logger.info(f"Prepared {len(updates)} update params.")
# Executing aggressive update
# Note: We update `thumbnail` AND `image` columns.
chunk_size = 10000
for i in range(0, len(updates), chunk_size):
chunk = updates[i:i + chunk_size]
cursor.executemany("UPDATE books SET thumbnail = ?, image = ? WHERE isbn13 = ? OR isbn10 = ?", chunk)
if i % 50000 == 0:
logger.info(f"Processed {i} updates...")
# conn.commit() # Optional commit/checkpoint
conn.commit()
logger.info("Update complete. Vacuuming...")
# Check result
cursor.execute("SELECT count(*) FROM books WHERE thumbnail LIKE 'http%'")
count = cursor.fetchone()[0]
logger.info(f"Total books with http thumbnails now: {count}")
conn.close()
if __name__ == "__main__":
fix_covers()
|