Spaces:

ymlin105
/

book-rec-with-LLMs

Sleeping

File size: 4,219 Bytes

eb63144

"""
Script to restore valid http image links from `data/books_basic_info.csv` into `data/books.db`.
This fixes the issue where processed data contained local file paths instead of original network URLs.
"""
import sqlite3
import pandas as pd
from pathlib import Path
import sys

# Add project root to path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.append(str(PROJECT_ROOT))

from src.config import DATA_DIR
from src.utils import setup_logger

logger = setup_logger("fix_covers")

def fix_covers():
    db_path = DATA_DIR / "books.db"
    csv_path = DATA_DIR / "books_basic_info.csv"
    
    if not db_path.exists():
        logger.error(f"Database not found at {db_path}")
        return
    
    if not csv_path.exists():
        logger.error(f"Source CSV not found at {csv_path}")
        return
        
    logger.info("Loading source CSV (this may take a moment)...")
    # Read strict columns to save memory
    try:
        df = pd.read_csv(csv_path, usecols=["isbn10", "isbn13", "image"], dtype=str)
    except ValueError:
        # Fallback if names differ
        df = pd.read_csv(csv_path, dtype=str)
        
    logger.info(f"Loaded {len(df)} rows from CSV.")
    
    # Filter for valid images only
    df = df[df["image"].str.startswith("http", na=False)]
    logger.info(f"Found {len(df)} rows with valid http links.")
    
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    updated_count = 0
    
    # We'll prioritize ISBN13, then ISBN10
    # Prepare batch update
    updates = []
    
    logger.info("Preparing updates...")
    for _, row in df.iterrows():
        url = row["image"]
        isbn13 = row.get("isbn13")
        isbn10 = row.get("isbn10")
        
        if pd.notna(isbn13) and str(isbn13).strip():
            updates.append((url, str(isbn13)))
        elif pd.notna(isbn10) and str(isbn10).strip():
            # If we updates based on isbn10, we need a different query or handle it
            # Let's simple try to update by isbn10 if isbn13 matches nothing?
            # Actually, let's just create a list of (url, id, type)
            pass
            
    # Strategy: 
    # The DB looks like it might have 10-digit ISBNs in the 'isbn13' column for some rows.
    # So we use the valid ISBN from CSV (finding either 10 or 13) and match against BOTH columns in DB.
    
    updates = []
    
    for _, row in df.iterrows():
        url = row["image"]
        # Get whatever ID we have
        i13 = str(row.get("isbn13", "")).strip()
        i10 = str(row.get("isbn10", "")).strip()
        
        # Helper: is this a valid-looking ID?
        has_13 = len(i13) > 5 and i13.lower() != "nan"
        has_10 = len(i10) > 5 and i10.lower() != "nan"
        
        # We will try to match whatever we have against BOTH columns in DB to be safe
        ids_to_try = []
        if has_13: ids_to_try.append(i13)
        if has_10: ids_to_try.append(i10)
        
        # If we have both, we probably only need to queue one update set that tries both
        # But separate updates are safer if specific logic handles it
        # Actually, let's just create a flattened list of (url, id_to_match)
        # And run query: UPDATE books SET thumbnail = ? WHERE isbn13 = ? OR isbn10 = ?
        
        for pid in set(ids_to_try):
            updates.append((url, url, pid, pid))

    logger.info(f"Prepared {len(updates)} update params.")
    
    # Executing aggressive update
    # Note: We update `thumbnail` AND `image` columns.
    chunk_size = 10000
    for i in range(0, len(updates), chunk_size):
        chunk = updates[i:i + chunk_size]
        cursor.executemany("UPDATE books SET thumbnail = ?, image = ? WHERE isbn13 = ? OR isbn10 = ?", chunk)
        if i % 50000 == 0:
            logger.info(f"Processed {i} updates...")
            # conn.commit() # Optional commit/checkpoint

    conn.commit()
    logger.info("Update complete. Vacuuming...")
    
    # Check result
    cursor.execute("SELECT count(*) FROM books WHERE thumbnail LIKE 'http%'")
    count = cursor.fetchone()[0]
    logger.info(f"Total books with http thumbnails now: {count}")
    
    conn.close()

if __name__ == "__main__":
    fix_covers()