File size: 4,219 Bytes
eb63144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""
Script to restore valid http image links from `data/books_basic_info.csv` into `data/books.db`.
This fixes the issue where processed data contained local file paths instead of original network URLs.
"""
import sqlite3
import pandas as pd
from pathlib import Path
import sys

# Add project root to path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.append(str(PROJECT_ROOT))

from src.config import DATA_DIR
from src.utils import setup_logger

logger = setup_logger("fix_covers")

def fix_covers():
    db_path = DATA_DIR / "books.db"
    csv_path = DATA_DIR / "books_basic_info.csv"
    
    if not db_path.exists():
        logger.error(f"Database not found at {db_path}")
        return
    
    if not csv_path.exists():
        logger.error(f"Source CSV not found at {csv_path}")
        return
        
    logger.info("Loading source CSV (this may take a moment)...")
    # Read strict columns to save memory
    try:
        df = pd.read_csv(csv_path, usecols=["isbn10", "isbn13", "image"], dtype=str)
    except ValueError:
        # Fallback if names differ
        df = pd.read_csv(csv_path, dtype=str)
        
    logger.info(f"Loaded {len(df)} rows from CSV.")
    
    # Filter for valid images only
    df = df[df["image"].str.startswith("http", na=False)]
    logger.info(f"Found {len(df)} rows with valid http links.")
    
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    updated_count = 0
    
    # We'll prioritize ISBN13, then ISBN10
    # Prepare batch update
    updates = []
    
    logger.info("Preparing updates...")
    for _, row in df.iterrows():
        url = row["image"]
        isbn13 = row.get("isbn13")
        isbn10 = row.get("isbn10")
        
        if pd.notna(isbn13) and str(isbn13).strip():
            updates.append((url, str(isbn13)))
        elif pd.notna(isbn10) and str(isbn10).strip():
            # If we updates based on isbn10, we need a different query or handle it
            # Let's simple try to update by isbn10 if isbn13 matches nothing?
            # Actually, let's just create a list of (url, id, type)
            pass
            
    # Strategy: 
    # The DB looks like it might have 10-digit ISBNs in the 'isbn13' column for some rows.
    # So we use the valid ISBN from CSV (finding either 10 or 13) and match against BOTH columns in DB.
    
    updates = []
    
    for _, row in df.iterrows():
        url = row["image"]
        # Get whatever ID we have
        i13 = str(row.get("isbn13", "")).strip()
        i10 = str(row.get("isbn10", "")).strip()
        
        # Helper: is this a valid-looking ID?
        has_13 = len(i13) > 5 and i13.lower() != "nan"
        has_10 = len(i10) > 5 and i10.lower() != "nan"
        
        # We will try to match whatever we have against BOTH columns in DB to be safe
        ids_to_try = []
        if has_13: ids_to_try.append(i13)
        if has_10: ids_to_try.append(i10)
        
        # If we have both, we probably only need to queue one update set that tries both
        # But separate updates are safer if specific logic handles it
        # Actually, let's just create a flattened list of (url, id_to_match)
        # And run query: UPDATE books SET thumbnail = ? WHERE isbn13 = ? OR isbn10 = ?
        
        for pid in set(ids_to_try):
            updates.append((url, url, pid, pid))

    logger.info(f"Prepared {len(updates)} update params.")
    
    # Executing aggressive update
    # Note: We update `thumbnail` AND `image` columns.
    chunk_size = 10000
    for i in range(0, len(updates), chunk_size):
        chunk = updates[i:i + chunk_size]
        cursor.executemany("UPDATE books SET thumbnail = ?, image = ? WHERE isbn13 = ? OR isbn10 = ?", chunk)
        if i % 50000 == 0:
            logger.info(f"Processed {i} updates...")
            # conn.commit() # Optional commit/checkpoint

    conn.commit()
    logger.info("Update complete. Vacuuming...")
    
    # Check result
    cursor.execute("SELECT count(*) FROM books WHERE thumbnail LIKE 'http%'")
    count = cursor.fetchone()[0]
    logger.info(f"Total books with http thumbnails now: {count}")
    
    conn.close()

if __name__ == "__main__":
    fix_covers()