Spaces:

ymlin105
/

book-rec-with-LLMs

Sleeping

App Files Files Community

book-rec-with-LLMs / scripts /fix_covers.py

ymlin105

fix: VectorDB MetadataStore, dynamic cover fetching

eb63144 11 days ago

raw

history blame contribute delete

4.22 kB

	"""
	Script to restore valid http image links from `data/books_basic_info.csv` into `data/books.db`.
	This fixes the issue where processed data contained local file paths instead of original network URLs.
	"""
	import sqlite3
	import pandas as pd
	from pathlib import Path
	import sys

	# Add project root to path
	PROJECT_ROOT = Path(__file__).resolve().parent.parent
	sys.path.append(str(PROJECT_ROOT))

	from src.config import DATA_DIR
	from src.utils import setup_logger

	logger = setup_logger("fix_covers")

	def fix_covers():
	db_path = DATA_DIR / "books.db"
	csv_path = DATA_DIR / "books_basic_info.csv"

	if not db_path.exists():
	logger.error(f"Database not found at {db_path}")
	return

	if not csv_path.exists():
	logger.error(f"Source CSV not found at {csv_path}")
	return

	logger.info("Loading source CSV (this may take a moment)...")
	# Read strict columns to save memory
	try:
	df = pd.read_csv(csv_path, usecols=["isbn10", "isbn13", "image"], dtype=str)
	except ValueError:
	# Fallback if names differ
	df = pd.read_csv(csv_path, dtype=str)

	logger.info(f"Loaded {len(df)} rows from CSV.")

	# Filter for valid images only
	df = df[df["image"].str.startswith("http", na=False)]
	logger.info(f"Found {len(df)} rows with valid http links.")

	conn = sqlite3.connect(db_path)
	cursor = conn.cursor()

	updated_count = 0

	# We'll prioritize ISBN13, then ISBN10
	# Prepare batch update
	updates = []

	logger.info("Preparing updates...")
	for _, row in df.iterrows():
	url = row["image"]
	isbn13 = row.get("isbn13")
	isbn10 = row.get("isbn10")

	if pd.notna(isbn13) and str(isbn13).strip():
	updates.append((url, str(isbn13)))
	elif pd.notna(isbn10) and str(isbn10).strip():
	# If we updates based on isbn10, we need a different query or handle it
	# Let's simple try to update by isbn10 if isbn13 matches nothing?
	# Actually, let's just create a list of (url, id, type)
	pass

	# Strategy:
	# The DB looks like it might have 10-digit ISBNs in the 'isbn13' column for some rows.
	# So we use the valid ISBN from CSV (finding either 10 or 13) and match against BOTH columns in DB.

	updates = []

	for _, row in df.iterrows():
	url = row["image"]
	# Get whatever ID we have
	i13 = str(row.get("isbn13", "")).strip()
	i10 = str(row.get("isbn10", "")).strip()

	# Helper: is this a valid-looking ID?
	has_13 = len(i13) > 5 and i13.lower() != "nan"
	has_10 = len(i10) > 5 and i10.lower() != "nan"

	# We will try to match whatever we have against BOTH columns in DB to be safe
	ids_to_try = []
	if has_13: ids_to_try.append(i13)
	if has_10: ids_to_try.append(i10)

	# If we have both, we probably only need to queue one update set that tries both
	# But separate updates are safer if specific logic handles it
	# Actually, let's just create a flattened list of (url, id_to_match)
	# And run query: UPDATE books SET thumbnail = ? WHERE isbn13 = ? OR isbn10 = ?

	for pid in set(ids_to_try):
	updates.append((url, url, pid, pid))

	logger.info(f"Prepared {len(updates)} update params.")

	# Executing aggressive update
	# Note: We update `thumbnail` AND `image` columns.
	chunk_size = 10000
	for i in range(0, len(updates), chunk_size):
	chunk = updates[i:i + chunk_size]
	cursor.executemany("UPDATE books SET thumbnail = ?, image = ? WHERE isbn13 = ? OR isbn10 = ?", chunk)
	if i % 50000 == 0:
	logger.info(f"Processed {i} updates...")
	# conn.commit() # Optional commit/checkpoint

	conn.commit()
	logger.info("Update complete. Vacuuming...")

	# Check result
	cursor.execute("SELECT count(*) FROM books WHERE thumbnail LIKE 'http%'")
	count = cursor.fetchone()[0]
	logger.info(f"Total books with http thumbnails now: {count}")

	conn.close()

	if __name__ == "__main__":
	fix_covers()