deepshelf-api / scripts /enrich_book_covers.py
nice-bill's picture
initial commit
cdb73a8
import pandas as pd
import requests
import time
import logging
from pathlib import Path
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
DATA_DIR = Path("data/raw")
INPUT_FILE = DATA_DIR / "books_prepared.csv"
def get_openlibrary_cover(title, author):
try:
# Simple cleaning
clean_title = title.replace('&', '').split('(')[0].strip()
clean_author = author.split(',')[0].strip() if author else ""
query = f"title={clean_title}&author={clean_author}"
url = f"https://openlibrary.org/search.json?{query}&limit=1"
response = requests.get(url, timeout=5)
if response.status_code == 200:
data = response.json()
if data.get("docs"):
doc = data["docs"][0]
if "cover_i" in doc:
return f"https://covers.openlibrary.org/b/id/{doc['cover_i']}-L.jpg"
except Exception as e:
logger.warning(f"Error fetching cover for {title}: {e}")
return None
def enrich_data():
if not INPUT_FILE.exists():
logger.error(f"File not found: {INPUT_FILE}")
return
df = pd.read_csv(INPUT_FILE)
logger.info(f"Loaded {len(df)} books.")
if "cover_image_url" not in df.columns:
df["cover_image_url"] = None
# Filter for rows without covers
# We check for NaN or empty string
mask = df["cover_image_url"].isna() | (df["cover_image_url"] == "")
indices = df[mask].index
logger.info(f"Found {len(indices)} books missing covers.")
# Process a batch (e.g., 50) to demonstrate improvement without timeout
# The user can run this script repeatedly or increase limit
BATCH_SIZE = 20
count = 0
for idx in indices:
if count >= BATCH_SIZE:
break
row = df.loc[idx]
title = row['title']
author = row['authors']
logger.info(f"[{count+1}/{BATCH_SIZE}] Fetching cover for: {title}")
cover_url = get_openlibrary_cover(title, author)
if cover_url:
df.at[idx, 'cover_image_url'] = cover_url
logger.info(f" -> Found: {cover_url}")
else:
logger.info(" -> No cover found.")
time.sleep(0.2) # Polite delay
count += 1
# Save back
df.to_csv(INPUT_FILE, index=False)
logger.info(f"Saved enriched data to {INPUT_FILE}")
if __name__ == "__main__":
enrich_data()