Spaces:

nice-bill
/

deepshelf-api

Sleeping

App Files Files Community

deepshelf-api / scripts /enrich_book_covers.py

nice-bill

initial commit

cdb73a8 3 months ago

raw

history blame contribute delete

2.54 kB

	import pandas as pd
	import requests
	import time
	import logging
	from pathlib import Path

	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	DATA_DIR = Path("data/raw")
	INPUT_FILE = DATA_DIR / "books_prepared.csv"

	def get_openlibrary_cover(title, author):
	try:
	# Simple cleaning
	clean_title = title.replace('&', '').split('(')[0].strip()
	clean_author = author.split(',')[0].strip() if author else ""

	query = f"title={clean_title}&author={clean_author}"
	url = f"https://openlibrary.org/search.json?{query}&limit=1"

	response = requests.get(url, timeout=5)
	if response.status_code == 200:
	data = response.json()
	if data.get("docs"):
	doc = data["docs"][0]
	if "cover_i" in doc:
	return f"https://covers.openlibrary.org/b/id/{doc['cover_i']}-L.jpg"
	except Exception as e:
	logger.warning(f"Error fetching cover for {title}: {e}")
	return None

	def enrich_data():
	if not INPUT_FILE.exists():
	logger.error(f"File not found: {INPUT_FILE}")
	return

	df = pd.read_csv(INPUT_FILE)
	logger.info(f"Loaded {len(df)} books.")

	if "cover_image_url" not in df.columns:
	df["cover_image_url"] = None

	# Filter for rows without covers
	# We check for NaN or empty string
	mask = df["cover_image_url"].isna() \| (df["cover_image_url"] == "")
	indices = df[mask].index

	logger.info(f"Found {len(indices)} books missing covers.")

	# Process a batch (e.g., 50) to demonstrate improvement without timeout
	# The user can run this script repeatedly or increase limit
	BATCH_SIZE = 20
	count = 0

	for idx in indices:
	if count >= BATCH_SIZE:
	break

	row = df.loc[idx]
	title = row['title']
	author = row['authors']

	logger.info(f"[{count+1}/{BATCH_SIZE}] Fetching cover for: {title}")
	cover_url = get_openlibrary_cover(title, author)

	if cover_url:
	df.at[idx, 'cover_image_url'] = cover_url
	logger.info(f" -> Found: {cover_url}")
	else:
	logger.info(" -> No cover found.")

	time.sleep(0.2) # Polite delay
	count += 1

	# Save back
	df.to_csv(INPUT_FILE, index=False)
	logger.info(f"Saved enriched data to {INPUT_FILE}")

	if __name__ == "__main__":
	enrich_data()