Spaces:

nice-bill
/

deepshelf-api

Sleeping

App Files Files Community

deepshelf-api / src /book_recommender /utils.py

nice-bill

initial commit

cdb73a8 3 months ago

raw

history blame contribute delete

6.24 kB

	import logging
	import os
	import urllib.parse
	from concurrent.futures import ThreadPoolExecutor
	from functools import lru_cache
	from typing import Optional
	import difflib

	import requests


	logger = logging.getLogger(__name__)

	PLACEHOLDER_IMAGES = [
	"https://images.unsplash.com/photo-1543002588-bfa74002ed7e?w=300&h=450&fit=crop",
	"https://images.unsplash.com/photo-1512820790803-83ca734da794?w=300&h=450&fit=crop",
	"https://images.unsplash.com/photo-1495446815901-a7297e633e8d?w=300&h=450&fit=crop",
	]


	def _strings_are_similar(s1: str, s2: str, threshold: float = 0.6) -> bool:
	"""Check if two strings are similar using sequence matching or containment."""
	if not s1 or not s2:
	return False
	s1, s2 = s1.lower(), s2.lower()
	# Check for containment (handles substrings like "Harry Potter" in "Harry Potter and the...")
	if s1 in s2 or s2 in s1:
	return True
	return difflib.SequenceMatcher(None, s1, s2).ratio() > threshold


	def ensure_dir_exists(file_path: str):
	"""Ensures that the directory for a given file path exists."""
	try:
	output_dir = os.path.dirname(file_path)
	if output_dir:
	os.makedirs(output_dir, exist_ok=True)
	except OSError as e:
	logger.error(f"Error creating directory for {file_path}: {e}")
	raise


	@lru_cache(maxsize=256)
	def get_cover_url_multi_source(title: str, author: str) -> str:
	"""
	Fetch book cover from multiple sources with fallback chain.

	Priority order:
	1. Google Books API (best quality, most reliable)
	2. Open Library API
	3. Beautiful placeholder from Unsplash
	"""
	cover = _get_cover_from_google_books(title, author)
	if cover:
	return cover

	cover = _get_cover_from_openlibrary(title, author)
	if cover:
	return cover

	# Return None so the frontend can render a generated gradient cover
	return None


	def _get_cover_from_google_books(title: str, author: str) -> Optional[str]:
	"""Fetch cover from Google Books API."""
	try:
	query = f"{title} {author}".strip()
	encoded_query = urllib.parse.quote(query)

	base_url = "https://www.googleapis.com/books/v1/volumes"
	url = f"{base_url}?q={encoded_query}&maxResults=1"

	# Add API key if available to avoid rate limiting
	api_key = os.getenv("GOOGLE_BOOKS_API_KEY")
	if api_key:
	url += f"&key={api_key}"

	response = requests.get(url, timeout=5)
	response.raise_for_status()
	data = response.json()

	if data.get("totalItems", 0) > 0:
	items = data.get("items", [])
	if items and "volumeInfo" in items[0]:
	volume_info = items[0]["volumeInfo"]

	# Validate match to avoid false positives
	found_title = volume_info.get("title", "")
	if not _strings_are_similar(title, found_title):
	logger.info(f"Google Books mismatch: queried '{title}', got '{found_title}'. Skipping.")
	return None

	image_links = volume_info.get("imageLinks", {})

	for size in ["large", "medium", "small", "thumbnail", "smallThumbnail"]:
	if size in image_links:
	cover_url: str = image_links[size]
	cover_url = cover_url.replace("http://", "https://")
	logger.info(f"Found Google Books cover for '{title}'")
	return cover_url

	return None
	except Exception as e:
	logger.debug(f"Google Books API failed for '{title}': {e}")
	return None


	def _get_cover_from_openlibrary(title: str, author: str) -> Optional[str]:
	"""Fetch cover from Open Library API."""
	try:
	search_url = (
	f"https://openlibrary.org/search.json?title={urllib.parse.quote(title)}&author={urllib.parse.quote(author)}"
	)

	response = requests.get(search_url, timeout=5)
	response.raise_for_status()
	data = response.json()

	if data.get("numFound", 0) > 0:
	docs = data.get("docs", [])
	if docs:
	for doc in docs:
	if "isbn" in doc and doc["isbn"]:
	isbn = doc["isbn"][0]
	cover_url = f"https://covers.openlibrary.org/b/isbn/{isbn}-L.jpg"
	logger.info(f"Found Open Library cover for '{title}'")
	return cover_url

	return None
	except Exception as e:
	logger.debug(f"Open Library API failed for '{title}': {e}")
	return None


	def load_book_covers_batch(books):
	"""Pre-fetch covers in batch, using existing URLs if available."""
	results = {}
	books_to_fetch = []

	for book in books:
	# Check if we already have a valid URL from our enriched dataset
	existing_url = book.get("cover_image_url")
	if existing_url and isinstance(existing_url, str) and len(existing_url) > 10:
	results[book["title"]] = existing_url
	else:
	books_to_fetch.append(book)

	if not books_to_fetch:
	return results

	with ThreadPoolExecutor(max_workers=10) as executor:
	futures = {
	executor.submit(get_cover_url_multi_source, book["title"], book.get("authors", "")): book for book in books_to_fetch
	}

	for future in futures:
	book = futures[future]
	try:
	results[book["title"]] = future.result()
	except Exception as e:
	logger.error(f"Error loading cover for {book['title']}: {e}")
	results[book["title"]] = PLACEHOLDER_IMAGES[0]

	return results


	def fetch_book_cover(title: str, author: str) -> Optional[str]:
	"""
	Fetch book cover from multiple sources with fallback chain.

	Priority order:
	1. Google Books API (best quality, most reliable)
	2. Open Library API
	3. Placeholder image

	Args:
	title (str): Book title
	author (str): Book author

	Returns:
	Optional[str]: URL to book cover or None if not found
	"""
	return get_cover_url_multi_source(title, author)