import logging import os import urllib.parse from concurrent.futures import ThreadPoolExecutor from functools import lru_cache from typing import Optional import difflib import requests logger = logging.getLogger(__name__) PLACEHOLDER_IMAGES = [ "https://images.unsplash.com/photo-1543002588-bfa74002ed7e?w=300&h=450&fit=crop", "https://images.unsplash.com/photo-1512820790803-83ca734da794?w=300&h=450&fit=crop", "https://images.unsplash.com/photo-1495446815901-a7297e633e8d?w=300&h=450&fit=crop", ] def _strings_are_similar(s1: str, s2: str, threshold: float = 0.6) -> bool: """Check if two strings are similar using sequence matching or containment.""" if not s1 or not s2: return False s1, s2 = s1.lower(), s2.lower() # Check for containment (handles substrings like "Harry Potter" in "Harry Potter and the...") if s1 in s2 or s2 in s1: return True return difflib.SequenceMatcher(None, s1, s2).ratio() > threshold def ensure_dir_exists(file_path: str): """Ensures that the directory for a given file path exists.""" try: output_dir = os.path.dirname(file_path) if output_dir: os.makedirs(output_dir, exist_ok=True) except OSError as e: logger.error(f"Error creating directory for {file_path}: {e}") raise @lru_cache(maxsize=256) def get_cover_url_multi_source(title: str, author: str) -> str: """ Fetch book cover from multiple sources with fallback chain. Priority order: 1. Google Books API (best quality, most reliable) 2. Open Library API 3. Beautiful placeholder from Unsplash """ cover = _get_cover_from_google_books(title, author) if cover: return cover cover = _get_cover_from_openlibrary(title, author) if cover: return cover # Return None so the frontend can render a generated gradient cover return None def _get_cover_from_google_books(title: str, author: str) -> Optional[str]: """Fetch cover from Google Books API.""" try: query = f"{title} {author}".strip() encoded_query = urllib.parse.quote(query) base_url = "https://www.googleapis.com/books/v1/volumes" url = f"{base_url}?q={encoded_query}&maxResults=1" # Add API key if available to avoid rate limiting api_key = os.getenv("GOOGLE_BOOKS_API_KEY") if api_key: url += f"&key={api_key}" response = requests.get(url, timeout=5) response.raise_for_status() data = response.json() if data.get("totalItems", 0) > 0: items = data.get("items", []) if items and "volumeInfo" in items[0]: volume_info = items[0]["volumeInfo"] # Validate match to avoid false positives found_title = volume_info.get("title", "") if not _strings_are_similar(title, found_title): logger.info(f"Google Books mismatch: queried '{title}', got '{found_title}'. Skipping.") return None image_links = volume_info.get("imageLinks", {}) for size in ["large", "medium", "small", "thumbnail", "smallThumbnail"]: if size in image_links: cover_url: str = image_links[size] cover_url = cover_url.replace("http://", "https://") logger.info(f"Found Google Books cover for '{title}'") return cover_url return None except Exception as e: logger.debug(f"Google Books API failed for '{title}': {e}") return None def _get_cover_from_openlibrary(title: str, author: str) -> Optional[str]: """Fetch cover from Open Library API.""" try: search_url = ( f"https://openlibrary.org/search.json?title={urllib.parse.quote(title)}&author={urllib.parse.quote(author)}" ) response = requests.get(search_url, timeout=5) response.raise_for_status() data = response.json() if data.get("numFound", 0) > 0: docs = data.get("docs", []) if docs: for doc in docs: if "isbn" in doc and doc["isbn"]: isbn = doc["isbn"][0] cover_url = f"https://covers.openlibrary.org/b/isbn/{isbn}-L.jpg" logger.info(f"Found Open Library cover for '{title}'") return cover_url return None except Exception as e: logger.debug(f"Open Library API failed for '{title}': {e}") return None def load_book_covers_batch(books): """Pre-fetch covers in batch, using existing URLs if available.""" results = {} books_to_fetch = [] for book in books: # Check if we already have a valid URL from our enriched dataset existing_url = book.get("cover_image_url") if existing_url and isinstance(existing_url, str) and len(existing_url) > 10: results[book["title"]] = existing_url else: books_to_fetch.append(book) if not books_to_fetch: return results with ThreadPoolExecutor(max_workers=10) as executor: futures = { executor.submit(get_cover_url_multi_source, book["title"], book.get("authors", "")): book for book in books_to_fetch } for future in futures: book = futures[future] try: results[book["title"]] = future.result() except Exception as e: logger.error(f"Error loading cover for {book['title']}: {e}") results[book["title"]] = PLACEHOLDER_IMAGES[0] return results def fetch_book_cover(title: str, author: str) -> Optional[str]: """ Fetch book cover from multiple sources with fallback chain. Priority order: 1. Google Books API (best quality, most reliable) 2. Open Library API 3. Placeholder image Args: title (str): Book title author (str): Book author Returns: Optional[str]: URL to book cover or None if not found """ return get_cover_url_multi_source(title, author)