File size: 6,241 Bytes
cdb73a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import logging
import os
import urllib.parse
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache
from typing import Optional
import difflib

import requests


logger = logging.getLogger(__name__)

PLACEHOLDER_IMAGES = [
    "https://images.unsplash.com/photo-1543002588-bfa74002ed7e?w=300&h=450&fit=crop",
    "https://images.unsplash.com/photo-1512820790803-83ca734da794?w=300&h=450&fit=crop",
    "https://images.unsplash.com/photo-1495446815901-a7297e633e8d?w=300&h=450&fit=crop",
]


def _strings_are_similar(s1: str, s2: str, threshold: float = 0.6) -> bool:
    """Check if two strings are similar using sequence matching or containment."""
    if not s1 or not s2:
        return False
    s1, s2 = s1.lower(), s2.lower()
    # Check for containment (handles substrings like "Harry Potter" in "Harry Potter and the...")
    if s1 in s2 or s2 in s1:
        return True
    return difflib.SequenceMatcher(None, s1, s2).ratio() > threshold


def ensure_dir_exists(file_path: str):
    """Ensures that the directory for a given file path exists."""
    try:
        output_dir = os.path.dirname(file_path)
        if output_dir:
            os.makedirs(output_dir, exist_ok=True)
    except OSError as e:
        logger.error(f"Error creating directory for {file_path}: {e}")
        raise


@lru_cache(maxsize=256)
def get_cover_url_multi_source(title: str, author: str) -> str:
    """
    Fetch book cover from multiple sources with fallback chain.

    Priority order:
    1. Google Books API (best quality, most reliable)
    2. Open Library API
    3. Beautiful placeholder from Unsplash
    """
    cover = _get_cover_from_google_books(title, author)
    if cover:
        return cover

    cover = _get_cover_from_openlibrary(title, author)
    if cover:
        return cover

    # Return None so the frontend can render a generated gradient cover
    return None


def _get_cover_from_google_books(title: str, author: str) -> Optional[str]:
    """Fetch cover from Google Books API."""
    try:
        query = f"{title} {author}".strip()
        encoded_query = urllib.parse.quote(query)
        
        base_url = "https://www.googleapis.com/books/v1/volumes"
        url = f"{base_url}?q={encoded_query}&maxResults=1"
        
        # Add API key if available to avoid rate limiting
        api_key = os.getenv("GOOGLE_BOOKS_API_KEY")
        if api_key:
            url += f"&key={api_key}"

        response = requests.get(url, timeout=5)
        response.raise_for_status()
        data = response.json()

        if data.get("totalItems", 0) > 0:
            items = data.get("items", [])
            if items and "volumeInfo" in items[0]:
                volume_info = items[0]["volumeInfo"]
                
                # Validate match to avoid false positives
                found_title = volume_info.get("title", "")
                if not _strings_are_similar(title, found_title):
                    logger.info(f"Google Books mismatch: queried '{title}', got '{found_title}'. Skipping.")
                    return None

                image_links = volume_info.get("imageLinks", {})

                for size in ["large", "medium", "small", "thumbnail", "smallThumbnail"]:
                    if size in image_links:
                        cover_url: str = image_links[size]
                        cover_url = cover_url.replace("http://", "https://")
                        logger.info(f"Found Google Books cover for '{title}'")
                        return cover_url

        return None
    except Exception as e:
        logger.debug(f"Google Books API failed for '{title}': {e}")
        return None


def _get_cover_from_openlibrary(title: str, author: str) -> Optional[str]:
    """Fetch cover from Open Library API."""
    try:
        search_url = (
            f"https://openlibrary.org/search.json?title={urllib.parse.quote(title)}&author={urllib.parse.quote(author)}"
        )

        response = requests.get(search_url, timeout=5)
        response.raise_for_status()
        data = response.json()

        if data.get("numFound", 0) > 0:
            docs = data.get("docs", [])
            if docs:
                for doc in docs:
                    if "isbn" in doc and doc["isbn"]:
                        isbn = doc["isbn"][0]
                        cover_url = f"https://covers.openlibrary.org/b/isbn/{isbn}-L.jpg"
                        logger.info(f"Found Open Library cover for '{title}'")
                        return cover_url

        return None
    except Exception as e:
        logger.debug(f"Open Library API failed for '{title}': {e}")
        return None


def load_book_covers_batch(books):
    """Pre-fetch covers in batch, using existing URLs if available."""
    results = {}
    books_to_fetch = []

    for book in books:
        # Check if we already have a valid URL from our enriched dataset
        existing_url = book.get("cover_image_url")
        if existing_url and isinstance(existing_url, str) and len(existing_url) > 10:
             results[book["title"]] = existing_url
        else:
             books_to_fetch.append(book)

    if not books_to_fetch:
        return results

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {
            executor.submit(get_cover_url_multi_source, book["title"], book.get("authors", "")): book for book in books_to_fetch
        }

        for future in futures:
            book = futures[future]
            try:
                results[book["title"]] = future.result()
            except Exception as e:
                logger.error(f"Error loading cover for {book['title']}: {e}")
                results[book["title"]] = PLACEHOLDER_IMAGES[0]

        return results


def fetch_book_cover(title: str, author: str) -> Optional[str]:
    """
    Fetch book cover from multiple sources with fallback chain.

    Priority order:
    1. Google Books API (best quality, most reliable)
    2. Open Library API
    3. Placeholder image

    Args:
        title (str): Book title
        author (str): Book author

    Returns:
        Optional[str]: URL to book cover or None if not found
    """
    return get_cover_url_multi_source(title, author)