import pandas as pd import requests import time import logging from pathlib import Path logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) DATA_DIR = Path("data/raw") INPUT_FILE = DATA_DIR / "books_prepared.csv" def get_openlibrary_cover(title, author): try: # Simple cleaning clean_title = title.replace('&', '').split('(')[0].strip() clean_author = author.split(',')[0].strip() if author else "" query = f"title={clean_title}&author={clean_author}" url = f"https://openlibrary.org/search.json?{query}&limit=1" response = requests.get(url, timeout=5) if response.status_code == 200: data = response.json() if data.get("docs"): doc = data["docs"][0] if "cover_i" in doc: return f"https://covers.openlibrary.org/b/id/{doc['cover_i']}-L.jpg" except Exception as e: logger.warning(f"Error fetching cover for {title}: {e}") return None def enrich_data(): if not INPUT_FILE.exists(): logger.error(f"File not found: {INPUT_FILE}") return df = pd.read_csv(INPUT_FILE) logger.info(f"Loaded {len(df)} books.") if "cover_image_url" not in df.columns: df["cover_image_url"] = None # Filter for rows without covers # We check for NaN or empty string mask = df["cover_image_url"].isna() | (df["cover_image_url"] == "") indices = df[mask].index logger.info(f"Found {len(indices)} books missing covers.") # Process a batch (e.g., 50) to demonstrate improvement without timeout # The user can run this script repeatedly or increase limit BATCH_SIZE = 20 count = 0 for idx in indices: if count >= BATCH_SIZE: break row = df.loc[idx] title = row['title'] author = row['authors'] logger.info(f"[{count+1}/{BATCH_SIZE}] Fetching cover for: {title}") cover_url = get_openlibrary_cover(title, author) if cover_url: df.at[idx, 'cover_image_url'] = cover_url logger.info(f" -> Found: {cover_url}") else: logger.info(" -> No cover found.") time.sleep(0.2) # Polite delay count += 1 # Save back df.to_csv(INPUT_FILE, index=False) logger.info(f"Saved enriched data to {INPUT_FILE}") if __name__ == "__main__": enrich_data()