Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import requests | |
| import time | |
| import logging | |
| from pathlib import Path | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| DATA_DIR = Path("data/raw") | |
| INPUT_FILE = DATA_DIR / "books_prepared.csv" | |
| def get_openlibrary_cover(title, author): | |
| try: | |
| # Simple cleaning | |
| clean_title = title.replace('&', '').split('(')[0].strip() | |
| clean_author = author.split(',')[0].strip() if author else "" | |
| query = f"title={clean_title}&author={clean_author}" | |
| url = f"https://openlibrary.org/search.json?{query}&limit=1" | |
| response = requests.get(url, timeout=5) | |
| if response.status_code == 200: | |
| data = response.json() | |
| if data.get("docs"): | |
| doc = data["docs"][0] | |
| if "cover_i" in doc: | |
| return f"https://covers.openlibrary.org/b/id/{doc['cover_i']}-L.jpg" | |
| except Exception as e: | |
| logger.warning(f"Error fetching cover for {title}: {e}") | |
| return None | |
| def enrich_data(): | |
| if not INPUT_FILE.exists(): | |
| logger.error(f"File not found: {INPUT_FILE}") | |
| return | |
| df = pd.read_csv(INPUT_FILE) | |
| logger.info(f"Loaded {len(df)} books.") | |
| if "cover_image_url" not in df.columns: | |
| df["cover_image_url"] = None | |
| # Filter for rows without covers | |
| # We check for NaN or empty string | |
| mask = df["cover_image_url"].isna() | (df["cover_image_url"] == "") | |
| indices = df[mask].index | |
| logger.info(f"Found {len(indices)} books missing covers.") | |
| # Process a batch (e.g., 50) to demonstrate improvement without timeout | |
| # The user can run this script repeatedly or increase limit | |
| BATCH_SIZE = 20 | |
| count = 0 | |
| for idx in indices: | |
| if count >= BATCH_SIZE: | |
| break | |
| row = df.loc[idx] | |
| title = row['title'] | |
| author = row['authors'] | |
| logger.info(f"[{count+1}/{BATCH_SIZE}] Fetching cover for: {title}") | |
| cover_url = get_openlibrary_cover(title, author) | |
| if cover_url: | |
| df.at[idx, 'cover_image_url'] = cover_url | |
| logger.info(f" -> Found: {cover_url}") | |
| else: | |
| logger.info(" -> No cover found.") | |
| time.sleep(0.2) # Polite delay | |
| count += 1 | |
| # Save back | |
| df.to_csv(INPUT_FILE, index=False) | |
| logger.info(f"Saved enriched data to {INPUT_FILE}") | |
| if __name__ == "__main__": | |
| enrich_data() | |