# app/services/project_gutenberg.py import httpx, logging, re, urllib.parse from tenacity import retry, stop_after_attempt, wait_fixed logger = logging.getLogger("book-query") GUTENDEX = "https://gutendex.com/books/?search=" # Query for items return @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) async def search(q: str): """Return at most 5 PDF-downloadable results from Gutendex.""" url = f"{GUTENDEX}{urllib.parse.quote_plus(q)}" async with httpx.AsyncClient(timeout=10, follow_redirects=True) as client: r = await client.get(url) r.raise_for_status() books = r.json().get("results", [])[:10] results = [] for b in books: pdf_link = next( (v for k, v in b["formats"].items() if k.lower().endswith("pdf")), None ) # Link not from public details if not pdf_link: try: # Attempt fallback hardcoded PDF URL fallback_url = f"https://www.gutenberg.org/files/{b['id']}/{b['id']}-pdf.pdf" async with httpx.AsyncClient(timeout=5) as client: head_resp = await client.head(fallback_url) if head_resp.status_code == 200: pdf_link = fallback_url # PDF not accessible from except Exception as e: logger.debug(f"[GUT] fallback failed for {b['id']}: {e}") # Fallback book not having preview/download url from both details and hardcode method if not pdf_link: logger.debug(f"[GUT] skipped (no PDF): {b['title']}") continue # Final JSON results.append( { "title": b["title"], "author": ", ".join(a["name"] for a in b["authors"]), "edition": "", # not supplied "year": b.get("copyright_year"), "source": "gutenberg", "isbn": "", "download_available": True, "download_url": pdf_link, "ref": {"id": b["id"]}, } ) logger.info(f"[GUT] returned {len(results)} pdf titles for “{q}”") return results # Fetch items async def fetch(ref: dict): """For import: return direct PDF link if available via Gutendex or fallback to Gutenberg.""" gid = ref.get("id") if not gid: return None # Trailing to preview page for PDF confirmation gutendex_url = f"https://gutendex.com/books/{gid}/" # ensure trailing slash try: async with httpx.AsyncClient(timeout=10, follow_redirects=True) as client: r = await client.get(gutendex_url) if r.status_code == 200: data = r.json() pdf_link = next( (v for k, v in data["formats"].items() if k.lower().endswith("pdf")), None ) if pdf_link: return {"download_available": True, "download_url": pdf_link} except Exception as e: logger.warning(f"[GUT] Gutendex metadata failed for {gid}: {e}") # Fallback to static Gutenberg URL fallback_url = f"https://www.gutenberg.org/files/{gid}/{gid}-pdf.pdf" async with httpx.AsyncClient(timeout=10) as client: head = await client.head(fallback_url) if head.status_code == 200: logger.info(f"[GUT] Using fallback PDF: {fallback_url}") return {"download_available": True, "download_url": fallback_url} # Log logger.warning(f"[GUT] No PDF for book {gid}") return None