Spaces:

ymlin105
/

book-rec-with-LLMs

Sleeping

App Files Files Community

book-rec-with-LLMs / src /utils.py

ymlin105

feat: integrate A/B testing framework and enhance RAG diversity in recommendation system

b4bfa19 11 days ago

raw

history blame contribute delete

3.6 kB

	import logging
	import sys
	import re
	import html

	def setup_logger(name: str):
	"""Configure and return a logger. Use DEBUG=1 for verbose output."""
	from src.config import DEBUG
	logger = logging.getLogger(name)
	logger.setLevel(logging.DEBUG if DEBUG else logging.WARNING)

	if not logger.handlers:
	handler = logging.StreamHandler(sys.stdout)
	handler.setFormatter(logging.Formatter("%(levelname)s \| %(name)s \| %(message)s"))
	logger.addHandler(handler)
	return logger


	def summarize_description(text: str, max_sentences: int = 2, max_chars: int = 240) -> str:
	"""Create a clean, sentence-based summary for a book description.

	- Decodes HTML entities (e.g., & → &)
	- Normalizes whitespace
	- Truncates by complete sentences (not raw words)
	- Applies a soft character cap with an ellipsis if needed
	"""
	if not text:
	return "—"

	# Decode HTML entities and normalize whitespace
	cleaned = html.unescape(str(text))
	cleaned = re.sub(r"\s+", " ", cleaned).strip()

	if not cleaned:
	return "—"

	# Split into sentences on punctuation followed by whitespace
	sentences = re.split(r"(?<=[.!?])\s+", cleaned)
	selected: list[str] = []
	total_len = 0
	for s in sentences:
	if not s:
	continue
	# Tentatively add sentence if within limits
	if len(selected) < max_sentences and (total_len + len(s) + (1 if selected else 0)) <= max_chars:
	selected.append(s)
	total_len += len(s) + (1 if selected else 0)
	else:
	break

	summary = " ".join(selected).strip()
	if not summary:
	# Fallback: hard trim characters with ellipsis
	summary = cleaned[: max_chars].rstrip()
	if len(cleaned) > max_chars:
	summary = summary.rsplit(" ", 1)[0].rstrip() + "…"
	return summary

	# Ensure soft char cap
	if len(summary) > max_chars:
	summary = summary[: max_chars].rstrip()
	summary = summary.rsplit(" ", 1)[0].rstrip() + "…"

	return summary


	def enrich_book_metadata(meta: dict \| None, isbn: str) -> dict:
	"""
	Enrich book metadata with dynamic cover fetching if missing.
	Mutates and returns the meta dictionary.
	"""
	if not meta:
	meta = {}

	# 1. Get available metadata
	title = meta.get("title")
	thumbnail = meta.get("thumbnail")
	author = meta.get("authors", "Unknown")

	# 2. Validation Check
	is_valid_thumb = thumbnail and str(thumbnail).lower() not in ["nan", "none", "", "null"] and "/assets/cover-not-found.jpg" not in str(thumbnail) and "cover-not-found" not in str(thumbnail)

	# 3. Fetch if needed
	if not title or not is_valid_thumb:
	# Lazy import to avoid circular dependency
	from src.cover_fetcher import fetch_book_cover

	fetched_cover, fetched_authors, fetched_desc = fetch_book_cover(str(isbn))

	# Update if we found better data
	if not is_valid_thumb and "cover-not-found" not in fetched_cover:
	meta["thumbnail"] = fetched_cover

	if not title:
	meta["title"] = f"Book {isbn}"

	if author == "Unknown" and fetched_authors != "Unknown":
	meta["authors"] = fetched_authors

	# 4. Final Fallback
	final_thumb = meta.get("thumbnail")
	if not final_thumb or str(final_thumb).lower() in ["nan", "none", "", "null"] or "cover-not-found" in str(final_thumb):
	meta["thumbnail"] = "/content/cover-not-found.jpg"

	return meta