ymlin105's picture
feat: integrate A/B testing framework and enhance RAG diversity in recommendation system
b4bfa19
import logging
import sys
import re
import html
def setup_logger(name: str):
"""Configure and return a logger. Use DEBUG=1 for verbose output."""
from src.config import DEBUG
logger = logging.getLogger(name)
logger.setLevel(logging.DEBUG if DEBUG else logging.WARNING)
if not logger.handlers:
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter("%(levelname)s | %(name)s | %(message)s"))
logger.addHandler(handler)
return logger
def summarize_description(text: str, max_sentences: int = 2, max_chars: int = 240) -> str:
"""Create a clean, sentence-based summary for a book description.
- Decodes HTML entities (e.g., & → &)
- Normalizes whitespace
- Truncates by complete sentences (not raw words)
- Applies a soft character cap with an ellipsis if needed
"""
if not text:
return "—"
# Decode HTML entities and normalize whitespace
cleaned = html.unescape(str(text))
cleaned = re.sub(r"\s+", " ", cleaned).strip()
if not cleaned:
return "—"
# Split into sentences on punctuation followed by whitespace
sentences = re.split(r"(?<=[.!?])\s+", cleaned)
selected: list[str] = []
total_len = 0
for s in sentences:
if not s:
continue
# Tentatively add sentence if within limits
if len(selected) < max_sentences and (total_len + len(s) + (1 if selected else 0)) <= max_chars:
selected.append(s)
total_len += len(s) + (1 if selected else 0)
else:
break
summary = " ".join(selected).strip()
if not summary:
# Fallback: hard trim characters with ellipsis
summary = cleaned[: max_chars].rstrip()
if len(cleaned) > max_chars:
summary = summary.rsplit(" ", 1)[0].rstrip() + "…"
return summary
# Ensure soft char cap
if len(summary) > max_chars:
summary = summary[: max_chars].rstrip()
summary = summary.rsplit(" ", 1)[0].rstrip() + "…"
return summary
def enrich_book_metadata(meta: dict | None, isbn: str) -> dict:
"""
Enrich book metadata with dynamic cover fetching if missing.
Mutates and returns the meta dictionary.
"""
if not meta:
meta = {}
# 1. Get available metadata
title = meta.get("title")
thumbnail = meta.get("thumbnail")
author = meta.get("authors", "Unknown")
# 2. Validation Check
is_valid_thumb = thumbnail and str(thumbnail).lower() not in ["nan", "none", "", "null"] and "/assets/cover-not-found.jpg" not in str(thumbnail) and "cover-not-found" not in str(thumbnail)
# 3. Fetch if needed
if not title or not is_valid_thumb:
# Lazy import to avoid circular dependency
from src.cover_fetcher import fetch_book_cover
fetched_cover, fetched_authors, fetched_desc = fetch_book_cover(str(isbn))
# Update if we found better data
if not is_valid_thumb and "cover-not-found" not in fetched_cover:
meta["thumbnail"] = fetched_cover
if not title:
meta["title"] = f"Book {isbn}"
if author == "Unknown" and fetched_authors != "Unknown":
meta["authors"] = fetched_authors
# 4. Final Fallback
final_thumb = meta.get("thumbnail")
if not final_thumb or str(final_thumb).lower() in ["nan", "none", "", "null"] or "cover-not-found" in str(final_thumb):
meta["thumbnail"] = "/content/cover-not-found.jpg"
return meta