Spaces:

nice-bill
/

personalisation-engine

Sleeping

App Files Files Community

nice-bill commited on Dec 6, 2025

Commit

2d773b1

1 Parent(s): aefb5af

Readme updated

Browse files

Files changed (14) hide show

.dockerignore +0 -1
.gitignore +3 -4
Dockerfile +0 -16
README.md +9 -9
scripts/download_artifacts.py +0 -4
scripts/download_model.py +0 -1
scripts/evaluate_quality.py +0 -8
scripts/evaluate_system.py +2 -23
scripts/inspect_data.py +0 -33
scripts/optimize_index.py +0 -1
scripts/visualize_users.py +2 -13
src/personalization/api/main.py +0 -9
src/personalization/config.py +0 -1
test_api.py +0 -1

.dockerignore CHANGED Viewed

@@ -11,6 +11,5 @@ venv/
 .mypy_cache
 .pytest_cache
-# Ignore raw data if any, but keep catalog/index/embeddings
 data/raw
 data/synthetic

 .mypy_cache
 .pytest_cache
 data/raw
 data/synthetic

.gitignore CHANGED Viewed

@@ -1,4 +1,3 @@
-# Python
 __pycache__/
 *.py[cod]
 *$py.class
@@ -6,13 +5,13 @@ __pycache__/
 venv/
 .env
-# Data & Models (Too large for git)
 data/
 *.pt
 *.pth
 *.parquet
 *.csv
-# IDE
 .vscode/
-.idea/

 __pycache__/
 *.py[cod]
 *$py.class
 venv/
 .env
 data/
 *.pt
 *.pth
 *.parquet
 *.csv
 .vscode/
+.idea/
+docs/

Dockerfile CHANGED Viewed

@@ -1,4 +1,3 @@
-# Build Stage
 FROM python:3.10-slim AS builder
 WORKDIR /app
@@ -6,7 +5,6 @@ WORKDIR /app
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
-# Install build dependencies
 RUN apt-get update && apt-get install -y \
     gcc \
     python3-dev \
@@ -14,17 +12,14 @@ RUN apt-get update && apt-get install -y \
     git \
     && rm -rf /var/lib/apt/lists/*
-# Install uv
 RUN pip install --no-cache-dir uv
 COPY requirements.txt .
-# Create virtual environment and install dependencies
 ENV UV_HTTP_TIMEOUT=300
 RUN uv venv .venv && \
     uv pip install --no-cache -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
-# --- Runtime Stage ---
 FROM python:3.10-slim
 WORKDIR /app
@@ -35,37 +30,26 @@ ENV PATH="/app/.venv/bin:$PATH"
 ENV LANG=C.UTF-8
 ENV LC_ALL=C.UTF-8
-# Install runtime dependencies
 RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
-# Copy virtual environment from builder
 COPY --from=builder /app/.venv /app/.venv
-# Copy Scripts
 COPY scripts/ ./scripts/
-# Data & Model Baking
 ENV HF_HOME=/app/data/model_cache
-# Download Model
 RUN /app/.venv/bin/python scripts/download_model.py
-# Download Data
-# Ensure data directory exists
 RUN mkdir -p data/catalog data/index
 RUN /app/.venv/bin/python scripts/download_artifacts.py
-# Copy Code (Last to maximize layer caching)
 COPY src/ ./src/
-# Create directories and permissions
 # RUN addgroup --system app && adduser --system --group app && \
 #     chown -R app:app /app
 # USER app
-# Expose port
 EXPOSE 7860
-# Run Command
 CMD ["/app/.venv/bin/uvicorn", "src.personalization.api.main:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.10-slim AS builder
 WORKDIR /app
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 RUN apt-get update && apt-get install -y \
     gcc \
     python3-dev \
     git \
     && rm -rf /var/lib/apt/lists/*
 RUN pip install --no-cache-dir uv
 COPY requirements.txt .
 ENV UV_HTTP_TIMEOUT=300
 RUN uv venv .venv && \
     uv pip install --no-cache -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
 FROM python:3.10-slim
 WORKDIR /app
 ENV LANG=C.UTF-8
 ENV LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
 COPY --from=builder /app/.venv /app/.venv
 COPY scripts/ ./scripts/
 ENV HF_HOME=/app/data/model_cache
 RUN /app/.venv/bin/python scripts/download_model.py
 RUN mkdir -p data/catalog data/index
 RUN /app/.venv/bin/python scripts/download_artifacts.py
 COPY src/ ./src/
 # RUN addgroup --system app && adduser --system --group app && \
 #     chown -R app:app /app
 # USER app
 EXPOSE 7860
 CMD ["/app/.venv/bin/uvicorn", "src.personalization.api.main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -14,7 +14,7 @@ A high-performance, standalone recommendation service that uses **Semantic Searc
 Unlike traditional recommenders that rely on collaborative filtering (which fails without massive user data), this engine uses **Sentence Transformers** to understand the *content* of books (Title + Author + Genre + Description), allowing it to work effectively from Day 1 ("Cold Start").
-## 🚀 Key Features
 *   **Semantic Understanding:** Connects "The Haunted School" to "Ghost Beach" based on plot descriptions, not just title keywords.
 *   **Hybrid Scoring:** Combines **Semantic Similarity** (85%) with **Book Ratings** (15%) to recommend high-quality matches.
@@ -23,7 +23,7 @@ Unlike traditional recommenders that rely on collaborative filtering (which fail
 *   **Evaluation:** Achieves **40% Exact Hit Rate @ 10** on held-out author tests.
 *   **Standalone API:** Runs as a separate microservice (FastAPI) on Port 8001.
-## 🏗️ Architecture
 This project uses a **retrieval-based** approach:
@@ -35,7 +35,7 @@ This project uses a **retrieval-based** approach:
     *   The engine searches the FAISS index for the nearest neighbors.
     *   Results are re-ranked using the book's rating.
-## 📦 Installation & Setup
 ### Prerequisites
 *   Python 3.10+ (or Docker)
@@ -77,7 +77,7 @@ python scripts/1b_generate_semantic_data.py
 python scripts/optimize_index.py
 ```
-## 🏃 Run the Application
 ### Option A: Run Locally
 ```bash
@@ -96,7 +96,7 @@ docker build -t personalise .
 docker run -p 8001:7860 personalise
 ```
-## 🧪 Evaluation & Demo
 We have included a synthetic dataset of 10,000 users to validate the model.
 **Run the Offline Evaluation:**
@@ -121,7 +121,7 @@ python scripts/visualize_users.py
 python scripts/inspect_data.py
 ```
-## 📡 API Usage
 #### POST `/personalize/recommend`
 Get personalized books based on reading history.
@@ -141,7 +141,7 @@ Semantic search by plot or vibe.
 }
 ```
-## 📊 Performance Stats
 | Metric | Brute Force (Flat) | Optimized (IVF-PQ) |
 | :--- | :--- | :--- |
@@ -150,10 +150,10 @@ Semantic search by plot or vibe.
 | **Speed** | ~10ms | ~2ms |
 | **Hit Rate @ 10** | N/A | **40.0%** |
-## 🗺️ Roadmap & Future Improvements
 *   **Model Compression (ONNX):** Replace the heavy PyTorch dependency with **ONNX Runtime**. This would reduce the Docker image size from ~3GB to ~500MB and improve CPU inference latency by 2-3x.
 *   **Real-Time Learning:** Implement a "Session-Based" Recommender (using RNNs or Transformers) to adapt to user intent within a single session, rather than just long-term history.
 *   **A/B Testing Framework:** Add infrastructure to serve different model versions to different user segments to scientifically measure engagement.
-## 📄 License
 MIT

 Unlike traditional recommenders that rely on collaborative filtering (which fails without massive user data), this engine uses **Sentence Transformers** to understand the *content* of books (Title + Author + Genre + Description), allowing it to work effectively from Day 1 ("Cold Start").
+## Key Features
 *   **Semantic Understanding:** Connects "The Haunted School" to "Ghost Beach" based on plot descriptions, not just title keywords.
 *   **Hybrid Scoring:** Combines **Semantic Similarity** (85%) with **Book Ratings** (15%) to recommend high-quality matches.
 *   **Evaluation:** Achieves **40% Exact Hit Rate @ 10** on held-out author tests.
 *   **Standalone API:** Runs as a separate microservice (FastAPI) on Port 8001.
+## Architecture
 This project uses a **retrieval-based** approach:
     *   The engine searches the FAISS index for the nearest neighbors.
     *   Results are re-ranked using the book's rating.
+## Installation & Setup
 ### Prerequisites
 *   Python 3.10+ (or Docker)
 python scripts/optimize_index.py
 ```
+## Run the Application
 ### Option A: Run Locally
 ```bash
 docker run -p 8001:7860 personalise
 ```
+## Evaluation & Demo
 We have included a synthetic dataset of 10,000 users to validate the model.
 **Run the Offline Evaluation:**
 python scripts/inspect_data.py
 ```
+## API Usage
 #### POST `/personalize/recommend`
 Get personalized books based on reading history.
 }
 ```
+## Performance Stats
 | Metric | Brute Force (Flat) | Optimized (IVF-PQ) |
 | :--- | :--- | :--- |
 | **Speed** | ~10ms | ~2ms |
 | **Hit Rate @ 10** | N/A | **40.0%** |
+## Roadmap & Future Improvements
 *   **Model Compression (ONNX):** Replace the heavy PyTorch dependency with **ONNX Runtime**. This would reduce the Docker image size from ~3GB to ~500MB and improve CPU inference latency by 2-3x.
 *   **Real-Time Learning:** Implement a "Session-Based" Recommender (using RNNs or Transformers) to adapt to user intent within a single session, rather than just long-term history.
 *   **A/B Testing Framework:** Add infrastructure to serve different model versions to different user segments to scientifically measure engagement.
+## License
 MIT

scripts/download_artifacts.py CHANGED Viewed

@@ -6,7 +6,6 @@ import shutil
 HF_REPO_ID = "nice-bill/book-recommender-artifacts"
 REPO_TYPE = "dataset"
-# Local Paths
 DATA_DIR = Path("data")
 CATALOG_DIR = DATA_DIR / "catalog"
 INDEX_DIR = DATA_DIR / "index"
@@ -20,7 +19,6 @@ FILES_TO_DOWNLOAD = {
 def main():
     print(f"--- Checking artifacts from {HF_REPO_ID} ---")
-    # Ensure directories exist
     for dir_path in [DATA_DIR, CATALOG_DIR, INDEX_DIR]:
         dir_path.mkdir(parents=True, exist_ok=True)
@@ -33,14 +31,12 @@ def main():
         print(f"Downloading {filename}...")
         try:
-            # Download to local cache
             cached_path = hf_hub_download(
                 repo_id=HF_REPO_ID,
                 filename=filename,
                 repo_type=REPO_TYPE
             )
-            # Copy from cache to our project structure
             shutil.copy(cached_path, dest_path)
             print(f"   Saved to {dest_path}")

 HF_REPO_ID = "nice-bill/book-recommender-artifacts"
 REPO_TYPE = "dataset"
 DATA_DIR = Path("data")
 CATALOG_DIR = DATA_DIR / "catalog"
 INDEX_DIR = DATA_DIR / "index"
 def main():
     print(f"--- Checking artifacts from {HF_REPO_ID} ---")
     for dir_path in [DATA_DIR, CATALOG_DIR, INDEX_DIR]:
         dir_path.mkdir(parents=True, exist_ok=True)
         print(f"Downloading {filename}...")
         try:
             cached_path = hf_hub_download(
                 repo_id=HF_REPO_ID,
                 filename=filename,
                 repo_type=REPO_TYPE
             )
             shutil.copy(cached_path, dest_path)
             print(f"   Saved to {dest_path}")

scripts/download_model.py CHANGED Viewed

@@ -5,7 +5,6 @@ MODEL_NAME = "all-MiniLM-L6-v2"
 def download():
     print(f"Downloading {MODEL_NAME}...")
-    # This will download to HF_HOME (set in Dockerfile)
     SentenceTransformer(MODEL_NAME)
     print("Done.")

 def download():
     print(f"Downloading {MODEL_NAME}...")
     SentenceTransformer(MODEL_NAME)
     print("Done.")

scripts/evaluate_quality.py CHANGED Viewed

@@ -6,11 +6,9 @@ import sys
 from tqdm import tqdm
 from pathlib import Path
-# Add src to path
 sys.path.append(str(Path(__file__).parent.parent))
 from src.personalization.config import settings
-# Config
 CATALOG_PATH = Path("data/catalog/books_catalog.csv")
 NUM_SAMPLES = 100
@@ -30,7 +28,6 @@ def main():
     df = pd.read_csv(CATALOG_PATH)
-    # Filter authors with at least 5 books
     author_counts = df['authors'].value_counts()
     valid_authors = author_counts[author_counts >= 5].index.tolist()
@@ -43,20 +40,17 @@ def main():
     print(f"Running {args.samples} evaluation queries against {api_url}...")
     for _ in tqdm(range(args.samples)):
-        # 1. Pick a random author
         author = random.choice(valid_authors)
         books = df[df['authors'] == author]
         if len(books) < 5:
             continue
-        # 2. Split: History (3 books) -> Target (1 book)
         sample = books.sample(n=4, replace=False)
         history = sample.iloc[:3]['title'].tolist()
         target_book = sample.iloc[3]
         target_title = target_book['title']
-        # 3. Call API
         try:
             payload = {"user_history": history, "top_k": 10}
             resp = requests.post(api_url, json=payload)
@@ -67,11 +61,9 @@ def main():
             recs = resp.json()
             rec_titles = [r['title'] for r in recs]
-            # Metrics
             if target_title in rec_titles:
                 hits += 1
-            # Author Match
             rec_authors = df[df['title'].isin(rec_titles)]['authors'].tolist()
             if author in rec_authors:
                 matches = rec_authors.count(author)

 from tqdm import tqdm
 from pathlib import Path
 sys.path.append(str(Path(__file__).parent.parent))
 from src.personalization.config import settings
 CATALOG_PATH = Path("data/catalog/books_catalog.csv")
 NUM_SAMPLES = 100
     df = pd.read_csv(CATALOG_PATH)
     author_counts = df['authors'].value_counts()
     valid_authors = author_counts[author_counts >= 5].index.tolist()
     print(f"Running {args.samples} evaluation queries against {api_url}...")
     for _ in tqdm(range(args.samples)):
         author = random.choice(valid_authors)
         books = df[df['authors'] == author]
         if len(books) < 5:
             continue
         sample = books.sample(n=4, replace=False)
         history = sample.iloc[:3]['title'].tolist()
         target_book = sample.iloc[3]
         target_title = target_book['title']
         try:
             payload = {"user_history": history, "top_k": 10}
             resp = requests.post(api_url, json=payload)
             recs = resp.json()
             rec_titles = [r['title'] for r in recs]
             if target_title in rec_titles:
                 hits += 1
             rec_authors = df[df['title'].isin(rec_titles)]['authors'].tolist()
             if author in rec_authors:
                 matches = rec_authors.count(author)

scripts/evaluate_system.py CHANGED Viewed

@@ -6,11 +6,9 @@ import logging
 from sentence_transformers import SentenceTransformer
 from tqdm import tqdm
-# Setup
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("Evaluator")
-# Paths
 DATA_DIR = Path("data")
 SYNTHETIC_DATA_PATH = DATA_DIR / "synthetic" / "user_sequences.parquet"
 CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
@@ -23,22 +21,17 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
     metric: Hit Rate @ k
     """
-    # 1. Load Resources
     logger.info("Loading Catalog and Embeddings...")
     if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
         logger.error("Missing Data! Run download scripts first.")
         return
-    # Load Titles for mapping
     df_catalog = pd.read_csv(CATALOG_PATH)
     titles = df_catalog['title'].tolist()
-    # Create Title -> Index map (normalized)
     title_to_idx = {t.lower().strip(): i for i, t in enumerate(titles)}
-    # Load Embeddings
     embeddings = np.load(EMBEDDINGS_PATH)
-    # Load Index
     logger.info("Loading FAISS Index...")
     if INDEX_PATH.exists():
         index = faiss.read_index(str(INDEX_PATH))
@@ -50,11 +43,9 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
         faiss.normalize_L2(embeddings)
         index.add(embeddings)
-    # 2. Load Synthetic Users
     logger.info(f"Loading Synthetic Data from {SYNTHETIC_DATA_PATH}...")
     df_users = pd.read_parquet(SYNTHETIC_DATA_PATH)
-    # Sample users if dataset is too large
     if len(df_users) > sample_size:
         df_users = df_users.sample(sample_size, random_state=42)
@@ -66,15 +57,12 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
     for _, row in tqdm(df_users.iterrows(), total=len(df_users)):
         history = row['book_sequence']
-        # Need at least 2 books (1 for history, 1 for test)
         if len(history) < 2:
             continue
-        # Leave-One-Out Split
         target_book = history[-1]
         context_books = history[:-1]
-        # 3. Convert Context to Vector
         valid_indices = []
         for book in context_books:
             norm_title = book.lower().strip()
@@ -84,10 +72,8 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
         if not valid_indices:
             continue
-        # Get vectors and average (Time Decay Simulation)
         context_vectors = embeddings[valid_indices]
-        # Simple Time Decay
         n = len(valid_indices)
         decay_factor = 0.9
         weights = np.array([decay_factor ** (n - 1 - i) for i in range(n)])
@@ -96,14 +82,11 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
         user_vector = np.average(context_vectors, axis=0, weights=weights).reshape(1, -1).astype(np.float32)
         faiss.normalize_L2(user_vector)
-        # 4. Search
-        # We search for top_k + len(context) because the model might return books the user already read
         search_k = top_k + len(valid_indices) + 5
         scores, indices = index.search(user_vector, search_k)
-        # Filter results
         recommended_titles = []
-        seen_indices = set(valid_indices) # Don't recommend what they just read
         for idx in indices[0]:
             if idx in seen_indices:
@@ -115,10 +98,7 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
             if len(recommended_titles) >= top_k:
                 break
-        # 5. Check Hit
-        # We check if the TARGET book title is in the recommended list
-        # Using loose matching (substring or exact) can be generous, but strict is better for ML metrics
-        # We'll stick to exact string match (normalized)
         target_norm = target_book.lower().strip()
         rec_norm = [t.lower().strip() for t in recommended_titles]
@@ -128,7 +108,6 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
         processed_users += 1
-    # 6. Report
     if processed_users == 0:
         print("No valid users found for evaluation.")
         return

 from sentence_transformers import SentenceTransformer
 from tqdm import tqdm
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("Evaluator")
 DATA_DIR = Path("data")
 SYNTHETIC_DATA_PATH = DATA_DIR / "synthetic" / "user_sequences.parquet"
 CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
     metric: Hit Rate @ k
     """
     logger.info("Loading Catalog and Embeddings...")
     if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
         logger.error("Missing Data! Run download scripts first.")
         return
     df_catalog = pd.read_csv(CATALOG_PATH)
     titles = df_catalog['title'].tolist()
     title_to_idx = {t.lower().strip(): i for i, t in enumerate(titles)}
     embeddings = np.load(EMBEDDINGS_PATH)
     logger.info("Loading FAISS Index...")
     if INDEX_PATH.exists():
         index = faiss.read_index(str(INDEX_PATH))
         faiss.normalize_L2(embeddings)
         index.add(embeddings)
     logger.info(f"Loading Synthetic Data from {SYNTHETIC_DATA_PATH}...")
     df_users = pd.read_parquet(SYNTHETIC_DATA_PATH)
     if len(df_users) > sample_size:
         df_users = df_users.sample(sample_size, random_state=42)
     for _, row in tqdm(df_users.iterrows(), total=len(df_users)):
         history = row['book_sequence']
         if len(history) < 2:
             continue
         target_book = history[-1]
         context_books = history[:-1]
         valid_indices = []
         for book in context_books:
             norm_title = book.lower().strip()
         if not valid_indices:
             continue
         context_vectors = embeddings[valid_indices]
         n = len(valid_indices)
         decay_factor = 0.9
         weights = np.array([decay_factor ** (n - 1 - i) for i in range(n)])
         user_vector = np.average(context_vectors, axis=0, weights=weights).reshape(1, -1).astype(np.float32)
         faiss.normalize_L2(user_vector)
         search_k = top_k + len(valid_indices) + 5
         scores, indices = index.search(user_vector, search_k)
         recommended_titles = []
+        seen_indices = set(valid_indices)
         for idx in indices[0]:
             if idx in seen_indices:
             if len(recommended_titles) >= top_k:
                 break
         target_norm = target_book.lower().strip()
         rec_norm = [t.lower().strip() for t in recommended_titles]
         processed_users += 1
     if processed_users == 0:
         print("No valid users found for evaluation.")
         return

scripts/inspect_data.py DELETED Viewed

@@ -1,33 +0,0 @@
-import pandas as pd
-from pathlib import Path
-# Config
-DATA_PATH = Path("data/synthetic/user_sequences.parquet")
-def inspect():
-    if not DATA_PATH.exists():
-        print(f"Error: File not found at {DATA_PATH}")
-        return
-    try:
-        print(f"Reading {DATA_PATH}...")
-        df = pd.read_parquet(DATA_PATH)
-        print("\n--- Schema ---")
-        print(df.info())
-        print("\n--- First 5 Rows ---")
-        print(df.head().to_string())
-        print("\n--- Sample User History ---")
-        # Show the full history of the first user
-        first_user = df.iloc[0]
-        print(f"User ID: {first_user.get('user_id', 'N/A')}")
-        print(f"History: {first_user.get('book_history', 'N/A')}")
-    except Exception as e:
-        print(f"Failed to read parquet: {e}")
-if __name__ == "__main__":
-    inspect()

scripts/optimize_index.py CHANGED Viewed

@@ -4,7 +4,6 @@ from pathlib import Path
 import time
 import sys
-# Config
 DATA_DIR = Path("data")
 EMBEDDINGS_PATH = DATA_DIR / "embeddings_cache.npy"
 OUTPUT_PATH = DATA_DIR / "index" / "optimized.index"

 import time
 import sys
 DATA_DIR = Path("data")
 EMBEDDINGS_PATH = DATA_DIR / "embeddings_cache.npy"
 OUTPUT_PATH = DATA_DIR / "index" / "optimized.index"

scripts/visualize_users.py CHANGED Viewed

@@ -4,11 +4,9 @@ import logging
 from pathlib import Path
 from tqdm import tqdm
-# Setup Logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("Visualizer")
-# Paths
 DATA_DIR = Path("data")
 SYNTHETIC_DATA_PATH = DATA_DIR / "synthetic" / "user_sequences.parquet"
 CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
@@ -29,24 +27,19 @@ def visualize_clusters(sample_size=2000):
         logger.error("Please run: uv pip install matplotlib seaborn")
         return
-    # 1. Load Resources
     logger.info("Loading Data...")
     if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
         logger.error("Missing Data! Run download scripts first.")
         return
-    # Load Titles for mapping
     df_catalog = pd.read_csv(CATALOG_PATH)
     titles = df_catalog['title'].tolist()
     title_to_idx = {t.lower().strip(): i for i, t in enumerate(titles)}
-    # Load Embeddings
     embeddings = np.load(EMBEDDINGS_PATH)
-    # Load Users
     df_users = pd.read_parquet(SYNTHETIC_DATA_PATH)
-    # Sample
     if len(df_users) > sample_size:
         df_users = df_users.sample(sample_size, random_state=42)
@@ -55,7 +48,6 @@ def visualize_clusters(sample_size=2000):
     user_vectors = []
     user_personas = []
-    # 2. Calculate User Vectors
     valid_users = 0
     for _, row in tqdm(df_users.iterrows(), total=len(df_users)):
         history = row['book_sequence']
@@ -70,7 +62,6 @@ def visualize_clusters(sample_size=2000):
         if not valid_indices:
             continue
-        # Average Embeddings
         vectors = embeddings[valid_indices]
         user_vec = np.mean(vectors, axis=0)
@@ -80,12 +71,10 @@ def visualize_clusters(sample_size=2000):
     X = np.array(user_vectors)
-    # 3. t-SNE Reduction
-    logger.info("Running t-SNE (this might take a moment)...")
     tsne = TSNE(n_components=2, random_state=42, perplexity=30)
     X_embedded = tsne.fit_transform(X)
-    # 4. Plotting
     logger.info("Generating Plot...")
     OUTPUT_DIR.mkdir(exist_ok=True)
@@ -106,7 +95,7 @@ def visualize_clusters(sample_size=2000):
     plt.tight_layout()
     plt.savefig(OUTPUT_IMAGE, dpi=300)
-    logger.info(f"✅ Visualization saved to {OUTPUT_IMAGE}")
     print(f"Success! Check {OUTPUT_IMAGE} to see your user clusters.")
 if __name__ == "__main__":

 from pathlib import Path
 from tqdm import tqdm
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("Visualizer")
 DATA_DIR = Path("data")
 SYNTHETIC_DATA_PATH = DATA_DIR / "synthetic" / "user_sequences.parquet"
 CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
         logger.error("Please run: uv pip install matplotlib seaborn")
         return
     logger.info("Loading Data...")
     if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
         logger.error("Missing Data! Run download scripts first.")
         return
     df_catalog = pd.read_csv(CATALOG_PATH)
     titles = df_catalog['title'].tolist()
     title_to_idx = {t.lower().strip(): i for i, t in enumerate(titles)}
     embeddings = np.load(EMBEDDINGS_PATH)
     df_users = pd.read_parquet(SYNTHETIC_DATA_PATH)
     if len(df_users) > sample_size:
         df_users = df_users.sample(sample_size, random_state=42)
     user_vectors = []
     user_personas = []
     valid_users = 0
     for _, row in tqdm(df_users.iterrows(), total=len(df_users)):
         history = row['book_sequence']
         if not valid_indices:
             continue
         vectors = embeddings[valid_indices]
         user_vec = np.mean(vectors, axis=0)
     X = np.array(user_vectors)
+    logger.info("Running t-SNE")
     tsne = TSNE(n_components=2, random_state=42, perplexity=30)
     X_embedded = tsne.fit_transform(X)
     logger.info("Generating Plot...")
     OUTPUT_DIR.mkdir(exist_ok=True)
     plt.tight_layout()
     plt.savefig(OUTPUT_IMAGE, dpi=300)
+    logger.info(f"Visualization saved to {OUTPUT_IMAGE}")
     print(f"Success! Check {OUTPUT_IMAGE} to see your user clusters.")
 if __name__ == "__main__":

src/personalization/api/main.py CHANGED Viewed

@@ -11,17 +11,14 @@ import faiss
 import numpy as np
 import time
-# Setup Logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Config
 DATA_DIR = Path("data")
 CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
 EMBEDDINGS_PATH = DATA_DIR / "embeddings_cache.npy"
 MODEL_NAME = "all-MiniLM-L6-v2"
-# Global State
 state = {
     "titles": [],
     "title_to_idx": {},
@@ -40,7 +37,6 @@ async def lifespan(app: FastAPI):
     if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
         logger.error("Missing catalog or embeddings! Run scripts/1b... first.")
-        # We continue but service might be degraded
     else:
         try:
             df = pd.read_csv(CATALOG_PATH)
@@ -51,7 +47,6 @@ async def lifespan(app: FastAPI):
             max_rating = raw_ratings.max()
             state["ratings"] = (raw_ratings / max_rating).tolist() if max_rating > 0 else [0.5] * len(df)
-            # Use normalized keys for robust lookup
             state["title_to_idx"] = {t.lower().strip(): i for i, t in enumerate(state["titles"])}
             state["popular_indices"] = np.argsort(raw_ratings)[::-1][:50].tolist()
@@ -80,23 +75,19 @@ async def lifespan(app: FastAPI):
             logger.info(f"Ready! Loaded {len(state['titles'])} books in {time.time() - start_time:.2f}s")
         except Exception as e:
             logger.error(f"Failed to load resources: {e}")
-            # Consider raising if critical
     yield
     logger.info("Shutting down...")
-    # Clean up resources if needed
 app = FastAPI(title="Semantic Book Discovery Engine", lifespan=lifespan)
-# Add a root endpoint to redirect to docs
 from fastapi.responses import RedirectResponse
 @app.get("/")
 async def read_root():
     return RedirectResponse(url="/docs")
-# Add Prometheus Instrumentation
 Instrumentator().instrument(app).expose(app)
 class RecommendationRequest(BaseModel):

 import numpy as np
 import time
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 DATA_DIR = Path("data")
 CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
 EMBEDDINGS_PATH = DATA_DIR / "embeddings_cache.npy"
 MODEL_NAME = "all-MiniLM-L6-v2"
 state = {
     "titles": [],
     "title_to_idx": {},
     if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
         logger.error("Missing catalog or embeddings! Run scripts/1b... first.")
     else:
         try:
             df = pd.read_csv(CATALOG_PATH)
             max_rating = raw_ratings.max()
             state["ratings"] = (raw_ratings / max_rating).tolist() if max_rating > 0 else [0.5] * len(df)
             state["title_to_idx"] = {t.lower().strip(): i for i, t in enumerate(state["titles"])}
             state["popular_indices"] = np.argsort(raw_ratings)[::-1][:50].tolist()
             logger.info(f"Ready! Loaded {len(state['titles'])} books in {time.time() - start_time:.2f}s")
         except Exception as e:
             logger.error(f"Failed to load resources: {e}")
     yield
     logger.info("Shutting down...")
 app = FastAPI(title="Semantic Book Discovery Engine", lifespan=lifespan)
 from fastapi.responses import RedirectResponse
 @app.get("/")
 async def read_root():
     return RedirectResponse(url="/docs")
 Instrumentator().instrument(app).expose(app)
 class RecommendationRequest(BaseModel):

src/personalization/config.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import os
 class Settings:
-    # Default to 8001, but allow Env Override
     HOST = os.getenv("API_HOST", "localhost")
     PORT = int(os.getenv("API_PORT", 8001))

 import os
 class Settings:
     HOST = os.getenv("API_HOST", "localhost")
     PORT = int(os.getenv("API_PORT", 8001))

test_api.py CHANGED Viewed

@@ -3,7 +3,6 @@ import argparse
 import sys
 from pathlib import Path
-# Add src to path to import config
 sys.path.append(str(Path(__file__).parent))
 from src.personalization.config import settings

 import sys
 from pathlib import Path
 sys.path.append(str(Path(__file__).parent))
 from src.personalization.config import settings