nice-bill commited on
Commit
2d773b1
·
1 Parent(s): aefb5af

Readme updated

Browse files
.dockerignore CHANGED
@@ -11,6 +11,5 @@ venv/
11
  .mypy_cache
12
  .pytest_cache
13
 
14
- # Ignore raw data if any, but keep catalog/index/embeddings
15
  data/raw
16
  data/synthetic
 
11
  .mypy_cache
12
  .pytest_cache
13
 
 
14
  data/raw
15
  data/synthetic
.gitignore CHANGED
@@ -1,4 +1,3 @@
1
- # Python
2
  __pycache__/
3
  *.py[cod]
4
  *$py.class
@@ -6,13 +5,13 @@ __pycache__/
6
  venv/
7
  .env
8
 
9
- # Data & Models (Too large for git)
10
  data/
11
  *.pt
12
  *.pth
13
  *.parquet
14
  *.csv
15
 
16
- # IDE
17
  .vscode/
18
- .idea/
 
 
 
 
1
  __pycache__/
2
  *.py[cod]
3
  *$py.class
 
5
  venv/
6
  .env
7
 
 
8
  data/
9
  *.pt
10
  *.pth
11
  *.parquet
12
  *.csv
13
 
 
14
  .vscode/
15
+ .idea/
16
+
17
+ docs/
Dockerfile CHANGED
@@ -1,4 +1,3 @@
1
- # Build Stage
2
  FROM python:3.10-slim AS builder
3
 
4
  WORKDIR /app
@@ -6,7 +5,6 @@ WORKDIR /app
6
  ENV PYTHONDONTWRITEBYTECODE=1
7
  ENV PYTHONUNBUFFERED=1
8
 
9
- # Install build dependencies
10
  RUN apt-get update && apt-get install -y \
11
  gcc \
12
  python3-dev \
@@ -14,17 +12,14 @@ RUN apt-get update && apt-get install -y \
14
  git \
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
- # Install uv
18
  RUN pip install --no-cache-dir uv
19
 
20
  COPY requirements.txt .
21
 
22
- # Create virtual environment and install dependencies
23
  ENV UV_HTTP_TIMEOUT=300
24
  RUN uv venv .venv && \
25
  uv pip install --no-cache -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
26
 
27
- # --- Runtime Stage ---
28
  FROM python:3.10-slim
29
 
30
  WORKDIR /app
@@ -35,37 +30,26 @@ ENV PATH="/app/.venv/bin:$PATH"
35
  ENV LANG=C.UTF-8
36
  ENV LC_ALL=C.UTF-8
37
 
38
- # Install runtime dependencies
39
  RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
40
 
41
- # Copy virtual environment from builder
42
  COPY --from=builder /app/.venv /app/.venv
43
 
44
- # Copy Scripts
45
  COPY scripts/ ./scripts/
46
 
47
- # Data & Model Baking
48
  ENV HF_HOME=/app/data/model_cache
49
 
50
- # Download Model
51
  RUN /app/.venv/bin/python scripts/download_model.py
52
 
53
- # Download Data
54
- # Ensure data directory exists
55
  RUN mkdir -p data/catalog data/index
56
  RUN /app/.venv/bin/python scripts/download_artifacts.py
57
 
58
- # Copy Code (Last to maximize layer caching)
59
  COPY src/ ./src/
60
 
61
- # Create directories and permissions
62
  # RUN addgroup --system app && adduser --system --group app && \
63
  # chown -R app:app /app
64
 
65
  # USER app
66
 
67
- # Expose port
68
  EXPOSE 7860
69
 
70
- # Run Command
71
  CMD ["/app/.venv/bin/uvicorn", "src.personalization.api.main:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
1
  FROM python:3.10-slim AS builder
2
 
3
  WORKDIR /app
 
5
  ENV PYTHONDONTWRITEBYTECODE=1
6
  ENV PYTHONUNBUFFERED=1
7
 
 
8
  RUN apt-get update && apt-get install -y \
9
  gcc \
10
  python3-dev \
 
12
  git \
13
  && rm -rf /var/lib/apt/lists/*
14
 
 
15
  RUN pip install --no-cache-dir uv
16
 
17
  COPY requirements.txt .
18
 
 
19
  ENV UV_HTTP_TIMEOUT=300
20
  RUN uv venv .venv && \
21
  uv pip install --no-cache -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
22
 
 
23
  FROM python:3.10-slim
24
 
25
  WORKDIR /app
 
30
  ENV LANG=C.UTF-8
31
  ENV LC_ALL=C.UTF-8
32
 
 
33
  RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
34
 
 
35
  COPY --from=builder /app/.venv /app/.venv
36
 
 
37
  COPY scripts/ ./scripts/
38
 
 
39
  ENV HF_HOME=/app/data/model_cache
40
 
 
41
  RUN /app/.venv/bin/python scripts/download_model.py
42
 
 
 
43
  RUN mkdir -p data/catalog data/index
44
  RUN /app/.venv/bin/python scripts/download_artifacts.py
45
 
 
46
  COPY src/ ./src/
47
 
 
48
  # RUN addgroup --system app && adduser --system --group app && \
49
  # chown -R app:app /app
50
 
51
  # USER app
52
 
 
53
  EXPOSE 7860
54
 
 
55
  CMD ["/app/.venv/bin/uvicorn", "src.personalization.api.main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -14,7 +14,7 @@ A high-performance, standalone recommendation service that uses **Semantic Searc
14
 
15
  Unlike traditional recommenders that rely on collaborative filtering (which fails without massive user data), this engine uses **Sentence Transformers** to understand the *content* of books (Title + Author + Genre + Description), allowing it to work effectively from Day 1 ("Cold Start").
16
 
17
- ## 🚀 Key Features
18
 
19
  * **Semantic Understanding:** Connects "The Haunted School" to "Ghost Beach" based on plot descriptions, not just title keywords.
20
  * **Hybrid Scoring:** Combines **Semantic Similarity** (85%) with **Book Ratings** (15%) to recommend high-quality matches.
@@ -23,7 +23,7 @@ Unlike traditional recommenders that rely on collaborative filtering (which fail
23
  * **Evaluation:** Achieves **40% Exact Hit Rate @ 10** on held-out author tests.
24
  * **Standalone API:** Runs as a separate microservice (FastAPI) on Port 8001.
25
 
26
- ## 🏗️ Architecture
27
 
28
  This project uses a **retrieval-based** approach:
29
 
@@ -35,7 +35,7 @@ This project uses a **retrieval-based** approach:
35
  * The engine searches the FAISS index for the nearest neighbors.
36
  * Results are re-ranked using the book's rating.
37
 
38
- ## 📦 Installation & Setup
39
 
40
  ### Prerequisites
41
  * Python 3.10+ (or Docker)
@@ -77,7 +77,7 @@ python scripts/1b_generate_semantic_data.py
77
  python scripts/optimize_index.py
78
  ```
79
 
80
- ## 🏃 Run the Application
81
 
82
  ### Option A: Run Locally
83
  ```bash
@@ -96,7 +96,7 @@ docker build -t personalise .
96
  docker run -p 8001:7860 personalise
97
  ```
98
 
99
- ## 🧪 Evaluation & Demo
100
  We have included a synthetic dataset of 10,000 users to validate the model.
101
 
102
  **Run the Offline Evaluation:**
@@ -121,7 +121,7 @@ python scripts/visualize_users.py
121
  python scripts/inspect_data.py
122
  ```
123
 
124
- ## 📡 API Usage
125
 
126
  #### POST `/personalize/recommend`
127
  Get personalized books based on reading history.
@@ -141,7 +141,7 @@ Semantic search by plot or vibe.
141
  }
142
  ```
143
 
144
- ## 📊 Performance Stats
145
 
146
  | Metric | Brute Force (Flat) | Optimized (IVF-PQ) |
147
  | :--- | :--- | :--- |
@@ -150,10 +150,10 @@ Semantic search by plot or vibe.
150
  | **Speed** | ~10ms | ~2ms |
151
  | **Hit Rate @ 10** | N/A | **40.0%** |
152
 
153
- ## 🗺️ Roadmap & Future Improvements
154
  * **Model Compression (ONNX):** Replace the heavy PyTorch dependency with **ONNX Runtime**. This would reduce the Docker image size from ~3GB to ~500MB and improve CPU inference latency by 2-3x.
155
  * **Real-Time Learning:** Implement a "Session-Based" Recommender (using RNNs or Transformers) to adapt to user intent within a single session, rather than just long-term history.
156
  * **A/B Testing Framework:** Add infrastructure to serve different model versions to different user segments to scientifically measure engagement.
157
 
158
- ## 📄 License
159
  MIT
 
14
 
15
  Unlike traditional recommenders that rely on collaborative filtering (which fails without massive user data), this engine uses **Sentence Transformers** to understand the *content* of books (Title + Author + Genre + Description), allowing it to work effectively from Day 1 ("Cold Start").
16
 
17
+ ## Key Features
18
 
19
  * **Semantic Understanding:** Connects "The Haunted School" to "Ghost Beach" based on plot descriptions, not just title keywords.
20
  * **Hybrid Scoring:** Combines **Semantic Similarity** (85%) with **Book Ratings** (15%) to recommend high-quality matches.
 
23
  * **Evaluation:** Achieves **40% Exact Hit Rate @ 10** on held-out author tests.
24
  * **Standalone API:** Runs as a separate microservice (FastAPI) on Port 8001.
25
 
26
+ ## Architecture
27
 
28
  This project uses a **retrieval-based** approach:
29
 
 
35
  * The engine searches the FAISS index for the nearest neighbors.
36
  * Results are re-ranked using the book's rating.
37
 
38
+ ## Installation & Setup
39
 
40
  ### Prerequisites
41
  * Python 3.10+ (or Docker)
 
77
  python scripts/optimize_index.py
78
  ```
79
 
80
+ ## Run the Application
81
 
82
  ### Option A: Run Locally
83
  ```bash
 
96
  docker run -p 8001:7860 personalise
97
  ```
98
 
99
+ ## Evaluation & Demo
100
  We have included a synthetic dataset of 10,000 users to validate the model.
101
 
102
  **Run the Offline Evaluation:**
 
121
  python scripts/inspect_data.py
122
  ```
123
 
124
+ ## API Usage
125
 
126
  #### POST `/personalize/recommend`
127
  Get personalized books based on reading history.
 
141
  }
142
  ```
143
 
144
+ ## Performance Stats
145
 
146
  | Metric | Brute Force (Flat) | Optimized (IVF-PQ) |
147
  | :--- | :--- | :--- |
 
150
  | **Speed** | ~10ms | ~2ms |
151
  | **Hit Rate @ 10** | N/A | **40.0%** |
152
 
153
+ ## Roadmap & Future Improvements
154
  * **Model Compression (ONNX):** Replace the heavy PyTorch dependency with **ONNX Runtime**. This would reduce the Docker image size from ~3GB to ~500MB and improve CPU inference latency by 2-3x.
155
  * **Real-Time Learning:** Implement a "Session-Based" Recommender (using RNNs or Transformers) to adapt to user intent within a single session, rather than just long-term history.
156
  * **A/B Testing Framework:** Add infrastructure to serve different model versions to different user segments to scientifically measure engagement.
157
 
158
+ ## License
159
  MIT
scripts/download_artifacts.py CHANGED
@@ -6,7 +6,6 @@ import shutil
6
  HF_REPO_ID = "nice-bill/book-recommender-artifacts"
7
  REPO_TYPE = "dataset"
8
 
9
- # Local Paths
10
  DATA_DIR = Path("data")
11
  CATALOG_DIR = DATA_DIR / "catalog"
12
  INDEX_DIR = DATA_DIR / "index"
@@ -20,7 +19,6 @@ FILES_TO_DOWNLOAD = {
20
  def main():
21
  print(f"--- Checking artifacts from {HF_REPO_ID} ---")
22
 
23
- # Ensure directories exist
24
  for dir_path in [DATA_DIR, CATALOG_DIR, INDEX_DIR]:
25
  dir_path.mkdir(parents=True, exist_ok=True)
26
 
@@ -33,14 +31,12 @@ def main():
33
 
34
  print(f"Downloading {filename}...")
35
  try:
36
- # Download to local cache
37
  cached_path = hf_hub_download(
38
  repo_id=HF_REPO_ID,
39
  filename=filename,
40
  repo_type=REPO_TYPE
41
  )
42
 
43
- # Copy from cache to our project structure
44
  shutil.copy(cached_path, dest_path)
45
  print(f" Saved to {dest_path}")
46
 
 
6
  HF_REPO_ID = "nice-bill/book-recommender-artifacts"
7
  REPO_TYPE = "dataset"
8
 
 
9
  DATA_DIR = Path("data")
10
  CATALOG_DIR = DATA_DIR / "catalog"
11
  INDEX_DIR = DATA_DIR / "index"
 
19
  def main():
20
  print(f"--- Checking artifacts from {HF_REPO_ID} ---")
21
 
 
22
  for dir_path in [DATA_DIR, CATALOG_DIR, INDEX_DIR]:
23
  dir_path.mkdir(parents=True, exist_ok=True)
24
 
 
31
 
32
  print(f"Downloading {filename}...")
33
  try:
 
34
  cached_path = hf_hub_download(
35
  repo_id=HF_REPO_ID,
36
  filename=filename,
37
  repo_type=REPO_TYPE
38
  )
39
 
 
40
  shutil.copy(cached_path, dest_path)
41
  print(f" Saved to {dest_path}")
42
 
scripts/download_model.py CHANGED
@@ -5,7 +5,6 @@ MODEL_NAME = "all-MiniLM-L6-v2"
5
 
6
  def download():
7
  print(f"Downloading {MODEL_NAME}...")
8
- # This will download to HF_HOME (set in Dockerfile)
9
  SentenceTransformer(MODEL_NAME)
10
  print("Done.")
11
 
 
5
 
6
  def download():
7
  print(f"Downloading {MODEL_NAME}...")
 
8
  SentenceTransformer(MODEL_NAME)
9
  print("Done.")
10
 
scripts/evaluate_quality.py CHANGED
@@ -6,11 +6,9 @@ import sys
6
  from tqdm import tqdm
7
  from pathlib import Path
8
 
9
- # Add src to path
10
  sys.path.append(str(Path(__file__).parent.parent))
11
  from src.personalization.config import settings
12
 
13
- # Config
14
  CATALOG_PATH = Path("data/catalog/books_catalog.csv")
15
  NUM_SAMPLES = 100
16
 
@@ -30,7 +28,6 @@ def main():
30
 
31
  df = pd.read_csv(CATALOG_PATH)
32
 
33
- # Filter authors with at least 5 books
34
  author_counts = df['authors'].value_counts()
35
  valid_authors = author_counts[author_counts >= 5].index.tolist()
36
 
@@ -43,20 +40,17 @@ def main():
43
  print(f"Running {args.samples} evaluation queries against {api_url}...")
44
 
45
  for _ in tqdm(range(args.samples)):
46
- # 1. Pick a random author
47
  author = random.choice(valid_authors)
48
  books = df[df['authors'] == author]
49
 
50
  if len(books) < 5:
51
  continue
52
 
53
- # 2. Split: History (3 books) -> Target (1 book)
54
  sample = books.sample(n=4, replace=False)
55
  history = sample.iloc[:3]['title'].tolist()
56
  target_book = sample.iloc[3]
57
  target_title = target_book['title']
58
 
59
- # 3. Call API
60
  try:
61
  payload = {"user_history": history, "top_k": 10}
62
  resp = requests.post(api_url, json=payload)
@@ -67,11 +61,9 @@ def main():
67
  recs = resp.json()
68
  rec_titles = [r['title'] for r in recs]
69
 
70
- # Metrics
71
  if target_title in rec_titles:
72
  hits += 1
73
 
74
- # Author Match
75
  rec_authors = df[df['title'].isin(rec_titles)]['authors'].tolist()
76
  if author in rec_authors:
77
  matches = rec_authors.count(author)
 
6
  from tqdm import tqdm
7
  from pathlib import Path
8
 
 
9
  sys.path.append(str(Path(__file__).parent.parent))
10
  from src.personalization.config import settings
11
 
 
12
  CATALOG_PATH = Path("data/catalog/books_catalog.csv")
13
  NUM_SAMPLES = 100
14
 
 
28
 
29
  df = pd.read_csv(CATALOG_PATH)
30
 
 
31
  author_counts = df['authors'].value_counts()
32
  valid_authors = author_counts[author_counts >= 5].index.tolist()
33
 
 
40
  print(f"Running {args.samples} evaluation queries against {api_url}...")
41
 
42
  for _ in tqdm(range(args.samples)):
 
43
  author = random.choice(valid_authors)
44
  books = df[df['authors'] == author]
45
 
46
  if len(books) < 5:
47
  continue
48
 
 
49
  sample = books.sample(n=4, replace=False)
50
  history = sample.iloc[:3]['title'].tolist()
51
  target_book = sample.iloc[3]
52
  target_title = target_book['title']
53
 
 
54
  try:
55
  payload = {"user_history": history, "top_k": 10}
56
  resp = requests.post(api_url, json=payload)
 
61
  recs = resp.json()
62
  rec_titles = [r['title'] for r in recs]
63
 
 
64
  if target_title in rec_titles:
65
  hits += 1
66
 
 
67
  rec_authors = df[df['title'].isin(rec_titles)]['authors'].tolist()
68
  if author in rec_authors:
69
  matches = rec_authors.count(author)
scripts/evaluate_system.py CHANGED
@@ -6,11 +6,9 @@ import logging
6
  from sentence_transformers import SentenceTransformer
7
  from tqdm import tqdm
8
 
9
- # Setup
10
  logging.basicConfig(level=logging.INFO)
11
  logger = logging.getLogger("Evaluator")
12
 
13
- # Paths
14
  DATA_DIR = Path("data")
15
  SYNTHETIC_DATA_PATH = DATA_DIR / "synthetic" / "user_sequences.parquet"
16
  CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
@@ -23,22 +21,17 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
23
  metric: Hit Rate @ k
24
  """
25
 
26
- # 1. Load Resources
27
  logger.info("Loading Catalog and Embeddings...")
28
  if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
29
  logger.error("Missing Data! Run download scripts first.")
30
  return
31
 
32
- # Load Titles for mapping
33
  df_catalog = pd.read_csv(CATALOG_PATH)
34
  titles = df_catalog['title'].tolist()
35
- # Create Title -> Index map (normalized)
36
  title_to_idx = {t.lower().strip(): i for i, t in enumerate(titles)}
37
 
38
- # Load Embeddings
39
  embeddings = np.load(EMBEDDINGS_PATH)
40
 
41
- # Load Index
42
  logger.info("Loading FAISS Index...")
43
  if INDEX_PATH.exists():
44
  index = faiss.read_index(str(INDEX_PATH))
@@ -50,11 +43,9 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
50
  faiss.normalize_L2(embeddings)
51
  index.add(embeddings)
52
 
53
- # 2. Load Synthetic Users
54
  logger.info(f"Loading Synthetic Data from {SYNTHETIC_DATA_PATH}...")
55
  df_users = pd.read_parquet(SYNTHETIC_DATA_PATH)
56
 
57
- # Sample users if dataset is too large
58
  if len(df_users) > sample_size:
59
  df_users = df_users.sample(sample_size, random_state=42)
60
 
@@ -66,15 +57,12 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
66
  for _, row in tqdm(df_users.iterrows(), total=len(df_users)):
67
  history = row['book_sequence']
68
 
69
- # Need at least 2 books (1 for history, 1 for test)
70
  if len(history) < 2:
71
  continue
72
 
73
- # Leave-One-Out Split
74
  target_book = history[-1]
75
  context_books = history[:-1]
76
 
77
- # 3. Convert Context to Vector
78
  valid_indices = []
79
  for book in context_books:
80
  norm_title = book.lower().strip()
@@ -84,10 +72,8 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
84
  if not valid_indices:
85
  continue
86
 
87
- # Get vectors and average (Time Decay Simulation)
88
  context_vectors = embeddings[valid_indices]
89
 
90
- # Simple Time Decay
91
  n = len(valid_indices)
92
  decay_factor = 0.9
93
  weights = np.array([decay_factor ** (n - 1 - i) for i in range(n)])
@@ -96,14 +82,11 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
96
  user_vector = np.average(context_vectors, axis=0, weights=weights).reshape(1, -1).astype(np.float32)
97
  faiss.normalize_L2(user_vector)
98
 
99
- # 4. Search
100
- # We search for top_k + len(context) because the model might return books the user already read
101
  search_k = top_k + len(valid_indices) + 5
102
  scores, indices = index.search(user_vector, search_k)
103
 
104
- # Filter results
105
  recommended_titles = []
106
- seen_indices = set(valid_indices) # Don't recommend what they just read
107
 
108
  for idx in indices[0]:
109
  if idx in seen_indices:
@@ -115,10 +98,7 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
115
  if len(recommended_titles) >= top_k:
116
  break
117
 
118
- # 5. Check Hit
119
- # We check if the TARGET book title is in the recommended list
120
- # Using loose matching (substring or exact) can be generous, but strict is better for ML metrics
121
- # We'll stick to exact string match (normalized)
122
 
123
  target_norm = target_book.lower().strip()
124
  rec_norm = [t.lower().strip() for t in recommended_titles]
@@ -128,7 +108,6 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
128
 
129
  processed_users += 1
130
 
131
- # 6. Report
132
  if processed_users == 0:
133
  print("No valid users found for evaluation.")
134
  return
 
6
  from sentence_transformers import SentenceTransformer
7
  from tqdm import tqdm
8
 
 
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger("Evaluator")
11
 
 
12
  DATA_DIR = Path("data")
13
  SYNTHETIC_DATA_PATH = DATA_DIR / "synthetic" / "user_sequences.parquet"
14
  CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
 
21
  metric: Hit Rate @ k
22
  """
23
 
 
24
  logger.info("Loading Catalog and Embeddings...")
25
  if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
26
  logger.error("Missing Data! Run download scripts first.")
27
  return
28
 
 
29
  df_catalog = pd.read_csv(CATALOG_PATH)
30
  titles = df_catalog['title'].tolist()
 
31
  title_to_idx = {t.lower().strip(): i for i, t in enumerate(titles)}
32
 
 
33
  embeddings = np.load(EMBEDDINGS_PATH)
34
 
 
35
  logger.info("Loading FAISS Index...")
36
  if INDEX_PATH.exists():
37
  index = faiss.read_index(str(INDEX_PATH))
 
43
  faiss.normalize_L2(embeddings)
44
  index.add(embeddings)
45
 
 
46
  logger.info(f"Loading Synthetic Data from {SYNTHETIC_DATA_PATH}...")
47
  df_users = pd.read_parquet(SYNTHETIC_DATA_PATH)
48
 
 
49
  if len(df_users) > sample_size:
50
  df_users = df_users.sample(sample_size, random_state=42)
51
 
 
57
  for _, row in tqdm(df_users.iterrows(), total=len(df_users)):
58
  history = row['book_sequence']
59
 
 
60
  if len(history) < 2:
61
  continue
62
 
 
63
  target_book = history[-1]
64
  context_books = history[:-1]
65
 
 
66
  valid_indices = []
67
  for book in context_books:
68
  norm_title = book.lower().strip()
 
72
  if not valid_indices:
73
  continue
74
 
 
75
  context_vectors = embeddings[valid_indices]
76
 
 
77
  n = len(valid_indices)
78
  decay_factor = 0.9
79
  weights = np.array([decay_factor ** (n - 1 - i) for i in range(n)])
 
82
  user_vector = np.average(context_vectors, axis=0, weights=weights).reshape(1, -1).astype(np.float32)
83
  faiss.normalize_L2(user_vector)
84
 
 
 
85
  search_k = top_k + len(valid_indices) + 5
86
  scores, indices = index.search(user_vector, search_k)
87
 
 
88
  recommended_titles = []
89
+ seen_indices = set(valid_indices)
90
 
91
  for idx in indices[0]:
92
  if idx in seen_indices:
 
98
  if len(recommended_titles) >= top_k:
99
  break
100
 
101
+
 
 
 
102
 
103
  target_norm = target_book.lower().strip()
104
  rec_norm = [t.lower().strip() for t in recommended_titles]
 
108
 
109
  processed_users += 1
110
 
 
111
  if processed_users == 0:
112
  print("No valid users found for evaluation.")
113
  return
scripts/inspect_data.py DELETED
@@ -1,33 +0,0 @@
1
- import pandas as pd
2
- from pathlib import Path
3
-
4
- # Config
5
- DATA_PATH = Path("data/synthetic/user_sequences.parquet")
6
-
7
- def inspect():
8
- if not DATA_PATH.exists():
9
- print(f"Error: File not found at {DATA_PATH}")
10
- return
11
-
12
- try:
13
- print(f"Reading {DATA_PATH}...")
14
- df = pd.read_parquet(DATA_PATH)
15
-
16
- print("\n--- Schema ---")
17
- print(df.info())
18
-
19
- print("\n--- First 5 Rows ---")
20
- print(df.head().to_string())
21
-
22
- print("\n--- Sample User History ---")
23
- # Show the full history of the first user
24
- first_user = df.iloc[0]
25
- print(f"User ID: {first_user.get('user_id', 'N/A')}")
26
- print(f"History: {first_user.get('book_history', 'N/A')}")
27
-
28
- except Exception as e:
29
- print(f"Failed to read parquet: {e}")
30
-
31
- if __name__ == "__main__":
32
- inspect()
33
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/optimize_index.py CHANGED
@@ -4,7 +4,6 @@ from pathlib import Path
4
  import time
5
  import sys
6
 
7
- # Config
8
  DATA_DIR = Path("data")
9
  EMBEDDINGS_PATH = DATA_DIR / "embeddings_cache.npy"
10
  OUTPUT_PATH = DATA_DIR / "index" / "optimized.index"
 
4
  import time
5
  import sys
6
 
 
7
  DATA_DIR = Path("data")
8
  EMBEDDINGS_PATH = DATA_DIR / "embeddings_cache.npy"
9
  OUTPUT_PATH = DATA_DIR / "index" / "optimized.index"
scripts/visualize_users.py CHANGED
@@ -4,11 +4,9 @@ import logging
4
  from pathlib import Path
5
  from tqdm import tqdm
6
 
7
- # Setup Logging
8
  logging.basicConfig(level=logging.INFO)
9
  logger = logging.getLogger("Visualizer")
10
 
11
- # Paths
12
  DATA_DIR = Path("data")
13
  SYNTHETIC_DATA_PATH = DATA_DIR / "synthetic" / "user_sequences.parquet"
14
  CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
@@ -29,24 +27,19 @@ def visualize_clusters(sample_size=2000):
29
  logger.error("Please run: uv pip install matplotlib seaborn")
30
  return
31
 
32
- # 1. Load Resources
33
  logger.info("Loading Data...")
34
  if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
35
  logger.error("Missing Data! Run download scripts first.")
36
  return
37
 
38
- # Load Titles for mapping
39
  df_catalog = pd.read_csv(CATALOG_PATH)
40
  titles = df_catalog['title'].tolist()
41
  title_to_idx = {t.lower().strip(): i for i, t in enumerate(titles)}
42
 
43
- # Load Embeddings
44
  embeddings = np.load(EMBEDDINGS_PATH)
45
 
46
- # Load Users
47
  df_users = pd.read_parquet(SYNTHETIC_DATA_PATH)
48
 
49
- # Sample
50
  if len(df_users) > sample_size:
51
  df_users = df_users.sample(sample_size, random_state=42)
52
 
@@ -55,7 +48,6 @@ def visualize_clusters(sample_size=2000):
55
  user_vectors = []
56
  user_personas = []
57
 
58
- # 2. Calculate User Vectors
59
  valid_users = 0
60
  for _, row in tqdm(df_users.iterrows(), total=len(df_users)):
61
  history = row['book_sequence']
@@ -70,7 +62,6 @@ def visualize_clusters(sample_size=2000):
70
  if not valid_indices:
71
  continue
72
 
73
- # Average Embeddings
74
  vectors = embeddings[valid_indices]
75
  user_vec = np.mean(vectors, axis=0)
76
 
@@ -80,12 +71,10 @@ def visualize_clusters(sample_size=2000):
80
 
81
  X = np.array(user_vectors)
82
 
83
- # 3. t-SNE Reduction
84
- logger.info("Running t-SNE (this might take a moment)...")
85
  tsne = TSNE(n_components=2, random_state=42, perplexity=30)
86
  X_embedded = tsne.fit_transform(X)
87
 
88
- # 4. Plotting
89
  logger.info("Generating Plot...")
90
  OUTPUT_DIR.mkdir(exist_ok=True)
91
 
@@ -106,7 +95,7 @@ def visualize_clusters(sample_size=2000):
106
  plt.tight_layout()
107
 
108
  plt.savefig(OUTPUT_IMAGE, dpi=300)
109
- logger.info(f"Visualization saved to {OUTPUT_IMAGE}")
110
  print(f"Success! Check {OUTPUT_IMAGE} to see your user clusters.")
111
 
112
  if __name__ == "__main__":
 
4
  from pathlib import Path
5
  from tqdm import tqdm
6
 
 
7
  logging.basicConfig(level=logging.INFO)
8
  logger = logging.getLogger("Visualizer")
9
 
 
10
  DATA_DIR = Path("data")
11
  SYNTHETIC_DATA_PATH = DATA_DIR / "synthetic" / "user_sequences.parquet"
12
  CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
 
27
  logger.error("Please run: uv pip install matplotlib seaborn")
28
  return
29
 
 
30
  logger.info("Loading Data...")
31
  if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
32
  logger.error("Missing Data! Run download scripts first.")
33
  return
34
 
 
35
  df_catalog = pd.read_csv(CATALOG_PATH)
36
  titles = df_catalog['title'].tolist()
37
  title_to_idx = {t.lower().strip(): i for i, t in enumerate(titles)}
38
 
 
39
  embeddings = np.load(EMBEDDINGS_PATH)
40
 
 
41
  df_users = pd.read_parquet(SYNTHETIC_DATA_PATH)
42
 
 
43
  if len(df_users) > sample_size:
44
  df_users = df_users.sample(sample_size, random_state=42)
45
 
 
48
  user_vectors = []
49
  user_personas = []
50
 
 
51
  valid_users = 0
52
  for _, row in tqdm(df_users.iterrows(), total=len(df_users)):
53
  history = row['book_sequence']
 
62
  if not valid_indices:
63
  continue
64
 
 
65
  vectors = embeddings[valid_indices]
66
  user_vec = np.mean(vectors, axis=0)
67
 
 
71
 
72
  X = np.array(user_vectors)
73
 
74
+ logger.info("Running t-SNE")
 
75
  tsne = TSNE(n_components=2, random_state=42, perplexity=30)
76
  X_embedded = tsne.fit_transform(X)
77
 
 
78
  logger.info("Generating Plot...")
79
  OUTPUT_DIR.mkdir(exist_ok=True)
80
 
 
95
  plt.tight_layout()
96
 
97
  plt.savefig(OUTPUT_IMAGE, dpi=300)
98
+ logger.info(f"Visualization saved to {OUTPUT_IMAGE}")
99
  print(f"Success! Check {OUTPUT_IMAGE} to see your user clusters.")
100
 
101
  if __name__ == "__main__":
src/personalization/api/main.py CHANGED
@@ -11,17 +11,14 @@ import faiss
11
  import numpy as np
12
  import time
13
 
14
- # Setup Logging
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
18
- # Config
19
  DATA_DIR = Path("data")
20
  CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
21
  EMBEDDINGS_PATH = DATA_DIR / "embeddings_cache.npy"
22
  MODEL_NAME = "all-MiniLM-L6-v2"
23
 
24
- # Global State
25
  state = {
26
  "titles": [],
27
  "title_to_idx": {},
@@ -40,7 +37,6 @@ async def lifespan(app: FastAPI):
40
 
41
  if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
42
  logger.error("Missing catalog or embeddings! Run scripts/1b... first.")
43
- # We continue but service might be degraded
44
  else:
45
  try:
46
  df = pd.read_csv(CATALOG_PATH)
@@ -51,7 +47,6 @@ async def lifespan(app: FastAPI):
51
  max_rating = raw_ratings.max()
52
  state["ratings"] = (raw_ratings / max_rating).tolist() if max_rating > 0 else [0.5] * len(df)
53
 
54
- # Use normalized keys for robust lookup
55
  state["title_to_idx"] = {t.lower().strip(): i for i, t in enumerate(state["titles"])}
56
 
57
  state["popular_indices"] = np.argsort(raw_ratings)[::-1][:50].tolist()
@@ -80,23 +75,19 @@ async def lifespan(app: FastAPI):
80
  logger.info(f"Ready! Loaded {len(state['titles'])} books in {time.time() - start_time:.2f}s")
81
  except Exception as e:
82
  logger.error(f"Failed to load resources: {e}")
83
- # Consider raising if critical
84
 
85
  yield
86
 
87
  logger.info("Shutting down...")
88
- # Clean up resources if needed
89
 
90
  app = FastAPI(title="Semantic Book Discovery Engine", lifespan=lifespan)
91
 
92
- # Add a root endpoint to redirect to docs
93
  from fastapi.responses import RedirectResponse
94
 
95
  @app.get("/")
96
  async def read_root():
97
  return RedirectResponse(url="/docs")
98
 
99
- # Add Prometheus Instrumentation
100
  Instrumentator().instrument(app).expose(app)
101
 
102
  class RecommendationRequest(BaseModel):
 
11
  import numpy as np
12
  import time
13
 
 
14
  logging.basicConfig(level=logging.INFO)
15
  logger = logging.getLogger(__name__)
16
 
 
17
  DATA_DIR = Path("data")
18
  CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
19
  EMBEDDINGS_PATH = DATA_DIR / "embeddings_cache.npy"
20
  MODEL_NAME = "all-MiniLM-L6-v2"
21
 
 
22
  state = {
23
  "titles": [],
24
  "title_to_idx": {},
 
37
 
38
  if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
39
  logger.error("Missing catalog or embeddings! Run scripts/1b... first.")
 
40
  else:
41
  try:
42
  df = pd.read_csv(CATALOG_PATH)
 
47
  max_rating = raw_ratings.max()
48
  state["ratings"] = (raw_ratings / max_rating).tolist() if max_rating > 0 else [0.5] * len(df)
49
 
 
50
  state["title_to_idx"] = {t.lower().strip(): i for i, t in enumerate(state["titles"])}
51
 
52
  state["popular_indices"] = np.argsort(raw_ratings)[::-1][:50].tolist()
 
75
  logger.info(f"Ready! Loaded {len(state['titles'])} books in {time.time() - start_time:.2f}s")
76
  except Exception as e:
77
  logger.error(f"Failed to load resources: {e}")
 
78
 
79
  yield
80
 
81
  logger.info("Shutting down...")
 
82
 
83
  app = FastAPI(title="Semantic Book Discovery Engine", lifespan=lifespan)
84
 
 
85
  from fastapi.responses import RedirectResponse
86
 
87
  @app.get("/")
88
  async def read_root():
89
  return RedirectResponse(url="/docs")
90
 
 
91
  Instrumentator().instrument(app).expose(app)
92
 
93
  class RecommendationRequest(BaseModel):
src/personalization/config.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
 
3
  class Settings:
4
- # Default to 8001, but allow Env Override
5
  HOST = os.getenv("API_HOST", "localhost")
6
  PORT = int(os.getenv("API_PORT", 8001))
7
 
 
1
  import os
2
 
3
  class Settings:
 
4
  HOST = os.getenv("API_HOST", "localhost")
5
  PORT = int(os.getenv("API_PORT", 8001))
6
 
test_api.py CHANGED
@@ -3,7 +3,6 @@ import argparse
3
  import sys
4
  from pathlib import Path
5
 
6
- # Add src to path to import config
7
  sys.path.append(str(Path(__file__).parent))
8
  from src.personalization.config import settings
9
 
 
3
  import sys
4
  from pathlib import Path
5
 
 
6
  sys.path.append(str(Path(__file__).parent))
7
  from src.personalization.config import settings
8