Spaces:
Sleeping
Sleeping
Readme updated
Browse files- .dockerignore +0 -1
- .gitignore +3 -4
- Dockerfile +0 -16
- README.md +9 -9
- scripts/download_artifacts.py +0 -4
- scripts/download_model.py +0 -1
- scripts/evaluate_quality.py +0 -8
- scripts/evaluate_system.py +2 -23
- scripts/inspect_data.py +0 -33
- scripts/optimize_index.py +0 -1
- scripts/visualize_users.py +2 -13
- src/personalization/api/main.py +0 -9
- src/personalization/config.py +0 -1
- test_api.py +0 -1
.dockerignore
CHANGED
|
@@ -11,6 +11,5 @@ venv/
|
|
| 11 |
.mypy_cache
|
| 12 |
.pytest_cache
|
| 13 |
|
| 14 |
-
# Ignore raw data if any, but keep catalog/index/embeddings
|
| 15 |
data/raw
|
| 16 |
data/synthetic
|
|
|
|
| 11 |
.mypy_cache
|
| 12 |
.pytest_cache
|
| 13 |
|
|
|
|
| 14 |
data/raw
|
| 15 |
data/synthetic
|
.gitignore
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
# Python
|
| 2 |
__pycache__/
|
| 3 |
*.py[cod]
|
| 4 |
*$py.class
|
|
@@ -6,13 +5,13 @@ __pycache__/
|
|
| 6 |
venv/
|
| 7 |
.env
|
| 8 |
|
| 9 |
-
# Data & Models (Too large for git)
|
| 10 |
data/
|
| 11 |
*.pt
|
| 12 |
*.pth
|
| 13 |
*.parquet
|
| 14 |
*.csv
|
| 15 |
|
| 16 |
-
# IDE
|
| 17 |
.vscode/
|
| 18 |
-
.idea/
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
__pycache__/
|
| 2 |
*.py[cod]
|
| 3 |
*$py.class
|
|
|
|
| 5 |
venv/
|
| 6 |
.env
|
| 7 |
|
|
|
|
| 8 |
data/
|
| 9 |
*.pt
|
| 10 |
*.pth
|
| 11 |
*.parquet
|
| 12 |
*.csv
|
| 13 |
|
|
|
|
| 14 |
.vscode/
|
| 15 |
+
.idea/
|
| 16 |
+
|
| 17 |
+
docs/
|
Dockerfile
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
# Build Stage
|
| 2 |
FROM python:3.10-slim AS builder
|
| 3 |
|
| 4 |
WORKDIR /app
|
|
@@ -6,7 +5,6 @@ WORKDIR /app
|
|
| 6 |
ENV PYTHONDONTWRITEBYTECODE=1
|
| 7 |
ENV PYTHONUNBUFFERED=1
|
| 8 |
|
| 9 |
-
# Install build dependencies
|
| 10 |
RUN apt-get update && apt-get install -y \
|
| 11 |
gcc \
|
| 12 |
python3-dev \
|
|
@@ -14,17 +12,14 @@ RUN apt-get update && apt-get install -y \
|
|
| 14 |
git \
|
| 15 |
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
|
| 17 |
-
# Install uv
|
| 18 |
RUN pip install --no-cache-dir uv
|
| 19 |
|
| 20 |
COPY requirements.txt .
|
| 21 |
|
| 22 |
-
# Create virtual environment and install dependencies
|
| 23 |
ENV UV_HTTP_TIMEOUT=300
|
| 24 |
RUN uv venv .venv && \
|
| 25 |
uv pip install --no-cache -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
| 26 |
|
| 27 |
-
# --- Runtime Stage ---
|
| 28 |
FROM python:3.10-slim
|
| 29 |
|
| 30 |
WORKDIR /app
|
|
@@ -35,37 +30,26 @@ ENV PATH="/app/.venv/bin:$PATH"
|
|
| 35 |
ENV LANG=C.UTF-8
|
| 36 |
ENV LC_ALL=C.UTF-8
|
| 37 |
|
| 38 |
-
# Install runtime dependencies
|
| 39 |
RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
|
| 40 |
|
| 41 |
-
# Copy virtual environment from builder
|
| 42 |
COPY --from=builder /app/.venv /app/.venv
|
| 43 |
|
| 44 |
-
# Copy Scripts
|
| 45 |
COPY scripts/ ./scripts/
|
| 46 |
|
| 47 |
-
# Data & Model Baking
|
| 48 |
ENV HF_HOME=/app/data/model_cache
|
| 49 |
|
| 50 |
-
# Download Model
|
| 51 |
RUN /app/.venv/bin/python scripts/download_model.py
|
| 52 |
|
| 53 |
-
# Download Data
|
| 54 |
-
# Ensure data directory exists
|
| 55 |
RUN mkdir -p data/catalog data/index
|
| 56 |
RUN /app/.venv/bin/python scripts/download_artifacts.py
|
| 57 |
|
| 58 |
-
# Copy Code (Last to maximize layer caching)
|
| 59 |
COPY src/ ./src/
|
| 60 |
|
| 61 |
-
# Create directories and permissions
|
| 62 |
# RUN addgroup --system app && adduser --system --group app && \
|
| 63 |
# chown -R app:app /app
|
| 64 |
|
| 65 |
# USER app
|
| 66 |
|
| 67 |
-
# Expose port
|
| 68 |
EXPOSE 7860
|
| 69 |
|
| 70 |
-
# Run Command
|
| 71 |
CMD ["/app/.venv/bin/uvicorn", "src.personalization.api.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
|
|
|
| 1 |
FROM python:3.10-slim AS builder
|
| 2 |
|
| 3 |
WORKDIR /app
|
|
|
|
| 5 |
ENV PYTHONDONTWRITEBYTECODE=1
|
| 6 |
ENV PYTHONUNBUFFERED=1
|
| 7 |
|
|
|
|
| 8 |
RUN apt-get update && apt-get install -y \
|
| 9 |
gcc \
|
| 10 |
python3-dev \
|
|
|
|
| 12 |
git \
|
| 13 |
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
|
|
|
|
| 15 |
RUN pip install --no-cache-dir uv
|
| 16 |
|
| 17 |
COPY requirements.txt .
|
| 18 |
|
|
|
|
| 19 |
ENV UV_HTTP_TIMEOUT=300
|
| 20 |
RUN uv venv .venv && \
|
| 21 |
uv pip install --no-cache -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
| 22 |
|
|
|
|
| 23 |
FROM python:3.10-slim
|
| 24 |
|
| 25 |
WORKDIR /app
|
|
|
|
| 30 |
ENV LANG=C.UTF-8
|
| 31 |
ENV LC_ALL=C.UTF-8
|
| 32 |
|
|
|
|
| 33 |
RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
|
| 34 |
|
|
|
|
| 35 |
COPY --from=builder /app/.venv /app/.venv
|
| 36 |
|
|
|
|
| 37 |
COPY scripts/ ./scripts/
|
| 38 |
|
|
|
|
| 39 |
ENV HF_HOME=/app/data/model_cache
|
| 40 |
|
|
|
|
| 41 |
RUN /app/.venv/bin/python scripts/download_model.py
|
| 42 |
|
|
|
|
|
|
|
| 43 |
RUN mkdir -p data/catalog data/index
|
| 44 |
RUN /app/.venv/bin/python scripts/download_artifacts.py
|
| 45 |
|
|
|
|
| 46 |
COPY src/ ./src/
|
| 47 |
|
|
|
|
| 48 |
# RUN addgroup --system app && adduser --system --group app && \
|
| 49 |
# chown -R app:app /app
|
| 50 |
|
| 51 |
# USER app
|
| 52 |
|
|
|
|
| 53 |
EXPOSE 7860
|
| 54 |
|
|
|
|
| 55 |
CMD ["/app/.venv/bin/uvicorn", "src.personalization.api.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -14,7 +14,7 @@ A high-performance, standalone recommendation service that uses **Semantic Searc
|
|
| 14 |
|
| 15 |
Unlike traditional recommenders that rely on collaborative filtering (which fails without massive user data), this engine uses **Sentence Transformers** to understand the *content* of books (Title + Author + Genre + Description), allowing it to work effectively from Day 1 ("Cold Start").
|
| 16 |
|
| 17 |
-
##
|
| 18 |
|
| 19 |
* **Semantic Understanding:** Connects "The Haunted School" to "Ghost Beach" based on plot descriptions, not just title keywords.
|
| 20 |
* **Hybrid Scoring:** Combines **Semantic Similarity** (85%) with **Book Ratings** (15%) to recommend high-quality matches.
|
|
@@ -23,7 +23,7 @@ Unlike traditional recommenders that rely on collaborative filtering (which fail
|
|
| 23 |
* **Evaluation:** Achieves **40% Exact Hit Rate @ 10** on held-out author tests.
|
| 24 |
* **Standalone API:** Runs as a separate microservice (FastAPI) on Port 8001.
|
| 25 |
|
| 26 |
-
##
|
| 27 |
|
| 28 |
This project uses a **retrieval-based** approach:
|
| 29 |
|
|
@@ -35,7 +35,7 @@ This project uses a **retrieval-based** approach:
|
|
| 35 |
* The engine searches the FAISS index for the nearest neighbors.
|
| 36 |
* Results are re-ranked using the book's rating.
|
| 37 |
|
| 38 |
-
##
|
| 39 |
|
| 40 |
### Prerequisites
|
| 41 |
* Python 3.10+ (or Docker)
|
|
@@ -77,7 +77,7 @@ python scripts/1b_generate_semantic_data.py
|
|
| 77 |
python scripts/optimize_index.py
|
| 78 |
```
|
| 79 |
|
| 80 |
-
##
|
| 81 |
|
| 82 |
### Option A: Run Locally
|
| 83 |
```bash
|
|
@@ -96,7 +96,7 @@ docker build -t personalise .
|
|
| 96 |
docker run -p 8001:7860 personalise
|
| 97 |
```
|
| 98 |
|
| 99 |
-
##
|
| 100 |
We have included a synthetic dataset of 10,000 users to validate the model.
|
| 101 |
|
| 102 |
**Run the Offline Evaluation:**
|
|
@@ -121,7 +121,7 @@ python scripts/visualize_users.py
|
|
| 121 |
python scripts/inspect_data.py
|
| 122 |
```
|
| 123 |
|
| 124 |
-
##
|
| 125 |
|
| 126 |
#### POST `/personalize/recommend`
|
| 127 |
Get personalized books based on reading history.
|
|
@@ -141,7 +141,7 @@ Semantic search by plot or vibe.
|
|
| 141 |
}
|
| 142 |
```
|
| 143 |
|
| 144 |
-
##
|
| 145 |
|
| 146 |
| Metric | Brute Force (Flat) | Optimized (IVF-PQ) |
|
| 147 |
| :--- | :--- | :--- |
|
|
@@ -150,10 +150,10 @@ Semantic search by plot or vibe.
|
|
| 150 |
| **Speed** | ~10ms | ~2ms |
|
| 151 |
| **Hit Rate @ 10** | N/A | **40.0%** |
|
| 152 |
|
| 153 |
-
##
|
| 154 |
* **Model Compression (ONNX):** Replace the heavy PyTorch dependency with **ONNX Runtime**. This would reduce the Docker image size from ~3GB to ~500MB and improve CPU inference latency by 2-3x.
|
| 155 |
* **Real-Time Learning:** Implement a "Session-Based" Recommender (using RNNs or Transformers) to adapt to user intent within a single session, rather than just long-term history.
|
| 156 |
* **A/B Testing Framework:** Add infrastructure to serve different model versions to different user segments to scientifically measure engagement.
|
| 157 |
|
| 158 |
-
##
|
| 159 |
MIT
|
|
|
|
| 14 |
|
| 15 |
Unlike traditional recommenders that rely on collaborative filtering (which fails without massive user data), this engine uses **Sentence Transformers** to understand the *content* of books (Title + Author + Genre + Description), allowing it to work effectively from Day 1 ("Cold Start").
|
| 16 |
|
| 17 |
+
## Key Features
|
| 18 |
|
| 19 |
* **Semantic Understanding:** Connects "The Haunted School" to "Ghost Beach" based on plot descriptions, not just title keywords.
|
| 20 |
* **Hybrid Scoring:** Combines **Semantic Similarity** (85%) with **Book Ratings** (15%) to recommend high-quality matches.
|
|
|
|
| 23 |
* **Evaluation:** Achieves **40% Exact Hit Rate @ 10** on held-out author tests.
|
| 24 |
* **Standalone API:** Runs as a separate microservice (FastAPI) on Port 8001.
|
| 25 |
|
| 26 |
+
## Architecture
|
| 27 |
|
| 28 |
This project uses a **retrieval-based** approach:
|
| 29 |
|
|
|
|
| 35 |
* The engine searches the FAISS index for the nearest neighbors.
|
| 36 |
* Results are re-ranked using the book's rating.
|
| 37 |
|
| 38 |
+
## Installation & Setup
|
| 39 |
|
| 40 |
### Prerequisites
|
| 41 |
* Python 3.10+ (or Docker)
|
|
|
|
| 77 |
python scripts/optimize_index.py
|
| 78 |
```
|
| 79 |
|
| 80 |
+
## Run the Application
|
| 81 |
|
| 82 |
### Option A: Run Locally
|
| 83 |
```bash
|
|
|
|
| 96 |
docker run -p 8001:7860 personalise
|
| 97 |
```
|
| 98 |
|
| 99 |
+
## Evaluation & Demo
|
| 100 |
We have included a synthetic dataset of 10,000 users to validate the model.
|
| 101 |
|
| 102 |
**Run the Offline Evaluation:**
|
|
|
|
| 121 |
python scripts/inspect_data.py
|
| 122 |
```
|
| 123 |
|
| 124 |
+
## API Usage
|
| 125 |
|
| 126 |
#### POST `/personalize/recommend`
|
| 127 |
Get personalized books based on reading history.
|
|
|
|
| 141 |
}
|
| 142 |
```
|
| 143 |
|
| 144 |
+
## Performance Stats
|
| 145 |
|
| 146 |
| Metric | Brute Force (Flat) | Optimized (IVF-PQ) |
|
| 147 |
| :--- | :--- | :--- |
|
|
|
|
| 150 |
| **Speed** | ~10ms | ~2ms |
|
| 151 |
| **Hit Rate @ 10** | N/A | **40.0%** |
|
| 152 |
|
| 153 |
+
## Roadmap & Future Improvements
|
| 154 |
* **Model Compression (ONNX):** Replace the heavy PyTorch dependency with **ONNX Runtime**. This would reduce the Docker image size from ~3GB to ~500MB and improve CPU inference latency by 2-3x.
|
| 155 |
* **Real-Time Learning:** Implement a "Session-Based" Recommender (using RNNs or Transformers) to adapt to user intent within a single session, rather than just long-term history.
|
| 156 |
* **A/B Testing Framework:** Add infrastructure to serve different model versions to different user segments to scientifically measure engagement.
|
| 157 |
|
| 158 |
+
## License
|
| 159 |
MIT
|
scripts/download_artifacts.py
CHANGED
|
@@ -6,7 +6,6 @@ import shutil
|
|
| 6 |
HF_REPO_ID = "nice-bill/book-recommender-artifacts"
|
| 7 |
REPO_TYPE = "dataset"
|
| 8 |
|
| 9 |
-
# Local Paths
|
| 10 |
DATA_DIR = Path("data")
|
| 11 |
CATALOG_DIR = DATA_DIR / "catalog"
|
| 12 |
INDEX_DIR = DATA_DIR / "index"
|
|
@@ -20,7 +19,6 @@ FILES_TO_DOWNLOAD = {
|
|
| 20 |
def main():
|
| 21 |
print(f"--- Checking artifacts from {HF_REPO_ID} ---")
|
| 22 |
|
| 23 |
-
# Ensure directories exist
|
| 24 |
for dir_path in [DATA_DIR, CATALOG_DIR, INDEX_DIR]:
|
| 25 |
dir_path.mkdir(parents=True, exist_ok=True)
|
| 26 |
|
|
@@ -33,14 +31,12 @@ def main():
|
|
| 33 |
|
| 34 |
print(f"Downloading {filename}...")
|
| 35 |
try:
|
| 36 |
-
# Download to local cache
|
| 37 |
cached_path = hf_hub_download(
|
| 38 |
repo_id=HF_REPO_ID,
|
| 39 |
filename=filename,
|
| 40 |
repo_type=REPO_TYPE
|
| 41 |
)
|
| 42 |
|
| 43 |
-
# Copy from cache to our project structure
|
| 44 |
shutil.copy(cached_path, dest_path)
|
| 45 |
print(f" Saved to {dest_path}")
|
| 46 |
|
|
|
|
| 6 |
HF_REPO_ID = "nice-bill/book-recommender-artifacts"
|
| 7 |
REPO_TYPE = "dataset"
|
| 8 |
|
|
|
|
| 9 |
DATA_DIR = Path("data")
|
| 10 |
CATALOG_DIR = DATA_DIR / "catalog"
|
| 11 |
INDEX_DIR = DATA_DIR / "index"
|
|
|
|
| 19 |
def main():
|
| 20 |
print(f"--- Checking artifacts from {HF_REPO_ID} ---")
|
| 21 |
|
|
|
|
| 22 |
for dir_path in [DATA_DIR, CATALOG_DIR, INDEX_DIR]:
|
| 23 |
dir_path.mkdir(parents=True, exist_ok=True)
|
| 24 |
|
|
|
|
| 31 |
|
| 32 |
print(f"Downloading {filename}...")
|
| 33 |
try:
|
|
|
|
| 34 |
cached_path = hf_hub_download(
|
| 35 |
repo_id=HF_REPO_ID,
|
| 36 |
filename=filename,
|
| 37 |
repo_type=REPO_TYPE
|
| 38 |
)
|
| 39 |
|
|
|
|
| 40 |
shutil.copy(cached_path, dest_path)
|
| 41 |
print(f" Saved to {dest_path}")
|
| 42 |
|
scripts/download_model.py
CHANGED
|
@@ -5,7 +5,6 @@ MODEL_NAME = "all-MiniLM-L6-v2"
|
|
| 5 |
|
| 6 |
def download():
|
| 7 |
print(f"Downloading {MODEL_NAME}...")
|
| 8 |
-
# This will download to HF_HOME (set in Dockerfile)
|
| 9 |
SentenceTransformer(MODEL_NAME)
|
| 10 |
print("Done.")
|
| 11 |
|
|
|
|
| 5 |
|
| 6 |
def download():
|
| 7 |
print(f"Downloading {MODEL_NAME}...")
|
|
|
|
| 8 |
SentenceTransformer(MODEL_NAME)
|
| 9 |
print("Done.")
|
| 10 |
|
scripts/evaluate_quality.py
CHANGED
|
@@ -6,11 +6,9 @@ import sys
|
|
| 6 |
from tqdm import tqdm
|
| 7 |
from pathlib import Path
|
| 8 |
|
| 9 |
-
# Add src to path
|
| 10 |
sys.path.append(str(Path(__file__).parent.parent))
|
| 11 |
from src.personalization.config import settings
|
| 12 |
|
| 13 |
-
# Config
|
| 14 |
CATALOG_PATH = Path("data/catalog/books_catalog.csv")
|
| 15 |
NUM_SAMPLES = 100
|
| 16 |
|
|
@@ -30,7 +28,6 @@ def main():
|
|
| 30 |
|
| 31 |
df = pd.read_csv(CATALOG_PATH)
|
| 32 |
|
| 33 |
-
# Filter authors with at least 5 books
|
| 34 |
author_counts = df['authors'].value_counts()
|
| 35 |
valid_authors = author_counts[author_counts >= 5].index.tolist()
|
| 36 |
|
|
@@ -43,20 +40,17 @@ def main():
|
|
| 43 |
print(f"Running {args.samples} evaluation queries against {api_url}...")
|
| 44 |
|
| 45 |
for _ in tqdm(range(args.samples)):
|
| 46 |
-
# 1. Pick a random author
|
| 47 |
author = random.choice(valid_authors)
|
| 48 |
books = df[df['authors'] == author]
|
| 49 |
|
| 50 |
if len(books) < 5:
|
| 51 |
continue
|
| 52 |
|
| 53 |
-
# 2. Split: History (3 books) -> Target (1 book)
|
| 54 |
sample = books.sample(n=4, replace=False)
|
| 55 |
history = sample.iloc[:3]['title'].tolist()
|
| 56 |
target_book = sample.iloc[3]
|
| 57 |
target_title = target_book['title']
|
| 58 |
|
| 59 |
-
# 3. Call API
|
| 60 |
try:
|
| 61 |
payload = {"user_history": history, "top_k": 10}
|
| 62 |
resp = requests.post(api_url, json=payload)
|
|
@@ -67,11 +61,9 @@ def main():
|
|
| 67 |
recs = resp.json()
|
| 68 |
rec_titles = [r['title'] for r in recs]
|
| 69 |
|
| 70 |
-
# Metrics
|
| 71 |
if target_title in rec_titles:
|
| 72 |
hits += 1
|
| 73 |
|
| 74 |
-
# Author Match
|
| 75 |
rec_authors = df[df['title'].isin(rec_titles)]['authors'].tolist()
|
| 76 |
if author in rec_authors:
|
| 77 |
matches = rec_authors.count(author)
|
|
|
|
| 6 |
from tqdm import tqdm
|
| 7 |
from pathlib import Path
|
| 8 |
|
|
|
|
| 9 |
sys.path.append(str(Path(__file__).parent.parent))
|
| 10 |
from src.personalization.config import settings
|
| 11 |
|
|
|
|
| 12 |
CATALOG_PATH = Path("data/catalog/books_catalog.csv")
|
| 13 |
NUM_SAMPLES = 100
|
| 14 |
|
|
|
|
| 28 |
|
| 29 |
df = pd.read_csv(CATALOG_PATH)
|
| 30 |
|
|
|
|
| 31 |
author_counts = df['authors'].value_counts()
|
| 32 |
valid_authors = author_counts[author_counts >= 5].index.tolist()
|
| 33 |
|
|
|
|
| 40 |
print(f"Running {args.samples} evaluation queries against {api_url}...")
|
| 41 |
|
| 42 |
for _ in tqdm(range(args.samples)):
|
|
|
|
| 43 |
author = random.choice(valid_authors)
|
| 44 |
books = df[df['authors'] == author]
|
| 45 |
|
| 46 |
if len(books) < 5:
|
| 47 |
continue
|
| 48 |
|
|
|
|
| 49 |
sample = books.sample(n=4, replace=False)
|
| 50 |
history = sample.iloc[:3]['title'].tolist()
|
| 51 |
target_book = sample.iloc[3]
|
| 52 |
target_title = target_book['title']
|
| 53 |
|
|
|
|
| 54 |
try:
|
| 55 |
payload = {"user_history": history, "top_k": 10}
|
| 56 |
resp = requests.post(api_url, json=payload)
|
|
|
|
| 61 |
recs = resp.json()
|
| 62 |
rec_titles = [r['title'] for r in recs]
|
| 63 |
|
|
|
|
| 64 |
if target_title in rec_titles:
|
| 65 |
hits += 1
|
| 66 |
|
|
|
|
| 67 |
rec_authors = df[df['title'].isin(rec_titles)]['authors'].tolist()
|
| 68 |
if author in rec_authors:
|
| 69 |
matches = rec_authors.count(author)
|
scripts/evaluate_system.py
CHANGED
|
@@ -6,11 +6,9 @@ import logging
|
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
from tqdm import tqdm
|
| 8 |
|
| 9 |
-
# Setup
|
| 10 |
logging.basicConfig(level=logging.INFO)
|
| 11 |
logger = logging.getLogger("Evaluator")
|
| 12 |
|
| 13 |
-
# Paths
|
| 14 |
DATA_DIR = Path("data")
|
| 15 |
SYNTHETIC_DATA_PATH = DATA_DIR / "synthetic" / "user_sequences.parquet"
|
| 16 |
CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
|
|
@@ -23,22 +21,17 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
|
|
| 23 |
metric: Hit Rate @ k
|
| 24 |
"""
|
| 25 |
|
| 26 |
-
# 1. Load Resources
|
| 27 |
logger.info("Loading Catalog and Embeddings...")
|
| 28 |
if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
|
| 29 |
logger.error("Missing Data! Run download scripts first.")
|
| 30 |
return
|
| 31 |
|
| 32 |
-
# Load Titles for mapping
|
| 33 |
df_catalog = pd.read_csv(CATALOG_PATH)
|
| 34 |
titles = df_catalog['title'].tolist()
|
| 35 |
-
# Create Title -> Index map (normalized)
|
| 36 |
title_to_idx = {t.lower().strip(): i for i, t in enumerate(titles)}
|
| 37 |
|
| 38 |
-
# Load Embeddings
|
| 39 |
embeddings = np.load(EMBEDDINGS_PATH)
|
| 40 |
|
| 41 |
-
# Load Index
|
| 42 |
logger.info("Loading FAISS Index...")
|
| 43 |
if INDEX_PATH.exists():
|
| 44 |
index = faiss.read_index(str(INDEX_PATH))
|
|
@@ -50,11 +43,9 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
|
|
| 50 |
faiss.normalize_L2(embeddings)
|
| 51 |
index.add(embeddings)
|
| 52 |
|
| 53 |
-
# 2. Load Synthetic Users
|
| 54 |
logger.info(f"Loading Synthetic Data from {SYNTHETIC_DATA_PATH}...")
|
| 55 |
df_users = pd.read_parquet(SYNTHETIC_DATA_PATH)
|
| 56 |
|
| 57 |
-
# Sample users if dataset is too large
|
| 58 |
if len(df_users) > sample_size:
|
| 59 |
df_users = df_users.sample(sample_size, random_state=42)
|
| 60 |
|
|
@@ -66,15 +57,12 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
|
|
| 66 |
for _, row in tqdm(df_users.iterrows(), total=len(df_users)):
|
| 67 |
history = row['book_sequence']
|
| 68 |
|
| 69 |
-
# Need at least 2 books (1 for history, 1 for test)
|
| 70 |
if len(history) < 2:
|
| 71 |
continue
|
| 72 |
|
| 73 |
-
# Leave-One-Out Split
|
| 74 |
target_book = history[-1]
|
| 75 |
context_books = history[:-1]
|
| 76 |
|
| 77 |
-
# 3. Convert Context to Vector
|
| 78 |
valid_indices = []
|
| 79 |
for book in context_books:
|
| 80 |
norm_title = book.lower().strip()
|
|
@@ -84,10 +72,8 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
|
|
| 84 |
if not valid_indices:
|
| 85 |
continue
|
| 86 |
|
| 87 |
-
# Get vectors and average (Time Decay Simulation)
|
| 88 |
context_vectors = embeddings[valid_indices]
|
| 89 |
|
| 90 |
-
# Simple Time Decay
|
| 91 |
n = len(valid_indices)
|
| 92 |
decay_factor = 0.9
|
| 93 |
weights = np.array([decay_factor ** (n - 1 - i) for i in range(n)])
|
|
@@ -96,14 +82,11 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
|
|
| 96 |
user_vector = np.average(context_vectors, axis=0, weights=weights).reshape(1, -1).astype(np.float32)
|
| 97 |
faiss.normalize_L2(user_vector)
|
| 98 |
|
| 99 |
-
# 4. Search
|
| 100 |
-
# We search for top_k + len(context) because the model might return books the user already read
|
| 101 |
search_k = top_k + len(valid_indices) + 5
|
| 102 |
scores, indices = index.search(user_vector, search_k)
|
| 103 |
|
| 104 |
-
# Filter results
|
| 105 |
recommended_titles = []
|
| 106 |
-
seen_indices = set(valid_indices)
|
| 107 |
|
| 108 |
for idx in indices[0]:
|
| 109 |
if idx in seen_indices:
|
|
@@ -115,10 +98,7 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
|
|
| 115 |
if len(recommended_titles) >= top_k:
|
| 116 |
break
|
| 117 |
|
| 118 |
-
|
| 119 |
-
# We check if the TARGET book title is in the recommended list
|
| 120 |
-
# Using loose matching (substring or exact) can be generous, but strict is better for ML metrics
|
| 121 |
-
# We'll stick to exact string match (normalized)
|
| 122 |
|
| 123 |
target_norm = target_book.lower().strip()
|
| 124 |
rec_norm = [t.lower().strip() for t in recommended_titles]
|
|
@@ -128,7 +108,6 @@ def evaluate_hit_rate(top_k=10, sample_size=1000):
|
|
| 128 |
|
| 129 |
processed_users += 1
|
| 130 |
|
| 131 |
-
# 6. Report
|
| 132 |
if processed_users == 0:
|
| 133 |
print("No valid users found for evaluation.")
|
| 134 |
return
|
|
|
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
from tqdm import tqdm
|
| 8 |
|
|
|
|
| 9 |
logging.basicConfig(level=logging.INFO)
|
| 10 |
logger = logging.getLogger("Evaluator")
|
| 11 |
|
|
|
|
| 12 |
DATA_DIR = Path("data")
|
| 13 |
SYNTHETIC_DATA_PATH = DATA_DIR / "synthetic" / "user_sequences.parquet"
|
| 14 |
CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
|
|
|
|
| 21 |
metric: Hit Rate @ k
|
| 22 |
"""
|
| 23 |
|
|
|
|
| 24 |
logger.info("Loading Catalog and Embeddings...")
|
| 25 |
if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
|
| 26 |
logger.error("Missing Data! Run download scripts first.")
|
| 27 |
return
|
| 28 |
|
|
|
|
| 29 |
df_catalog = pd.read_csv(CATALOG_PATH)
|
| 30 |
titles = df_catalog['title'].tolist()
|
|
|
|
| 31 |
title_to_idx = {t.lower().strip(): i for i, t in enumerate(titles)}
|
| 32 |
|
|
|
|
| 33 |
embeddings = np.load(EMBEDDINGS_PATH)
|
| 34 |
|
|
|
|
| 35 |
logger.info("Loading FAISS Index...")
|
| 36 |
if INDEX_PATH.exists():
|
| 37 |
index = faiss.read_index(str(INDEX_PATH))
|
|
|
|
| 43 |
faiss.normalize_L2(embeddings)
|
| 44 |
index.add(embeddings)
|
| 45 |
|
|
|
|
| 46 |
logger.info(f"Loading Synthetic Data from {SYNTHETIC_DATA_PATH}...")
|
| 47 |
df_users = pd.read_parquet(SYNTHETIC_DATA_PATH)
|
| 48 |
|
|
|
|
| 49 |
if len(df_users) > sample_size:
|
| 50 |
df_users = df_users.sample(sample_size, random_state=42)
|
| 51 |
|
|
|
|
| 57 |
for _, row in tqdm(df_users.iterrows(), total=len(df_users)):
|
| 58 |
history = row['book_sequence']
|
| 59 |
|
|
|
|
| 60 |
if len(history) < 2:
|
| 61 |
continue
|
| 62 |
|
|
|
|
| 63 |
target_book = history[-1]
|
| 64 |
context_books = history[:-1]
|
| 65 |
|
|
|
|
| 66 |
valid_indices = []
|
| 67 |
for book in context_books:
|
| 68 |
norm_title = book.lower().strip()
|
|
|
|
| 72 |
if not valid_indices:
|
| 73 |
continue
|
| 74 |
|
|
|
|
| 75 |
context_vectors = embeddings[valid_indices]
|
| 76 |
|
|
|
|
| 77 |
n = len(valid_indices)
|
| 78 |
decay_factor = 0.9
|
| 79 |
weights = np.array([decay_factor ** (n - 1 - i) for i in range(n)])
|
|
|
|
| 82 |
user_vector = np.average(context_vectors, axis=0, weights=weights).reshape(1, -1).astype(np.float32)
|
| 83 |
faiss.normalize_L2(user_vector)
|
| 84 |
|
|
|
|
|
|
|
| 85 |
search_k = top_k + len(valid_indices) + 5
|
| 86 |
scores, indices = index.search(user_vector, search_k)
|
| 87 |
|
|
|
|
| 88 |
recommended_titles = []
|
| 89 |
+
seen_indices = set(valid_indices)
|
| 90 |
|
| 91 |
for idx in indices[0]:
|
| 92 |
if idx in seen_indices:
|
|
|
|
| 98 |
if len(recommended_titles) >= top_k:
|
| 99 |
break
|
| 100 |
|
| 101 |
+
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
target_norm = target_book.lower().strip()
|
| 104 |
rec_norm = [t.lower().strip() for t in recommended_titles]
|
|
|
|
| 108 |
|
| 109 |
processed_users += 1
|
| 110 |
|
|
|
|
| 111 |
if processed_users == 0:
|
| 112 |
print("No valid users found for evaluation.")
|
| 113 |
return
|
scripts/inspect_data.py
DELETED
|
@@ -1,33 +0,0 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
-
from pathlib import Path
|
| 3 |
-
|
| 4 |
-
# Config
|
| 5 |
-
DATA_PATH = Path("data/synthetic/user_sequences.parquet")
|
| 6 |
-
|
| 7 |
-
def inspect():
|
| 8 |
-
if not DATA_PATH.exists():
|
| 9 |
-
print(f"Error: File not found at {DATA_PATH}")
|
| 10 |
-
return
|
| 11 |
-
|
| 12 |
-
try:
|
| 13 |
-
print(f"Reading {DATA_PATH}...")
|
| 14 |
-
df = pd.read_parquet(DATA_PATH)
|
| 15 |
-
|
| 16 |
-
print("\n--- Schema ---")
|
| 17 |
-
print(df.info())
|
| 18 |
-
|
| 19 |
-
print("\n--- First 5 Rows ---")
|
| 20 |
-
print(df.head().to_string())
|
| 21 |
-
|
| 22 |
-
print("\n--- Sample User History ---")
|
| 23 |
-
# Show the full history of the first user
|
| 24 |
-
first_user = df.iloc[0]
|
| 25 |
-
print(f"User ID: {first_user.get('user_id', 'N/A')}")
|
| 26 |
-
print(f"History: {first_user.get('book_history', 'N/A')}")
|
| 27 |
-
|
| 28 |
-
except Exception as e:
|
| 29 |
-
print(f"Failed to read parquet: {e}")
|
| 30 |
-
|
| 31 |
-
if __name__ == "__main__":
|
| 32 |
-
inspect()
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/optimize_index.py
CHANGED
|
@@ -4,7 +4,6 @@ from pathlib import Path
|
|
| 4 |
import time
|
| 5 |
import sys
|
| 6 |
|
| 7 |
-
# Config
|
| 8 |
DATA_DIR = Path("data")
|
| 9 |
EMBEDDINGS_PATH = DATA_DIR / "embeddings_cache.npy"
|
| 10 |
OUTPUT_PATH = DATA_DIR / "index" / "optimized.index"
|
|
|
|
| 4 |
import time
|
| 5 |
import sys
|
| 6 |
|
|
|
|
| 7 |
DATA_DIR = Path("data")
|
| 8 |
EMBEDDINGS_PATH = DATA_DIR / "embeddings_cache.npy"
|
| 9 |
OUTPUT_PATH = DATA_DIR / "index" / "optimized.index"
|
scripts/visualize_users.py
CHANGED
|
@@ -4,11 +4,9 @@ import logging
|
|
| 4 |
from pathlib import Path
|
| 5 |
from tqdm import tqdm
|
| 6 |
|
| 7 |
-
# Setup Logging
|
| 8 |
logging.basicConfig(level=logging.INFO)
|
| 9 |
logger = logging.getLogger("Visualizer")
|
| 10 |
|
| 11 |
-
# Paths
|
| 12 |
DATA_DIR = Path("data")
|
| 13 |
SYNTHETIC_DATA_PATH = DATA_DIR / "synthetic" / "user_sequences.parquet"
|
| 14 |
CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
|
|
@@ -29,24 +27,19 @@ def visualize_clusters(sample_size=2000):
|
|
| 29 |
logger.error("Please run: uv pip install matplotlib seaborn")
|
| 30 |
return
|
| 31 |
|
| 32 |
-
# 1. Load Resources
|
| 33 |
logger.info("Loading Data...")
|
| 34 |
if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
|
| 35 |
logger.error("Missing Data! Run download scripts first.")
|
| 36 |
return
|
| 37 |
|
| 38 |
-
# Load Titles for mapping
|
| 39 |
df_catalog = pd.read_csv(CATALOG_PATH)
|
| 40 |
titles = df_catalog['title'].tolist()
|
| 41 |
title_to_idx = {t.lower().strip(): i for i, t in enumerate(titles)}
|
| 42 |
|
| 43 |
-
# Load Embeddings
|
| 44 |
embeddings = np.load(EMBEDDINGS_PATH)
|
| 45 |
|
| 46 |
-
# Load Users
|
| 47 |
df_users = pd.read_parquet(SYNTHETIC_DATA_PATH)
|
| 48 |
|
| 49 |
-
# Sample
|
| 50 |
if len(df_users) > sample_size:
|
| 51 |
df_users = df_users.sample(sample_size, random_state=42)
|
| 52 |
|
|
@@ -55,7 +48,6 @@ def visualize_clusters(sample_size=2000):
|
|
| 55 |
user_vectors = []
|
| 56 |
user_personas = []
|
| 57 |
|
| 58 |
-
# 2. Calculate User Vectors
|
| 59 |
valid_users = 0
|
| 60 |
for _, row in tqdm(df_users.iterrows(), total=len(df_users)):
|
| 61 |
history = row['book_sequence']
|
|
@@ -70,7 +62,6 @@ def visualize_clusters(sample_size=2000):
|
|
| 70 |
if not valid_indices:
|
| 71 |
continue
|
| 72 |
|
| 73 |
-
# Average Embeddings
|
| 74 |
vectors = embeddings[valid_indices]
|
| 75 |
user_vec = np.mean(vectors, axis=0)
|
| 76 |
|
|
@@ -80,12 +71,10 @@ def visualize_clusters(sample_size=2000):
|
|
| 80 |
|
| 81 |
X = np.array(user_vectors)
|
| 82 |
|
| 83 |
-
|
| 84 |
-
logger.info("Running t-SNE (this might take a moment)...")
|
| 85 |
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
|
| 86 |
X_embedded = tsne.fit_transform(X)
|
| 87 |
|
| 88 |
-
# 4. Plotting
|
| 89 |
logger.info("Generating Plot...")
|
| 90 |
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 91 |
|
|
@@ -106,7 +95,7 @@ def visualize_clusters(sample_size=2000):
|
|
| 106 |
plt.tight_layout()
|
| 107 |
|
| 108 |
plt.savefig(OUTPUT_IMAGE, dpi=300)
|
| 109 |
-
logger.info(f"
|
| 110 |
print(f"Success! Check {OUTPUT_IMAGE} to see your user clusters.")
|
| 111 |
|
| 112 |
if __name__ == "__main__":
|
|
|
|
| 4 |
from pathlib import Path
|
| 5 |
from tqdm import tqdm
|
| 6 |
|
|
|
|
| 7 |
logging.basicConfig(level=logging.INFO)
|
| 8 |
logger = logging.getLogger("Visualizer")
|
| 9 |
|
|
|
|
| 10 |
DATA_DIR = Path("data")
|
| 11 |
SYNTHETIC_DATA_PATH = DATA_DIR / "synthetic" / "user_sequences.parquet"
|
| 12 |
CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
|
|
|
|
| 27 |
logger.error("Please run: uv pip install matplotlib seaborn")
|
| 28 |
return
|
| 29 |
|
|
|
|
| 30 |
logger.info("Loading Data...")
|
| 31 |
if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
|
| 32 |
logger.error("Missing Data! Run download scripts first.")
|
| 33 |
return
|
| 34 |
|
|
|
|
| 35 |
df_catalog = pd.read_csv(CATALOG_PATH)
|
| 36 |
titles = df_catalog['title'].tolist()
|
| 37 |
title_to_idx = {t.lower().strip(): i for i, t in enumerate(titles)}
|
| 38 |
|
|
|
|
| 39 |
embeddings = np.load(EMBEDDINGS_PATH)
|
| 40 |
|
|
|
|
| 41 |
df_users = pd.read_parquet(SYNTHETIC_DATA_PATH)
|
| 42 |
|
|
|
|
| 43 |
if len(df_users) > sample_size:
|
| 44 |
df_users = df_users.sample(sample_size, random_state=42)
|
| 45 |
|
|
|
|
| 48 |
user_vectors = []
|
| 49 |
user_personas = []
|
| 50 |
|
|
|
|
| 51 |
valid_users = 0
|
| 52 |
for _, row in tqdm(df_users.iterrows(), total=len(df_users)):
|
| 53 |
history = row['book_sequence']
|
|
|
|
| 62 |
if not valid_indices:
|
| 63 |
continue
|
| 64 |
|
|
|
|
| 65 |
vectors = embeddings[valid_indices]
|
| 66 |
user_vec = np.mean(vectors, axis=0)
|
| 67 |
|
|
|
|
| 71 |
|
| 72 |
X = np.array(user_vectors)
|
| 73 |
|
| 74 |
+
logger.info("Running t-SNE")
|
|
|
|
| 75 |
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
|
| 76 |
X_embedded = tsne.fit_transform(X)
|
| 77 |
|
|
|
|
| 78 |
logger.info("Generating Plot...")
|
| 79 |
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 80 |
|
|
|
|
| 95 |
plt.tight_layout()
|
| 96 |
|
| 97 |
plt.savefig(OUTPUT_IMAGE, dpi=300)
|
| 98 |
+
logger.info(f"Visualization saved to {OUTPUT_IMAGE}")
|
| 99 |
print(f"Success! Check {OUTPUT_IMAGE} to see your user clusters.")
|
| 100 |
|
| 101 |
if __name__ == "__main__":
|
src/personalization/api/main.py
CHANGED
|
@@ -11,17 +11,14 @@ import faiss
|
|
| 11 |
import numpy as np
|
| 12 |
import time
|
| 13 |
|
| 14 |
-
# Setup Logging
|
| 15 |
logging.basicConfig(level=logging.INFO)
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
|
| 18 |
-
# Config
|
| 19 |
DATA_DIR = Path("data")
|
| 20 |
CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
|
| 21 |
EMBEDDINGS_PATH = DATA_DIR / "embeddings_cache.npy"
|
| 22 |
MODEL_NAME = "all-MiniLM-L6-v2"
|
| 23 |
|
| 24 |
-
# Global State
|
| 25 |
state = {
|
| 26 |
"titles": [],
|
| 27 |
"title_to_idx": {},
|
|
@@ -40,7 +37,6 @@ async def lifespan(app: FastAPI):
|
|
| 40 |
|
| 41 |
if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
|
| 42 |
logger.error("Missing catalog or embeddings! Run scripts/1b... first.")
|
| 43 |
-
# We continue but service might be degraded
|
| 44 |
else:
|
| 45 |
try:
|
| 46 |
df = pd.read_csv(CATALOG_PATH)
|
|
@@ -51,7 +47,6 @@ async def lifespan(app: FastAPI):
|
|
| 51 |
max_rating = raw_ratings.max()
|
| 52 |
state["ratings"] = (raw_ratings / max_rating).tolist() if max_rating > 0 else [0.5] * len(df)
|
| 53 |
|
| 54 |
-
# Use normalized keys for robust lookup
|
| 55 |
state["title_to_idx"] = {t.lower().strip(): i for i, t in enumerate(state["titles"])}
|
| 56 |
|
| 57 |
state["popular_indices"] = np.argsort(raw_ratings)[::-1][:50].tolist()
|
|
@@ -80,23 +75,19 @@ async def lifespan(app: FastAPI):
|
|
| 80 |
logger.info(f"Ready! Loaded {len(state['titles'])} books in {time.time() - start_time:.2f}s")
|
| 81 |
except Exception as e:
|
| 82 |
logger.error(f"Failed to load resources: {e}")
|
| 83 |
-
# Consider raising if critical
|
| 84 |
|
| 85 |
yield
|
| 86 |
|
| 87 |
logger.info("Shutting down...")
|
| 88 |
-
# Clean up resources if needed
|
| 89 |
|
| 90 |
app = FastAPI(title="Semantic Book Discovery Engine", lifespan=lifespan)
|
| 91 |
|
| 92 |
-
# Add a root endpoint to redirect to docs
|
| 93 |
from fastapi.responses import RedirectResponse
|
| 94 |
|
| 95 |
@app.get("/")
|
| 96 |
async def read_root():
|
| 97 |
return RedirectResponse(url="/docs")
|
| 98 |
|
| 99 |
-
# Add Prometheus Instrumentation
|
| 100 |
Instrumentator().instrument(app).expose(app)
|
| 101 |
|
| 102 |
class RecommendationRequest(BaseModel):
|
|
|
|
| 11 |
import numpy as np
|
| 12 |
import time
|
| 13 |
|
|
|
|
| 14 |
logging.basicConfig(level=logging.INFO)
|
| 15 |
logger = logging.getLogger(__name__)
|
| 16 |
|
|
|
|
| 17 |
DATA_DIR = Path("data")
|
| 18 |
CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
|
| 19 |
EMBEDDINGS_PATH = DATA_DIR / "embeddings_cache.npy"
|
| 20 |
MODEL_NAME = "all-MiniLM-L6-v2"
|
| 21 |
|
|
|
|
| 22 |
state = {
|
| 23 |
"titles": [],
|
| 24 |
"title_to_idx": {},
|
|
|
|
| 37 |
|
| 38 |
if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
|
| 39 |
logger.error("Missing catalog or embeddings! Run scripts/1b... first.")
|
|
|
|
| 40 |
else:
|
| 41 |
try:
|
| 42 |
df = pd.read_csv(CATALOG_PATH)
|
|
|
|
| 47 |
max_rating = raw_ratings.max()
|
| 48 |
state["ratings"] = (raw_ratings / max_rating).tolist() if max_rating > 0 else [0.5] * len(df)
|
| 49 |
|
|
|
|
| 50 |
state["title_to_idx"] = {t.lower().strip(): i for i, t in enumerate(state["titles"])}
|
| 51 |
|
| 52 |
state["popular_indices"] = np.argsort(raw_ratings)[::-1][:50].tolist()
|
|
|
|
| 75 |
logger.info(f"Ready! Loaded {len(state['titles'])} books in {time.time() - start_time:.2f}s")
|
| 76 |
except Exception as e:
|
| 77 |
logger.error(f"Failed to load resources: {e}")
|
|
|
|
| 78 |
|
| 79 |
yield
|
| 80 |
|
| 81 |
logger.info("Shutting down...")
|
|
|
|
| 82 |
|
| 83 |
app = FastAPI(title="Semantic Book Discovery Engine", lifespan=lifespan)
|
| 84 |
|
|
|
|
| 85 |
from fastapi.responses import RedirectResponse
|
| 86 |
|
| 87 |
@app.get("/")
|
| 88 |
async def read_root():
|
| 89 |
return RedirectResponse(url="/docs")
|
| 90 |
|
|
|
|
| 91 |
Instrumentator().instrument(app).expose(app)
|
| 92 |
|
| 93 |
class RecommendationRequest(BaseModel):
|
src/personalization/config.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
class Settings:
|
| 4 |
-
# Default to 8001, but allow Env Override
|
| 5 |
HOST = os.getenv("API_HOST", "localhost")
|
| 6 |
PORT = int(os.getenv("API_PORT", 8001))
|
| 7 |
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
class Settings:
|
|
|
|
| 4 |
HOST = os.getenv("API_HOST", "localhost")
|
| 5 |
PORT = int(os.getenv("API_PORT", 8001))
|
| 6 |
|
test_api.py
CHANGED
|
@@ -3,7 +3,6 @@ import argparse
|
|
| 3 |
import sys
|
| 4 |
from pathlib import Path
|
| 5 |
|
| 6 |
-
# Add src to path to import config
|
| 7 |
sys.path.append(str(Path(__file__).parent))
|
| 8 |
from src.personalization.config import settings
|
| 9 |
|
|
|
|
| 3 |
import sys
|
| 4 |
from pathlib import Path
|
| 5 |
|
|
|
|
| 6 |
sys.path.append(str(Path(__file__).parent))
|
| 7 |
from src.personalization.config import settings
|
| 8 |
|