Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- .gitignore +5 -0
- Dockerfile +31 -0
- LICENSE +12 -0
- README.md +66 -7
- app.py +26 -0
- machine_learning/__init__.py +0 -0
- machine_learning/datasets/__init__.py +0 -0
- machine_learning/datasets/embeddings_csv.py +20 -0
- machine_learning/datasets/embeddings_databricks.py +50 -0
- machine_learning/load_models.py +35 -0
- machine_learning/models/__init__.py +0 -0
- machine_learning/models/rfy.py +57 -0
- machine_learning/models/similarity.py +24 -0
- machine_learning/prefetched/embeddings.csv.gz +3 -0
- machine_learning/transformers/__init__.py +0 -0
- machine_learning/transformers/inverter.py +9 -0
- machine_learning/transformers/item_encoder.py +21 -0
- machine_learning/transformers/scores_to_dict.py +40 -0
- requirements.txt +6 -0
- server/__init__.py +0 -0
- server/api.py +73 -0
- setup.py +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
venv/
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pyo
|
| 5 |
+
.DS_Store
|
Dockerfile
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
# Set environment variables
|
| 4 |
+
ENV PYTHONPATH="/app"
|
| 5 |
+
ENV PYTHONUNBUFFERED=1
|
| 6 |
+
|
| 7 |
+
# Create non-root user for Hugging Face Spaces
|
| 8 |
+
RUN useradd -m -u 1000 user
|
| 9 |
+
USER user
|
| 10 |
+
ENV HOME=/home/user \
|
| 11 |
+
PATH=/home/user/.local/bin:$PATH
|
| 12 |
+
|
| 13 |
+
WORKDIR /app
|
| 14 |
+
|
| 15 |
+
# Copy requirements first for better caching
|
| 16 |
+
COPY --chown=user:user requirements.txt .
|
| 17 |
+
RUN pip install --no-cache-dir --user -r requirements.txt
|
| 18 |
+
|
| 19 |
+
# Copy application code
|
| 20 |
+
COPY --chown=user:user . .
|
| 21 |
+
|
| 22 |
+
# Expose port (7860 for HF Spaces, can be overridden)
|
| 23 |
+
EXPOSE 7860
|
| 24 |
+
|
| 25 |
+
# Health check
|
| 26 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
| 27 |
+
CMD curl -f http://localhost:7860/health || exit 1
|
| 28 |
+
|
| 29 |
+
# Start the server
|
| 30 |
+
# Use single worker to minimize memory usage
|
| 31 |
+
CMD ["python", "-m", "gunicorn", "-b", "0.0.0.0:7860", "--workers", "1", "--threads", "2", "--timeout", "120", "server.api:app"]
|
LICENSE
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Warner Bros. Discovery Hackathon License (WBDHL)
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 - present Warner Bros. Discovery, Inc. or its subsidiaries and affiliates.
|
| 4 |
+
|
| 5 |
+
This software and associated documentation files (the “Software”) are proprietary and confidential,
|
| 6 |
+
and Warner Bros. Discovery, Inc. or its subsidiaries and affiliates (“WBD”) reserves all rights to
|
| 7 |
+
the Software. Authorized participants in a hackathon hosted by WBD may use the Software solely for
|
| 8 |
+
internal research and development purposes within WBD. Any other use of the Software is strictly
|
| 9 |
+
prohibited. The Software may not be published, distributed, redistributed, sublicensed, rented, sold,
|
| 10 |
+
exported, or lent, unless WBD expressly provides separate authorization to do so. The Software may
|
| 11 |
+
not be copied, modified, or merged for any purpose other than internal research and development
|
| 12 |
+
purposes within WBD, unless WBD expressly provides separate authorization to do so.
|
README.md
CHANGED
|
@@ -1,12 +1,71 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: purple
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
-
|
| 8 |
-
license: mit
|
| 9 |
-
short_description: Prototype
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: FF1000 Recommendation Service
|
| 3 |
+
emoji: 🎬
|
| 4 |
colorFrom: purple
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 7860
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
+
# FF1000 - ML Recommendation Service
|
| 11 |
+
|
| 12 |
+
A pretrained recommendation service for content discovery, providing similarity-based and personalized recommendations.
|
| 13 |
+
|
| 14 |
+
## API Endpoints
|
| 15 |
+
|
| 16 |
+
### Health Check
|
| 17 |
+
```
|
| 18 |
+
GET /health
|
| 19 |
+
```
|
| 20 |
+
Returns `{"status": "ok"}` when the service is running.
|
| 21 |
+
|
| 22 |
+
### Predict Endpoint
|
| 23 |
+
```
|
| 24 |
+
POST /predict/<model_name>
|
| 25 |
+
Content-Type: application/json
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
**Available models:**
|
| 29 |
+
- `similarity` - Find similar content based on embeddings
|
| 30 |
+
- `rfy` - "Recommended for you" personalized recommendations
|
| 31 |
+
- `nfm` - "Not for me" content filtering
|
| 32 |
+
|
| 33 |
+
**Request body:**
|
| 34 |
+
```json
|
| 35 |
+
{
|
| 36 |
+
"items": ["item_id_1", "item_id_2"]
|
| 37 |
+
}
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
**Response:**
|
| 41 |
+
```json
|
| 42 |
+
{
|
| 43 |
+
"model": "similarity",
|
| 44 |
+
"predictions": [
|
| 45 |
+
{
|
| 46 |
+
"item_ids": ["..."],
|
| 47 |
+
"titles": ["..."],
|
| 48 |
+
"scores": [0.95, 0.87, ...],
|
| 49 |
+
"posters": ["https://...", ...],
|
| 50 |
+
"premiere_years": [2023, 2022, ...]
|
| 51 |
+
}
|
| 52 |
+
]
|
| 53 |
+
}
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
## Example Usage
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
curl -X POST https://YOUR-SPACE.hf.space/predict/similarity \
|
| 60 |
+
-H "Content-Type: application/json" \
|
| 61 |
+
-d '{"items": ["ab553cdc-e15d-4597-b65f-bec9201fd2dd"]}'
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
## Architecture
|
| 65 |
+
|
| 66 |
+
The service loads pre-computed embeddings and serves three recommendation models:
|
| 67 |
+
- **Similarity**: Cosine distance between content embeddings
|
| 68 |
+
- **RFY**: Variance-explained recommendations for personalization
|
| 69 |
+
- **NFM**: Negative preference modeling
|
| 70 |
+
|
| 71 |
+
Built with Flask and scikit-learn.
|
app.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FF1000 Recommendation Service - Hugging Face Spaces Entry Point
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
import logging
|
| 7 |
+
|
| 8 |
+
# Set up logging
|
| 9 |
+
logging.basicConfig(
|
| 10 |
+
level=os.environ.get("LOG_LEVEL", "INFO"),
|
| 11 |
+
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
| 12 |
+
stream=sys.stdout,
|
| 13 |
+
)
|
| 14 |
+
log = logging.getLogger("ff1000-hf")
|
| 15 |
+
|
| 16 |
+
# Import the Flask app from server.api
|
| 17 |
+
from server.api import app
|
| 18 |
+
|
| 19 |
+
# Hugging Face Spaces expects the app to be available as 'app'
|
| 20 |
+
# The app will be run by the Spaces infrastructure
|
| 21 |
+
|
| 22 |
+
if __name__ == "__main__":
|
| 23 |
+
# For local testing
|
| 24 |
+
port = int(os.environ.get("PORT", 7860))
|
| 25 |
+
app.run(host="0.0.0.0", port=port, debug=False)
|
| 26 |
+
|
machine_learning/__init__.py
ADDED
|
File without changes
|
machine_learning/datasets/__init__.py
ADDED
|
File without changes
|
machine_learning/datasets/embeddings_csv.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class EmbeddingsDataLoader:
|
| 7 |
+
def __init__(
|
| 8 |
+
self,
|
| 9 |
+
filepath=None,
|
| 10 |
+
):
|
| 11 |
+
if filepath is None:
|
| 12 |
+
# Use relative path from this file's location
|
| 13 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 14 |
+
filepath = os.path.join(current_dir, '..', 'prefetched', 'embeddings.csv.gz')
|
| 15 |
+
self.filepath = filepath
|
| 16 |
+
|
| 17 |
+
def load(self) -> pd.DataFrame:
|
| 18 |
+
df = pd.read_csv(self.filepath, compression='gzip')
|
| 19 |
+
df.embedding = df.embedding.apply(lambda vec: [float(v) for v in json.loads(vec)])
|
| 20 |
+
return df
|
machine_learning/datasets/embeddings_databricks.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from pyspark.sql.session import SparkSession
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
EMBEDDINGS_SQL = """
|
| 6 |
+
SELECT DISTINCT
|
| 7 |
+
e.item_id,
|
| 8 |
+
s.seriesMainTitle AS title,
|
| 9 |
+
e.embedding
|
| 10 |
+
FROM
|
| 11 |
+
{item_embeddings} e
|
| 12 |
+
INNER JOIN
|
| 13 |
+
{s2s_content_entities} s
|
| 14 |
+
ON e.item_id = s.unpackedValue.series.id.id
|
| 15 |
+
INNER JOIN
|
| 16 |
+
{series_offering_dim} off
|
| 17 |
+
ON
|
| 18 |
+
s.unpackedValue.series.id.id = off.series_id
|
| 19 |
+
AND SIZE(ARRAY_INTERSECT(off.country_codes, ARRAY('US'))) > 0
|
| 20 |
+
WHERE
|
| 21 |
+
date = (SELECT MAX(date) FROM bolt_recs_prod.gold.item_embeddings)
|
| 22 |
+
AND n_dimensions = 1536
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class EmbeddingsDataLoader:
|
| 27 |
+
def __init__(
|
| 28 |
+
self,
|
| 29 |
+
env: str = "prod",
|
| 30 |
+
spark_session: SparkSession = None,
|
| 31 |
+
item_embeddings: str = "bolt_recs_prod.gold.item_embeddings",
|
| 32 |
+
s2s_content_entities: str = "bolt_cep_prod.gold.s2s_content_entities",
|
| 33 |
+
series_offering_dim: str = "bolt_dai_ckg_prod.gold.series_offering_dim",
|
| 34 |
+
):
|
| 35 |
+
if not spark_session:
|
| 36 |
+
spark_session = SparkSession.builder.appName(
|
| 37 |
+
"embeddings"
|
| 38 |
+
).getOrCreate()
|
| 39 |
+
|
| 40 |
+
self._env = env
|
| 41 |
+
self._spark_session = spark_session
|
| 42 |
+
self._table_names = {
|
| 43 |
+
"item_embeddings": item_embeddings,
|
| 44 |
+
"s2s_content_entities": s2s_content_entities,
|
| 45 |
+
"series_offering_dim": series_offering_dim,
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
def load(self) -> pd.DataFrame:
|
| 49 |
+
query = EMBEDDINGS_SQL.format(**self._table_names)
|
| 50 |
+
return self._spark_session.sql(query).toPandas()
|
machine_learning/load_models.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
|
| 3 |
+
from sklearn.pipeline import Pipeline
|
| 4 |
+
|
| 5 |
+
from machine_learning.datasets.embeddings_csv import EmbeddingsDataLoader
|
| 6 |
+
from machine_learning.models.rfy import BayesianRecommender
|
| 7 |
+
from machine_learning.models.similarity import SimilarityRecommender
|
| 8 |
+
from machine_learning.transformers.inverter import Inverter
|
| 9 |
+
from machine_learning.transformers.item_encoder import ItemIdOneHotEncoder
|
| 10 |
+
from machine_learning.transformers.scores_to_dict import ScoresToDict
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
catalog = EmbeddingsDataLoader().load()
|
| 14 |
+
embeddings = np.array(catalog.embedding.tolist())
|
| 15 |
+
posters = catalog.poster if 'poster' in catalog.columns else None
|
| 16 |
+
premiere_years = catalog.premiere_year if 'premiere_year' in catalog.columns else None
|
| 17 |
+
|
| 18 |
+
recommended_for_you = Pipeline([
|
| 19 |
+
('encoder', ItemIdOneHotEncoder(catalog.item_id)),
|
| 20 |
+
('ranker', BayesianRecommender(embeddings)),
|
| 21 |
+
('scores_to_dict', ScoresToDict(catalog.item_id, catalog.title, posters, premiere_years)),
|
| 22 |
+
]).fit([])
|
| 23 |
+
|
| 24 |
+
not_for_me = Pipeline([
|
| 25 |
+
('encoder', ItemIdOneHotEncoder(catalog.item_id)),
|
| 26 |
+
('inverter', Inverter()),
|
| 27 |
+
('ranker', BayesianRecommender(embeddings)),
|
| 28 |
+
('scores_to_dict', ScoresToDict(catalog.item_id, catalog.title, posters, premiere_years)),
|
| 29 |
+
]).fit([])
|
| 30 |
+
|
| 31 |
+
similarity = Pipeline([
|
| 32 |
+
('encoder', ItemIdOneHotEncoder(catalog.item_id)),
|
| 33 |
+
('ranker', SimilarityRecommender(embeddings)),
|
| 34 |
+
('scores_to_dict', ScoresToDict(catalog.item_id, catalog.title, posters, premiere_years)),
|
| 35 |
+
]).fit([])
|
machine_learning/models/__init__.py
ADDED
|
File without changes
|
machine_learning/models/rfy.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from sklearn.base import BaseEstimator
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class BayesianRecommender(BaseEstimator):
|
| 6 |
+
def __init__(self,
|
| 7 |
+
item_embeddings: np.ndarray,
|
| 8 |
+
lambda_reg: float = 1.0,
|
| 9 |
+
sigma2: float = 1.0,
|
| 10 |
+
z: float = -1.1645, # -1.645=<10% LCB
|
| 11 |
+
mask_value: float = -np.inf):
|
| 12 |
+
|
| 13 |
+
self.item_embeddings = np.asarray(item_embeddings, dtype=np.float64)
|
| 14 |
+
self.N_, self.d_ = self.item_embeddings.shape
|
| 15 |
+
|
| 16 |
+
self.lambda_reg = float(lambda_reg)
|
| 17 |
+
self.sigma2 = float(sigma2)
|
| 18 |
+
self.z = float(z)
|
| 19 |
+
self.mask_value = mask_value
|
| 20 |
+
|
| 21 |
+
self.X_items = self.item_embeddings
|
| 22 |
+
self.XT_items = self.item_embeddings.T
|
| 23 |
+
|
| 24 |
+
def fit(self, X=None, y=None):
|
| 25 |
+
return self
|
| 26 |
+
|
| 27 |
+
def _user_posterior_and_scores(self, y_vec: np.ndarray):
|
| 28 |
+
seen_mask = y_vec != 0
|
| 29 |
+
X_obs = self.X_items[seen_mask]
|
| 30 |
+
y_obs = y_vec[seen_mask].astype(np.float64)
|
| 31 |
+
A = self.lambda_reg * np.eye(self.d_, dtype=np.float64) + (X_obs.T @ X_obs) / self.sigma2
|
| 32 |
+
invA = np.linalg.inv(A)
|
| 33 |
+
mu = invA @ (X_obs.T @ y_obs) / self.sigma2
|
| 34 |
+
|
| 35 |
+
m = self.X_items @ mu
|
| 36 |
+
XinvA = self.X_items @ invA
|
| 37 |
+
s2 = np.einsum('ij,ij->i', XinvA, self.X_items)
|
| 38 |
+
s = np.sqrt(np.clip(s2, 0.0, None))
|
| 39 |
+
|
| 40 |
+
scores = m + self.z * s
|
| 41 |
+
scores[seen_mask] = self.mask_value
|
| 42 |
+
|
| 43 |
+
return scores
|
| 44 |
+
|
| 45 |
+
def transform(self, X):
|
| 46 |
+
X = np.asarray(X, dtype=np.float64)
|
| 47 |
+
if X.ndim == 1:
|
| 48 |
+
X = X[None, :]
|
| 49 |
+
|
| 50 |
+
B, N = X.shape
|
| 51 |
+
if N != self.N_:
|
| 52 |
+
raise ValueError(f"Input width {N} != number of items {self.N_}.")
|
| 53 |
+
|
| 54 |
+
out = np.empty((B, N), dtype=np.float64)
|
| 55 |
+
for b in range(B):
|
| 56 |
+
out[b] = self._user_posterior_and_scores(X[b])
|
| 57 |
+
return out
|
machine_learning/models/similarity.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from sklearn.base import BaseEstimator
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class SimilarityRecommender(BaseEstimator):
|
| 6 |
+
def __init__(self,
|
| 7 |
+
item_embeddings: np.ndarray,
|
| 8 |
+
mask_value: float = -np.inf):
|
| 9 |
+
|
| 10 |
+
E = np.asarray(item_embeddings, dtype=np.float64)
|
| 11 |
+
self.item_embeddings = E / np.linalg.norm(E, axis=1, keepdims=True)
|
| 12 |
+
self.N_, self.d_ = self.item_embeddings.shape
|
| 13 |
+
self.mask_value = mask_value
|
| 14 |
+
|
| 15 |
+
def fit(self, X=None, y=None):
|
| 16 |
+
return self
|
| 17 |
+
|
| 18 |
+
def transform(self, X):
|
| 19 |
+
X = np.asarray(X, dtype=np.float64)
|
| 20 |
+
U = X @ self.item_embeddings
|
| 21 |
+
U /= np.linalg.norm(U, axis=1, keepdims=True)
|
| 22 |
+
scores = U @ self.item_embeddings.T
|
| 23 |
+
scores[X != 0] = self.mask_value
|
| 24 |
+
return scores
|
machine_learning/prefetched/embeddings.csv.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:40897c6ec6916d39983608eb81ad85700db92ad6aec8bb7167cc79d100337f6d
|
| 3 |
+
size 42019593
|
machine_learning/transformers/__init__.py
ADDED
|
File without changes
|
machine_learning/transformers/inverter.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class Inverter(BaseEstimator, TransformerMixin):
|
| 5 |
+
def fit(self, X, y=None):
|
| 6 |
+
return self
|
| 7 |
+
|
| 8 |
+
def transform(self, scores_matrix):
|
| 9 |
+
return -scores_matrix
|
machine_learning/transformers/item_encoder.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
| 3 |
+
from sklearn.preprocessing import MultiLabelBinarizer
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ItemIdOneHotEncoder(BaseEstimator, TransformerMixin):
|
| 7 |
+
def __init__(self, all_item_ids):
|
| 8 |
+
self.all_item_ids = list(all_item_ids)
|
| 9 |
+
self._mlb = MultiLabelBinarizer(classes=self.all_item_ids)
|
| 10 |
+
|
| 11 |
+
def fit(self, X, y=None):
|
| 12 |
+
self._mlb.fit([[]])
|
| 13 |
+
return self
|
| 14 |
+
|
| 15 |
+
def transform(self, X):
|
| 16 |
+
M = self._mlb.transform(X).astype(np.float64)
|
| 17 |
+
return M
|
| 18 |
+
|
| 19 |
+
@property
|
| 20 |
+
def vocab_(self):
|
| 21 |
+
return self.all_item_ids
|
machine_learning/transformers/scores_to_dict.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class ScoresToDict(BaseEstimator, TransformerMixin):
|
| 6 |
+
def __init__(self, item_ids, titles, posters=None, premiere_years=None):
|
| 7 |
+
self.item_ids = list(item_ids)
|
| 8 |
+
self.titles = list(titles)
|
| 9 |
+
self.posters = list(posters) if posters is not None else [None] * len(item_ids)
|
| 10 |
+
self.premiere_years = list(premiere_years) if premiere_years is not None else [None] * len(item_ids)
|
| 11 |
+
|
| 12 |
+
def fit(self, X, y=None):
|
| 13 |
+
return self
|
| 14 |
+
|
| 15 |
+
def transform(self, scores_matrix):
|
| 16 |
+
scores_matrix = np.asarray(scores_matrix, dtype=np.float64)
|
| 17 |
+
B, N = scores_matrix.shape
|
| 18 |
+
out = []
|
| 19 |
+
for b in range(B):
|
| 20 |
+
out.append({
|
| 21 |
+
"item_ids": self.item_ids,
|
| 22 |
+
"scores": scores_matrix[b].tolist(),
|
| 23 |
+
})
|
| 24 |
+
return out
|
| 25 |
+
|
| 26 |
+
def predict(self, scores_matrix, limit=10):
|
| 27 |
+
scores_matrix = np.asarray(scores_matrix, dtype=np.float64)
|
| 28 |
+
B, N = scores_matrix.shape
|
| 29 |
+
out = []
|
| 30 |
+
for b in range(B):
|
| 31 |
+
scores = scores_matrix[b]
|
| 32 |
+
idx = np.argsort(-scores)[:limit] # descending top-k
|
| 33 |
+
out.append({
|
| 34 |
+
"item_ids": [self.item_ids[i] for i in idx],
|
| 35 |
+
"titles": [self.titles[i] for i in idx],
|
| 36 |
+
"posters": [self.posters[i] for i in idx],
|
| 37 |
+
"premiere_years": [self.premiere_years[i] for i in idx],
|
| 38 |
+
"scores": scores[idx].tolist(),
|
| 39 |
+
})
|
| 40 |
+
return out
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
joblib==1.5.2
|
| 2 |
+
numpy==2.0.2
|
| 3 |
+
pandas==2.3.3
|
| 4 |
+
scikit-learn==1.6.1
|
| 5 |
+
Flask==3.0.3
|
| 6 |
+
gunicorn==21.2.0
|
server/__init__.py
ADDED
|
File without changes
|
server/api.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
from typing import Dict, Any
|
| 6 |
+
from flask import Flask, request, jsonify
|
| 7 |
+
from werkzeug.exceptions import HTTPException
|
| 8 |
+
from machine_learning.load_models import (
|
| 9 |
+
not_for_me,
|
| 10 |
+
recommended_for_you,
|
| 11 |
+
similarity,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
logging.basicConfig(
|
| 16 |
+
level=os.environ.get("LOG_LEVEL", "INFO"),
|
| 17 |
+
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
| 18 |
+
stream=sys.stdout,
|
| 19 |
+
)
|
| 20 |
+
log = logging.getLogger("ff1000-api")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
MODELS: Dict[str, Any] = {
|
| 24 |
+
"nfm": not_for_me,
|
| 25 |
+
"rfy": recommended_for_you,
|
| 26 |
+
"similarity": similarity,
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def create_app() -> Flask:
|
| 31 |
+
app = Flask(__name__)
|
| 32 |
+
|
| 33 |
+
@app.errorhandler(Exception)
|
| 34 |
+
def handle_exception(e):
|
| 35 |
+
if isinstance(e, HTTPException):
|
| 36 |
+
return jsonify(error=e.name, message=e.description), e.code
|
| 37 |
+
log.exception("Unhandled exception")
|
| 38 |
+
return jsonify(error="InternalServerError", message=str(e)), 500
|
| 39 |
+
|
| 40 |
+
@app.get("/health")
|
| 41 |
+
def healthz():
|
| 42 |
+
return jsonify(status="ok")
|
| 43 |
+
|
| 44 |
+
@app.post("/predict/<model_name>")
|
| 45 |
+
def predict(model_name: str):
|
| 46 |
+
if model_name not in MODELS:
|
| 47 |
+
return jsonify(error="UnknownModel", message=f"valid models: {list(MODELS.keys())}"), 400
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
payload = request.get_json(force=True, silent=False)
|
| 51 |
+
except Exception:
|
| 52 |
+
return jsonify(error="InvalidJSON", message="body must be valid JSON"), 400
|
| 53 |
+
|
| 54 |
+
if not isinstance(payload, dict) or "items" not in payload:
|
| 55 |
+
return jsonify(error="BadRequest", message="json must have key 'items'"), 400
|
| 56 |
+
|
| 57 |
+
inputs = payload["items"]
|
| 58 |
+
if not isinstance(inputs, list):
|
| 59 |
+
return jsonify(error="BadRequest", message="'items' must be a list"), 400
|
| 60 |
+
|
| 61 |
+
model = MODELS[model_name]
|
| 62 |
+
try:
|
| 63 |
+
preds = model.predict([inputs])
|
| 64 |
+
except Exception as e:
|
| 65 |
+
log.exception("Prediction failed")
|
| 66 |
+
return jsonify(error="PredictionError", message=str(e)), 500
|
| 67 |
+
|
| 68 |
+
return jsonify(model=model_name, predictions=preds)
|
| 69 |
+
|
| 70 |
+
return app
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
app = create_app()
|
setup.py
ADDED
|
File without changes
|