iiewa commited on
Commit
2cc98e1
·
verified ·
1 Parent(s): a88eaf7

Upload folder using huggingface_hub

Browse files
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ venv/
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ .DS_Store
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Set environment variables
4
+ ENV PYTHONPATH="/app"
5
+ ENV PYTHONUNBUFFERED=1
6
+
7
+ # Create non-root user for Hugging Face Spaces
8
+ RUN useradd -m -u 1000 user
9
+ USER user
10
+ ENV HOME=/home/user \
11
+ PATH=/home/user/.local/bin:$PATH
12
+
13
+ WORKDIR /app
14
+
15
+ # Copy requirements first for better caching
16
+ COPY --chown=user:user requirements.txt .
17
+ RUN pip install --no-cache-dir --user -r requirements.txt
18
+
19
+ # Copy application code
20
+ COPY --chown=user:user . .
21
+
22
+ # Expose port (7860 for HF Spaces, can be overridden)
23
+ EXPOSE 7860
24
+
25
+ # Health check
26
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
27
+ CMD curl -f http://localhost:7860/health || exit 1
28
+
29
+ # Start the server
30
+ # Use single worker to minimize memory usage
31
+ CMD ["python", "-m", "gunicorn", "-b", "0.0.0.0:7860", "--workers", "1", "--threads", "2", "--timeout", "120", "server.api:app"]
LICENSE ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Warner Bros. Discovery Hackathon License (WBDHL)
2
+
3
+ Copyright (c) 2025 - present Warner Bros. Discovery, Inc. or its subsidiaries and affiliates.
4
+
5
+ This software and associated documentation files (the “Software”) are proprietary and confidential,
6
+ and Warner Bros. Discovery, Inc. or its subsidiaries and affiliates (“WBD”) reserves all rights to
7
+ the Software. Authorized participants in a hackathon hosted by WBD may use the Software solely for
8
+ internal research and development purposes within WBD. Any other use of the Software is strictly
9
+ prohibited. The Software may not be published, distributed, redistributed, sublicensed, rented, sold,
10
+ exported, or lent, unless WBD expressly provides separate authorization to do so. The Software may
11
+ not be copied, modified, or merged for any purpose other than internal research and development
12
+ purposes within WBD, unless WBD expressly provides separate authorization to do so.
README.md CHANGED
@@ -1,12 +1,71 @@
1
  ---
2
- title: DynamicRail
3
- emoji: 🌍
4
  colorFrom: purple
5
- colorTo: red
6
  sdk: docker
7
- pinned: false
8
- license: mit
9
- short_description: Prototype
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: FF1000 Recommendation Service
3
+ emoji: 🎬
4
  colorFrom: purple
5
+ colorTo: blue
6
  sdk: docker
7
+ app_port: 7860
 
 
8
  ---
9
 
10
+ # FF1000 - ML Recommendation Service
11
+
12
+ A pretrained recommendation service for content discovery, providing similarity-based and personalized recommendations.
13
+
14
+ ## API Endpoints
15
+
16
+ ### Health Check
17
+ ```
18
+ GET /health
19
+ ```
20
+ Returns `{"status": "ok"}` when the service is running.
21
+
22
+ ### Predict Endpoint
23
+ ```
24
+ POST /predict/<model_name>
25
+ Content-Type: application/json
26
+ ```
27
+
28
+ **Available models:**
29
+ - `similarity` - Find similar content based on embeddings
30
+ - `rfy` - "Recommended for you" personalized recommendations
31
+ - `nfm` - "Not for me" content filtering
32
+
33
+ **Request body:**
34
+ ```json
35
+ {
36
+ "items": ["item_id_1", "item_id_2"]
37
+ }
38
+ ```
39
+
40
+ **Response:**
41
+ ```json
42
+ {
43
+ "model": "similarity",
44
+ "predictions": [
45
+ {
46
+ "item_ids": ["..."],
47
+ "titles": ["..."],
48
+ "scores": [0.95, 0.87, ...],
49
+ "posters": ["https://...", ...],
50
+ "premiere_years": [2023, 2022, ...]
51
+ }
52
+ ]
53
+ }
54
+ ```
55
+
56
+ ## Example Usage
57
+
58
+ ```bash
59
+ curl -X POST https://YOUR-SPACE.hf.space/predict/similarity \
60
+ -H "Content-Type: application/json" \
61
+ -d '{"items": ["ab553cdc-e15d-4597-b65f-bec9201fd2dd"]}'
62
+ ```
63
+
64
+ ## Architecture
65
+
66
+ The service loads pre-computed embeddings and serves three recommendation models:
67
+ - **Similarity**: Cosine distance between content embeddings
68
+ - **RFY**: Variance-explained recommendations for personalization
69
+ - **NFM**: Negative preference modeling
70
+
71
+ Built with Flask and scikit-learn.
app.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FF1000 Recommendation Service - Hugging Face Spaces Entry Point
3
+ """
4
+ import os
5
+ import sys
6
+ import logging
7
+
8
+ # Set up logging
9
+ logging.basicConfig(
10
+ level=os.environ.get("LOG_LEVEL", "INFO"),
11
+ format="%(asctime)s %(levelname)s %(name)s: %(message)s",
12
+ stream=sys.stdout,
13
+ )
14
+ log = logging.getLogger("ff1000-hf")
15
+
16
+ # Import the Flask app from server.api
17
+ from server.api import app
18
+
19
+ # Hugging Face Spaces expects the app to be available as 'app'
20
+ # The app will be run by the Spaces infrastructure
21
+
22
+ if __name__ == "__main__":
23
+ # For local testing
24
+ port = int(os.environ.get("PORT", 7860))
25
+ app.run(host="0.0.0.0", port=port, debug=False)
26
+
machine_learning/__init__.py ADDED
File without changes
machine_learning/datasets/__init__.py ADDED
File without changes
machine_learning/datasets/embeddings_csv.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ import os
4
+
5
+
6
+ class EmbeddingsDataLoader:
7
+ def __init__(
8
+ self,
9
+ filepath=None,
10
+ ):
11
+ if filepath is None:
12
+ # Use relative path from this file's location
13
+ current_dir = os.path.dirname(os.path.abspath(__file__))
14
+ filepath = os.path.join(current_dir, '..', 'prefetched', 'embeddings.csv.gz')
15
+ self.filepath = filepath
16
+
17
+ def load(self) -> pd.DataFrame:
18
+ df = pd.read_csv(self.filepath, compression='gzip')
19
+ df.embedding = df.embedding.apply(lambda vec: [float(v) for v in json.loads(vec)])
20
+ return df
machine_learning/datasets/embeddings_databricks.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pyspark.sql.session import SparkSession
3
+
4
+
5
+ EMBEDDINGS_SQL = """
6
+ SELECT DISTINCT
7
+ e.item_id,
8
+ s.seriesMainTitle AS title,
9
+ e.embedding
10
+ FROM
11
+ {item_embeddings} e
12
+ INNER JOIN
13
+ {s2s_content_entities} s
14
+ ON e.item_id = s.unpackedValue.series.id.id
15
+ INNER JOIN
16
+ {series_offering_dim} off
17
+ ON
18
+ s.unpackedValue.series.id.id = off.series_id
19
+ AND SIZE(ARRAY_INTERSECT(off.country_codes, ARRAY('US'))) > 0
20
+ WHERE
21
+ date = (SELECT MAX(date) FROM bolt_recs_prod.gold.item_embeddings)
22
+ AND n_dimensions = 1536
23
+ """
24
+
25
+
26
+ class EmbeddingsDataLoader:
27
+ def __init__(
28
+ self,
29
+ env: str = "prod",
30
+ spark_session: SparkSession = None,
31
+ item_embeddings: str = "bolt_recs_prod.gold.item_embeddings",
32
+ s2s_content_entities: str = "bolt_cep_prod.gold.s2s_content_entities",
33
+ series_offering_dim: str = "bolt_dai_ckg_prod.gold.series_offering_dim",
34
+ ):
35
+ if not spark_session:
36
+ spark_session = SparkSession.builder.appName(
37
+ "embeddings"
38
+ ).getOrCreate()
39
+
40
+ self._env = env
41
+ self._spark_session = spark_session
42
+ self._table_names = {
43
+ "item_embeddings": item_embeddings,
44
+ "s2s_content_entities": s2s_content_entities,
45
+ "series_offering_dim": series_offering_dim,
46
+ }
47
+
48
+ def load(self) -> pd.DataFrame:
49
+ query = EMBEDDINGS_SQL.format(**self._table_names)
50
+ return self._spark_session.sql(query).toPandas()
machine_learning/load_models.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ from sklearn.pipeline import Pipeline
4
+
5
+ from machine_learning.datasets.embeddings_csv import EmbeddingsDataLoader
6
+ from machine_learning.models.rfy import BayesianRecommender
7
+ from machine_learning.models.similarity import SimilarityRecommender
8
+ from machine_learning.transformers.inverter import Inverter
9
+ from machine_learning.transformers.item_encoder import ItemIdOneHotEncoder
10
+ from machine_learning.transformers.scores_to_dict import ScoresToDict
11
+
12
+
13
+ catalog = EmbeddingsDataLoader().load()
14
+ embeddings = np.array(catalog.embedding.tolist())
15
+ posters = catalog.poster if 'poster' in catalog.columns else None
16
+ premiere_years = catalog.premiere_year if 'premiere_year' in catalog.columns else None
17
+
18
+ recommended_for_you = Pipeline([
19
+ ('encoder', ItemIdOneHotEncoder(catalog.item_id)),
20
+ ('ranker', BayesianRecommender(embeddings)),
21
+ ('scores_to_dict', ScoresToDict(catalog.item_id, catalog.title, posters, premiere_years)),
22
+ ]).fit([])
23
+
24
+ not_for_me = Pipeline([
25
+ ('encoder', ItemIdOneHotEncoder(catalog.item_id)),
26
+ ('inverter', Inverter()),
27
+ ('ranker', BayesianRecommender(embeddings)),
28
+ ('scores_to_dict', ScoresToDict(catalog.item_id, catalog.title, posters, premiere_years)),
29
+ ]).fit([])
30
+
31
+ similarity = Pipeline([
32
+ ('encoder', ItemIdOneHotEncoder(catalog.item_id)),
33
+ ('ranker', SimilarityRecommender(embeddings)),
34
+ ('scores_to_dict', ScoresToDict(catalog.item_id, catalog.title, posters, premiere_years)),
35
+ ]).fit([])
machine_learning/models/__init__.py ADDED
File without changes
machine_learning/models/rfy.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.base import BaseEstimator
3
+
4
+
5
+ class BayesianRecommender(BaseEstimator):
6
+ def __init__(self,
7
+ item_embeddings: np.ndarray,
8
+ lambda_reg: float = 1.0,
9
+ sigma2: float = 1.0,
10
+ z: float = -1.1645, # -1.645=<10% LCB
11
+ mask_value: float = -np.inf):
12
+
13
+ self.item_embeddings = np.asarray(item_embeddings, dtype=np.float64)
14
+ self.N_, self.d_ = self.item_embeddings.shape
15
+
16
+ self.lambda_reg = float(lambda_reg)
17
+ self.sigma2 = float(sigma2)
18
+ self.z = float(z)
19
+ self.mask_value = mask_value
20
+
21
+ self.X_items = self.item_embeddings
22
+ self.XT_items = self.item_embeddings.T
23
+
24
+ def fit(self, X=None, y=None):
25
+ return self
26
+
27
+ def _user_posterior_and_scores(self, y_vec: np.ndarray):
28
+ seen_mask = y_vec != 0
29
+ X_obs = self.X_items[seen_mask]
30
+ y_obs = y_vec[seen_mask].astype(np.float64)
31
+ A = self.lambda_reg * np.eye(self.d_, dtype=np.float64) + (X_obs.T @ X_obs) / self.sigma2
32
+ invA = np.linalg.inv(A)
33
+ mu = invA @ (X_obs.T @ y_obs) / self.sigma2
34
+
35
+ m = self.X_items @ mu
36
+ XinvA = self.X_items @ invA
37
+ s2 = np.einsum('ij,ij->i', XinvA, self.X_items)
38
+ s = np.sqrt(np.clip(s2, 0.0, None))
39
+
40
+ scores = m + self.z * s
41
+ scores[seen_mask] = self.mask_value
42
+
43
+ return scores
44
+
45
+ def transform(self, X):
46
+ X = np.asarray(X, dtype=np.float64)
47
+ if X.ndim == 1:
48
+ X = X[None, :]
49
+
50
+ B, N = X.shape
51
+ if N != self.N_:
52
+ raise ValueError(f"Input width {N} != number of items {self.N_}.")
53
+
54
+ out = np.empty((B, N), dtype=np.float64)
55
+ for b in range(B):
56
+ out[b] = self._user_posterior_and_scores(X[b])
57
+ return out
machine_learning/models/similarity.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.base import BaseEstimator
3
+
4
+
5
+ class SimilarityRecommender(BaseEstimator):
6
+ def __init__(self,
7
+ item_embeddings: np.ndarray,
8
+ mask_value: float = -np.inf):
9
+
10
+ E = np.asarray(item_embeddings, dtype=np.float64)
11
+ self.item_embeddings = E / np.linalg.norm(E, axis=1, keepdims=True)
12
+ self.N_, self.d_ = self.item_embeddings.shape
13
+ self.mask_value = mask_value
14
+
15
+ def fit(self, X=None, y=None):
16
+ return self
17
+
18
+ def transform(self, X):
19
+ X = np.asarray(X, dtype=np.float64)
20
+ U = X @ self.item_embeddings
21
+ U /= np.linalg.norm(U, axis=1, keepdims=True)
22
+ scores = U @ self.item_embeddings.T
23
+ scores[X != 0] = self.mask_value
24
+ return scores
machine_learning/prefetched/embeddings.csv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40897c6ec6916d39983608eb81ad85700db92ad6aec8bb7167cc79d100337f6d
3
+ size 42019593
machine_learning/transformers/__init__.py ADDED
File without changes
machine_learning/transformers/inverter.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.base import BaseEstimator, TransformerMixin
2
+
3
+
4
+ class Inverter(BaseEstimator, TransformerMixin):
5
+ def fit(self, X, y=None):
6
+ return self
7
+
8
+ def transform(self, scores_matrix):
9
+ return -scores_matrix
machine_learning/transformers/item_encoder.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.base import BaseEstimator, TransformerMixin
3
+ from sklearn.preprocessing import MultiLabelBinarizer
4
+
5
+
6
+ class ItemIdOneHotEncoder(BaseEstimator, TransformerMixin):
7
+ def __init__(self, all_item_ids):
8
+ self.all_item_ids = list(all_item_ids)
9
+ self._mlb = MultiLabelBinarizer(classes=self.all_item_ids)
10
+
11
+ def fit(self, X, y=None):
12
+ self._mlb.fit([[]])
13
+ return self
14
+
15
+ def transform(self, X):
16
+ M = self._mlb.transform(X).astype(np.float64)
17
+ return M
18
+
19
+ @property
20
+ def vocab_(self):
21
+ return self.all_item_ids
machine_learning/transformers/scores_to_dict.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.base import BaseEstimator, TransformerMixin
3
+
4
+
5
+ class ScoresToDict(BaseEstimator, TransformerMixin):
6
+ def __init__(self, item_ids, titles, posters=None, premiere_years=None):
7
+ self.item_ids = list(item_ids)
8
+ self.titles = list(titles)
9
+ self.posters = list(posters) if posters is not None else [None] * len(item_ids)
10
+ self.premiere_years = list(premiere_years) if premiere_years is not None else [None] * len(item_ids)
11
+
12
+ def fit(self, X, y=None):
13
+ return self
14
+
15
+ def transform(self, scores_matrix):
16
+ scores_matrix = np.asarray(scores_matrix, dtype=np.float64)
17
+ B, N = scores_matrix.shape
18
+ out = []
19
+ for b in range(B):
20
+ out.append({
21
+ "item_ids": self.item_ids,
22
+ "scores": scores_matrix[b].tolist(),
23
+ })
24
+ return out
25
+
26
+ def predict(self, scores_matrix, limit=10):
27
+ scores_matrix = np.asarray(scores_matrix, dtype=np.float64)
28
+ B, N = scores_matrix.shape
29
+ out = []
30
+ for b in range(B):
31
+ scores = scores_matrix[b]
32
+ idx = np.argsort(-scores)[:limit] # descending top-k
33
+ out.append({
34
+ "item_ids": [self.item_ids[i] for i in idx],
35
+ "titles": [self.titles[i] for i in idx],
36
+ "posters": [self.posters[i] for i in idx],
37
+ "premiere_years": [self.premiere_years[i] for i in idx],
38
+ "scores": scores[idx].tolist(),
39
+ })
40
+ return out
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ joblib==1.5.2
2
+ numpy==2.0.2
3
+ pandas==2.3.3
4
+ scikit-learn==1.6.1
5
+ Flask==3.0.3
6
+ gunicorn==21.2.0
server/__init__.py ADDED
File without changes
server/api.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import logging
4
+
5
+ from typing import Dict, Any
6
+ from flask import Flask, request, jsonify
7
+ from werkzeug.exceptions import HTTPException
8
+ from machine_learning.load_models import (
9
+ not_for_me,
10
+ recommended_for_you,
11
+ similarity,
12
+ )
13
+
14
+
15
+ logging.basicConfig(
16
+ level=os.environ.get("LOG_LEVEL", "INFO"),
17
+ format="%(asctime)s %(levelname)s %(name)s: %(message)s",
18
+ stream=sys.stdout,
19
+ )
20
+ log = logging.getLogger("ff1000-api")
21
+
22
+
23
+ MODELS: Dict[str, Any] = {
24
+ "nfm": not_for_me,
25
+ "rfy": recommended_for_you,
26
+ "similarity": similarity,
27
+ }
28
+
29
+
30
+ def create_app() -> Flask:
31
+ app = Flask(__name__)
32
+
33
+ @app.errorhandler(Exception)
34
+ def handle_exception(e):
35
+ if isinstance(e, HTTPException):
36
+ return jsonify(error=e.name, message=e.description), e.code
37
+ log.exception("Unhandled exception")
38
+ return jsonify(error="InternalServerError", message=str(e)), 500
39
+
40
+ @app.get("/health")
41
+ def healthz():
42
+ return jsonify(status="ok")
43
+
44
+ @app.post("/predict/<model_name>")
45
+ def predict(model_name: str):
46
+ if model_name not in MODELS:
47
+ return jsonify(error="UnknownModel", message=f"valid models: {list(MODELS.keys())}"), 400
48
+
49
+ try:
50
+ payload = request.get_json(force=True, silent=False)
51
+ except Exception:
52
+ return jsonify(error="InvalidJSON", message="body must be valid JSON"), 400
53
+
54
+ if not isinstance(payload, dict) or "items" not in payload:
55
+ return jsonify(error="BadRequest", message="json must have key 'items'"), 400
56
+
57
+ inputs = payload["items"]
58
+ if not isinstance(inputs, list):
59
+ return jsonify(error="BadRequest", message="'items' must be a list"), 400
60
+
61
+ model = MODELS[model_name]
62
+ try:
63
+ preds = model.predict([inputs])
64
+ except Exception as e:
65
+ log.exception("Prediction failed")
66
+ return jsonify(error="PredictionError", message=str(e)), 500
67
+
68
+ return jsonify(model=model_name, predictions=preds)
69
+
70
+ return app
71
+
72
+
73
+ app = create_app()
setup.py ADDED
File without changes