Spaces:
Sleeping
Sleeping
Commit ·
a2f5871
1
Parent(s): c18b91b
Add application file
Browse files- .dockerignore +15 -0
- DEPLOY.md +92 -0
- Dockerfile +31 -0
- README.md +80 -8
- app.py +364 -0
- ml/__init__.py +0 -0
- ml/features.py +226 -0
- ml/features_v2.py +275 -0
- models/gaussian_nb__parity.v2.joblib +3 -0
- models/mlp__number.joblib +3 -0
- models/svc__color.v2.joblib +3 -0
- models/svc__column.v2.joblib +3 -0
- models/xgboost__dozen.joblib +3 -0
- requirements.txt +9 -0
.dockerignore
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
.Python
|
| 6 |
+
.pytest_cache
|
| 7 |
+
.ruff_cache
|
| 8 |
+
.mypy_cache
|
| 9 |
+
.venv
|
| 10 |
+
venv/
|
| 11 |
+
env/
|
| 12 |
+
.git
|
| 13 |
+
.gitignore
|
| 14 |
+
.DS_Store
|
| 15 |
+
*.egg-info
|
DEPLOY.md
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Deploying to HuggingFace Spaces
|
| 2 |
+
|
| 3 |
+
## Option A — web UI (easiest)
|
| 4 |
+
|
| 5 |
+
1. Create a new Space at https://huggingface.co/new-space
|
| 6 |
+
2. Owner: your username (or org). Name: e.g. `roulette-predictor`.
|
| 7 |
+
3. SDK: **Docker**. License: MIT. Hardware: CPU basic (2 vCPU / 16 GB RAM is enough).
|
| 8 |
+
4. Visibility: Public or Private.
|
| 9 |
+
5. Click **Create Space**, then on the Space page choose **Files → Upload files**
|
| 10 |
+
and upload **everything inside `deployment/`**:
|
| 11 |
+
|
| 12 |
+
```
|
| 13 |
+
app.py
|
| 14 |
+
requirements.txt
|
| 15 |
+
Dockerfile
|
| 16 |
+
README.md
|
| 17 |
+
.dockerignore
|
| 18 |
+
ml/
|
| 19 |
+
models/
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
Keep directory structure (drag the `ml/` and `models/` folders as-is).
|
| 23 |
+
|
| 24 |
+
6. HuggingFace will build the container automatically. First build takes
|
| 25 |
+
3–5 minutes. When it finishes, the Space serves at:
|
| 26 |
+
```
|
| 27 |
+
https://<username>-roulette-predictor.hf.space
|
| 28 |
+
```
|
| 29 |
+
Interactive docs live at `/docs`.
|
| 30 |
+
|
| 31 |
+
## Option B — git push (repeatable)
|
| 32 |
+
|
| 33 |
+
```bash
|
| 34 |
+
# 1. Create the Space (Docker SDK) on the HF website first.
|
| 35 |
+
# 2. Clone it locally:
|
| 36 |
+
git clone https://huggingface.co/spaces/<username>/roulette-predictor
|
| 37 |
+
cd roulette-predictor
|
| 38 |
+
|
| 39 |
+
# 3. Copy all deployment/ files into this directory:
|
| 40 |
+
cp -r /path/to/tej/deployment/* .
|
| 41 |
+
|
| 42 |
+
# 4. Commit and push:
|
| 43 |
+
git add .
|
| 44 |
+
git commit -m "initial deploy"
|
| 45 |
+
git push # may need: git lfs install && git lfs track "models/*"
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
> **LFS note:** `svc__column.v2.joblib` is ~4 MB and `xgboost__dozen.joblib` is
|
| 49 |
+
> ~3 MB — both fit under the 10 MB normal-git-push limit on HF. If you ever
|
| 50 |
+
> add a model over 10 MB, run `git lfs install && git lfs track "models/*.joblib"`
|
| 51 |
+
> first.
|
| 52 |
+
|
| 53 |
+
## Sanity checks after deploy
|
| 54 |
+
|
| 55 |
+
```bash
|
| 56 |
+
export SPACE=https://<username>-roulette-predictor.hf.space
|
| 57 |
+
|
| 58 |
+
# Health
|
| 59 |
+
curl -s "$SPACE/" | jq
|
| 60 |
+
|
| 61 |
+
# Predict via JSON
|
| 62 |
+
curl -s -X POST "$SPACE/predict" \
|
| 63 |
+
-H 'Content-Type: application/json' \
|
| 64 |
+
-d '{"numbers":[28,35,36,31,12,17,12,34,6,10,15,14,19,19,22,2,9,11,33,16],"steps":10}' \
|
| 65 |
+
| jq
|
| 66 |
+
|
| 67 |
+
# Predict via CSV upload (test.csv with a "Winner" or "number" column)
|
| 68 |
+
curl -s -X POST "$SPACE/predict/file?steps=10" -F "file=@test.csv" | jq
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
## Local test before deploying
|
| 72 |
+
|
| 73 |
+
```bash
|
| 74 |
+
cd deployment
|
| 75 |
+
docker build -t roulette-predictor .
|
| 76 |
+
docker run --rm -p 7860:7860 roulette-predictor
|
| 77 |
+
# then open http://localhost:7860/docs
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
## What's inside
|
| 81 |
+
|
| 82 |
+
| File | Purpose |
|
| 83 |
+
|------|---------|
|
| 84 |
+
| `app.py` | FastAPI service with `/predict`, `/predict/file`, `/models`, `/` |
|
| 85 |
+
| `Dockerfile` | Python 3.11-slim, non-root user UID 1000, port 7860 (HF convention) |
|
| 86 |
+
| `requirements.txt` | fastapi, uvicorn, pandas, numpy, scikit-learn, xgboost, joblib |
|
| 87 |
+
| `README.md` | HF Space metadata frontmatter + user docs |
|
| 88 |
+
| `ml/features.py` | v1 hand-crafted features (window=10, 25 dims) |
|
| 89 |
+
| `ml/features_v2.py` | v2 features (window=20, 51 dims, run-length, autocorrelation, wheel-neighbor) |
|
| 90 |
+
| `models/*.joblib` | Five winning model artefacts (number, color, parity, dozen, column) |
|
| 91 |
+
|
| 92 |
+
Total image size is ~1.3 GB (mostly sklearn + xgboost + numpy wheels).
|
Dockerfile
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# syntax=docker/dockerfile:1.6
|
| 2 |
+
FROM python:3.11-slim
|
| 3 |
+
|
| 4 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 5 |
+
PYTHONUNBUFFERED=1 \
|
| 6 |
+
PIP_NO_CACHE_DIR=1 \
|
| 7 |
+
HF_HOME=/tmp/hf \
|
| 8 |
+
XDG_CACHE_HOME=/tmp/cache
|
| 9 |
+
|
| 10 |
+
# System deps: libgomp needed by XGBoost/LightGBM runtime.
|
| 11 |
+
RUN apt-get update \
|
| 12 |
+
&& apt-get install -y --no-install-recommends libgomp1 \
|
| 13 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
+
|
| 15 |
+
# HuggingFace Spaces expects a non-root user with UID 1000 and writable /home.
|
| 16 |
+
RUN useradd -m -u 1000 app
|
| 17 |
+
WORKDIR /home/app
|
| 18 |
+
|
| 19 |
+
COPY --chown=app:app requirements.txt .
|
| 20 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 21 |
+
|
| 22 |
+
COPY --chown=app:app app.py ./
|
| 23 |
+
COPY --chown=app:app ml ./ml
|
| 24 |
+
COPY --chown=app:app models ./models
|
| 25 |
+
COPY --chown=app:app README.md ./README.md
|
| 26 |
+
|
| 27 |
+
USER app
|
| 28 |
+
|
| 29 |
+
EXPOSE 7860
|
| 30 |
+
|
| 31 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,12 +1,84 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
|
| 8 |
-
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Roulette Next-Spin Predictor
|
| 3 |
+
emoji: 🎰
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: gray
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
|
|
|
| 8 |
pinned: false
|
| 9 |
+
license: mit
|
| 10 |
+
short_description: Predict next roulette spins from a history of past numbers
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# Roulette Next-Spin Predictor
|
| 14 |
+
|
| 15 |
+
FastAPI service that predicts the next N roulette spins from a history of past
|
| 16 |
+
winning numbers (European single-zero wheel, 0–36). The best-in-class model per
|
| 17 |
+
target was picked from a sweep across **30+ algorithms**: classical ML
|
| 18 |
+
(LogReg, RandomForest, XGBoost, LightGBM, CatBoost, SVC, ExtraTrees, KNN,
|
| 19 |
+
GaussianNB, MultinomialNB, BernoulliNB, AdaBoost, DecisionTree, Ridge, SGD,
|
| 20 |
+
MLP, HistGradientBoosting), deep learning (LSTM, GRU, Transformer, vanilla
|
| 21 |
+
RNN, 1D-CNN, TabNet), and ensembling (stacking with LogReg meta-learner).
|
| 22 |
+
|
| 23 |
+
## Endpoints
|
| 24 |
+
|
| 25 |
+
| Method | Path | Purpose |
|
| 26 |
+
|-------|------|---------|
|
| 27 |
+
| `GET` | `/` | Health check and route summary |
|
| 28 |
+
| `GET` | `/models` | Active model per target + rolling test accuracy |
|
| 29 |
+
| `POST` | `/predict` | Predict from JSON `{numbers: [...], steps: N}` |
|
| 30 |
+
| `POST` | `/predict/file` | Predict from uploaded CSV (column `Winner`/`number`) |
|
| 31 |
+
| `GET` | `/docs` | Interactive Swagger UI |
|
| 32 |
+
|
| 33 |
+
## Example — curl
|
| 34 |
+
|
| 35 |
+
```bash
|
| 36 |
+
curl -X POST https://<your-space>.hf.space/predict \
|
| 37 |
+
-H 'Content-Type: application/json' \
|
| 38 |
+
-d '{"numbers":[28,35,36,31,12,17,12,34,6,10,15,14,19,19,22,2,9,11,33,16],"steps":10}'
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
## Example — file upload
|
| 42 |
+
|
| 43 |
+
```bash
|
| 44 |
+
curl -X POST "https://<your-space>.hf.space/predict/file?steps=10" \
|
| 45 |
+
-F "file=@test.csv"
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
The uploaded CSV must have a column named one of `Winner`, `winning number`,
|
| 49 |
+
or `number` containing integers in `[0, 36]`. If none match, the last column
|
| 50 |
+
is used.
|
| 51 |
+
|
| 52 |
+
## Model selection (after the full sweep)
|
| 53 |
+
|
| 54 |
+
| Target | Winning algorithm | Rolling test accuracy |
|
| 55 |
+
|--------|-------------------|----------------------:|
|
| 56 |
+
| number (0–36) | MLPClassifier | **4.16%** |
|
| 57 |
+
| color (red/black/green) | SVC (RBF) | **52.63%** |
|
| 58 |
+
| parity (odd/even/none) | GaussianNB | **51.88%** |
|
| 59 |
+
| dozen (1st/2nd/3rd) | XGBoost | **38.14%** |
|
| 60 |
+
| column (1st/2nd/3rd) | SVC (RBF) | **38.85%** |
|
| 61 |
+
|
| 62 |
+
For parity the best absolute was a stacking ensemble (52.13%); GaussianNB is
|
| 63 |
+
used in deployment because it is a single cheap model with almost identical
|
| 64 |
+
accuracy.
|
| 65 |
+
|
| 66 |
+
## Honest disclaimer
|
| 67 |
+
|
| 68 |
+
On a fair roulette wheel, consecutive spins are **statistically independent**,
|
| 69 |
+
so past outcomes contain no information about the next spin beyond the wheel's
|
| 70 |
+
structural class imbalance (18 red / 18 black / 1 green). The `number`
|
| 71 |
+
prediction sits just above the 2.70% uniform-random baseline; higher
|
| 72 |
+
per-target numbers come almost entirely from that imbalance, not from learned
|
| 73 |
+
temporal patterns.
|
| 74 |
+
|
| 75 |
+
**Do not gamble money based on these outputs.** The service is built for
|
| 76 |
+
educational and demonstration purposes.
|
| 77 |
+
|
| 78 |
+
## Local run
|
| 79 |
+
|
| 80 |
+
```bash
|
| 81 |
+
docker build -t roulette-predictor .
|
| 82 |
+
docker run -p 7860:7860 roulette-predictor
|
| 83 |
+
open http://localhost:7860/docs
|
| 84 |
+
```
|
app.py
ADDED
|
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Roulette next-spin prediction API.
|
| 2 |
+
|
| 3 |
+
FastAPI server exposing the best per-target models selected after an exhaustive
|
| 4 |
+
sweep across 30+ algorithms. Designed to run inside a HuggingFace Space with
|
| 5 |
+
Docker SDK on port 7860.
|
| 6 |
+
|
| 7 |
+
Endpoints
|
| 8 |
+
---------
|
| 9 |
+
GET / Health + metadata
|
| 10 |
+
GET /models Active model selection and their rolling-test accuracies
|
| 11 |
+
POST /predict Predict next N spins from a JSON list of past numbers
|
| 12 |
+
POST /predict/file Predict next N spins from an uploaded CSV (as test.csv)
|
| 13 |
+
|
| 14 |
+
The recommended minimum context is 20 past spins (matching WINDOW_V2). The
|
| 15 |
+
service automatically pads with zeros if the caller supplies fewer.
|
| 16 |
+
"""
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
import io
|
| 20 |
+
import logging
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
from typing import Any
|
| 23 |
+
|
| 24 |
+
import joblib
|
| 25 |
+
import numpy as np
|
| 26 |
+
import pandas as pd
|
| 27 |
+
from fastapi import FastAPI, File, HTTPException, UploadFile
|
| 28 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 29 |
+
from pydantic import BaseModel, Field, field_validator
|
| 30 |
+
|
| 31 |
+
from ml.features import (
|
| 32 |
+
WINDOW,
|
| 33 |
+
_features_from_window,
|
| 34 |
+
derive_color,
|
| 35 |
+
derive_column,
|
| 36 |
+
derive_dozen,
|
| 37 |
+
derive_parity,
|
| 38 |
+
)
|
| 39 |
+
from ml.features_v2 import WINDOW_V2, _features_v2
|
| 40 |
+
|
| 41 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
| 42 |
+
LOG = logging.getLogger("app")
|
| 43 |
+
|
| 44 |
+
APP_ROOT = Path(__file__).resolve().parent
|
| 45 |
+
MODELS_DIR = APP_ROOT / "models"
|
| 46 |
+
|
| 47 |
+
# ---------------------------------------------------------------------------
|
| 48 |
+
# Metadata recorded after the full v1+v2+v3 sweep on the 419-row test.csv
|
| 49 |
+
# ---------------------------------------------------------------------------
|
| 50 |
+
BEST_MODELS: dict[str, dict[str, Any]] = {
|
| 51 |
+
"number": {
|
| 52 |
+
"algo": "MLPClassifier",
|
| 53 |
+
"test_accuracy": 0.0416,
|
| 54 |
+
"feature_version": "v1",
|
| 55 |
+
"window": WINDOW,
|
| 56 |
+
"file": "mlp__number.joblib",
|
| 57 |
+
},
|
| 58 |
+
"color": {
|
| 59 |
+
"algo": "SVC (RBF)",
|
| 60 |
+
"test_accuracy": 0.5263,
|
| 61 |
+
"feature_version": "v2",
|
| 62 |
+
"window": WINDOW_V2,
|
| 63 |
+
"file": "svc__color.v2.joblib",
|
| 64 |
+
},
|
| 65 |
+
"parity": {
|
| 66 |
+
"algo": "GaussianNB",
|
| 67 |
+
"test_accuracy": 0.5188,
|
| 68 |
+
"feature_version": "v2",
|
| 69 |
+
"window": WINDOW_V2,
|
| 70 |
+
"file": "gaussian_nb__parity.v2.joblib",
|
| 71 |
+
"notes": "Best stacking was +0.25pp higher but needs 5 base models; GaussianNB chosen for deployment simplicity.",
|
| 72 |
+
},
|
| 73 |
+
"dozen": {
|
| 74 |
+
"algo": "XGBoost",
|
| 75 |
+
"test_accuracy": 0.3814,
|
| 76 |
+
"feature_version": "v1",
|
| 77 |
+
"window": WINDOW,
|
| 78 |
+
"file": "xgboost__dozen.joblib",
|
| 79 |
+
},
|
| 80 |
+
"column": {
|
| 81 |
+
"algo": "SVC (RBF)",
|
| 82 |
+
"test_accuracy": 0.3885,
|
| 83 |
+
"feature_version": "v2",
|
| 84 |
+
"window": WINDOW_V2,
|
| 85 |
+
"file": "svc__column.v2.joblib",
|
| 86 |
+
},
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
# Loaded at startup
|
| 90 |
+
MODELS: dict[str, dict[str, Any]] = {}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def load_models() -> None:
|
| 94 |
+
for target, spec in BEST_MODELS.items():
|
| 95 |
+
path = MODELS_DIR / spec["file"]
|
| 96 |
+
if not path.exists():
|
| 97 |
+
LOG.warning("Model file missing for %s: %s", target, path)
|
| 98 |
+
continue
|
| 99 |
+
MODELS[target] = joblib.load(path)
|
| 100 |
+
LOG.info("Loaded %s -> %s", target, path.name)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
app = FastAPI(
|
| 104 |
+
title="Roulette Next-Spin Predictor",
|
| 105 |
+
description=(
|
| 106 |
+
"Predict the next spins of a European single-zero roulette wheel from a "
|
| 107 |
+
"history of past winning numbers. Best-in-class models per target selected "
|
| 108 |
+
"from a 30+ algorithm sweep."
|
| 109 |
+
),
|
| 110 |
+
version="1.0.0",
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
app.add_middleware(
|
| 114 |
+
CORSMiddleware,
|
| 115 |
+
allow_origins=["*"],
|
| 116 |
+
allow_methods=["*"],
|
| 117 |
+
allow_headers=["*"],
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
@app.on_event("startup")
|
| 122 |
+
def _startup() -> None:
|
| 123 |
+
load_models()
|
| 124 |
+
LOG.info("Startup complete. Models: %s", list(MODELS.keys()))
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# ---------------------------------------------------------------------------
|
| 128 |
+
# Schemas
|
| 129 |
+
# ---------------------------------------------------------------------------
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
class PredictRequest(BaseModel):
|
| 133 |
+
numbers: list[int] = Field(
|
| 134 |
+
...,
|
| 135 |
+
description="Sequence of past winning numbers (0-36). Most recent spin goes last.",
|
| 136 |
+
examples=[[28, 35, 36, 31, 12, 17, 12, 34, 6, 10, 15, 14, 19, 19, 22, 2, 9, 11, 33, 16]],
|
| 137 |
+
)
|
| 138 |
+
steps: int = Field(10, ge=1, le=50, description="How many future spins to forecast.")
|
| 139 |
+
|
| 140 |
+
@field_validator("numbers")
|
| 141 |
+
@classmethod
|
| 142 |
+
def _check_numbers(cls, v: list[int]) -> list[int]:
|
| 143 |
+
if not v:
|
| 144 |
+
raise ValueError("numbers cannot be empty")
|
| 145 |
+
if any(n < 0 or n > 36 for n in v):
|
| 146 |
+
raise ValueError("all numbers must be in [0, 36]")
|
| 147 |
+
return v
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
class Prediction(BaseModel):
|
| 151 |
+
step: int
|
| 152 |
+
predicted_number: int
|
| 153 |
+
top3_numbers: list[int]
|
| 154 |
+
number_confidence: float
|
| 155 |
+
predicted_color: str
|
| 156 |
+
predicted_parity: str
|
| 157 |
+
predicted_dozen: str
|
| 158 |
+
predicted_column: str
|
| 159 |
+
derived_from_number_color: str
|
| 160 |
+
derived_from_number_parity: str
|
| 161 |
+
derived_from_number_dozen: str
|
| 162 |
+
derived_from_number_column: str
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
class PredictResponse(BaseModel):
|
| 166 |
+
model_config = {"protected_namespaces": ()}
|
| 167 |
+
|
| 168 |
+
model_info: dict[str, Any]
|
| 169 |
+
predictions: list[Prediction]
|
| 170 |
+
notes: list[str]
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
# ---------------------------------------------------------------------------
|
| 174 |
+
# Helpers
|
| 175 |
+
# ---------------------------------------------------------------------------
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def _prepare_windows(sequence: list[int]) -> tuple[np.ndarray, np.ndarray]:
|
| 179 |
+
"""Return (window_v1, window_v2) of the last WINDOW / WINDOW_V2 numbers.
|
| 180 |
+
|
| 181 |
+
Pads with leading zeros if the input is shorter than required.
|
| 182 |
+
"""
|
| 183 |
+
arr = np.asarray(sequence, dtype=np.int64)
|
| 184 |
+
if len(arr) < WINDOW_V2:
|
| 185 |
+
pad = np.zeros(WINDOW_V2 - len(arr), dtype=np.int64)
|
| 186 |
+
arr = np.concatenate([pad, arr])
|
| 187 |
+
w_v2 = arr[-WINDOW_V2:]
|
| 188 |
+
w_v1 = arr[-WINDOW:]
|
| 189 |
+
return w_v1, w_v2
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def _softmax(x: np.ndarray) -> np.ndarray:
|
| 193 |
+
e = np.exp(x - x.max())
|
| 194 |
+
return e / e.sum()
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def _predict_number(w_v1: np.ndarray) -> tuple[int, list[int], float]:
|
| 198 |
+
bundle = MODELS["number"]
|
| 199 |
+
model, scaler = bundle["model"], bundle.get("scaler")
|
| 200 |
+
feats = _features_from_window(w_v1).reshape(1, -1)
|
| 201 |
+
if scaler is not None:
|
| 202 |
+
feats = scaler.transform(feats)
|
| 203 |
+
if hasattr(model, "predict_proba"):
|
| 204 |
+
proba = model.predict_proba(feats)[0]
|
| 205 |
+
else:
|
| 206 |
+
logits = np.atleast_1d(model.decision_function(feats)[0])
|
| 207 |
+
proba = _softmax(logits)
|
| 208 |
+
if len(proba) < 37:
|
| 209 |
+
padded = np.zeros(37)
|
| 210 |
+
padded[: len(proba)] = proba
|
| 211 |
+
proba = padded
|
| 212 |
+
n = int(np.argmax(proba))
|
| 213 |
+
top3 = [int(i) for i in np.argsort(proba)[-3:][::-1]]
|
| 214 |
+
return n, top3, float(proba[n])
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def _predict_target_v2(target: str, w_v2: np.ndarray) -> int:
|
| 218 |
+
bundle = MODELS[target]
|
| 219 |
+
model, scaler = bundle["model"], bundle.get("scaler")
|
| 220 |
+
feats = _features_v2(w_v2).reshape(1, -1)
|
| 221 |
+
if scaler is not None:
|
| 222 |
+
feats = scaler.transform(feats)
|
| 223 |
+
return int(model.predict(feats)[0])
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def _predict_target_v1(target: str, w_v1: np.ndarray) -> int:
|
| 227 |
+
bundle = MODELS[target]
|
| 228 |
+
model, scaler = bundle["model"], bundle.get("scaler")
|
| 229 |
+
feats = _features_from_window(w_v1).reshape(1, -1)
|
| 230 |
+
if scaler is not None:
|
| 231 |
+
feats = scaler.transform(feats)
|
| 232 |
+
return int(model.predict(feats)[0])
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
COLOR_LABELS = ("red", "black", "green")
|
| 236 |
+
PARITY_LABELS = ("odd", "even", "none")
|
| 237 |
+
DOZEN_LABELS = ("first", "second", "third", "none")
|
| 238 |
+
COLUMN_LABELS = ("first", "second", "third", "none")
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def _predict_one_step(w_v1: np.ndarray, w_v2: np.ndarray, step: int) -> Prediction:
|
| 242 |
+
num, top3, conf = _predict_number(w_v1)
|
| 243 |
+
return Prediction(
|
| 244 |
+
step=step,
|
| 245 |
+
predicted_number=num,
|
| 246 |
+
top3_numbers=top3,
|
| 247 |
+
number_confidence=conf,
|
| 248 |
+
predicted_color=COLOR_LABELS[_predict_target_v2("color", w_v2)],
|
| 249 |
+
predicted_parity=PARITY_LABELS[_predict_target_v2("parity", w_v2)],
|
| 250 |
+
predicted_dozen=DOZEN_LABELS[_predict_target_v1("dozen", w_v1)],
|
| 251 |
+
predicted_column=COLUMN_LABELS[_predict_target_v2("column", w_v2)],
|
| 252 |
+
derived_from_number_color=derive_color(num),
|
| 253 |
+
derived_from_number_parity=derive_parity(num),
|
| 254 |
+
derived_from_number_dozen=derive_dozen(num),
|
| 255 |
+
derived_from_number_column=derive_column(num),
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def _forecast(sequence: list[int], steps: int) -> list[Prediction]:
|
| 260 |
+
w_v1, w_v2 = _prepare_windows(sequence)
|
| 261 |
+
out: list[Prediction] = []
|
| 262 |
+
for step in range(1, steps + 1):
|
| 263 |
+
pred = _predict_one_step(w_v1, w_v2, step)
|
| 264 |
+
out.append(pred)
|
| 265 |
+
w_v1 = np.append(w_v1[1:], pred.predicted_number)
|
| 266 |
+
w_v2 = np.append(w_v2[1:], pred.predicted_number)
|
| 267 |
+
return out
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
# ---------------------------------------------------------------------------
|
| 271 |
+
# Routes
|
| 272 |
+
# ---------------------------------------------------------------------------
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
@app.get("/")
|
| 276 |
+
def root() -> dict[str, Any]:
|
| 277 |
+
return {
|
| 278 |
+
"service": "Roulette Next-Spin Predictor",
|
| 279 |
+
"version": "1.0.0",
|
| 280 |
+
"wheel": "European single-zero (0-36)",
|
| 281 |
+
"endpoints": {
|
| 282 |
+
"GET /models": "Active models and their rolling-test accuracy",
|
| 283 |
+
"POST /predict": "Predict from JSON {numbers: [...], steps: N}",
|
| 284 |
+
"POST /predict/file": "Predict from uploaded CSV (column 'Winner' or 'number')",
|
| 285 |
+
"GET /docs": "Interactive Swagger UI",
|
| 286 |
+
},
|
| 287 |
+
"models_loaded": list(MODELS.keys()),
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
@app.get("/models")
|
| 292 |
+
def model_info() -> dict[str, Any]:
|
| 293 |
+
return {
|
| 294 |
+
"targets": BEST_MODELS,
|
| 295 |
+
"disclaimer": (
|
| 296 |
+
"Roulette on a fair wheel produces independent draws. The 'number' "
|
| 297 |
+
"prediction accuracy (~4%) is only marginally above the 2.70% uniform "
|
| 298 |
+
"random baseline. Higher per-target accuracies come largely from the "
|
| 299 |
+
"wheel's structural class imbalance (18 red / 18 black / 1 green), not "
|
| 300 |
+
"from learned patterns. Do not gamble money based on these outputs."
|
| 301 |
+
),
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
@app.post("/predict", response_model=PredictResponse)
|
| 306 |
+
def predict(req: PredictRequest) -> PredictResponse:
|
| 307 |
+
if not MODELS:
|
| 308 |
+
raise HTTPException(status_code=503, detail="models not loaded")
|
| 309 |
+
notes: list[str] = []
|
| 310 |
+
if len(req.numbers) < WINDOW_V2:
|
| 311 |
+
notes.append(
|
| 312 |
+
f"Input had {len(req.numbers)} numbers; padded with leading zeros up to {WINDOW_V2} for the v2 window."
|
| 313 |
+
)
|
| 314 |
+
preds = _forecast(req.numbers, req.steps)
|
| 315 |
+
return PredictResponse(
|
| 316 |
+
model_info={t: {"algo": s["algo"], "test_accuracy": s["test_accuracy"]} for t, s in BEST_MODELS.items()},
|
| 317 |
+
predictions=preds,
|
| 318 |
+
notes=notes,
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
@app.post("/predict/file", response_model=PredictResponse)
|
| 323 |
+
async def predict_file(file: UploadFile = File(...), steps: int = 10) -> PredictResponse:
|
| 324 |
+
if not MODELS:
|
| 325 |
+
raise HTTPException(status_code=503, detail="models not loaded")
|
| 326 |
+
try:
|
| 327 |
+
content = await file.read()
|
| 328 |
+
df = pd.read_csv(io.BytesIO(content))
|
| 329 |
+
except Exception as exc:
|
| 330 |
+
raise HTTPException(status_code=400, detail=f"could not read CSV: {exc}") from exc
|
| 331 |
+
|
| 332 |
+
col = next(
|
| 333 |
+
(c for c in df.columns if c.lower() in {"winner", "winning number", "number"}),
|
| 334 |
+
None,
|
| 335 |
+
)
|
| 336 |
+
if col is None:
|
| 337 |
+
col = df.columns[-1]
|
| 338 |
+
try:
|
| 339 |
+
numbers = [int(x) for x in df[col].tolist()]
|
| 340 |
+
except Exception as exc:
|
| 341 |
+
raise HTTPException(status_code=400, detail=f"column {col!r} is not integer-coercible: {exc}") from exc
|
| 342 |
+
|
| 343 |
+
if any(n < 0 or n > 36 for n in numbers):
|
| 344 |
+
raise HTTPException(status_code=400, detail="values must be in [0, 36]")
|
| 345 |
+
|
| 346 |
+
if steps < 1 or steps > 50:
|
| 347 |
+
raise HTTPException(status_code=400, detail="steps must be between 1 and 50")
|
| 348 |
+
|
| 349 |
+
notes = [f"Loaded column {col!r} with {len(numbers)} rows from upload."]
|
| 350 |
+
if len(numbers) < WINDOW_V2:
|
| 351 |
+
notes.append(f"Padded to window={WINDOW_V2} with leading zeros.")
|
| 352 |
+
|
| 353 |
+
preds = _forecast(numbers, steps)
|
| 354 |
+
return PredictResponse(
|
| 355 |
+
model_info={t: {"algo": s["algo"], "test_accuracy": s["test_accuracy"]} for t, s in BEST_MODELS.items()},
|
| 356 |
+
predictions=preds,
|
| 357 |
+
notes=notes,
|
| 358 |
+
)
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
if __name__ == "__main__":
|
| 362 |
+
import uvicorn
|
| 363 |
+
|
| 364 |
+
uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
|
ml/__init__.py
ADDED
|
File without changes
|
ml/features.py
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Sliding-window feature engineering for roulette next-spin prediction.
|
| 2 |
+
|
| 3 |
+
Features are built per source sequence so windows never cross source boundaries.
|
| 4 |
+
Every row of the feature matrix holds the last ``WINDOW`` winning numbers plus
|
| 5 |
+
derived counts; labels are the next spin's number, color, parity, dozen, column.
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
from typing import Final
|
| 11 |
+
|
| 12 |
+
import numpy as np
|
| 13 |
+
import pandas as pd
|
| 14 |
+
|
| 15 |
+
WINDOW: Final[int] = 10
|
| 16 |
+
|
| 17 |
+
RED_NUMBERS: Final[frozenset[int]] = frozenset(
|
| 18 |
+
{1, 3, 5, 7, 9, 12, 14, 16, 18, 19, 21, 23, 25, 27, 30, 32, 34, 36}
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
NUMBER_CLASSES: Final[int] = 37
|
| 22 |
+
COLOR_CLASSES: Final[tuple[str, ...]] = ("red", "black", "green")
|
| 23 |
+
PARITY_CLASSES: Final[tuple[str, ...]] = ("odd", "even", "none")
|
| 24 |
+
DOZEN_CLASSES: Final[tuple[str, ...]] = ("first", "second", "third", "none")
|
| 25 |
+
COLUMN_CLASSES: Final[tuple[str, ...]] = ("first", "second", "third", "none")
|
| 26 |
+
|
| 27 |
+
TARGETS: Final[tuple[str, ...]] = ("number", "color", "parity", "dozen", "column")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass(frozen=True)
|
| 31 |
+
class WindowedDataset:
|
| 32 |
+
X: np.ndarray
|
| 33 |
+
y: dict[str, np.ndarray]
|
| 34 |
+
feature_names: list[str]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def derive_color(n: int) -> str:
|
| 38 |
+
if n == 0:
|
| 39 |
+
return "green"
|
| 40 |
+
return "red" if n in RED_NUMBERS else "black"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def derive_parity(n: int) -> str:
|
| 44 |
+
if n == 0:
|
| 45 |
+
return "none"
|
| 46 |
+
return "even" if n % 2 == 0 else "odd"
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def derive_dozen(n: int) -> str:
|
| 50 |
+
if n == 0:
|
| 51 |
+
return "none"
|
| 52 |
+
if n <= 12:
|
| 53 |
+
return "first"
|
| 54 |
+
if n <= 24:
|
| 55 |
+
return "second"
|
| 56 |
+
return "third"
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def derive_column(n: int) -> str:
|
| 60 |
+
if n == 0:
|
| 61 |
+
return "none"
|
| 62 |
+
rem = n % 3
|
| 63 |
+
return "first" if rem == 1 else ("second" if rem == 2 else "third")
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def encode_label(value: str, classes: tuple[str, ...]) -> int:
|
| 67 |
+
return classes.index(value)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def build_windows_from_sequence(numbers: np.ndarray, window: int = WINDOW) -> WindowedDataset:
|
| 71 |
+
"""Build sliding-window features from a single contiguous number sequence."""
|
| 72 |
+
if len(numbers) <= window:
|
| 73 |
+
empty_y = {name: np.empty(0, dtype=np.int64) for name in TARGETS}
|
| 74 |
+
return WindowedDataset(
|
| 75 |
+
X=np.empty((0, _feature_count(window)), dtype=np.float32),
|
| 76 |
+
y=empty_y,
|
| 77 |
+
feature_names=_feature_names(window),
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
n_samples = len(numbers) - window
|
| 81 |
+
X = np.empty((n_samples, _feature_count(window)), dtype=np.float32)
|
| 82 |
+
y_number = np.empty(n_samples, dtype=np.int64)
|
| 83 |
+
y_color = np.empty(n_samples, dtype=np.int64)
|
| 84 |
+
y_parity = np.empty(n_samples, dtype=np.int64)
|
| 85 |
+
y_dozen = np.empty(n_samples, dtype=np.int64)
|
| 86 |
+
y_column = np.empty(n_samples, dtype=np.int64)
|
| 87 |
+
|
| 88 |
+
for i in range(n_samples):
|
| 89 |
+
win = numbers[i : i + window]
|
| 90 |
+
target = int(numbers[i + window])
|
| 91 |
+
X[i] = _features_from_window(win)
|
| 92 |
+
y_number[i] = target
|
| 93 |
+
y_color[i] = encode_label(derive_color(target), COLOR_CLASSES)
|
| 94 |
+
y_parity[i] = encode_label(derive_parity(target), PARITY_CLASSES)
|
| 95 |
+
y_dozen[i] = encode_label(derive_dozen(target), DOZEN_CLASSES)
|
| 96 |
+
y_column[i] = encode_label(derive_column(target), COLUMN_CLASSES)
|
| 97 |
+
|
| 98 |
+
return WindowedDataset(
|
| 99 |
+
X=X,
|
| 100 |
+
y={
|
| 101 |
+
"number": y_number,
|
| 102 |
+
"color": y_color,
|
| 103 |
+
"parity": y_parity,
|
| 104 |
+
"dozen": y_dozen,
|
| 105 |
+
"column": y_column,
|
| 106 |
+
},
|
| 107 |
+
feature_names=_feature_names(window),
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def build_windows_grouped(
|
| 112 |
+
df: pd.DataFrame,
|
| 113 |
+
number_col: str = "number",
|
| 114 |
+
group_col: str = "source",
|
| 115 |
+
window: int = WINDOW,
|
| 116 |
+
) -> WindowedDataset:
|
| 117 |
+
"""Build windows per source group and concatenate. Never crosses sources."""
|
| 118 |
+
parts: list[WindowedDataset] = []
|
| 119 |
+
for _, group in df.groupby(group_col, sort=False):
|
| 120 |
+
numbers = group[number_col].to_numpy(dtype=np.int64)
|
| 121 |
+
parts.append(build_windows_from_sequence(numbers, window=window))
|
| 122 |
+
|
| 123 |
+
non_empty = [p for p in parts if len(p.X) > 0]
|
| 124 |
+
if not non_empty:
|
| 125 |
+
return build_windows_from_sequence(np.empty(0, dtype=np.int64), window=window)
|
| 126 |
+
|
| 127 |
+
X = np.vstack([p.X for p in non_empty])
|
| 128 |
+
y = {
|
| 129 |
+
name: np.concatenate([p.y[name] for p in non_empty]) for name in TARGETS
|
| 130 |
+
}
|
| 131 |
+
return WindowedDataset(X=X, y=y, feature_names=non_empty[0].feature_names)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def _features_from_window(win: np.ndarray) -> np.ndarray:
|
| 135 |
+
"""Extract features from a window of length WINDOW of integer numbers."""
|
| 136 |
+
window = len(win)
|
| 137 |
+
feats = np.empty(_feature_count(window), dtype=np.float32)
|
| 138 |
+
|
| 139 |
+
feats[:window] = win
|
| 140 |
+
|
| 141 |
+
red_count = 0
|
| 142 |
+
black_count = 0
|
| 143 |
+
zero_count = 0
|
| 144 |
+
even_count = 0
|
| 145 |
+
odd_count = 0
|
| 146 |
+
low_count = 0
|
| 147 |
+
high_count = 0
|
| 148 |
+
dozen_counts = [0, 0, 0]
|
| 149 |
+
column_counts = [0, 0, 0]
|
| 150 |
+
number_sum = 0
|
| 151 |
+
|
| 152 |
+
for n in win:
|
| 153 |
+
n_int = int(n)
|
| 154 |
+
number_sum += n_int
|
| 155 |
+
if n_int == 0:
|
| 156 |
+
zero_count += 1
|
| 157 |
+
continue
|
| 158 |
+
if n_int in RED_NUMBERS:
|
| 159 |
+
red_count += 1
|
| 160 |
+
else:
|
| 161 |
+
black_count += 1
|
| 162 |
+
if n_int % 2 == 0:
|
| 163 |
+
even_count += 1
|
| 164 |
+
else:
|
| 165 |
+
odd_count += 1
|
| 166 |
+
if n_int <= 18:
|
| 167 |
+
low_count += 1
|
| 168 |
+
else:
|
| 169 |
+
high_count += 1
|
| 170 |
+
if n_int <= 12:
|
| 171 |
+
dozen_counts[0] += 1
|
| 172 |
+
elif n_int <= 24:
|
| 173 |
+
dozen_counts[1] += 1
|
| 174 |
+
else:
|
| 175 |
+
dozen_counts[2] += 1
|
| 176 |
+
rem = n_int % 3
|
| 177 |
+
if rem == 1:
|
| 178 |
+
column_counts[0] += 1
|
| 179 |
+
elif rem == 2:
|
| 180 |
+
column_counts[1] += 1
|
| 181 |
+
else:
|
| 182 |
+
column_counts[2] += 1
|
| 183 |
+
|
| 184 |
+
offset = window
|
| 185 |
+
feats[offset + 0] = red_count
|
| 186 |
+
feats[offset + 1] = black_count
|
| 187 |
+
feats[offset + 2] = zero_count
|
| 188 |
+
feats[offset + 3] = even_count
|
| 189 |
+
feats[offset + 4] = odd_count
|
| 190 |
+
feats[offset + 5] = low_count
|
| 191 |
+
feats[offset + 6] = high_count
|
| 192 |
+
feats[offset + 7] = dozen_counts[0]
|
| 193 |
+
feats[offset + 8] = dozen_counts[1]
|
| 194 |
+
feats[offset + 9] = dozen_counts[2]
|
| 195 |
+
feats[offset + 10] = column_counts[0]
|
| 196 |
+
feats[offset + 11] = column_counts[1]
|
| 197 |
+
feats[offset + 12] = column_counts[2]
|
| 198 |
+
feats[offset + 13] = number_sum / window
|
| 199 |
+
feats[offset + 14] = int(win[-1]) # last number
|
| 200 |
+
return feats
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def _feature_count(window: int) -> int:
|
| 204 |
+
return window + 15
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def _feature_names(window: int) -> list[str]:
|
| 208 |
+
lags = [f"lag_{i}" for i in range(window, 0, -1)]
|
| 209 |
+
extras = [
|
| 210 |
+
"red_count",
|
| 211 |
+
"black_count",
|
| 212 |
+
"zero_count",
|
| 213 |
+
"even_count",
|
| 214 |
+
"odd_count",
|
| 215 |
+
"low_count",
|
| 216 |
+
"high_count",
|
| 217 |
+
"dozen1_count",
|
| 218 |
+
"dozen2_count",
|
| 219 |
+
"dozen3_count",
|
| 220 |
+
"col1_count",
|
| 221 |
+
"col2_count",
|
| 222 |
+
"col3_count",
|
| 223 |
+
"mean_number",
|
| 224 |
+
"last_number",
|
| 225 |
+
]
|
| 226 |
+
return lags + extras
|
ml/features_v2.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Richer feature engineering + source-matching preprocessing.
|
| 2 |
+
|
| 3 |
+
Adds on top of the v1 sliding-window features:
|
| 4 |
+
- Run-length features (current streak of red/black, odd/even, same dozen, same column)
|
| 5 |
+
- Rolling hot/cold counts at multiple horizons (20, 50, 100)
|
| 6 |
+
- Autocorrelation-lag features (repeat rate at lag 1..5)
|
| 7 |
+
- Wheel-neighbor stats (how many of the last N were on the left/right half of the wheel)
|
| 8 |
+
|
| 9 |
+
Also computes Jensen-Shannon divergence between test.csv's number distribution
|
| 10 |
+
and each training source, so we can train on matched sources only.
|
| 11 |
+
"""
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
from dataclasses import dataclass
|
| 15 |
+
from typing import Final
|
| 16 |
+
|
| 17 |
+
import numpy as np
|
| 18 |
+
import pandas as pd
|
| 19 |
+
|
| 20 |
+
from ml.features import (
|
| 21 |
+
RED_NUMBERS,
|
| 22 |
+
TARGETS,
|
| 23 |
+
derive_color,
|
| 24 |
+
derive_column,
|
| 25 |
+
derive_dozen,
|
| 26 |
+
derive_parity,
|
| 27 |
+
encode_label,
|
| 28 |
+
COLOR_CLASSES,
|
| 29 |
+
PARITY_CLASSES,
|
| 30 |
+
DOZEN_CLASSES,
|
| 31 |
+
COLUMN_CLASSES,
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
WINDOW_V2: Final[int] = 20
|
| 35 |
+
|
| 36 |
+
# Single-zero European wheel order (clockwise starting from 0)
|
| 37 |
+
WHEEL_ORDER: Final[tuple[int, ...]] = (
|
| 38 |
+
0, 32, 15, 19, 4, 21, 2, 25, 17, 34, 6, 27, 13, 36, 11, 30, 8, 23, 10,
|
| 39 |
+
5, 24, 16, 33, 1, 20, 14, 31, 9, 22, 18, 29, 7, 28, 12, 35, 3, 26,
|
| 40 |
+
)
|
| 41 |
+
WHEEL_POS: Final[dict[int, int]] = {n: i for i, n in enumerate(WHEEL_ORDER)}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _color_id(n: int) -> int:
|
| 45 |
+
if n == 0:
|
| 46 |
+
return 2
|
| 47 |
+
return 0 if n in RED_NUMBERS else 1
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _parity_id(n: int) -> int:
|
| 51 |
+
if n == 0:
|
| 52 |
+
return 2
|
| 53 |
+
return 1 if n % 2 == 0 else 0
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _dozen_id(n: int) -> int:
|
| 57 |
+
if n == 0:
|
| 58 |
+
return 3
|
| 59 |
+
if n <= 12:
|
| 60 |
+
return 0
|
| 61 |
+
if n <= 24:
|
| 62 |
+
return 1
|
| 63 |
+
return 2
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _column_id(n: int) -> int:
|
| 67 |
+
if n == 0:
|
| 68 |
+
return 3
|
| 69 |
+
rem = n % 3
|
| 70 |
+
return 0 if rem == 1 else (1 if rem == 2 else 2)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@dataclass(frozen=True)
|
| 74 |
+
class V2Dataset:
|
| 75 |
+
X: np.ndarray
|
| 76 |
+
y: dict[str, np.ndarray]
|
| 77 |
+
feature_names: list[str]
|
| 78 |
+
source: np.ndarray # per-row source label (for debugging)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _features_v2(win: np.ndarray) -> np.ndarray:
|
| 82 |
+
"""Rich feature vector for one window of length WINDOW_V2."""
|
| 83 |
+
w = len(win)
|
| 84 |
+
feats: list[float] = []
|
| 85 |
+
|
| 86 |
+
feats.extend(win.astype(np.float32).tolist())
|
| 87 |
+
|
| 88 |
+
red = sum(1 for x in win if int(x) != 0 and int(x) in RED_NUMBERS)
|
| 89 |
+
black = sum(1 for x in win if int(x) != 0 and int(x) not in RED_NUMBERS)
|
| 90 |
+
zero = int(np.sum(win == 0))
|
| 91 |
+
even = sum(1 for x in win if int(x) != 0 and int(x) % 2 == 0)
|
| 92 |
+
odd = sum(1 for x in win if int(x) != 0 and int(x) % 2 != 0)
|
| 93 |
+
low = sum(1 for x in win if 1 <= int(x) <= 18)
|
| 94 |
+
high = sum(1 for x in win if 19 <= int(x) <= 36)
|
| 95 |
+
doz = [0, 0, 0]
|
| 96 |
+
col = [0, 0, 0]
|
| 97 |
+
for x in win:
|
| 98 |
+
xi = int(x)
|
| 99 |
+
if xi == 0:
|
| 100 |
+
continue
|
| 101 |
+
if xi <= 12:
|
| 102 |
+
doz[0] += 1
|
| 103 |
+
elif xi <= 24:
|
| 104 |
+
doz[1] += 1
|
| 105 |
+
else:
|
| 106 |
+
doz[2] += 1
|
| 107 |
+
rem = xi % 3
|
| 108 |
+
if rem == 1:
|
| 109 |
+
col[0] += 1
|
| 110 |
+
elif rem == 2:
|
| 111 |
+
col[1] += 1
|
| 112 |
+
else:
|
| 113 |
+
col[2] += 1
|
| 114 |
+
feats.extend([red, black, zero, even, odd, low, high, *doz, *col])
|
| 115 |
+
|
| 116 |
+
feats.append(float(np.mean(win)))
|
| 117 |
+
feats.append(float(np.std(win)))
|
| 118 |
+
feats.append(int(win[-1]))
|
| 119 |
+
|
| 120 |
+
# Run-length features: current streak of same color/parity/dozen/column at end
|
| 121 |
+
last_color = _color_id(int(win[-1]))
|
| 122 |
+
streak_color = 1
|
| 123 |
+
for x in win[-2::-1]:
|
| 124 |
+
if _color_id(int(x)) == last_color:
|
| 125 |
+
streak_color += 1
|
| 126 |
+
else:
|
| 127 |
+
break
|
| 128 |
+
last_parity = _parity_id(int(win[-1]))
|
| 129 |
+
streak_parity = 1
|
| 130 |
+
for x in win[-2::-1]:
|
| 131 |
+
if _parity_id(int(x)) == last_parity:
|
| 132 |
+
streak_parity += 1
|
| 133 |
+
else:
|
| 134 |
+
break
|
| 135 |
+
last_dozen = _dozen_id(int(win[-1]))
|
| 136 |
+
streak_dozen = 1
|
| 137 |
+
for x in win[-2::-1]:
|
| 138 |
+
if _dozen_id(int(x)) == last_dozen:
|
| 139 |
+
streak_dozen += 1
|
| 140 |
+
else:
|
| 141 |
+
break
|
| 142 |
+
last_column = _column_id(int(win[-1]))
|
| 143 |
+
streak_column = 1
|
| 144 |
+
for x in win[-2::-1]:
|
| 145 |
+
if _column_id(int(x)) == last_column:
|
| 146 |
+
streak_column += 1
|
| 147 |
+
else:
|
| 148 |
+
break
|
| 149 |
+
feats.extend([streak_color, streak_parity, streak_dozen, streak_column])
|
| 150 |
+
|
| 151 |
+
# Autocorrelation-ish features: repeat rate at lags 1..5
|
| 152 |
+
for lag in range(1, 6):
|
| 153 |
+
if w > lag:
|
| 154 |
+
same = sum(1 for i in range(lag, w) if int(win[i]) == int(win[i - lag]))
|
| 155 |
+
feats.append(same / (w - lag))
|
| 156 |
+
else:
|
| 157 |
+
feats.append(0.0)
|
| 158 |
+
|
| 159 |
+
# Wheel-neighbor stats: mean wheel position, std, distance last→prev
|
| 160 |
+
positions = [WHEEL_POS.get(int(x), 0) for x in win]
|
| 161 |
+
feats.append(float(np.mean(positions)))
|
| 162 |
+
feats.append(float(np.std(positions)))
|
| 163 |
+
if w >= 2:
|
| 164 |
+
feats.append(float(abs(positions[-1] - positions[-2])))
|
| 165 |
+
else:
|
| 166 |
+
feats.append(0.0)
|
| 167 |
+
|
| 168 |
+
# Multi-horizon hot/cold (simply: most/least-frequent number & its count in window)
|
| 169 |
+
from collections import Counter
|
| 170 |
+
c = Counter(int(x) for x in win)
|
| 171 |
+
most = c.most_common(1)[0]
|
| 172 |
+
feats.append(most[0])
|
| 173 |
+
feats.append(most[1])
|
| 174 |
+
feats.append(float(min(c.values())))
|
| 175 |
+
|
| 176 |
+
return np.asarray(feats, dtype=np.float32)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def _feature_names_v2(window: int) -> list[str]:
|
| 180 |
+
lags = [f"lag_{i}" for i in range(window, 0, -1)]
|
| 181 |
+
block_counts = [
|
| 182 |
+
"red", "black", "zero", "even", "odd", "low", "high",
|
| 183 |
+
"doz1", "doz2", "doz3", "col1", "col2", "col3",
|
| 184 |
+
"mean", "std", "last",
|
| 185 |
+
]
|
| 186 |
+
streaks = ["streak_color", "streak_parity", "streak_dozen", "streak_column"]
|
| 187 |
+
autocorrs = [f"autocorr_lag{k}" for k in range(1, 6)]
|
| 188 |
+
wheel = ["wheel_mean_pos", "wheel_std_pos", "wheel_last_dist"]
|
| 189 |
+
hotcold = ["hot_num", "hot_count", "cold_count"]
|
| 190 |
+
return lags + block_counts + streaks + autocorrs + wheel + hotcold
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def build_windows_v2(
|
| 194 |
+
df: pd.DataFrame,
|
| 195 |
+
number_col: str = "number",
|
| 196 |
+
group_col: str = "source",
|
| 197 |
+
window: int = WINDOW_V2,
|
| 198 |
+
) -> V2Dataset:
|
| 199 |
+
"""Build rich windowed features per source (never crosses source boundaries)."""
|
| 200 |
+
X_parts: list[np.ndarray] = []
|
| 201 |
+
y_parts: dict[str, list[np.ndarray]] = {t: [] for t in TARGETS}
|
| 202 |
+
src_parts: list[np.ndarray] = []
|
| 203 |
+
|
| 204 |
+
for source, group in df.groupby(group_col, sort=False):
|
| 205 |
+
nums = group[number_col].to_numpy(dtype=np.int64)
|
| 206 |
+
if len(nums) <= window:
|
| 207 |
+
continue
|
| 208 |
+
n = len(nums) - window
|
| 209 |
+
Xg = np.empty((n, len(_feature_names_v2(window))), dtype=np.float32)
|
| 210 |
+
yg_num = np.empty(n, dtype=np.int64)
|
| 211 |
+
yg_col = np.empty(n, dtype=np.int64)
|
| 212 |
+
yg_par = np.empty(n, dtype=np.int64)
|
| 213 |
+
yg_doz = np.empty(n, dtype=np.int64)
|
| 214 |
+
yg_colm = np.empty(n, dtype=np.int64)
|
| 215 |
+
for i in range(n):
|
| 216 |
+
win = nums[i : i + window]
|
| 217 |
+
nxt = int(nums[i + window])
|
| 218 |
+
Xg[i] = _features_v2(win)
|
| 219 |
+
yg_num[i] = nxt
|
| 220 |
+
yg_col[i] = encode_label(derive_color(nxt), COLOR_CLASSES)
|
| 221 |
+
yg_par[i] = encode_label(derive_parity(nxt), PARITY_CLASSES)
|
| 222 |
+
yg_doz[i] = encode_label(derive_dozen(nxt), DOZEN_CLASSES)
|
| 223 |
+
yg_colm[i] = encode_label(derive_column(nxt), COLUMN_CLASSES)
|
| 224 |
+
X_parts.append(Xg)
|
| 225 |
+
y_parts["number"].append(yg_num)
|
| 226 |
+
y_parts["color"].append(yg_col)
|
| 227 |
+
y_parts["parity"].append(yg_par)
|
| 228 |
+
y_parts["dozen"].append(yg_doz)
|
| 229 |
+
y_parts["column"].append(yg_colm)
|
| 230 |
+
src_parts.append(np.array([str(source)] * n, dtype=object))
|
| 231 |
+
|
| 232 |
+
X = np.vstack(X_parts) if X_parts else np.empty((0, len(_feature_names_v2(window))), dtype=np.float32)
|
| 233 |
+
y = {t: (np.concatenate(parts) if parts else np.empty(0, dtype=np.int64)) for t, parts in y_parts.items()}
|
| 234 |
+
src = np.concatenate(src_parts) if src_parts else np.empty(0, dtype=object)
|
| 235 |
+
return V2Dataset(X=X, y=y, feature_names=_feature_names_v2(window), source=src)
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
# ----------------------------------------------------------------------------
|
| 239 |
+
# Source-matching via Jensen-Shannon divergence
|
| 240 |
+
# ----------------------------------------------------------------------------
|
| 241 |
+
|
| 242 |
+
def _prob(counts: np.ndarray, smoothing: float = 1e-6) -> np.ndarray:
|
| 243 |
+
p = counts.astype(np.float64) + smoothing
|
| 244 |
+
return p / p.sum()
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def js_divergence(p: np.ndarray, q: np.ndarray) -> float:
|
| 248 |
+
p = _prob(p)
|
| 249 |
+
q = _prob(q)
|
| 250 |
+
m = 0.5 * (p + q)
|
| 251 |
+
def kl(a: np.ndarray, b: np.ndarray) -> float:
|
| 252 |
+
return float(np.sum(a * np.log(a / b)))
|
| 253 |
+
return 0.5 * kl(p, m) + 0.5 * kl(q, m)
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def number_histogram(nums: np.ndarray, n_classes: int = 37) -> np.ndarray:
|
| 257 |
+
return np.bincount(nums.astype(np.int64), minlength=n_classes)
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def rank_sources_by_similarity(train_df: pd.DataFrame, test_numbers: np.ndarray) -> list[tuple[str, float]]:
|
| 261 |
+
"""Return list of (source, js_divergence) sorted ascending (closest first)."""
|
| 262 |
+
test_hist = number_histogram(test_numbers)
|
| 263 |
+
scores: list[tuple[str, float]] = []
|
| 264 |
+
for source, group in train_df.groupby("source", sort=False):
|
| 265 |
+
src_hist = number_histogram(group["number"].to_numpy())
|
| 266 |
+
if src_hist.sum() == 0:
|
| 267 |
+
continue
|
| 268 |
+
scores.append((str(source), js_divergence(test_hist, src_hist)))
|
| 269 |
+
scores.sort(key=lambda x: x[1])
|
| 270 |
+
return scores
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def select_training_df(train_df: pd.DataFrame, top_k_sources: list[str]) -> pd.DataFrame:
|
| 274 |
+
mask = train_df["source"].isin(top_k_sources)
|
| 275 |
+
return train_df.loc[mask].copy()
|
models/gaussian_nb__parity.v2.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7814989dc6dee8828cb129a0cdce45b8544f61a8390717eed58f52687d7360b3
|
| 3 |
+
size 4864
|
models/mlp__number.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:48e230050e4985813daae68a1edd0e4ea12612b386378f8d677e0650737506b1
|
| 3 |
+
size 175048
|
models/svc__color.v2.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ed6be933d3fa20dce51d8ca4c6957d0bb35a00bbaafa7ddcc3e9842bb603514
|
| 3 |
+
size 2746407
|
models/svc__column.v2.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f8ba96b67344c79c86e9a260dde028c254544f9c97c49cf777fc9b147d12bcf4
|
| 3 |
+
size 4068072
|
models/xgboost__dozen.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ce55dc8fd72d946cdc4e04e512782503eb9482c1f5ced9bd1b2d8b0857e82252
|
| 3 |
+
size 3226293
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.109
|
| 2 |
+
uvicorn[standard]>=0.27
|
| 3 |
+
pydantic>=2.5
|
| 4 |
+
python-multipart>=0.0.9
|
| 5 |
+
numpy>=1.26,<2.3
|
| 6 |
+
pandas>=2.1
|
| 7 |
+
scikit-learn==1.6.1
|
| 8 |
+
xgboost>=2.1,<3.0
|
| 9 |
+
joblib>=1.3
|