Spaces:
Running
Running
Upload 16 files
Browse files- README.md +178 -9
- app.py +0 -0
- db.py +272 -0
- fashion_ai/__init__.py +12 -0
- fashion_ai/encoder.py +319 -0
- fashion_ai/ranker.py +308 -0
- fashion_ai/retriever.py +142 -0
- fashion_ai/schemas.py +50 -0
- fashion_ai/service.py +333 -0
- fashion_ai/training.py +258 -0
- packages.txt +0 -0
- requirements.txt +16 -0
- scoring.py +553 -0
- scraped_json/product_urls_20260413T214331Z.json +174 -0
- scraper.py +512 -0
- zalando_scraper.py +1073 -0
README.md
CHANGED
|
@@ -1,14 +1,183 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji: 👁
|
| 4 |
-
colorFrom: green
|
| 5 |
-
colorTo: yellow
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 6.12.0
|
| 8 |
-
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
license: openrail
|
| 11 |
-
short_description: Style Well Model Backend
|
| 12 |
---
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Wardrobe Backend API
|
|
|
|
|
|
|
|
|
|
| 3 |
sdk: gradio
|
|
|
|
|
|
|
| 4 |
pinned: false
|
|
|
|
|
|
|
| 5 |
---
|
| 6 |
|
| 7 |
+
# Wardrobe Backend API
|
| 8 |
+
|
| 9 |
+
Production backend for Wardrobe Assistant, designed to run on Hugging Face Spaces.
|
| 10 |
+
|
| 11 |
+
The service provides:
|
| 12 |
+
- garment classification from uploaded images,
|
| 13 |
+
- wardrobe item persistence,
|
| 14 |
+
- AI outfit scoring and recommendation,
|
| 15 |
+
- shopping suggestion and product URL extraction,
|
| 16 |
+
- lightweight feedback capture for preference signals.
|
| 17 |
+
|
| 18 |
+
The API is built with FastAPI, uses SQLite for persistence, and integrates external AI providers for inference.
|
| 19 |
+
|
| 20 |
+
## Architecture Summary
|
| 21 |
+
|
| 22 |
+
- Runtime: FastAPI + Uvicorn
|
| 23 |
+
- Storage: SQLite (persistent when `/data` is mounted on Hugging Face)
|
| 24 |
+
- Inference: Hugging Face-hosted fine-tuned Qwen model (primary); NVIDIA-hosted chat completions used as fallback (default fallback model: `qwen/qwen3.5-122b-a10b`)
|
| 25 |
+
- Retrieval: Web scraping pipeline for product discovery (Nike and Zalando logic in code)
|
| 26 |
+
|
| 27 |
+
Core modules:
|
| 28 |
+
- `app.py`: API routes, orchestration, inference calls, scraper flow
|
| 29 |
+
- `db.py`: SQLite schema and CRUD/caching helpers
|
| 30 |
+
- `scoring.py`: deterministic fallback scoring logic
|
| 31 |
+
- `fashion_ai/`: recommendation service and ranking support
|
| 32 |
+
|
| 33 |
+
## Repository Contents for Deployment
|
| 34 |
+
|
| 35 |
+
Upload this backend directory as your Hugging Face Space source (or sync it via Git):
|
| 36 |
+
|
| 37 |
+
- `app.py`
|
| 38 |
+
- `db.py`
|
| 39 |
+
- `scoring.py`
|
| 40 |
+
- `scraper.py`
|
| 41 |
+
- `zalando_scraper.py`
|
| 42 |
+
- `requirements.txt`
|
| 43 |
+
- `packages.txt`
|
| 44 |
+
- `fashion_ai/`
|
| 45 |
+
|
| 46 |
+
## Hugging Face Deployment
|
| 47 |
+
|
| 48 |
+
1. Create a new Space.
|
| 49 |
+
2. Select `Gradio` SDK.
|
| 50 |
+
3. Use CPU hardware (inference is delegated to external APIs).
|
| 51 |
+
4. Enable Persistent Storage if you want data durability across restarts.
|
| 52 |
+
5. Add the required environment variables.
|
| 53 |
+
6. Deploy the backend files.
|
| 54 |
+
|
| 55 |
+
### Required Environment Variables
|
| 56 |
+
|
| 57 |
+
- `HF_API_KEY`: API key for the primary Hugging Face-hosted fine-tuned Qwen model.
|
| 58 |
+
- `NVIDIA_API_KEY`: API key for the NVIDIA inference fallback.
|
| 59 |
+
|
| 60 |
+
### Common Optional Environment Variables
|
| 61 |
+
|
| 62 |
+
Inference and reliability:
|
| 63 |
+
- `HF_MODEL_ID` (default: your fine-tuned Qwen model on Hugging Face)
|
| 64 |
+
- `HF_INVOKE_URL` (default: Hugging Face Inference API endpoint for the fine-tuned model)
|
| 65 |
+
- `NVIDIA_MODEL_ID` (fallback; default: `qwen/qwen3.5-122b-a10b`)
|
| 66 |
+
- `NVIDIA_INVOKE_URL` (fallback; default: `https://integrate.api.nvidia.com/v1/chat/completions`)
|
| 67 |
+
- `OPENAI_MODEL_ID` (secondary fallback; OpenAI-compatible model ID if both primary and NVIDIA fallback are unavailable)
|
| 68 |
+
- `OPENAI_API_KEY` (secondary fallback; required only if OpenAI fallback is enabled)
|
| 69 |
+
- `NVIDIA_MAX_TOKENS` (default: `16384`)
|
| 70 |
+
- `NVIDIA_REASONING_MAX_TOKENS` (default: `16384`)
|
| 71 |
+
- `NVIDIA_TEMPERATURE` (default: `0.60`)
|
| 72 |
+
- `NVIDIA_TOP_P` (default: `0.95`)
|
| 73 |
+
- `NVIDIA_TIMEOUT_SECONDS` (default: `180`)
|
| 74 |
+
- `NVIDIA_MAX_RETRIES` (default: `3`)
|
| 75 |
+
- `NVIDIA_RETRY_BACKOFF_SECONDS` (default: `0.8`)
|
| 76 |
+
- `NVIDIA_ENABLE_THINKING` (default: `false`)
|
| 77 |
+
- `NVIDIA_FALLBACK_MODEL_IDS` (comma-separated fallback list)
|
| 78 |
+
|
| 79 |
+
Matching and cache:
|
| 80 |
+
- `MATCHING_RESULT_CACHE_MAX` (default: `500`)
|
| 81 |
+
- `MATCHING_RESULT_CACHE_TTL_SECONDS` (default: `86400`)
|
| 82 |
+
|
| 83 |
+
Scraper and planner:
|
| 84 |
+
- `SCRAPER_DEFAULT_STORE` (default: `nike`)
|
| 85 |
+
- `KIMI_MODEL_ID` (default: `moonshotai/kimi-k2.5`)
|
| 86 |
+
- `KIMI_MAX_TOKENS` (default: `800`)
|
| 87 |
+
|
| 88 |
+
Database path:
|
| 89 |
+
- `DB_PATH` (optional override)
|
| 90 |
+
|
| 91 |
+
When `DB_PATH` is not provided, the app uses:
|
| 92 |
+
- `/data/wardrobe.db` if `/data` exists,
|
| 93 |
+
- otherwise `./wardrobe.db`.
|
| 94 |
+
|
| 95 |
+
## Inference Priority
|
| 96 |
+
|
| 97 |
+
The service resolves inference providers in the following order:
|
| 98 |
+
|
| 99 |
+
1. **Primary** - Fine-tuned Qwen model hosted on Hugging Face (`HF_MODEL_ID`).
|
| 100 |
+
2. **Fallback 1** - NVIDIA-hosted chat completions (`NVIDIA_MODEL_ID`, default: `qwen/qwen3.5-122b-a10b`). Used when the primary model is unavailable or returns an error.
|
| 101 |
+
3. **Fallback 2** - OpenAI-compatible model (`OPENAI_MODEL_ID`). Used when both the primary and NVIDIA fallback are unavailable.
|
| 102 |
+
|
| 103 |
+
AI-powered routes return a service-level error only when all three providers are exhausted or unconfigured.
|
| 104 |
+
|
| 105 |
+
## API Endpoints
|
| 106 |
+
|
| 107 |
+
Health and service metadata:
|
| 108 |
+
- `GET /`
|
| 109 |
+
- `GET /health`
|
| 110 |
+
|
| 111 |
+
Wardrobe ingestion and CRUD:
|
| 112 |
+
- `POST /classify`
|
| 113 |
+
- `POST /upload`
|
| 114 |
+
- `GET /items`
|
| 115 |
+
- `PUT /items/{item_id}`
|
| 116 |
+
- `DELETE /items/{item_id}`
|
| 117 |
+
|
| 118 |
+
Outfit intelligence:
|
| 119 |
+
- `POST /ai/score-outfit`
|
| 120 |
+
- `POST /ai/gap-analysis`
|
| 121 |
+
- `POST /ai/recommend-outfits`
|
| 122 |
+
- `POST /feedback`
|
| 123 |
+
|
| 124 |
+
Shopping and scraping:
|
| 125 |
+
- `POST /product-urls`
|
| 126 |
+
- `POST /suggestions`
|
| 127 |
+
- `POST /api/suggestions`
|
| 128 |
+
- `POST /scraper/recommend`
|
| 129 |
+
- `GET /scraper`
|
| 130 |
+
- `GET /image-proxy`
|
| 131 |
+
|
| 132 |
+
## Local Development
|
| 133 |
+
|
| 134 |
+
### 1. Install dependencies
|
| 135 |
+
|
| 136 |
+
```bash
|
| 137 |
+
pip install -r requirements.txt
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
### 2. Export environment variables
|
| 141 |
+
|
| 142 |
+
Linux/macOS:
|
| 143 |
+
|
| 144 |
+
```bash
|
| 145 |
+
export HF_API_KEY=""
|
| 146 |
+
export NVIDIA_API_KEY="" # fallback
|
| 147 |
+
export OPENAI_API_KEY="" # secondary fallback, optional
|
| 148 |
+
```
|
| 149 |
+
|
| 150 |
+
Windows PowerShell:
|
| 151 |
+
|
| 152 |
+
```powershell
|
| 153 |
+
$env:HF_API_KEY = ""
|
| 154 |
+
$env:NVIDIA_API_KEY = "" # fallback
|
| 155 |
+
$env:OPENAI_API_KEY = "" # secondary fallback, optional
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
### 3. Run the API
|
| 159 |
+
|
| 160 |
+
```bash
|
| 161 |
+
python app.py
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
The service starts on `http://0.0.0.0:7860`.
|
| 165 |
+
|
| 166 |
+
## Smoke Checks
|
| 167 |
+
|
| 168 |
+
Health:
|
| 169 |
+
|
| 170 |
+
```bash
|
| 171 |
+
curl "http://127.0.0.1:7860/health"
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
Image classification:
|
| 175 |
+
|
| 176 |
+
```bash
|
| 177 |
+
curl -X POST "http://127.0.0.1:7860/classify" \
|
| 178 |
+
-F "image=@/path/to/garment.jpg"
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
Expected post-deploy health signal:
|
| 182 |
+
- `hf_api_configured` should be `"True"` (primary model).
|
| 183 |
+
- `nvidia_api_configured` should be `"True"` (fallback model).
|
app.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
db.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
db.py — persistent storage for Wardrobe Assistant on HF Spaces.
|
| 3 |
+
|
| 4 |
+
Setup:
|
| 5 |
+
1. HF Space → Settings → Persistent Storage → Enable (mounts at /data)
|
| 6 |
+
2. Drop this file next to app.py
|
| 7 |
+
3. No extra pip packages needed — sqlite3 is stdlib
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
import os
|
| 14 |
+
import sqlite3
|
| 15 |
+
import uuid
|
| 16 |
+
from contextlib import contextmanager
|
| 17 |
+
from datetime import datetime, timezone, timedelta
|
| 18 |
+
from typing import Any
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# ---------------------------------------------------------------------------
|
| 22 |
+
# Path resolution
|
| 23 |
+
# ---------------------------------------------------------------------------
|
| 24 |
+
|
| 25 |
+
def _resolve_db_path() -> str:
|
| 26 |
+
custom = os.getenv("DB_PATH")
|
| 27 |
+
if custom:
|
| 28 |
+
return custom
|
| 29 |
+
if os.path.isdir("/data"): # HF Spaces persistent volume
|
| 30 |
+
return "/data/wardrobe.db"
|
| 31 |
+
return "./wardrobe.db" # local dev fallback
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
DB_PATH = _resolve_db_path()
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# ---------------------------------------------------------------------------
|
| 38 |
+
# Connection
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
|
| 41 |
+
@contextmanager
|
| 42 |
+
def _conn():
|
| 43 |
+
"""WAL mode + foreign keys. Auto-commit or rollback."""
|
| 44 |
+
con = sqlite3.connect(DB_PATH, check_same_thread=False)
|
| 45 |
+
con.row_factory = sqlite3.Row
|
| 46 |
+
con.execute("PRAGMA journal_mode=WAL")
|
| 47 |
+
con.execute("PRAGMA foreign_keys=ON")
|
| 48 |
+
try:
|
| 49 |
+
yield con
|
| 50 |
+
con.commit()
|
| 51 |
+
except Exception:
|
| 52 |
+
con.rollback()
|
| 53 |
+
raise
|
| 54 |
+
finally:
|
| 55 |
+
con.close()
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# ---------------------------------------------------------------------------
|
| 59 |
+
# Schema
|
| 60 |
+
# ---------------------------------------------------------------------------
|
| 61 |
+
|
| 62 |
+
def init_db() -> None:
|
| 63 |
+
"""Idempotent — safe to call on every startup."""
|
| 64 |
+
with _conn() as con:
|
| 65 |
+
con.executescript("""
|
| 66 |
+
CREATE TABLE IF NOT EXISTS items (
|
| 67 |
+
id TEXT PRIMARY KEY,
|
| 68 |
+
image_url TEXT NOT NULL DEFAULT '',
|
| 69 |
+
category TEXT NOT NULL DEFAULT 'Unknown',
|
| 70 |
+
color TEXT NOT NULL DEFAULT 'Unknown',
|
| 71 |
+
pattern TEXT NOT NULL DEFAULT 'Solid',
|
| 72 |
+
fabric TEXT NOT NULL DEFAULT 'Unknown',
|
| 73 |
+
fit TEXT NOT NULL DEFAULT 'Unknown',
|
| 74 |
+
season TEXT NOT NULL DEFAULT 'All-Season',
|
| 75 |
+
style TEXT NOT NULL DEFAULT 'casual',
|
| 76 |
+
type TEXT NOT NULL DEFAULT 'unknown',
|
| 77 |
+
description TEXT NOT NULL DEFAULT '{}',
|
| 78 |
+
created_at TEXT NOT NULL
|
| 79 |
+
);
|
| 80 |
+
|
| 81 |
+
CREATE TABLE IF NOT EXISTS outfit_feedback (
|
| 82 |
+
id TEXT PRIMARY KEY,
|
| 83 |
+
top_id TEXT NOT NULL,
|
| 84 |
+
bottom_id TEXT NOT NULL,
|
| 85 |
+
occasion TEXT NOT NULL DEFAULT 'casual',
|
| 86 |
+
action TEXT NOT NULL CHECK(action IN ('wear','skip','save')),
|
| 87 |
+
score INTEGER,
|
| 88 |
+
created_at TEXT NOT NULL,
|
| 89 |
+
FOREIGN KEY (top_id) REFERENCES items(id) ON DELETE CASCADE,
|
| 90 |
+
FOREIGN KEY (bottom_id) REFERENCES items(id) ON DELETE CASCADE
|
| 91 |
+
);
|
| 92 |
+
|
| 93 |
+
CREATE TABLE IF NOT EXISTS search_cache (
|
| 94 |
+
cache_key TEXT PRIMARY KEY,
|
| 95 |
+
payload TEXT NOT NULL,
|
| 96 |
+
created_at TEXT NOT NULL,
|
| 97 |
+
expires_at TEXT NOT NULL
|
| 98 |
+
);
|
| 99 |
+
|
| 100 |
+
CREATE INDEX IF NOT EXISTS idx_items_type ON items(type);
|
| 101 |
+
CREATE INDEX IF NOT EXISTS idx_items_created ON items(created_at DESC);
|
| 102 |
+
CREATE INDEX IF NOT EXISTS idx_fb_top ON outfit_feedback(top_id);
|
| 103 |
+
CREATE INDEX IF NOT EXISTS idx_fb_bot ON outfit_feedback(bottom_id);
|
| 104 |
+
""")
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
# ---------------------------------------------------------------------------
|
| 108 |
+
# Internal helpers
|
| 109 |
+
# ---------------------------------------------------------------------------
|
| 110 |
+
|
| 111 |
+
def _now() -> str:
|
| 112 |
+
return datetime.now(timezone.utc).isoformat()
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def _row_to_item(row: sqlite3.Row) -> dict[str, Any]:
|
| 116 |
+
d = dict(row)
|
| 117 |
+
raw = d.get("description", "{}")
|
| 118 |
+
try:
|
| 119 |
+
d["description"] = json.loads(raw) if isinstance(raw, str) else raw
|
| 120 |
+
except (json.JSONDecodeError, TypeError):
|
| 121 |
+
d["description"] = {}
|
| 122 |
+
return d
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
# ---------------------------------------------------------------------------
|
| 126 |
+
# Items CRUD
|
| 127 |
+
# ---------------------------------------------------------------------------
|
| 128 |
+
|
| 129 |
+
def item_insert(item: dict[str, Any]) -> dict[str, Any]:
|
| 130 |
+
record = {
|
| 131 |
+
"id": item.get("id") or str(uuid.uuid4()),
|
| 132 |
+
"image_url": str(item.get("image_url") or ""),
|
| 133 |
+
"category": str(item.get("category") or "Unknown"),
|
| 134 |
+
"color": str(item.get("color") or "Unknown"),
|
| 135 |
+
"pattern": str(item.get("pattern") or "Solid"),
|
| 136 |
+
"fabric": str(item.get("fabric") or "Unknown"),
|
| 137 |
+
"fit": str(item.get("fit") or "Unknown"),
|
| 138 |
+
"season": str(item.get("season") or "All-Season"),
|
| 139 |
+
"style": str(item.get("style") or "casual"),
|
| 140 |
+
"type": str(item.get("type") or "unknown"),
|
| 141 |
+
"description": json.dumps(item.get("description") or {}),
|
| 142 |
+
"created_at": item.get("created_at") or _now(),
|
| 143 |
+
}
|
| 144 |
+
with _conn() as con:
|
| 145 |
+
con.execute(
|
| 146 |
+
"""INSERT INTO items
|
| 147 |
+
(id,image_url,category,color,pattern,fabric,fit,
|
| 148 |
+
season,style,type,description,created_at)
|
| 149 |
+
VALUES
|
| 150 |
+
(:id,:image_url,:category,:color,:pattern,:fabric,:fit,
|
| 151 |
+
:season,:style,:type,:description,:created_at)""",
|
| 152 |
+
record,
|
| 153 |
+
)
|
| 154 |
+
record["description"] = item.get("description") or {}
|
| 155 |
+
return record
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def item_get_all() -> list[dict[str, Any]]:
|
| 159 |
+
with _conn() as con:
|
| 160 |
+
rows = con.execute(
|
| 161 |
+
"SELECT * FROM items ORDER BY created_at DESC"
|
| 162 |
+
).fetchall()
|
| 163 |
+
return [_row_to_item(r) for r in rows]
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def item_get(item_id: str) -> dict[str, Any] | None:
|
| 167 |
+
with _conn() as con:
|
| 168 |
+
row = con.execute(
|
| 169 |
+
"SELECT * FROM items WHERE id=?", (item_id,)
|
| 170 |
+
).fetchone()
|
| 171 |
+
return _row_to_item(row) if row else None
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def item_update(item_id: str, patch: dict[str, Any]) -> dict[str, Any] | None:
|
| 175 |
+
existing = item_get(item_id)
|
| 176 |
+
if existing is None:
|
| 177 |
+
return None
|
| 178 |
+
if isinstance(patch.get("description"), dict):
|
| 179 |
+
merged = {**existing.get("description", {}), **patch["description"]}
|
| 180 |
+
else:
|
| 181 |
+
merged = existing.get("description", {})
|
| 182 |
+
allowed = {"image_url", "category", "color", "pattern",
|
| 183 |
+
"fabric", "fit", "season", "style", "type"}
|
| 184 |
+
updates = {k: str(v) for k, v in patch.items() if k in allowed}
|
| 185 |
+
updates["description"] = json.dumps(merged)
|
| 186 |
+
set_clause = ", ".join(f"{k}=:{k}" for k in updates)
|
| 187 |
+
updates["id"] = item_id
|
| 188 |
+
with _conn() as con:
|
| 189 |
+
con.execute(f"UPDATE items SET {set_clause} WHERE id=:id", updates)
|
| 190 |
+
return item_get(item_id)
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def item_delete(item_id: str) -> bool:
|
| 194 |
+
with _conn() as con:
|
| 195 |
+
cur = con.execute("DELETE FROM items WHERE id=?", (item_id,))
|
| 196 |
+
return cur.rowcount > 0
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
# ---------------------------------------------------------------------------
|
| 200 |
+
# Feedback
|
| 201 |
+
# ---------------------------------------------------------------------------
|
| 202 |
+
|
| 203 |
+
def feedback_record(
|
| 204 |
+
top_id: str,
|
| 205 |
+
bottom_id: str,
|
| 206 |
+
action: str,
|
| 207 |
+
occasion: str = "casual",
|
| 208 |
+
score: int | None = None,
|
| 209 |
+
) -> dict[str, Any]:
|
| 210 |
+
rec = {
|
| 211 |
+
"id": str(uuid.uuid4()),
|
| 212 |
+
"top_id": top_id,
|
| 213 |
+
"bottom_id": bottom_id,
|
| 214 |
+
"occasion": occasion,
|
| 215 |
+
"action": action,
|
| 216 |
+
"score": score,
|
| 217 |
+
"created_at": _now(),
|
| 218 |
+
}
|
| 219 |
+
with _conn() as con:
|
| 220 |
+
con.execute(
|
| 221 |
+
"""INSERT INTO outfit_feedback
|
| 222 |
+
(id,top_id,bottom_id,occasion,action,score,created_at)
|
| 223 |
+
VALUES(:id,:top_id,:bottom_id,:occasion,:action,:score,:created_at)""",
|
| 224 |
+
rec,
|
| 225 |
+
)
|
| 226 |
+
return rec
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
# ---------------------------------------------------------------------------
|
| 230 |
+
# Search cache
|
| 231 |
+
# ---------------------------------------------------------------------------
|
| 232 |
+
|
| 233 |
+
def cache_get(key: str) -> Any | None:
|
| 234 |
+
with _conn() as con:
|
| 235 |
+
row = con.execute(
|
| 236 |
+
"SELECT payload, expires_at FROM search_cache WHERE cache_key=?",
|
| 237 |
+
(key,),
|
| 238 |
+
).fetchone()
|
| 239 |
+
if not row:
|
| 240 |
+
return None
|
| 241 |
+
if row["expires_at"] < _now():
|
| 242 |
+
cache_delete(key)
|
| 243 |
+
return None
|
| 244 |
+
try:
|
| 245 |
+
return json.loads(row["payload"])
|
| 246 |
+
except (json.JSONDecodeError, TypeError):
|
| 247 |
+
return None
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def cache_set(key: str, payload: Any, ttl_seconds: int = 86_400) -> None:
|
| 251 |
+
expires = (datetime.now(timezone.utc) + timedelta(seconds=ttl_seconds)).isoformat()
|
| 252 |
+
with _conn() as con:
|
| 253 |
+
con.execute(
|
| 254 |
+
"""INSERT INTO search_cache(cache_key,payload,created_at,expires_at)
|
| 255 |
+
VALUES(?,?,?,?)
|
| 256 |
+
ON CONFLICT(cache_key) DO UPDATE SET
|
| 257 |
+
payload=excluded.payload,
|
| 258 |
+
created_at=excluded.created_at,
|
| 259 |
+
expires_at=excluded.expires_at""",
|
| 260 |
+
(key, json.dumps(payload), _now(), expires),
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def cache_delete(key: str) -> None:
|
| 265 |
+
with _conn() as con:
|
| 266 |
+
con.execute("DELETE FROM search_cache WHERE cache_key=?", (key,))
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
def cache_purge_expired() -> int:
|
| 270 |
+
with _conn() as con:
|
| 271 |
+
cur = con.execute("DELETE FROM search_cache WHERE expires_at<?", (_now(),))
|
| 272 |
+
return cur.rowcount
|
fashion_ai/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .encoder import FashionItemEncoder
|
| 2 |
+
from .ranker import OutfitCompatibilityRanker
|
| 3 |
+
from .retriever import OutfitCandidateRetriever
|
| 4 |
+
from .service import MultimodalOutfitRecommendationService, get_recommendation_service
|
| 5 |
+
|
| 6 |
+
__all__ = [
|
| 7 |
+
"FashionItemEncoder",
|
| 8 |
+
"MultimodalOutfitRecommendationService",
|
| 9 |
+
"OutfitCandidateRetriever",
|
| 10 |
+
"OutfitCompatibilityRanker",
|
| 11 |
+
"get_recommendation_service",
|
| 12 |
+
]
|
fashion_ai/encoder.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import hashlib
|
| 4 |
+
import io
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
from collections import OrderedDict
|
| 8 |
+
from typing import Any
|
| 9 |
+
from urllib.parse import urlparse
|
| 10 |
+
from urllib.request import Request, urlopen
|
| 11 |
+
|
| 12 |
+
import numpy as np
|
| 13 |
+
import torch
|
| 14 |
+
from PIL import Image
|
| 15 |
+
from transformers import AutoModel, AutoProcessor
|
| 16 |
+
|
| 17 |
+
from .schemas import EncodedWardrobeItem, RecommendationContext, SlotName
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
import open_clip
|
| 21 |
+
except ImportError:
|
| 22 |
+
open_clip = None
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
DEFAULT_ENCODER_MODEL_ID = os.getenv("FASHION_ENCODER_MODEL_ID", "patrickjohncyh/fashion-clip")
|
| 26 |
+
DEFAULT_EMBEDDING_DIM = int(os.getenv("FASHION_EMBEDDING_DIM", "512"))
|
| 27 |
+
DEFAULT_IMAGE_TIMEOUT_SECONDS = int(os.getenv("FASHION_IMAGE_TIMEOUT_SECONDS", "8"))
|
| 28 |
+
DEFAULT_CACHE_SIZE = int(os.getenv("FASHION_EMBEDDING_CACHE_SIZE", "2048"))
|
| 29 |
+
|
| 30 |
+
_SLOT_KEYWORDS = {
|
| 31 |
+
"top": [
|
| 32 |
+
"topwear", "shirt", "t-shirt", "tee", "blouse",
|
| 33 |
+
"hoodie", "jacket", "blazer", "sweater", "polo", "coat",
|
| 34 |
+
],
|
| 35 |
+
"bottom": [
|
| 36 |
+
"bottomwear", "jeans", "trouser", "trousers", "pant", "pants",
|
| 37 |
+
"shorts", "skirt", "jogger", "palazzo", "leggings", "chinos",
|
| 38 |
+
],
|
| 39 |
+
"shoes": [
|
| 40 |
+
"footwear", "shoe", "shoes", "sneaker", "boot", "loafer",
|
| 41 |
+
"sandal", "heel",
|
| 42 |
+
],
|
| 43 |
+
"accessory": [
|
| 44 |
+
"accessory", "accessories", "bag", "belt", "watch", "cap",
|
| 45 |
+
"hat", "scarf", "sunglasses", "jewelry",
|
| 46 |
+
],
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
_STANDALONE_OUTFIT_KEYWORDS = [
|
| 50 |
+
"others", "kurta", "dress", "jumpsuit", "romper", "gown",
|
| 51 |
+
"saree", "lehenga", "co-ord", "coord", "one-piece", "one piece",
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def infer_slot_name(item: dict[str, Any]) -> SlotName:
|
| 56 |
+
description = item.get("description") if isinstance(item.get("description"), dict) else {}
|
| 57 |
+
raw = " ".join(
|
| 58 |
+
[
|
| 59 |
+
str(item.get("type") or ""),
|
| 60 |
+
str(item.get("category") or ""),
|
| 61 |
+
str(description.get("type") or ""),
|
| 62 |
+
str(description.get("category") or ""),
|
| 63 |
+
]
|
| 64 |
+
).lower()
|
| 65 |
+
|
| 66 |
+
if any(keyword in raw for keyword in _STANDALONE_OUTFIT_KEYWORDS):
|
| 67 |
+
return "unknown"
|
| 68 |
+
|
| 69 |
+
for slot, keywords in _SLOT_KEYWORDS.items():
|
| 70 |
+
if any(keyword in raw for keyword in keywords):
|
| 71 |
+
return slot # type: ignore[return-value]
|
| 72 |
+
return "unknown"
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class FashionItemEncoder:
|
| 76 |
+
"""
|
| 77 |
+
Multimodal garment encoder.
|
| 78 |
+
|
| 79 |
+
Output shape:
|
| 80 |
+
encode_item(...).vector -> [D]
|
| 81 |
+
encode_context(...) -> [D]
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
def __init__(
|
| 85 |
+
self,
|
| 86 |
+
model_id: str = DEFAULT_ENCODER_MODEL_ID,
|
| 87 |
+
embedding_dim: int = DEFAULT_EMBEDDING_DIM,
|
| 88 |
+
device: str | None = None,
|
| 89 |
+
image_timeout_seconds: int = DEFAULT_IMAGE_TIMEOUT_SECONDS,
|
| 90 |
+
cache_size: int = DEFAULT_CACHE_SIZE,
|
| 91 |
+
) -> None:
|
| 92 |
+
self.model_id = model_id
|
| 93 |
+
self.embedding_dim = embedding_dim
|
| 94 |
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 95 |
+
self.image_timeout_seconds = image_timeout_seconds
|
| 96 |
+
self.cache_size = cache_size
|
| 97 |
+
|
| 98 |
+
self._backend = "fallback-hash"
|
| 99 |
+
self._model = None
|
| 100 |
+
self._processor = None
|
| 101 |
+
self._preprocess = None
|
| 102 |
+
self._tokenizer = None
|
| 103 |
+
self._load_attempted = False
|
| 104 |
+
self._embedding_cache: OrderedDict[str, np.ndarray] = OrderedDict()
|
| 105 |
+
|
| 106 |
+
@property
|
| 107 |
+
def backend_name(self) -> str:
|
| 108 |
+
self._ensure_model_loaded()
|
| 109 |
+
return self._backend
|
| 110 |
+
|
| 111 |
+
def encode_item(self, item: dict[str, Any]) -> EncodedWardrobeItem:
|
| 112 |
+
metadata_text = self._build_item_prompt(item)
|
| 113 |
+
cache_key = self._cache_key(item, metadata_text)
|
| 114 |
+
|
| 115 |
+
cached = self._embedding_cache.get(cache_key)
|
| 116 |
+
if cached is not None:
|
| 117 |
+
self._embedding_cache.move_to_end(cache_key)
|
| 118 |
+
return EncodedWardrobeItem(
|
| 119 |
+
item=item,
|
| 120 |
+
vector=cached.copy(),
|
| 121 |
+
slot=infer_slot_name(item),
|
| 122 |
+
metadata_text=metadata_text,
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
text_vec = self.encode_text(metadata_text)
|
| 126 |
+
image_vec = self.encode_image_url(str(item.get("image_url") or ""), metadata_text)
|
| 127 |
+
vector = text_vec if image_vec is None else self._normalize_vector((text_vec + image_vec) / 2.0)
|
| 128 |
+
|
| 129 |
+
self._remember_embedding(cache_key, vector)
|
| 130 |
+
return EncodedWardrobeItem(
|
| 131 |
+
item=item,
|
| 132 |
+
vector=vector.copy(),
|
| 133 |
+
slot=infer_slot_name(item),
|
| 134 |
+
metadata_text=metadata_text,
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
def encode_text(self, text: str) -> np.ndarray:
|
| 138 |
+
self._ensure_model_loaded()
|
| 139 |
+
|
| 140 |
+
if self._backend == "open_clip" and self._model is not None and self._tokenizer is not None:
|
| 141 |
+
with torch.inference_mode():
|
| 142 |
+
tokens = self._tokenizer([text]).to(self.device)
|
| 143 |
+
features = self._model.encode_text(tokens)
|
| 144 |
+
return self._resize_and_normalize(features[0].detach().float().cpu().numpy())
|
| 145 |
+
|
| 146 |
+
if self._backend == "transformers" and self._model is not None and self._processor is not None:
|
| 147 |
+
with torch.inference_mode():
|
| 148 |
+
inputs = self._processor(
|
| 149 |
+
text=[text],
|
| 150 |
+
return_tensors="pt",
|
| 151 |
+
padding=True,
|
| 152 |
+
truncation=True,
|
| 153 |
+
)
|
| 154 |
+
inputs = {name: value.to(self.device) for name, value in inputs.items()}
|
| 155 |
+
if hasattr(self._model, "get_text_features"):
|
| 156 |
+
features = self._model.get_text_features(**inputs)
|
| 157 |
+
else:
|
| 158 |
+
outputs = self._model(**inputs)
|
| 159 |
+
features = getattr(outputs, "pooler_output", outputs.last_hidden_state[:, 0, :])
|
| 160 |
+
return self._resize_and_normalize(features[0].detach().float().cpu().numpy())
|
| 161 |
+
|
| 162 |
+
return self._fallback_embedding(text)
|
| 163 |
+
|
| 164 |
+
def encode_image_url(self, image_url: str, fallback_text: str) -> np.ndarray | None:
|
| 165 |
+
image = self._load_image(image_url)
|
| 166 |
+
if image is None:
|
| 167 |
+
return None
|
| 168 |
+
return self.encode_image(image, fallback_text)
|
| 169 |
+
|
| 170 |
+
def encode_image(self, image: Image.Image, fallback_text: str = "") -> np.ndarray:
|
| 171 |
+
self._ensure_model_loaded()
|
| 172 |
+
|
| 173 |
+
if self._backend == "open_clip" and self._model is not None and self._preprocess is not None:
|
| 174 |
+
with torch.inference_mode():
|
| 175 |
+
tensor = self._preprocess(image).unsqueeze(0).to(self.device)
|
| 176 |
+
features = self._model.encode_image(tensor)
|
| 177 |
+
return self._resize_and_normalize(features[0].detach().float().cpu().numpy())
|
| 178 |
+
|
| 179 |
+
if self._backend == "transformers" and self._model is not None and self._processor is not None:
|
| 180 |
+
with torch.inference_mode():
|
| 181 |
+
inputs = self._processor(images=[image], return_tensors="pt")
|
| 182 |
+
inputs = {name: value.to(self.device) for name, value in inputs.items()}
|
| 183 |
+
if hasattr(self._model, "get_image_features"):
|
| 184 |
+
features = self._model.get_image_features(**inputs)
|
| 185 |
+
else:
|
| 186 |
+
outputs = self._model(**inputs)
|
| 187 |
+
features = getattr(outputs, "pooler_output", outputs.last_hidden_state[:, 0, :])
|
| 188 |
+
return self._resize_and_normalize(features[0].detach().float().cpu().numpy())
|
| 189 |
+
|
| 190 |
+
return self._fallback_embedding(f"image::{fallback_text}")
|
| 191 |
+
|
| 192 |
+
def encode_context(self, context: RecommendationContext) -> np.ndarray:
|
| 193 |
+
profile = context.user_profile or {}
|
| 194 |
+
favorite_colors = profile.get("favorite_colors")
|
| 195 |
+
disliked_styles = profile.get("disliked_styles")
|
| 196 |
+
prompt_parts = [
|
| 197 |
+
f"An outfit for {context.occasion or 'casual'}",
|
| 198 |
+
f"in {context.weather.season or 'all-season'} weather",
|
| 199 |
+
f"temperature {context.weather.temperature_c}C"
|
| 200 |
+
if context.weather.temperature_c is not None
|
| 201 |
+
else "",
|
| 202 |
+
"rainy conditions" if context.weather.is_rainy else "",
|
| 203 |
+
f"region {context.region or 'global'}",
|
| 204 |
+
f"preferred style {profile.get('style_profile')}" if profile.get("style_profile") else "",
|
| 205 |
+
f"favorite colors {', '.join(favorite_colors)}"
|
| 206 |
+
if isinstance(favorite_colors, list) and favorite_colors
|
| 207 |
+
else "",
|
| 208 |
+
f"disliked styles {', '.join(disliked_styles)}"
|
| 209 |
+
if isinstance(disliked_styles, list) and disliked_styles
|
| 210 |
+
else "",
|
| 211 |
+
]
|
| 212 |
+
return self.encode_text(" ".join(part for part in prompt_parts if part))
|
| 213 |
+
|
| 214 |
+
def _ensure_model_loaded(self) -> None:
|
| 215 |
+
if self._load_attempted:
|
| 216 |
+
return
|
| 217 |
+
self._load_attempted = True
|
| 218 |
+
|
| 219 |
+
if open_clip is not None and self.model_id.lower().startswith("marqo/"):
|
| 220 |
+
try:
|
| 221 |
+
self._model, _, self._preprocess = open_clip.create_model_and_transforms(
|
| 222 |
+
f"hf-hub:{self.model_id}",
|
| 223 |
+
device=self.device,
|
| 224 |
+
)
|
| 225 |
+
self._tokenizer = open_clip.get_tokenizer(f"hf-hub:{self.model_id}")
|
| 226 |
+
self._model.eval()
|
| 227 |
+
self._backend = "open_clip"
|
| 228 |
+
return
|
| 229 |
+
except Exception:
|
| 230 |
+
self._model = None
|
| 231 |
+
self._preprocess = None
|
| 232 |
+
self._tokenizer = None
|
| 233 |
+
|
| 234 |
+
try:
|
| 235 |
+
self._processor = AutoProcessor.from_pretrained(self.model_id)
|
| 236 |
+
self._model = AutoModel.from_pretrained(self.model_id).to(self.device).eval()
|
| 237 |
+
self._backend = "transformers"
|
| 238 |
+
except Exception:
|
| 239 |
+
self._model = None
|
| 240 |
+
self._processor = None
|
| 241 |
+
self._backend = "fallback-hash"
|
| 242 |
+
|
| 243 |
+
def _load_image(self, image_url: str) -> Image.Image | None:
|
| 244 |
+
if not image_url or image_url.startswith("memory://") or image_url.startswith("data:"):
|
| 245 |
+
return None
|
| 246 |
+
|
| 247 |
+
parsed = urlparse(image_url)
|
| 248 |
+
try:
|
| 249 |
+
if parsed.scheme in {"http", "https"}:
|
| 250 |
+
request = Request(
|
| 251 |
+
image_url,
|
| 252 |
+
headers={"User-Agent": "Mozilla/5.0", "Accept": "image/*,*/*;q=0.8"},
|
| 253 |
+
)
|
| 254 |
+
with urlopen(request, timeout=self.image_timeout_seconds) as response:
|
| 255 |
+
raw = response.read()
|
| 256 |
+
return Image.open(io.BytesIO(raw)).convert("RGB")
|
| 257 |
+
|
| 258 |
+
if os.path.isfile(image_url):
|
| 259 |
+
return Image.open(image_url).convert("RGB")
|
| 260 |
+
except Exception:
|
| 261 |
+
return None
|
| 262 |
+
return None
|
| 263 |
+
|
| 264 |
+
def _build_item_prompt(self, item: dict[str, Any]) -> str:
|
| 265 |
+
description = item.get("description") if isinstance(item.get("description"), dict) else {}
|
| 266 |
+
category = item.get("category") or description.get("category") or description.get("type") or "garment"
|
| 267 |
+
color = item.get("color") or description.get("color") or "unknown color"
|
| 268 |
+
pattern = item.get("pattern") or description.get("pattern") or "solid"
|
| 269 |
+
fabric = item.get("fabric") or description.get("fabric") or "unknown fabric"
|
| 270 |
+
fit = item.get("fit") or description.get("fit") or "regular"
|
| 271 |
+
season = item.get("season") or description.get("season") or "all-season"
|
| 272 |
+
style = item.get("style") or description.get("occasion") or description.get("style") or "casual"
|
| 273 |
+
slot = infer_slot_name(item)
|
| 274 |
+
|
| 275 |
+
return (
|
| 276 |
+
f"Fashion product photo of a {color} {pattern} {fabric} {category}, "
|
| 277 |
+
f"{fit} fit, {style} style, suitable for {season}, worn as {slot}."
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
+
def _cache_key(self, item: dict[str, Any], metadata_text: str) -> str:
|
| 281 |
+
payload = {
|
| 282 |
+
"id": str(item.get("id") or ""),
|
| 283 |
+
"image_url": str(item.get("image_url") or ""),
|
| 284 |
+
"metadata_text": metadata_text,
|
| 285 |
+
}
|
| 286 |
+
raw = json.dumps(payload, sort_keys=True, ensure_ascii=True).encode("utf-8")
|
| 287 |
+
return hashlib.sha256(raw).hexdigest()
|
| 288 |
+
|
| 289 |
+
def _remember_embedding(self, cache_key: str, vector: np.ndarray) -> None:
|
| 290 |
+
self._embedding_cache[cache_key] = vector.copy()
|
| 291 |
+
self._embedding_cache.move_to_end(cache_key)
|
| 292 |
+
while len(self._embedding_cache) > self.cache_size:
|
| 293 |
+
self._embedding_cache.popitem(last=False)
|
| 294 |
+
|
| 295 |
+
def _fallback_embedding(self, seed_text: str) -> np.ndarray:
|
| 296 |
+
digest = hashlib.sha256(seed_text.encode("utf-8", errors="ignore")).digest()
|
| 297 |
+
seed = int.from_bytes(digest[:8], "big", signed=False)
|
| 298 |
+
rng = np.random.default_rng(seed)
|
| 299 |
+
return self._normalize_vector(rng.standard_normal(self.embedding_dim).astype(np.float32))
|
| 300 |
+
|
| 301 |
+
def _resize_and_normalize(self, vector: np.ndarray) -> np.ndarray:
|
| 302 |
+
arr = np.asarray(vector, dtype=np.float32).reshape(-1)
|
| 303 |
+
if arr.shape[0] == self.embedding_dim:
|
| 304 |
+
return self._normalize_vector(arr)
|
| 305 |
+
if arr.shape[0] < 2:
|
| 306 |
+
return self._fallback_embedding(str(arr.tolist()))
|
| 307 |
+
|
| 308 |
+
src_x = np.linspace(0.0, 1.0, num=arr.shape[0], dtype=np.float32)
|
| 309 |
+
dst_x = np.linspace(0.0, 1.0, num=self.embedding_dim, dtype=np.float32)
|
| 310 |
+
resized = np.interp(dst_x, src_x, arr).astype(np.float32)
|
| 311 |
+
return self._normalize_vector(resized)
|
| 312 |
+
|
| 313 |
+
@staticmethod
|
| 314 |
+
def _normalize_vector(vector: np.ndarray) -> np.ndarray:
|
| 315 |
+
arr = np.asarray(vector, dtype=np.float32).reshape(-1)
|
| 316 |
+
norm = float(np.linalg.norm(arr))
|
| 317 |
+
if norm < 1e-8:
|
| 318 |
+
return np.zeros_like(arr, dtype=np.float32)
|
| 319 |
+
return arr / norm
|
fashion_ai/ranker.py
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import math
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
import torch
|
| 8 |
+
import torch.nn as nn
|
| 9 |
+
|
| 10 |
+
from .schemas import OutfitCandidate, RecommendationContext
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class OutfitCompatibilityRanker(nn.Module):
|
| 14 |
+
"""
|
| 15 |
+
Transformer ranker.
|
| 16 |
+
|
| 17 |
+
Input:
|
| 18 |
+
outfit_tokens [B, 6, D] = [CONTEXT, USER, TOP, BOTTOM, SHOES, ACCESSORY]
|
| 19 |
+
attention_mask [B, 6]
|
| 20 |
+
|
| 21 |
+
Output:
|
| 22 |
+
logits [B, 1]
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self,
|
| 27 |
+
d_model: int = 512,
|
| 28 |
+
n_layers: int = 4,
|
| 29 |
+
n_heads: int = 8,
|
| 30 |
+
dropout: float = 0.1,
|
| 31 |
+
) -> None:
|
| 32 |
+
super().__init__()
|
| 33 |
+
self.d_model = d_model
|
| 34 |
+
self.cls_token = nn.Parameter(torch.zeros(1, 1, d_model))
|
| 35 |
+
self.slot_embedding = nn.Embedding(7, d_model)
|
| 36 |
+
encoder_layer = nn.TransformerEncoderLayer(
|
| 37 |
+
d_model=d_model,
|
| 38 |
+
nhead=n_heads,
|
| 39 |
+
dim_feedforward=d_model * 4,
|
| 40 |
+
dropout=dropout,
|
| 41 |
+
batch_first=True,
|
| 42 |
+
activation="gelu",
|
| 43 |
+
norm_first=True,
|
| 44 |
+
)
|
| 45 |
+
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
|
| 46 |
+
self.head = nn.Sequential(
|
| 47 |
+
nn.LayerNorm(d_model),
|
| 48 |
+
nn.Linear(d_model, d_model // 2),
|
| 49 |
+
nn.GELU(),
|
| 50 |
+
nn.Dropout(dropout),
|
| 51 |
+
nn.Linear(d_model // 2, 1),
|
| 52 |
+
)
|
| 53 |
+
nn.init.trunc_normal_(self.cls_token, std=0.02)
|
| 54 |
+
|
| 55 |
+
def forward(
|
| 56 |
+
self,
|
| 57 |
+
outfit_tokens: torch.Tensor,
|
| 58 |
+
attention_mask: torch.Tensor,
|
| 59 |
+
) -> torch.Tensor:
|
| 60 |
+
if outfit_tokens.ndim != 3:
|
| 61 |
+
raise ValueError("outfit_tokens must have shape [B, S, D]")
|
| 62 |
+
if attention_mask.ndim != 2:
|
| 63 |
+
raise ValueError("attention_mask must have shape [B, S]")
|
| 64 |
+
|
| 65 |
+
batch_size, seq_len, _ = outfit_tokens.shape
|
| 66 |
+
cls = self.cls_token.expand(batch_size, 1, self.d_model)
|
| 67 |
+
tokens = torch.cat([cls, outfit_tokens], dim=1)
|
| 68 |
+
slot_ids = torch.arange(seq_len + 1, device=tokens.device).unsqueeze(0)
|
| 69 |
+
tokens = tokens + self.slot_embedding(slot_ids)
|
| 70 |
+
|
| 71 |
+
cls_mask = torch.ones((batch_size, 1), device=attention_mask.device, dtype=attention_mask.dtype)
|
| 72 |
+
mask = torch.cat([cls_mask, attention_mask], dim=1)
|
| 73 |
+
encoded = self.encoder(tokens, src_key_padding_mask=mask == 0)
|
| 74 |
+
return self.head(encoded[:, 0, :])
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class NeuralOutfitScorer:
|
| 78 |
+
"""
|
| 79 |
+
Uses a trained transformer checkpoint when available.
|
| 80 |
+
Otherwise falls back to zero-shot geometric scoring over multimodal
|
| 81 |
+
embeddings so the endpoint stays usable before fine-tuning.
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
def __init__(
|
| 85 |
+
self,
|
| 86 |
+
d_model: int = 512,
|
| 87 |
+
checkpoint_path: str | None = None,
|
| 88 |
+
device: str | None = None,
|
| 89 |
+
) -> None:
|
| 90 |
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 91 |
+
self.model = OutfitCompatibilityRanker(d_model=d_model).to(self.device).eval()
|
| 92 |
+
self.checkpoint_path = checkpoint_path or os.getenv("FASHION_RANKER_CHECKPOINT")
|
| 93 |
+
self.is_trained = False
|
| 94 |
+
self._load_checkpoint_if_available()
|
| 95 |
+
|
| 96 |
+
def score_candidates(
|
| 97 |
+
self,
|
| 98 |
+
candidates: list[OutfitCandidate],
|
| 99 |
+
context_vector: np.ndarray,
|
| 100 |
+
user_vector: np.ndarray,
|
| 101 |
+
context: RecommendationContext,
|
| 102 |
+
) -> list[OutfitCandidate]:
|
| 103 |
+
if not candidates:
|
| 104 |
+
return []
|
| 105 |
+
if self.is_trained:
|
| 106 |
+
return self._score_with_transformer(candidates, context_vector, user_vector, context)
|
| 107 |
+
return self._score_zero_shot(candidates, context_vector, user_vector, context)
|
| 108 |
+
|
| 109 |
+
def _score_with_transformer(
|
| 110 |
+
self,
|
| 111 |
+
candidates: list[OutfitCandidate],
|
| 112 |
+
context_vector: np.ndarray,
|
| 113 |
+
user_vector: np.ndarray,
|
| 114 |
+
context: RecommendationContext,
|
| 115 |
+
) -> list[OutfitCandidate]:
|
| 116 |
+
token_batch = []
|
| 117 |
+
mask_batch = []
|
| 118 |
+
for candidate in candidates:
|
| 119 |
+
vectors = [
|
| 120 |
+
context_vector,
|
| 121 |
+
user_vector,
|
| 122 |
+
candidate.top.vector,
|
| 123 |
+
candidate.bottom.vector,
|
| 124 |
+
candidate.shoes.vector if candidate.shoes is not None else np.zeros_like(context_vector),
|
| 125 |
+
candidate.accessory.vector if candidate.accessory is not None else np.zeros_like(context_vector),
|
| 126 |
+
]
|
| 127 |
+
mask = [
|
| 128 |
+
1,
|
| 129 |
+
1,
|
| 130 |
+
1,
|
| 131 |
+
1,
|
| 132 |
+
1 if candidate.shoes is not None else 0,
|
| 133 |
+
1 if candidate.accessory is not None else 0,
|
| 134 |
+
]
|
| 135 |
+
token_batch.append(np.stack(vectors, axis=0))
|
| 136 |
+
mask_batch.append(mask)
|
| 137 |
+
|
| 138 |
+
with torch.inference_mode():
|
| 139 |
+
logits = self.model(
|
| 140 |
+
torch.tensor(np.stack(token_batch), dtype=torch.float32, device=self.device),
|
| 141 |
+
torch.tensor(np.asarray(mask_batch), dtype=torch.long, device=self.device),
|
| 142 |
+
).squeeze(-1)
|
| 143 |
+
probs = torch.sigmoid(logits).detach().cpu().numpy()
|
| 144 |
+
|
| 145 |
+
return self._finalize_candidates(candidates, probs, context_vector, user_vector, context)
|
| 146 |
+
|
| 147 |
+
def _score_zero_shot(
|
| 148 |
+
self,
|
| 149 |
+
candidates: list[OutfitCandidate],
|
| 150 |
+
context_vector: np.ndarray,
|
| 151 |
+
user_vector: np.ndarray,
|
| 152 |
+
context: RecommendationContext,
|
| 153 |
+
) -> list[OutfitCandidate]:
|
| 154 |
+
scores = []
|
| 155 |
+
for candidate in candidates:
|
| 156 |
+
item_vectors = [slot_item.vector for slot_item in candidate.slot_items()]
|
| 157 |
+
outfit_centroid = self._normalize(np.mean(np.stack(item_vectors, axis=0), axis=0))
|
| 158 |
+
context_alignment = self._cosine(outfit_centroid, context_vector)
|
| 159 |
+
user_alignment = self._cosine(outfit_centroid, user_vector)
|
| 160 |
+
pairwise_cohesion = self._pairwise_cohesion(item_vectors)
|
| 161 |
+
slot_coverage = math.log1p(len(item_vectors)) / math.log1p(4)
|
| 162 |
+
|
| 163 |
+
score = np.mean(
|
| 164 |
+
np.asarray(
|
| 165 |
+
[
|
| 166 |
+
self._to_unit_interval(context_alignment),
|
| 167 |
+
self._to_unit_interval(user_alignment),
|
| 168 |
+
self._to_unit_interval(pairwise_cohesion),
|
| 169 |
+
slot_coverage,
|
| 170 |
+
],
|
| 171 |
+
dtype=np.float32,
|
| 172 |
+
)
|
| 173 |
+
)
|
| 174 |
+
scores.append(float(np.clip(score, 0.0, 1.0)))
|
| 175 |
+
|
| 176 |
+
return self._finalize_candidates(candidates, scores, context_vector, user_vector, context)
|
| 177 |
+
|
| 178 |
+
def _finalize_candidates(
|
| 179 |
+
self,
|
| 180 |
+
candidates: list[OutfitCandidate],
|
| 181 |
+
probs: list[float] | np.ndarray,
|
| 182 |
+
context_vector: np.ndarray,
|
| 183 |
+
user_vector: np.ndarray,
|
| 184 |
+
context: RecommendationContext,
|
| 185 |
+
) -> list[OutfitCandidate]:
|
| 186 |
+
scored = []
|
| 187 |
+
for candidate, prob in zip(candidates, probs, strict=False):
|
| 188 |
+
candidate.score = round(float(prob) * 100.0, 2)
|
| 189 |
+
candidate.breakdown = self._build_breakdown(candidate, context_vector, user_vector, context)
|
| 190 |
+
candidate.reason = self._build_reason(candidate, context)
|
| 191 |
+
candidate.tip = self._build_tip(candidate)
|
| 192 |
+
scored.append(candidate)
|
| 193 |
+
scored.sort(key=lambda item: item.score, reverse=True)
|
| 194 |
+
return scored
|
| 195 |
+
|
| 196 |
+
def _load_checkpoint_if_available(self) -> None:
|
| 197 |
+
if not self.checkpoint_path or not os.path.isfile(self.checkpoint_path):
|
| 198 |
+
return
|
| 199 |
+
try:
|
| 200 |
+
payload = torch.load(self.checkpoint_path, map_location=self.device)
|
| 201 |
+
state_dict = payload.get("model_state_dict", payload) if isinstance(payload, dict) else payload
|
| 202 |
+
self.model.load_state_dict(state_dict, strict=False)
|
| 203 |
+
self.model.eval()
|
| 204 |
+
self.is_trained = True
|
| 205 |
+
except Exception:
|
| 206 |
+
self.is_trained = False
|
| 207 |
+
|
| 208 |
+
def _build_breakdown(
|
| 209 |
+
self,
|
| 210 |
+
candidate: OutfitCandidate,
|
| 211 |
+
context_vector: np.ndarray,
|
| 212 |
+
user_vector: np.ndarray,
|
| 213 |
+
context: RecommendationContext,
|
| 214 |
+
) -> dict[str, float]:
|
| 215 |
+
item_vectors = [slot_item.vector for slot_item in candidate.slot_items()]
|
| 216 |
+
outfit_centroid = self._normalize(np.mean(np.stack(item_vectors, axis=0), axis=0))
|
| 217 |
+
context_alignment = self._to_score(self._cosine(outfit_centroid, context_vector))
|
| 218 |
+
user_affinity = self._to_score(self._cosine(outfit_centroid, user_vector))
|
| 219 |
+
visual_cohesion = self._to_score(self._pairwise_cohesion(item_vectors))
|
| 220 |
+
top_bottom_compat = self._to_score(self._cosine(candidate.top.vector, candidate.bottom.vector))
|
| 221 |
+
occasion_fit = self._occasion_slot_fit(candidate, context.occasion)
|
| 222 |
+
|
| 223 |
+
return {
|
| 224 |
+
"color": round((visual_cohesion + top_bottom_compat) / 2.0, 2),
|
| 225 |
+
"style": round((context_alignment + visual_cohesion) / 2.0, 2),
|
| 226 |
+
"occasion": round((occasion_fit + context_alignment) / 2.0, 2),
|
| 227 |
+
"context_alignment": round(context_alignment, 2),
|
| 228 |
+
"user_affinity": round(user_affinity, 2),
|
| 229 |
+
"visual_cohesion": round(visual_cohesion, 2),
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
def _build_reason(self, candidate: OutfitCandidate, context: RecommendationContext) -> str:
|
| 233 |
+
parts = [
|
| 234 |
+
f"{candidate.top.item.get('color', 'Unknown')} {candidate.top.item.get('category', 'Topwear')}",
|
| 235 |
+
f"{candidate.bottom.item.get('color', 'Unknown')} {candidate.bottom.item.get('category', 'Bottomwear')}",
|
| 236 |
+
]
|
| 237 |
+
if candidate.shoes is not None:
|
| 238 |
+
parts.append(
|
| 239 |
+
f"{candidate.shoes.item.get('color', 'Unknown')} {candidate.shoes.item.get('category', 'Footwear')}"
|
| 240 |
+
)
|
| 241 |
+
if candidate.accessory is not None:
|
| 242 |
+
parts.append(
|
| 243 |
+
f"{candidate.accessory.item.get('color', 'Unknown')} {candidate.accessory.item.get('category', 'Accessory')}"
|
| 244 |
+
)
|
| 245 |
+
return (
|
| 246 |
+
f"Learned multimodal embeddings rate {' + '.join(parts)} as a coherent "
|
| 247 |
+
f"combination for {context.occasion or 'casual'} context."
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
@staticmethod
|
| 251 |
+
def _build_tip(candidate: OutfitCandidate) -> str:
|
| 252 |
+
if candidate.score >= 85:
|
| 253 |
+
return "Strong outfit match. Keep accessories minimal so the silhouette stays clean."
|
| 254 |
+
if candidate.score >= 70:
|
| 255 |
+
return "Good base outfit. Add one tonal accessory to reinforce the palette."
|
| 256 |
+
return "This outfit is acceptable, but one slot can be swapped for stronger style alignment."
|
| 257 |
+
|
| 258 |
+
@staticmethod
|
| 259 |
+
def _occasion_slot_fit(candidate: OutfitCandidate, occasion: str) -> float:
|
| 260 |
+
occ = str(occasion or "casual").lower()
|
| 261 |
+
texts = " ".join(slot.metadata_text.lower() for slot in candidate.slot_items())
|
| 262 |
+
if occ in texts:
|
| 263 |
+
return 95.0
|
| 264 |
+
if occ in {"formal", "interview", "business", "office"} and any(
|
| 265 |
+
token in texts for token in ["shirt", "blazer", "trouser", "loafer"]
|
| 266 |
+
):
|
| 267 |
+
return 88.0
|
| 268 |
+
if occ in {"party", "festive", "wedding"} and any(
|
| 269 |
+
token in texts for token in ["silk", "embroidered", "dress", "kurta"]
|
| 270 |
+
):
|
| 271 |
+
return 88.0
|
| 272 |
+
if occ in {"sports", "gym", "active"} and any(
|
| 273 |
+
token in texts for token in ["sneaker", "jogger", "tee", "hoodie"]
|
| 274 |
+
):
|
| 275 |
+
return 85.0
|
| 276 |
+
return 72.0
|
| 277 |
+
|
| 278 |
+
@staticmethod
|
| 279 |
+
def _pairwise_cohesion(vectors: list[np.ndarray]) -> float:
|
| 280 |
+
if len(vectors) < 2:
|
| 281 |
+
return 0.0
|
| 282 |
+
scores = []
|
| 283 |
+
for left_index in range(len(vectors)):
|
| 284 |
+
for right_index in range(left_index + 1, len(vectors)):
|
| 285 |
+
scores.append(NeuralOutfitScorer._cosine(vectors[left_index], vectors[right_index]))
|
| 286 |
+
return float(np.mean(np.asarray(scores, dtype=np.float32)))
|
| 287 |
+
|
| 288 |
+
@staticmethod
|
| 289 |
+
def _cosine(left: np.ndarray, right: np.ndarray) -> float:
|
| 290 |
+
left_vec = NeuralOutfitScorer._normalize(left)
|
| 291 |
+
right_vec = NeuralOutfitScorer._normalize(right)
|
| 292 |
+
return float(np.dot(left_vec, right_vec))
|
| 293 |
+
|
| 294 |
+
@staticmethod
|
| 295 |
+
def _to_score(value: float) -> float:
|
| 296 |
+
return round(100.0 * NeuralOutfitScorer._to_unit_interval(value), 2)
|
| 297 |
+
|
| 298 |
+
@staticmethod
|
| 299 |
+
def _to_unit_interval(value: float) -> float:
|
| 300 |
+
return float(np.clip((value + 1.0) / 2.0, 0.0, 1.0))
|
| 301 |
+
|
| 302 |
+
@staticmethod
|
| 303 |
+
def _normalize(vector: np.ndarray) -> np.ndarray:
|
| 304 |
+
arr = np.asarray(vector, dtype=np.float32).reshape(-1)
|
| 305 |
+
norm = float(np.linalg.norm(arr))
|
| 306 |
+
if norm < 1e-8:
|
| 307 |
+
return np.zeros_like(arr, dtype=np.float32)
|
| 308 |
+
return arr / norm
|
fashion_ai/retriever.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
from .encoder import FashionItemEncoder
|
| 8 |
+
from .schemas import EncodedWardrobeItem, RecommendationContext, SlotName
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class OutfitCandidateRetriever:
|
| 12 |
+
"""Slot-aware embedding retrieval with MMR diversification."""
|
| 13 |
+
|
| 14 |
+
def __init__(
|
| 15 |
+
self,
|
| 16 |
+
encoder: FashionItemEncoder,
|
| 17 |
+
slot_pool_size: int = 24,
|
| 18 |
+
mmr_lambda: float = 0.72,
|
| 19 |
+
) -> None:
|
| 20 |
+
self.encoder = encoder
|
| 21 |
+
self.slot_pool_size = slot_pool_size
|
| 22 |
+
self.mmr_lambda = mmr_lambda
|
| 23 |
+
|
| 24 |
+
def encode_wardrobe(self, wardrobe_items: list[dict[str, Any]]) -> list[EncodedWardrobeItem]:
|
| 25 |
+
return [self.encoder.encode_item(item) for item in wardrobe_items]
|
| 26 |
+
|
| 27 |
+
def split_by_slot(
|
| 28 |
+
self,
|
| 29 |
+
encoded_items: list[EncodedWardrobeItem],
|
| 30 |
+
) -> dict[SlotName, list[EncodedWardrobeItem]]:
|
| 31 |
+
buckets: dict[SlotName, list[EncodedWardrobeItem]] = {
|
| 32 |
+
"top": [],
|
| 33 |
+
"bottom": [],
|
| 34 |
+
"shoes": [],
|
| 35 |
+
"accessory": [],
|
| 36 |
+
"unknown": [],
|
| 37 |
+
}
|
| 38 |
+
for item in encoded_items:
|
| 39 |
+
buckets[item.slot].append(item)
|
| 40 |
+
return buckets
|
| 41 |
+
|
| 42 |
+
def retrieve(
|
| 43 |
+
self,
|
| 44 |
+
encoded_items: list[EncodedWardrobeItem],
|
| 45 |
+
context: RecommendationContext,
|
| 46 |
+
locked_top: dict[str, Any] | None = None,
|
| 47 |
+
locked_bottom: dict[str, Any] | None = None,
|
| 48 |
+
locked_other: dict[str, Any] | None = None,
|
| 49 |
+
candidate_pool: int | None = None,
|
| 50 |
+
) -> dict[SlotName, list[EncodedWardrobeItem]]:
|
| 51 |
+
buckets = self.split_by_slot(encoded_items)
|
| 52 |
+
query_vec = self.encoder.encode_context(context)
|
| 53 |
+
pool_size = candidate_pool or self.slot_pool_size
|
| 54 |
+
locked_top_vec = self.encoder.encode_item(locked_top).vector if locked_top else None
|
| 55 |
+
locked_bottom_vec = self.encoder.encode_item(locked_bottom).vector if locked_bottom else None
|
| 56 |
+
locked_other_encoded = self.encoder.encode_item(locked_other) if locked_other else None
|
| 57 |
+
accessory_bucket = buckets["accessory"] + buckets["unknown"]
|
| 58 |
+
|
| 59 |
+
return {
|
| 60 |
+
"top": [self.encoder.encode_item(locked_top)] if locked_top else self._rank_bucket(
|
| 61 |
+
buckets["top"],
|
| 62 |
+
query_vec=self._merge_query(query_vec, locked_bottom_vec),
|
| 63 |
+
top_k=pool_size,
|
| 64 |
+
),
|
| 65 |
+
"bottom": [self.encoder.encode_item(locked_bottom)] if locked_bottom else self._rank_bucket(
|
| 66 |
+
buckets["bottom"],
|
| 67 |
+
query_vec=self._merge_query(query_vec, locked_top_vec),
|
| 68 |
+
top_k=pool_size,
|
| 69 |
+
),
|
| 70 |
+
"shoes": self._rank_bucket(
|
| 71 |
+
buckets["shoes"],
|
| 72 |
+
query_vec=self._merge_query(
|
| 73 |
+
query_vec,
|
| 74 |
+
locked_top_vec,
|
| 75 |
+
locked_bottom_vec,
|
| 76 |
+
locked_other_encoded.vector if locked_other_encoded and locked_other_encoded.slot == "shoes" else None,
|
| 77 |
+
),
|
| 78 |
+
top_k=min(pool_size, 12),
|
| 79 |
+
) if not locked_other_encoded or locked_other_encoded.slot != "shoes" else [locked_other_encoded],
|
| 80 |
+
"accessory": (
|
| 81 |
+
[locked_other_encoded]
|
| 82 |
+
if locked_other_encoded and locked_other_encoded.slot != "shoes"
|
| 83 |
+
else self._rank_bucket(
|
| 84 |
+
accessory_bucket,
|
| 85 |
+
query_vec=self._merge_query(query_vec, locked_top_vec, locked_bottom_vec),
|
| 86 |
+
top_k=min(pool_size, 12),
|
| 87 |
+
)
|
| 88 |
+
),
|
| 89 |
+
"unknown": [],
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
def _rank_bucket(
|
| 93 |
+
self,
|
| 94 |
+
bucket: list[EncodedWardrobeItem],
|
| 95 |
+
query_vec: np.ndarray,
|
| 96 |
+
top_k: int,
|
| 97 |
+
) -> list[EncodedWardrobeItem]:
|
| 98 |
+
if not bucket:
|
| 99 |
+
return []
|
| 100 |
+
|
| 101 |
+
ranked = sorted(
|
| 102 |
+
bucket,
|
| 103 |
+
key=lambda item: float(np.dot(item.vector, query_vec)),
|
| 104 |
+
reverse=True,
|
| 105 |
+
)
|
| 106 |
+
return self._mmr_diversify(ranked, query_vec=query_vec, top_k=top_k)
|
| 107 |
+
|
| 108 |
+
def _mmr_diversify(
|
| 109 |
+
self,
|
| 110 |
+
ranked_items: list[EncodedWardrobeItem],
|
| 111 |
+
query_vec: np.ndarray,
|
| 112 |
+
top_k: int,
|
| 113 |
+
) -> list[EncodedWardrobeItem]:
|
| 114 |
+
selected: list[EncodedWardrobeItem] = []
|
| 115 |
+
remaining = list(ranked_items)
|
| 116 |
+
|
| 117 |
+
while remaining and len(selected) < top_k:
|
| 118 |
+
best_index = 0
|
| 119 |
+
best_score = -1e9
|
| 120 |
+
for index, item in enumerate(remaining):
|
| 121 |
+
query_score = float(np.dot(item.vector, query_vec))
|
| 122 |
+
novelty_penalty = 0.0
|
| 123 |
+
if selected:
|
| 124 |
+
novelty_penalty = max(float(np.dot(item.vector, prev.vector)) for prev in selected)
|
| 125 |
+
mmr_score = self.mmr_lambda * query_score - (1.0 - self.mmr_lambda) * novelty_penalty
|
| 126 |
+
if mmr_score > best_score:
|
| 127 |
+
best_index = index
|
| 128 |
+
best_score = mmr_score
|
| 129 |
+
selected.append(remaining.pop(best_index))
|
| 130 |
+
|
| 131 |
+
return selected
|
| 132 |
+
|
| 133 |
+
@staticmethod
|
| 134 |
+
def _merge_query(*vectors: np.ndarray | None) -> np.ndarray:
|
| 135 |
+
valid = [np.asarray(vec, dtype=np.float32) for vec in vectors if vec is not None]
|
| 136 |
+
if not valid:
|
| 137 |
+
return np.zeros((1,), dtype=np.float32)
|
| 138 |
+
merged = np.mean(np.stack(valid, axis=0), axis=0)
|
| 139 |
+
norm = float(np.linalg.norm(merged))
|
| 140 |
+
if norm < 1e-8:
|
| 141 |
+
return merged
|
| 142 |
+
return merged / norm
|
fashion_ai/schemas.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass, field
|
| 4 |
+
from typing import Any, Literal
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
SlotName = Literal["top", "bottom", "shoes", "accessory", "unknown"]
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass(slots=True)
|
| 12 |
+
class WeatherContext:
|
| 13 |
+
season: str = "all-season"
|
| 14 |
+
temperature_c: float | None = None
|
| 15 |
+
is_rainy: bool | None = None
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@dataclass(slots=True)
|
| 19 |
+
class RecommendationContext:
|
| 20 |
+
occasion: str = "casual"
|
| 21 |
+
weather: WeatherContext = field(default_factory=WeatherContext)
|
| 22 |
+
region: str = "global"
|
| 23 |
+
user_profile: dict[str, Any] = field(default_factory=dict)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass(slots=True)
|
| 27 |
+
class EncodedWardrobeItem:
|
| 28 |
+
item: dict[str, Any]
|
| 29 |
+
vector: np.ndarray
|
| 30 |
+
slot: SlotName
|
| 31 |
+
metadata_text: str
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@dataclass(slots=True)
|
| 35 |
+
class OutfitCandidate:
|
| 36 |
+
top: EncodedWardrobeItem
|
| 37 |
+
bottom: EncodedWardrobeItem
|
| 38 |
+
shoes: EncodedWardrobeItem | None = None
|
| 39 |
+
accessory: EncodedWardrobeItem | None = None
|
| 40 |
+
score: float = 0.0
|
| 41 |
+
breakdown: dict[str, float] = field(default_factory=dict)
|
| 42 |
+
reason: str = ""
|
| 43 |
+
tip: str = ""
|
| 44 |
+
|
| 45 |
+
def slot_items(self) -> list[EncodedWardrobeItem]:
|
| 46 |
+
return [
|
| 47 |
+
slot_item
|
| 48 |
+
for slot_item in [self.top, self.bottom, self.shoes, self.accessory]
|
| 49 |
+
if slot_item is not None
|
| 50 |
+
]
|
fashion_ai/service.py
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
from .encoder import FashionItemEncoder
|
| 9 |
+
from .ranker import NeuralOutfitScorer
|
| 10 |
+
from .retriever import OutfitCandidateRetriever
|
| 11 |
+
from .schemas import (
|
| 12 |
+
EncodedWardrobeItem,
|
| 13 |
+
OutfitCandidate,
|
| 14 |
+
RecommendationContext,
|
| 15 |
+
WeatherContext,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
DEFAULT_TOP_K = int(os.getenv("FASHION_RECOMMEND_TOP_K", "5"))
|
| 20 |
+
DEFAULT_CANDIDATE_POOL = int(os.getenv("FASHION_CANDIDATE_POOL", "24"))
|
| 21 |
+
DEFAULT_MAX_BEAM = int(os.getenv("FASHION_MAX_BEAM", "64"))
|
| 22 |
+
DEFAULT_DIVERSITY_LAMBDA = float(os.getenv("FASHION_DIVERSITY_LAMBDA", "0.28"))
|
| 23 |
+
|
| 24 |
+
_SERVICE_SINGLETON: MultimodalOutfitRecommendationService | None = None
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class MultimodalOutfitRecommendationService:
|
| 28 |
+
"""Multimodal retrieval + ranking service for outfit recommendations."""
|
| 29 |
+
|
| 30 |
+
def __init__(
|
| 31 |
+
self,
|
| 32 |
+
encoder: FashionItemEncoder | None = None,
|
| 33 |
+
retriever: OutfitCandidateRetriever | None = None,
|
| 34 |
+
scorer: NeuralOutfitScorer | None = None,
|
| 35 |
+
top_k: int = DEFAULT_TOP_K,
|
| 36 |
+
candidate_pool: int = DEFAULT_CANDIDATE_POOL,
|
| 37 |
+
max_beam: int = DEFAULT_MAX_BEAM,
|
| 38 |
+
diversity_lambda: float = DEFAULT_DIVERSITY_LAMBDA,
|
| 39 |
+
) -> None:
|
| 40 |
+
self.encoder = encoder or FashionItemEncoder()
|
| 41 |
+
self.retriever = retriever or OutfitCandidateRetriever(
|
| 42 |
+
self.encoder,
|
| 43 |
+
slot_pool_size=candidate_pool,
|
| 44 |
+
)
|
| 45 |
+
self.scorer = scorer or NeuralOutfitScorer(d_model=self.encoder.embedding_dim)
|
| 46 |
+
self.top_k = top_k
|
| 47 |
+
self.candidate_pool = candidate_pool
|
| 48 |
+
self.max_beam = max_beam
|
| 49 |
+
self.diversity_lambda = diversity_lambda
|
| 50 |
+
|
| 51 |
+
def recommend(
|
| 52 |
+
self,
|
| 53 |
+
wardrobe_items: list[dict[str, Any]],
|
| 54 |
+
occasion: str = "casual",
|
| 55 |
+
top_selected: dict[str, Any] | None = None,
|
| 56 |
+
bottom_selected: dict[str, Any] | None = None,
|
| 57 |
+
other_selected: dict[str, Any] | None = None,
|
| 58 |
+
weather: dict[str, Any] | None = None,
|
| 59 |
+
user_profile: dict[str, Any] | None = None,
|
| 60 |
+
region: str = "global",
|
| 61 |
+
top_k: int | None = None,
|
| 62 |
+
candidate_pool: int | None = None,
|
| 63 |
+
diversity_lambda: float | None = None,
|
| 64 |
+
) -> dict[str, Any]:
|
| 65 |
+
context = self._build_context(occasion, weather, user_profile, region)
|
| 66 |
+
top_k = top_k or self.top_k
|
| 67 |
+
candidate_pool = candidate_pool or self.candidate_pool
|
| 68 |
+
diversity_lambda = self.diversity_lambda if diversity_lambda is None else diversity_lambda
|
| 69 |
+
|
| 70 |
+
encoded_items = self.retriever.encode_wardrobe(wardrobe_items)
|
| 71 |
+
slot_buckets = self.retriever.split_by_slot(encoded_items)
|
| 72 |
+
if not encoded_items:
|
| 73 |
+
return self._empty_payload(occasion, "A", "Your wardrobe is empty. Add garments to get outfit recommendations.")
|
| 74 |
+
if not slot_buckets["top"] or not slot_buckets["bottom"]:
|
| 75 |
+
return self._empty_payload(
|
| 76 |
+
occasion,
|
| 77 |
+
"A",
|
| 78 |
+
"You need at least one topwear and one bottomwear item to generate outfits.",
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
case_name = self._resolve_case_name(top_selected, bottom_selected)
|
| 82 |
+
retrieved = self.retriever.retrieve(
|
| 83 |
+
encoded_items=encoded_items,
|
| 84 |
+
context=context,
|
| 85 |
+
locked_top=top_selected,
|
| 86 |
+
locked_bottom=bottom_selected,
|
| 87 |
+
locked_other=other_selected,
|
| 88 |
+
candidate_pool=candidate_pool,
|
| 89 |
+
)
|
| 90 |
+
context_vector = self.encoder.encode_context(context)
|
| 91 |
+
user_vector = self._build_user_vector(context, encoded_items)
|
| 92 |
+
|
| 93 |
+
candidates = self._assemble_candidates(retrieved)
|
| 94 |
+
scored = self.scorer.score_candidates(candidates, context_vector, user_vector, context)
|
| 95 |
+
diversified = self._diversify(scored, top_k=top_k, diversity_lambda=diversity_lambda)
|
| 96 |
+
payloads = [
|
| 97 |
+
self._candidate_to_payload(candidate, rank=index + 1)
|
| 98 |
+
for index, candidate in enumerate(diversified)
|
| 99 |
+
]
|
| 100 |
+
|
| 101 |
+
selected_outfit_score = None
|
| 102 |
+
improved_recommendations: list[dict[str, Any]] = []
|
| 103 |
+
recommendations = payloads
|
| 104 |
+
|
| 105 |
+
if case_name == "D" and top_selected and bottom_selected:
|
| 106 |
+
selected_candidates = self._assemble_candidates(
|
| 107 |
+
{
|
| 108 |
+
"top": [self.encoder.encode_item(top_selected)],
|
| 109 |
+
"bottom": [self.encoder.encode_item(bottom_selected)],
|
| 110 |
+
"shoes": [],
|
| 111 |
+
"accessory": [],
|
| 112 |
+
"unknown": [],
|
| 113 |
+
}
|
| 114 |
+
)
|
| 115 |
+
selected_scored = self.scorer.score_candidates(
|
| 116 |
+
selected_candidates,
|
| 117 |
+
context_vector,
|
| 118 |
+
user_vector,
|
| 119 |
+
context,
|
| 120 |
+
)
|
| 121 |
+
best_selected = self._diversify(selected_scored, top_k=1, diversity_lambda=0.0)
|
| 122 |
+
selected_outfit_score = self._candidate_to_payload(best_selected[0], rank=1) if best_selected else None
|
| 123 |
+
improved_recommendations = payloads[:top_k]
|
| 124 |
+
recommendations = []
|
| 125 |
+
|
| 126 |
+
return {
|
| 127 |
+
"occasion": occasion,
|
| 128 |
+
"case": case_name,
|
| 129 |
+
"selected_outfit_score": selected_outfit_score,
|
| 130 |
+
"recommendations": recommendations,
|
| 131 |
+
"improved_recommendations": improved_recommendations,
|
| 132 |
+
"total_combinations_checked": len(candidates),
|
| 133 |
+
"notice": (
|
| 134 |
+
"Selected outfit may not be the best fit for this occasion."
|
| 135 |
+
if selected_outfit_score and selected_outfit_score.get("score", 0) < 45
|
| 136 |
+
else None
|
| 137 |
+
),
|
| 138 |
+
"engine_version": (
|
| 139 |
+
f"fashion-mm-v1::{self.encoder.backend_name}"
|
| 140 |
+
f"::{ 'trained-transformer' if self.scorer.is_trained else 'zero-shot-ranker' }"
|
| 141 |
+
),
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
def _assemble_candidates(
|
| 145 |
+
self,
|
| 146 |
+
slot_candidates: dict[str, list[EncodedWardrobeItem]],
|
| 147 |
+
) -> list[OutfitCandidate]:
|
| 148 |
+
tops = slot_candidates.get("top") or []
|
| 149 |
+
bottoms = slot_candidates.get("bottom") or []
|
| 150 |
+
|
| 151 |
+
candidates: list[OutfitCandidate] = []
|
| 152 |
+
for top in tops[: self.candidate_pool]:
|
| 153 |
+
for bottom in bottoms[: self.candidate_pool]:
|
| 154 |
+
if top.item.get("id") == bottom.item.get("id"):
|
| 155 |
+
continue
|
| 156 |
+
|
| 157 |
+
candidates.append(
|
| 158 |
+
OutfitCandidate(
|
| 159 |
+
top=top,
|
| 160 |
+
bottom=bottom,
|
| 161 |
+
shoes=None,
|
| 162 |
+
accessory=None,
|
| 163 |
+
)
|
| 164 |
+
)
|
| 165 |
+
if len(candidates) >= self.max_beam:
|
| 166 |
+
return candidates
|
| 167 |
+
return candidates
|
| 168 |
+
|
| 169 |
+
def _diversify(
|
| 170 |
+
self,
|
| 171 |
+
scored: list[OutfitCandidate],
|
| 172 |
+
top_k: int,
|
| 173 |
+
diversity_lambda: float,
|
| 174 |
+
) -> list[OutfitCandidate]:
|
| 175 |
+
selected: list[OutfitCandidate] = []
|
| 176 |
+
remaining = list(scored)
|
| 177 |
+
while remaining and len(selected) < top_k:
|
| 178 |
+
best_index = 0
|
| 179 |
+
best_score = -1e9
|
| 180 |
+
for index, candidate in enumerate(remaining):
|
| 181 |
+
relevance = candidate.score / 100.0
|
| 182 |
+
redundancy = 0.0
|
| 183 |
+
if selected:
|
| 184 |
+
redundancy = max(self._candidate_similarity(candidate, prev) for prev in selected)
|
| 185 |
+
mmr_score = (1.0 - diversity_lambda) * relevance - diversity_lambda * redundancy
|
| 186 |
+
if mmr_score > best_score:
|
| 187 |
+
best_index = index
|
| 188 |
+
best_score = mmr_score
|
| 189 |
+
selected.append(remaining.pop(best_index))
|
| 190 |
+
return selected
|
| 191 |
+
|
| 192 |
+
def _build_user_vector(
|
| 193 |
+
self,
|
| 194 |
+
context: RecommendationContext,
|
| 195 |
+
encoded_items: list[EncodedWardrobeItem],
|
| 196 |
+
) -> np.ndarray:
|
| 197 |
+
liked_ids = set(str(item_id) for item_id in context.user_profile.get("liked_item_ids", []) if item_id)
|
| 198 |
+
disliked_ids = set(str(item_id) for item_id in context.user_profile.get("disliked_item_ids", []) if item_id)
|
| 199 |
+
|
| 200 |
+
vectors = []
|
| 201 |
+
for encoded_item in encoded_items:
|
| 202 |
+
item_id = str(encoded_item.item.get("id") or "")
|
| 203 |
+
if item_id in liked_ids:
|
| 204 |
+
vectors.append(encoded_item.vector)
|
| 205 |
+
if item_id in disliked_ids:
|
| 206 |
+
vectors.append(-encoded_item.vector)
|
| 207 |
+
|
| 208 |
+
if vectors:
|
| 209 |
+
merged = np.mean(np.stack(vectors, axis=0), axis=0)
|
| 210 |
+
norm = float(np.linalg.norm(merged))
|
| 211 |
+
if norm > 1e-8:
|
| 212 |
+
return merged / norm
|
| 213 |
+
return self.encoder.encode_text("User prefers versatile, well coordinated, context-appropriate outfits")
|
| 214 |
+
|
| 215 |
+
@staticmethod
|
| 216 |
+
def _candidate_to_payload(candidate: OutfitCandidate, rank: int) -> dict[str, Any]:
|
| 217 |
+
payload = {
|
| 218 |
+
"rank": rank,
|
| 219 |
+
"score": candidate.score,
|
| 220 |
+
"breakdown": candidate.breakdown,
|
| 221 |
+
"reason": candidate.reason,
|
| 222 |
+
"tip": candidate.tip,
|
| 223 |
+
"top": MultimodalOutfitRecommendationService._item_payload(candidate.top),
|
| 224 |
+
"bottom": MultimodalOutfitRecommendationService._item_payload(candidate.bottom),
|
| 225 |
+
}
|
| 226 |
+
return payload
|
| 227 |
+
|
| 228 |
+
@staticmethod
|
| 229 |
+
def _item_payload(encoded_item: EncodedWardrobeItem) -> dict[str, Any]:
|
| 230 |
+
return {
|
| 231 |
+
"id": encoded_item.item.get("id"),
|
| 232 |
+
"category": encoded_item.item.get("category"),
|
| 233 |
+
"color": encoded_item.item.get("color"),
|
| 234 |
+
"image_url": encoded_item.item.get("image_url", ""),
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
@staticmethod
|
| 238 |
+
def _build_context(
|
| 239 |
+
occasion: str,
|
| 240 |
+
weather: dict[str, Any] | None,
|
| 241 |
+
user_profile: dict[str, Any] | None,
|
| 242 |
+
region: str,
|
| 243 |
+
) -> RecommendationContext:
|
| 244 |
+
weather = weather or {}
|
| 245 |
+
return RecommendationContext(
|
| 246 |
+
occasion=occasion or "casual",
|
| 247 |
+
weather=WeatherContext(
|
| 248 |
+
season=str(weather.get("season") or "all-season"),
|
| 249 |
+
temperature_c=weather.get("temperature_c"),
|
| 250 |
+
is_rainy=weather.get("is_rainy"),
|
| 251 |
+
),
|
| 252 |
+
region=region or "global",
|
| 253 |
+
user_profile=user_profile or {},
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
@staticmethod
|
| 257 |
+
def _resolve_case_name(
|
| 258 |
+
top_selected: dict[str, Any] | None,
|
| 259 |
+
bottom_selected: dict[str, Any] | None,
|
| 260 |
+
) -> str:
|
| 261 |
+
if top_selected and bottom_selected:
|
| 262 |
+
return "D"
|
| 263 |
+
if top_selected:
|
| 264 |
+
return "B"
|
| 265 |
+
if bottom_selected:
|
| 266 |
+
return "C"
|
| 267 |
+
return "A"
|
| 268 |
+
|
| 269 |
+
@staticmethod
|
| 270 |
+
def _empty_payload(occasion: str, case_name: str, notice: str) -> dict[str, Any]:
|
| 271 |
+
return {
|
| 272 |
+
"occasion": occasion,
|
| 273 |
+
"case": case_name,
|
| 274 |
+
"selected_outfit_score": None,
|
| 275 |
+
"recommendations": [],
|
| 276 |
+
"improved_recommendations": [],
|
| 277 |
+
"total_combinations_checked": 0,
|
| 278 |
+
"notice": notice,
|
| 279 |
+
"engine_version": "fashion-mm-v1::empty",
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
def score_outfit(
|
| 283 |
+
self,
|
| 284 |
+
top: dict[str, Any],
|
| 285 |
+
bottom: dict[str, Any],
|
| 286 |
+
other: dict[str, Any] | None = None,
|
| 287 |
+
occasion: str = "casual",
|
| 288 |
+
weather: dict[str, Any] | None = None,
|
| 289 |
+
user_profile: dict[str, Any] | None = None,
|
| 290 |
+
region: str = "global",
|
| 291 |
+
) -> dict[str, Any]:
|
| 292 |
+
context = self._build_context(occasion, weather, user_profile, region)
|
| 293 |
+
other_encoded = self.encoder.encode_item(other) if other else None
|
| 294 |
+
candidate = OutfitCandidate(
|
| 295 |
+
top=self.encoder.encode_item(top),
|
| 296 |
+
bottom=self.encoder.encode_item(bottom),
|
| 297 |
+
shoes=other_encoded if other_encoded and other_encoded.slot == "shoes" else None,
|
| 298 |
+
accessory=other_encoded if other_encoded and other_encoded.slot != "shoes" else None,
|
| 299 |
+
)
|
| 300 |
+
scored = self.scorer.score_candidates(
|
| 301 |
+
candidates=[candidate],
|
| 302 |
+
context_vector=self.encoder.encode_context(context),
|
| 303 |
+
user_vector=self._build_user_vector(context, candidate.slot_items()),
|
| 304 |
+
context=context,
|
| 305 |
+
)
|
| 306 |
+
best = scored[0]
|
| 307 |
+
return {
|
| 308 |
+
"score": best.score,
|
| 309 |
+
"breakdown": best.breakdown,
|
| 310 |
+
"reason": best.reason,
|
| 311 |
+
"tip": best.tip,
|
| 312 |
+
"engine_version": (
|
| 313 |
+
f"fashion-mm-v1::{self.encoder.backend_name}"
|
| 314 |
+
f"::{ 'trained-transformer' if self.scorer.is_trained else 'zero-shot-ranker' }"
|
| 315 |
+
),
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
@staticmethod
|
| 319 |
+
def _candidate_similarity(left: OutfitCandidate, right: OutfitCandidate) -> float:
|
| 320 |
+
left_vec = np.mean(np.stack([item.vector for item in left.slot_items()], axis=0), axis=0)
|
| 321 |
+
right_vec = np.mean(np.stack([item.vector for item in right.slot_items()], axis=0), axis=0)
|
| 322 |
+
left_norm = float(np.linalg.norm(left_vec))
|
| 323 |
+
right_norm = float(np.linalg.norm(right_vec))
|
| 324 |
+
if left_norm < 1e-8 or right_norm < 1e-8:
|
| 325 |
+
return 0.0
|
| 326 |
+
return float(np.dot(left_vec / left_norm, right_vec / right_norm))
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def get_recommendation_service() -> MultimodalOutfitRecommendationService:
|
| 330 |
+
global _SERVICE_SINGLETON
|
| 331 |
+
if _SERVICE_SINGLETON is None:
|
| 332 |
+
_SERVICE_SINGLETON = MultimodalOutfitRecommendationService()
|
| 333 |
+
return _SERVICE_SINGLETON
|
fashion_ai/training.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import random
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Any, Iterable
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
import torch
|
| 11 |
+
import torch.nn.functional as F
|
| 12 |
+
from torch.utils.data import DataLoader, Dataset
|
| 13 |
+
|
| 14 |
+
from .encoder import FashionItemEncoder, infer_slot_name
|
| 15 |
+
from .ranker import OutfitCompatibilityRanker
|
| 16 |
+
from .schemas import RecommendationContext, WeatherContext
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass(slots=True)
|
| 20 |
+
class TrainingSample:
|
| 21 |
+
outfit: dict[str, Any]
|
| 22 |
+
label: float
|
| 23 |
+
occasion: str = "casual"
|
| 24 |
+
weather: dict[str, Any] | None = None
|
| 25 |
+
user_profile: dict[str, Any] | None = None
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class OutfitRankingDataset(Dataset[TrainingSample]):
|
| 29 |
+
"""
|
| 30 |
+
JSONL schema per row:
|
| 31 |
+
{
|
| 32 |
+
"outfit": {"top": {...}, "bottom": {...}, "shoes": {...}, "accessory": {...}},
|
| 33 |
+
"label": 1,
|
| 34 |
+
"occasion": "formal",
|
| 35 |
+
"weather": {"season": "summer", "temperature_c": 30},
|
| 36 |
+
"user_profile": {"style_profile": "minimal", "favorite_colors": ["navy"]}
|
| 37 |
+
}
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
def __init__(self, path: str | Path) -> None:
|
| 41 |
+
self.samples = self._load_samples(path)
|
| 42 |
+
|
| 43 |
+
def __len__(self) -> int:
|
| 44 |
+
return len(self.samples)
|
| 45 |
+
|
| 46 |
+
def __getitem__(self, index: int) -> TrainingSample:
|
| 47 |
+
return self.samples[index]
|
| 48 |
+
|
| 49 |
+
@staticmethod
|
| 50 |
+
def _load_samples(path: str | Path) -> list[TrainingSample]:
|
| 51 |
+
rows: list[TrainingSample] = []
|
| 52 |
+
with open(path, "r", encoding="utf-8") as file_obj:
|
| 53 |
+
for line in file_obj:
|
| 54 |
+
line = line.strip()
|
| 55 |
+
if not line:
|
| 56 |
+
continue
|
| 57 |
+
payload = json.loads(line)
|
| 58 |
+
rows.append(
|
| 59 |
+
TrainingSample(
|
| 60 |
+
outfit=payload.get("outfit", {}),
|
| 61 |
+
label=float(payload.get("label", 1.0)),
|
| 62 |
+
occasion=str(payload.get("occasion") or "casual"),
|
| 63 |
+
weather=payload.get("weather") if isinstance(payload.get("weather"), dict) else None,
|
| 64 |
+
user_profile=payload.get("user_profile")
|
| 65 |
+
if isinstance(payload.get("user_profile"), dict)
|
| 66 |
+
else None,
|
| 67 |
+
)
|
| 68 |
+
)
|
| 69 |
+
return rows
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class NegativeSampler:
|
| 73 |
+
"""Hard negative sampler using nearest same-slot replacements."""
|
| 74 |
+
|
| 75 |
+
def __init__(
|
| 76 |
+
self,
|
| 77 |
+
catalog_items: list[dict[str, Any]],
|
| 78 |
+
encoder: FashionItemEncoder,
|
| 79 |
+
hard_negative_top_k: int = 20,
|
| 80 |
+
) -> None:
|
| 81 |
+
self.encoder = encoder
|
| 82 |
+
self.hard_negative_top_k = hard_negative_top_k
|
| 83 |
+
self.slot_catalog = {
|
| 84 |
+
"top": [],
|
| 85 |
+
"bottom": [],
|
| 86 |
+
"shoes": [],
|
| 87 |
+
"accessory": [],
|
| 88 |
+
"unknown": [],
|
| 89 |
+
}
|
| 90 |
+
for item in catalog_items:
|
| 91 |
+
encoded = self.encoder.encode_item(item)
|
| 92 |
+
self.slot_catalog[encoded.slot].append(encoded)
|
| 93 |
+
|
| 94 |
+
def sample(self, outfit: dict[str, Any], occasion: str = "casual") -> dict[str, Any]:
|
| 95 |
+
candidate = dict(outfit)
|
| 96 |
+
replaceable_slots = [
|
| 97 |
+
slot_name
|
| 98 |
+
for slot_name in ["top", "bottom", "shoes", "accessory"]
|
| 99 |
+
if isinstance(outfit.get(slot_name), dict)
|
| 100 |
+
]
|
| 101 |
+
if not replaceable_slots:
|
| 102 |
+
return candidate
|
| 103 |
+
|
| 104 |
+
slot_to_replace = random.choice(replaceable_slots)
|
| 105 |
+
anchor = outfit[slot_to_replace]
|
| 106 |
+
pool = self.slot_catalog.get(infer_slot_name(anchor), [])
|
| 107 |
+
if not pool:
|
| 108 |
+
return candidate
|
| 109 |
+
|
| 110 |
+
anchor_vec = self.encoder.encode_item(anchor).vector
|
| 111 |
+
ranked = sorted(pool, key=lambda entry: float(np.dot(entry.vector, anchor_vec)), reverse=True)
|
| 112 |
+
hard_pool = [
|
| 113 |
+
entry.item
|
| 114 |
+
for entry in ranked[: self.hard_negative_top_k]
|
| 115 |
+
if str(entry.item.get("id")) != str(anchor.get("id"))
|
| 116 |
+
]
|
| 117 |
+
if not hard_pool:
|
| 118 |
+
return candidate
|
| 119 |
+
|
| 120 |
+
candidate[slot_to_replace] = random.choice(hard_pool)
|
| 121 |
+
candidate["negative_source"] = {
|
| 122 |
+
"strategy": "hard_same_slot_replacement",
|
| 123 |
+
"slot": slot_to_replace,
|
| 124 |
+
"occasion": occasion,
|
| 125 |
+
}
|
| 126 |
+
return candidate
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
class OutfitRankerCollator:
|
| 130 |
+
def __init__(self, encoder: FashionItemEncoder) -> None:
|
| 131 |
+
self.encoder = encoder
|
| 132 |
+
|
| 133 |
+
def __call__(self, samples: Iterable[TrainingSample]) -> dict[str, torch.Tensor]:
|
| 134 |
+
token_rows = []
|
| 135 |
+
mask_rows = []
|
| 136 |
+
labels = []
|
| 137 |
+
|
| 138 |
+
for sample in samples:
|
| 139 |
+
context = RecommendationContext(
|
| 140 |
+
occasion=sample.occasion,
|
| 141 |
+
weather=WeatherContext(
|
| 142 |
+
season=str((sample.weather or {}).get("season") or "all-season"),
|
| 143 |
+
temperature_c=(sample.weather or {}).get("temperature_c"),
|
| 144 |
+
is_rainy=(sample.weather or {}).get("is_rainy"),
|
| 145 |
+
),
|
| 146 |
+
user_profile=sample.user_profile or {},
|
| 147 |
+
)
|
| 148 |
+
row = [
|
| 149 |
+
self.encoder.encode_context(context),
|
| 150 |
+
self.encoder.encode_text(json.dumps(sample.user_profile or {"style_profile": "general"}, sort_keys=True)),
|
| 151 |
+
]
|
| 152 |
+
mask = [1, 1]
|
| 153 |
+
|
| 154 |
+
for slot_name in ["top", "bottom", "shoes", "accessory"]:
|
| 155 |
+
slot_value = sample.outfit.get(slot_name)
|
| 156 |
+
if isinstance(slot_value, dict):
|
| 157 |
+
row.append(self.encoder.encode_item(slot_value).vector)
|
| 158 |
+
mask.append(1)
|
| 159 |
+
else:
|
| 160 |
+
row.append(np.zeros(self.encoder.embedding_dim, dtype=np.float32))
|
| 161 |
+
mask.append(0)
|
| 162 |
+
|
| 163 |
+
token_rows.append(np.stack(row, axis=0))
|
| 164 |
+
mask_rows.append(mask)
|
| 165 |
+
labels.append(sample.label)
|
| 166 |
+
|
| 167 |
+
return {
|
| 168 |
+
"outfit_tokens": torch.tensor(np.stack(token_rows), dtype=torch.float32),
|
| 169 |
+
"attention_mask": torch.tensor(np.asarray(mask_rows), dtype=torch.long),
|
| 170 |
+
"labels": torch.tensor(np.asarray(labels), dtype=torch.float32),
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def bpr_pairwise_loss(pos_logits: torch.Tensor, neg_logits: torch.Tensor) -> torch.Tensor:
|
| 175 |
+
return -F.logsigmoid(pos_logits - neg_logits).mean()
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def train_ranker(
|
| 179 |
+
train_jsonl: str | Path,
|
| 180 |
+
catalog_jsonl: str | Path,
|
| 181 |
+
output_checkpoint: str | Path,
|
| 182 |
+
encoder_model_id: str = "patrickjohncyh/fashion-clip",
|
| 183 |
+
epochs: int = 5,
|
| 184 |
+
batch_size: int = 16,
|
| 185 |
+
lr: float = 2e-4,
|
| 186 |
+
device: str | None = None,
|
| 187 |
+
) -> None:
|
| 188 |
+
device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 189 |
+
encoder = FashionItemEncoder(model_id=encoder_model_id, device=device)
|
| 190 |
+
dataset = OutfitRankingDataset(train_jsonl)
|
| 191 |
+
negative_sampler = NegativeSampler(_load_catalog(catalog_jsonl), encoder)
|
| 192 |
+
collator = OutfitRankerCollator(encoder)
|
| 193 |
+
model = OutfitCompatibilityRanker(d_model=encoder.embedding_dim).to(device)
|
| 194 |
+
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
|
| 195 |
+
|
| 196 |
+
for _epoch in range(epochs):
|
| 197 |
+
model.train()
|
| 198 |
+
augmented_samples = []
|
| 199 |
+
for sample in dataset.samples:
|
| 200 |
+
augmented_samples.append(sample)
|
| 201 |
+
augmented_samples.append(
|
| 202 |
+
TrainingSample(
|
| 203 |
+
outfit=negative_sampler.sample(sample.outfit, sample.occasion),
|
| 204 |
+
label=0.0,
|
| 205 |
+
occasion=sample.occasion,
|
| 206 |
+
weather=sample.weather,
|
| 207 |
+
user_profile=sample.user_profile,
|
| 208 |
+
)
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
loader = DataLoader(
|
| 212 |
+
augmented_samples,
|
| 213 |
+
batch_size=batch_size,
|
| 214 |
+
shuffle=True,
|
| 215 |
+
collate_fn=collator,
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
for batch in loader:
|
| 219 |
+
logits = model(
|
| 220 |
+
batch["outfit_tokens"].to(device),
|
| 221 |
+
batch["attention_mask"].to(device),
|
| 222 |
+
).squeeze(-1)
|
| 223 |
+
labels = batch["labels"].to(device)
|
| 224 |
+
|
| 225 |
+
bce_loss = F.binary_cross_entropy_with_logits(logits, labels)
|
| 226 |
+
pos_logits = logits[labels > 0.5]
|
| 227 |
+
neg_logits = logits[labels <= 0.5]
|
| 228 |
+
if len(pos_logits) > 0 and len(neg_logits) > 0:
|
| 229 |
+
limit = min(len(pos_logits), len(neg_logits))
|
| 230 |
+
pairwise = bpr_pairwise_loss(pos_logits[:limit], neg_logits[:limit])
|
| 231 |
+
else:
|
| 232 |
+
pairwise = torch.zeros((), device=device)
|
| 233 |
+
|
| 234 |
+
loss = bce_loss + 0.4 * pairwise
|
| 235 |
+
optimizer.zero_grad(set_to_none=True)
|
| 236 |
+
loss.backward()
|
| 237 |
+
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
|
| 238 |
+
optimizer.step()
|
| 239 |
+
|
| 240 |
+
Path(output_checkpoint).parent.mkdir(parents=True, exist_ok=True)
|
| 241 |
+
torch.save(
|
| 242 |
+
{
|
| 243 |
+
"model_state_dict": model.state_dict(),
|
| 244 |
+
"encoder_model_id": encoder_model_id,
|
| 245 |
+
"embedding_dim": encoder.embedding_dim,
|
| 246 |
+
},
|
| 247 |
+
output_checkpoint,
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def _load_catalog(path: str | Path) -> list[dict[str, Any]]:
|
| 252 |
+
records = []
|
| 253 |
+
with open(path, "r", encoding="utf-8") as file_obj:
|
| 254 |
+
for line in file_obj:
|
| 255 |
+
line = line.strip()
|
| 256 |
+
if line:
|
| 257 |
+
records.append(json.loads(line))
|
| 258 |
+
return records
|
packages.txt
ADDED
|
File without changes
|
requirements.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn[standard]
|
| 3 |
+
python-multipart
|
| 4 |
+
numpy
|
| 5 |
+
pillow
|
| 6 |
+
requests
|
| 7 |
+
beautifulsoup4
|
| 8 |
+
lxml
|
| 9 |
+
playwright
|
| 10 |
+
torch
|
| 11 |
+
torchvision
|
| 12 |
+
transformers
|
| 13 |
+
accelerate
|
| 14 |
+
gradio
|
| 15 |
+
open_clip_torch
|
| 16 |
+
apify-client
|
scoring.py
ADDED
|
@@ -0,0 +1,553 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
scoring.py — strategic outfit scoring model.
|
| 3 |
+
|
| 4 |
+
Replaces all scoring logic previously inline in app.py.
|
| 5 |
+
Import compute_score() and recommend_outfits() from here.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import copy
|
| 11 |
+
from typing import Any
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# ---------------------------------------------------------------------------
|
| 15 |
+
# Weights
|
| 16 |
+
# ---------------------------------------------------------------------------
|
| 17 |
+
|
| 18 |
+
WEIGHTS: dict[str, float] = {
|
| 19 |
+
"color": 0.30,
|
| 20 |
+
"style": 0.25,
|
| 21 |
+
"occasion": 0.20,
|
| 22 |
+
"fit": 0.13,
|
| 23 |
+
"pattern": 0.12,
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
TOP_K = 5
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# ---------------------------------------------------------------------------
|
| 30 |
+
# Normalisation helpers
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
|
| 33 |
+
_BASE_COLORS = [
|
| 34 |
+
"black", "white", "grey", "gray", "beige", "cream", "tan",
|
| 35 |
+
"navy", "blue", "olive", "green", "brown", "maroon", "burgundy",
|
| 36 |
+
"red", "pink", "purple", "orange", "yellow", "gold", "silver",
|
| 37 |
+
"khaki", "coral", "teal", "indigo", "lavender", "mustard",
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _norm(value: Any) -> str:
|
| 42 |
+
return str(value or "").strip().lower()
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def extract_base_color(raw: Any) -> str:
|
| 46 |
+
"""'Navy Blue' -> 'navy', 'Olive Green' -> 'olive', etc."""
|
| 47 |
+
n = _norm(raw)
|
| 48 |
+
for base in _BASE_COLORS:
|
| 49 |
+
if base in n:
|
| 50 |
+
return base
|
| 51 |
+
return n
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def extract_style(item: dict[str, Any]) -> str:
|
| 55 |
+
"""Classifier writes 'occasion'; normaliser copies to 'style'. Accept both."""
|
| 56 |
+
raw = _norm(item.get("style") or item.get("occasion") or "")
|
| 57 |
+
if raw in {"work", "business", "office"}:
|
| 58 |
+
return "formal"
|
| 59 |
+
if raw in {"party", "festive", "ethnic"}:
|
| 60 |
+
return "party"
|
| 61 |
+
if raw in {"sports", "sport", "gym", "active"}:
|
| 62 |
+
return "sports"
|
| 63 |
+
if raw in {"casual", "formal", "streetwear", "party", "sports"}:
|
| 64 |
+
return raw
|
| 65 |
+
return "casual" # safe default
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def extract_fit(item: dict[str, Any]) -> str:
|
| 69 |
+
n = _norm(item.get("fit") or "")
|
| 70 |
+
if "slim" in n or "fitted" in n:
|
| 71 |
+
return "slim"
|
| 72 |
+
if "over" in n or "baggy" in n or "loose" in n:
|
| 73 |
+
return "oversized"
|
| 74 |
+
if "regular" in n or "relaxed" in n:
|
| 75 |
+
return "regular"
|
| 76 |
+
return "regular"
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def extract_pattern(item: dict[str, Any]) -> str:
|
| 80 |
+
n = _norm(item.get("pattern") or "")
|
| 81 |
+
return "solid" if n in {"solid", "plain", ""} else "pattern"
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def extract_season(item: dict[str, Any]) -> str:
|
| 85 |
+
n = _norm(item.get("season") or "")
|
| 86 |
+
if "summer" in n:
|
| 87 |
+
return "summer"
|
| 88 |
+
if "winter" in n:
|
| 89 |
+
return "winter"
|
| 90 |
+
if "monsoon" in n or "rainy" in n:
|
| 91 |
+
return "monsoon"
|
| 92 |
+
return "all" # "All-Season" or unknown -> no restriction
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def extract_fabric(item: dict[str, Any]) -> str:
|
| 96 |
+
return _norm(item.get("fabric") or "")
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# ---------------------------------------------------------------------------
|
| 100 |
+
# Color scoring
|
| 101 |
+
# ---------------------------------------------------------------------------
|
| 102 |
+
|
| 103 |
+
_COMPLEMENTARY: set[frozenset] = {
|
| 104 |
+
frozenset(["blue", "beige"]),
|
| 105 |
+
frozenset(["blue", "khaki"]),
|
| 106 |
+
frozenset(["black", "white"]),
|
| 107 |
+
frozenset(["navy", "khaki"]),
|
| 108 |
+
frozenset(["navy", "beige"]),
|
| 109 |
+
frozenset(["navy", "white"]),
|
| 110 |
+
frozenset(["green", "brown"]),
|
| 111 |
+
frozenset(["olive", "tan"]),
|
| 112 |
+
frozenset(["olive", "cream"]),
|
| 113 |
+
frozenset(["burgundy", "grey"]),
|
| 114 |
+
frozenset(["maroon", "white"]),
|
| 115 |
+
frozenset(["grey", "navy"]),
|
| 116 |
+
frozenset(["teal", "white"]),
|
| 117 |
+
frozenset(["coral", "navy"]),
|
| 118 |
+
frozenset(["black", "beige"]),
|
| 119 |
+
frozenset(["black", "khaki"]),
|
| 120 |
+
frozenset(["white", "navy"]),
|
| 121 |
+
frozenset(["brown", "cream"]),
|
| 122 |
+
frozenset(["mustard", "navy"]),
|
| 123 |
+
frozenset(["mustard", "black"]),
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
_NEUTRALS: set[str] = {
|
| 127 |
+
"black", "white", "grey", "gray", "beige",
|
| 128 |
+
"cream", "tan", "navy", "khaki",
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
_ANALOGOUS: set[frozenset] = {
|
| 132 |
+
frozenset(["blue", "green"]),
|
| 133 |
+
frozenset(["blue", "teal"]),
|
| 134 |
+
frozenset(["red", "orange"]),
|
| 135 |
+
frozenset(["yellow", "orange"]),
|
| 136 |
+
frozenset(["red", "maroon"]),
|
| 137 |
+
frozenset(["purple", "pink"]),
|
| 138 |
+
frozenset(["green", "teal"]),
|
| 139 |
+
frozenset(["orange", "coral"]),
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def _color_score(top: dict[str, Any], bottom: dict[str, Any]) -> int:
|
| 144 |
+
c1 = extract_base_color(top.get("color") or "")
|
| 145 |
+
c2 = extract_base_color(bottom.get("color") or "")
|
| 146 |
+
if not c1 or not c2:
|
| 147 |
+
return 60
|
| 148 |
+
pair = frozenset([c1, c2])
|
| 149 |
+
if pair in _COMPLEMENTARY:
|
| 150 |
+
return 90
|
| 151 |
+
if c1 in _NEUTRALS and c2 in _NEUTRALS:
|
| 152 |
+
return 50 if c1 == c2 else 82
|
| 153 |
+
if c1 in _NEUTRALS or c2 in _NEUTRALS:
|
| 154 |
+
return 80
|
| 155 |
+
if pair in _ANALOGOUS:
|
| 156 |
+
return 60
|
| 157 |
+
if c1 == c2:
|
| 158 |
+
return 45
|
| 159 |
+
return 60
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
# ---------------------------------------------------------------------------
|
| 163 |
+
# Style scoring
|
| 164 |
+
# ---------------------------------------------------------------------------
|
| 165 |
+
|
| 166 |
+
_STYLE_MATRIX: dict[tuple[str, str], int] = {
|
| 167 |
+
("casual", "casual"): 85,
|
| 168 |
+
("formal", "formal"): 90,
|
| 169 |
+
("streetwear", "streetwear"): 88,
|
| 170 |
+
("party", "party"): 85,
|
| 171 |
+
("sports", "sports"): 88,
|
| 172 |
+
("casual", "streetwear"): 80,
|
| 173 |
+
("streetwear", "casual"): 80,
|
| 174 |
+
("casual", "party"): 72,
|
| 175 |
+
("party", "casual"): 72,
|
| 176 |
+
("casual", "formal"): 62,
|
| 177 |
+
("formal", "casual"): 62,
|
| 178 |
+
("formal", "party"): 70,
|
| 179 |
+
("party", "formal"): 70,
|
| 180 |
+
("formal", "streetwear"): 48,
|
| 181 |
+
("streetwear", "formal"): 48,
|
| 182 |
+
("sports", "casual"): 72,
|
| 183 |
+
("casual", "sports"): 72,
|
| 184 |
+
("sports", "formal"): 28,
|
| 185 |
+
("formal", "sports"): 28,
|
| 186 |
+
("sports", "party"): 40,
|
| 187 |
+
("party", "sports"): 40,
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def _style_score(top: dict[str, Any], bottom: dict[str, Any]) -> int:
|
| 192 |
+
s1 = extract_style(top)
|
| 193 |
+
s2 = extract_style(bottom)
|
| 194 |
+
return _STYLE_MATRIX.get((s1, s2), 68)
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
# ---------------------------------------------------------------------------
|
| 198 |
+
# Occasion scoring
|
| 199 |
+
# ---------------------------------------------------------------------------
|
| 200 |
+
|
| 201 |
+
_STYLE_TO_OCCASIONS: dict[str, set[str]] = {
|
| 202 |
+
"casual": {"casual", "everyday", "weekend", "college", "brunch"},
|
| 203 |
+
"formal": {"formal", "work", "interview", "business", "office", "wedding", "meeting"},
|
| 204 |
+
"party": {"party", "festive", "ethnic", "diwali", "celebration", "date"},
|
| 205 |
+
"sports": {"sports", "gym", "active", "outdoor", "trekking"},
|
| 206 |
+
"streetwear": {"casual", "streetwear", "everyday", "college"},
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def _occasion_score(occasion: str, top: dict[str, Any], bottom: dict[str, Any]) -> int:
|
| 211 |
+
occ = _norm(occasion)
|
| 212 |
+
if not occ:
|
| 213 |
+
return 70
|
| 214 |
+
t_occ = _STYLE_TO_OCCASIONS.get(extract_style(top), set())
|
| 215 |
+
b_occ = _STYLE_TO_OCCASIONS.get(extract_style(bottom), set())
|
| 216 |
+
top_fits = occ in t_occ
|
| 217 |
+
bottom_fits = occ in b_occ
|
| 218 |
+
if top_fits and bottom_fits:
|
| 219 |
+
return 90
|
| 220 |
+
if top_fits or bottom_fits:
|
| 221 |
+
return 70
|
| 222 |
+
return 35
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
# ---------------------------------------------------------------------------
|
| 226 |
+
# Fit scoring
|
| 227 |
+
# ---------------------------------------------------------------------------
|
| 228 |
+
|
| 229 |
+
_FIT_MATRIX: dict[tuple[str, str], int] = {
|
| 230 |
+
("slim", "slim"): 82,
|
| 231 |
+
("oversized", "slim"): 92,
|
| 232 |
+
("slim", "oversized"): 78,
|
| 233 |
+
("oversized", "oversized"): 55,
|
| 234 |
+
("regular", "regular"): 80,
|
| 235 |
+
("slim", "regular"): 82,
|
| 236 |
+
("regular", "slim"): 82,
|
| 237 |
+
("oversized", "regular"): 85,
|
| 238 |
+
("regular", "oversized"): 75,
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def _fit_score(top: dict[str, Any], bottom: dict[str, Any]) -> int:
|
| 243 |
+
f1 = extract_fit(top)
|
| 244 |
+
f2 = extract_fit(bottom)
|
| 245 |
+
return _FIT_MATRIX.get((f1, f2), 70)
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
# ---------------------------------------------------------------------------
|
| 249 |
+
# Pattern scoring
|
| 250 |
+
# ---------------------------------------------------------------------------
|
| 251 |
+
|
| 252 |
+
def _pattern_score(top: dict[str, Any], bottom: dict[str, Any]) -> int:
|
| 253 |
+
p1 = extract_pattern(top)
|
| 254 |
+
p2 = extract_pattern(bottom)
|
| 255 |
+
if p1 == "pattern" and p2 == "pattern":
|
| 256 |
+
return 55
|
| 257 |
+
if p1 == "pattern" or p2 == "pattern":
|
| 258 |
+
return 88
|
| 259 |
+
return 75
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
# ---------------------------------------------------------------------------
|
| 263 |
+
# Season / fabric penalty
|
| 264 |
+
# ---------------------------------------------------------------------------
|
| 265 |
+
|
| 266 |
+
_HEAVY_FABRICS = {"wool", "leather", "velvet", "tweed", "corduroy", "fleece"}
|
| 267 |
+
_LIGHT_FABRICS = {"linen", "cotton", "silk", "chiffon", "georgette"}
|
| 268 |
+
_SUMMER_PENALTY = 18 # heavy fabric in summer
|
| 269 |
+
_WINTER_PENALTY = 12 # very light fabric in winter
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def _season_penalty(top: dict[str, Any], bottom: dict[str, Any]) -> int:
|
| 273 |
+
"""Returns a positive integer to subtract from the final score."""
|
| 274 |
+
penalty = 0
|
| 275 |
+
for item in (top, bottom):
|
| 276 |
+
season = extract_season(item)
|
| 277 |
+
fabric = extract_fabric(item)
|
| 278 |
+
if season == "summer" and any(f in fabric for f in _HEAVY_FABRICS):
|
| 279 |
+
penalty += _SUMMER_PENALTY
|
| 280 |
+
if season == "winter" and any(f in fabric for f in _LIGHT_FABRICS):
|
| 281 |
+
penalty += _WINTER_PENALTY
|
| 282 |
+
return penalty
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def _blend_breakdowns(primary: dict[str, int], extras: list[dict[str, int]]) -> dict[str, int]:
|
| 286 |
+
if not extras:
|
| 287 |
+
return dict(primary)
|
| 288 |
+
|
| 289 |
+
blended: dict[str, int] = {}
|
| 290 |
+
for key, value in primary.items():
|
| 291 |
+
extra_avg = sum(extra.get(key, value) for extra in extras) / len(extras)
|
| 292 |
+
blended[key] = round((value * 0.65) + (extra_avg * 0.35))
|
| 293 |
+
return blended
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
def _other_item_label(other: dict[str, Any] | None) -> str:
|
| 297 |
+
if not other:
|
| 298 |
+
return "other item"
|
| 299 |
+
color = extract_base_color(other.get("color") or "") or _norm(other.get("color") or "") or "neutral"
|
| 300 |
+
category = str(other.get("category") or other.get("type") or "other item").strip() or "other item"
|
| 301 |
+
return f"{color} {category}".strip()
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
# ---------------------------------------------------------------------------
|
| 305 |
+
# Human-readable explanation
|
| 306 |
+
# ---------------------------------------------------------------------------
|
| 307 |
+
|
| 308 |
+
def build_reason(
|
| 309 |
+
breakdown: dict[str, int],
|
| 310 |
+
top: dict[str, Any],
|
| 311 |
+
bottom: dict[str, Any],
|
| 312 |
+
occasion: str,
|
| 313 |
+
season_pen: int,
|
| 314 |
+
other: dict[str, Any] | None = None,
|
| 315 |
+
) -> str:
|
| 316 |
+
lines: list[str] = []
|
| 317 |
+
|
| 318 |
+
c = breakdown["color"]
|
| 319 |
+
c1 = extract_base_color(top.get("color") or "")
|
| 320 |
+
c2 = extract_base_color(bottom.get("color") or "")
|
| 321 |
+
if c >= 88:
|
| 322 |
+
lines.append(f"Great color contrast — {c1} and {c2} complement each other well.")
|
| 323 |
+
elif c >= 78:
|
| 324 |
+
lines.append(f"Clean color pairing — one neutral ({c1 if c1 in _NEUTRALS else c2}) anchors the look.")
|
| 325 |
+
elif c <= 60:
|
| 326 |
+
lines.append(f"Weak color pairing — {c1} and {c2} lack contrast or clash.")
|
| 327 |
+
|
| 328 |
+
s = breakdown["style"]
|
| 329 |
+
s1, s2 = extract_style(top), extract_style(bottom)
|
| 330 |
+
if s >= 85:
|
| 331 |
+
lines.append(f"Consistent style ({s1}).")
|
| 332 |
+
elif s <= 55:
|
| 333 |
+
lines.append(f"Style mismatch: {s1} top with {s2} bottom doesn't work for most occasions.")
|
| 334 |
+
|
| 335 |
+
o = breakdown["occasion"]
|
| 336 |
+
if occasion:
|
| 337 |
+
if o >= 88:
|
| 338 |
+
lines.append(f"Both pieces suit {occasion}.")
|
| 339 |
+
elif o >= 68:
|
| 340 |
+
lines.append(f"One piece suits {occasion}, the other is borderline.")
|
| 341 |
+
else:
|
| 342 |
+
lines.append(f"Neither piece is suited to {occasion}.")
|
| 343 |
+
|
| 344 |
+
f = breakdown["fit"]
|
| 345 |
+
f1, f2 = extract_fit(top), extract_fit(bottom)
|
| 346 |
+
if f >= 90:
|
| 347 |
+
lines.append(f"Excellent fit contrast — {f1} top with {f2} bottom is a strong silhouette.")
|
| 348 |
+
elif f <= 58:
|
| 349 |
+
lines.append(f"Both pieces are {f1} — too much volume in one direction.")
|
| 350 |
+
|
| 351 |
+
if season_pen > 0:
|
| 352 |
+
lines.append(f"Season/fabric mismatch reduced the score by {season_pen} pts.")
|
| 353 |
+
|
| 354 |
+
if other:
|
| 355 |
+
other_label = _other_item_label(other)
|
| 356 |
+
if breakdown["style"] >= 72 and breakdown["color"] >= 72:
|
| 357 |
+
lines.append(f"The {other_label} strengthens the finishing-layer/accessory coordination.")
|
| 358 |
+
else:
|
| 359 |
+
lines.append(f"The {other_label} was included in scoring, but it is not the strongest finishing piece here.")
|
| 360 |
+
|
| 361 |
+
return " ".join(lines) if lines else "Decent pairing overall."
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
def build_tip(
|
| 365 |
+
score: int,
|
| 366 |
+
top: dict[str, Any],
|
| 367 |
+
bottom: dict[str, Any],
|
| 368 |
+
other: dict[str, Any] | None = None,
|
| 369 |
+
) -> str:
|
| 370 |
+
if score >= 85:
|
| 371 |
+
if other:
|
| 372 |
+
return f"Strong outfit. Keep the {_other_item_label(other)} as the main finishing accent."
|
| 373 |
+
return "Solid outfit. Add a belt or watch to sharpen the look."
|
| 374 |
+
if score >= 70:
|
| 375 |
+
s1, s2 = extract_style(top), extract_style(bottom)
|
| 376 |
+
if s1 != s2:
|
| 377 |
+
return f"Swap the {s2} bottom for something more {s1} to improve cohesion."
|
| 378 |
+
c1 = extract_base_color(top.get("color") or "")
|
| 379 |
+
if c1 not in _NEUTRALS:
|
| 380 |
+
return "Add a neutral layer (jacket or shoes) to tie the colours together."
|
| 381 |
+
if other:
|
| 382 |
+
return f"If possible, swap the {_other_item_label(other)} for a cleaner neutral accent."
|
| 383 |
+
return "Try a different bottom colour for more visual interest."
|
| 384 |
+
return "This combination needs work — consider changing at least one piece."
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
# ---------------------------------------------------------------------------
|
| 388 |
+
# Main scoring entry point
|
| 389 |
+
# ---------------------------------------------------------------------------
|
| 390 |
+
|
| 391 |
+
def compute_score(
|
| 392 |
+
top: dict[str, Any],
|
| 393 |
+
bottom: dict[str, Any],
|
| 394 |
+
occasion: str = "casual",
|
| 395 |
+
other: dict[str, Any] | None = None,
|
| 396 |
+
) -> tuple[int, dict[str, int]]:
|
| 397 |
+
"""
|
| 398 |
+
Returns (final_score, breakdown_dict).
|
| 399 |
+
breakdown keys: color, style, occasion, fit, pattern
|
| 400 |
+
|
| 401 |
+
Veto caps:
|
| 402 |
+
- color <= 50 → final capped at 68 (monochrome / clash)
|
| 403 |
+
- style <= 48 → final capped at 58 (hard style mismatch)
|
| 404 |
+
- pattern == 55 (both patterned) AND color <= 80 → cap at 72
|
| 405 |
+
"""
|
| 406 |
+
raw_scores: dict[str, int] = {
|
| 407 |
+
"color": _color_score(top, bottom),
|
| 408 |
+
"style": _style_score(top, bottom),
|
| 409 |
+
"occasion": _occasion_score(occasion, top, bottom),
|
| 410 |
+
"fit": _fit_score(top, bottom),
|
| 411 |
+
"pattern": _pattern_score(top, bottom),
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
extra_penalty = 0
|
| 415 |
+
if other:
|
| 416 |
+
raw_scores = _blend_breakdowns(
|
| 417 |
+
raw_scores,
|
| 418 |
+
[
|
| 419 |
+
{
|
| 420 |
+
"color": _color_score(top, other),
|
| 421 |
+
"style": _style_score(top, other),
|
| 422 |
+
"occasion": _occasion_score(occasion, top, other),
|
| 423 |
+
"fit": _fit_score(top, other),
|
| 424 |
+
"pattern": _pattern_score(top, other),
|
| 425 |
+
},
|
| 426 |
+
{
|
| 427 |
+
"color": _color_score(bottom, other),
|
| 428 |
+
"style": _style_score(bottom, other),
|
| 429 |
+
"occasion": _occasion_score(occasion, bottom, other),
|
| 430 |
+
"fit": _fit_score(bottom, other),
|
| 431 |
+
"pattern": _pattern_score(bottom, other),
|
| 432 |
+
},
|
| 433 |
+
],
|
| 434 |
+
)
|
| 435 |
+
extra_penalty = round(
|
| 436 |
+
(_season_penalty(top, other) + _season_penalty(bottom, other)) / 2
|
| 437 |
+
)
|
| 438 |
+
|
| 439 |
+
weighted = sum(raw_scores[k] * WEIGHTS[k] for k in WEIGHTS)
|
| 440 |
+
penalty = (
|
| 441 |
+
round((_season_penalty(top, bottom) * 0.65) + (extra_penalty * 0.35))
|
| 442 |
+
if other
|
| 443 |
+
else _season_penalty(top, bottom)
|
| 444 |
+
)
|
| 445 |
+
final = max(0, min(100, round(weighted - penalty)))
|
| 446 |
+
|
| 447 |
+
# Veto caps — a fatal flaw in one dimension overrides a good weighted average
|
| 448 |
+
if raw_scores["color"] <= 50:
|
| 449 |
+
final = min(final, 68)
|
| 450 |
+
if raw_scores["style"] <= 48:
|
| 451 |
+
final = min(final, 58)
|
| 452 |
+
if raw_scores["pattern"] == 55 and raw_scores["color"] <= 80:
|
| 453 |
+
final = min(final, 72)
|
| 454 |
+
|
| 455 |
+
return final, raw_scores
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
def score_pair_full(
|
| 459 |
+
top: dict[str, Any],
|
| 460 |
+
bottom: dict[str, Any],
|
| 461 |
+
occasion: str = "casual",
|
| 462 |
+
other: dict[str, Any] | None = None,
|
| 463 |
+
) -> dict[str, Any]:
|
| 464 |
+
"""
|
| 465 |
+
Returns the full scoring dict that all endpoints expect:
|
| 466 |
+
score, breakdown, reason, tip, engine_version
|
| 467 |
+
"""
|
| 468 |
+
score, breakdown = compute_score(top, bottom, occasion, other=other)
|
| 469 |
+
penalty = _season_penalty(top, bottom)
|
| 470 |
+
if other:
|
| 471 |
+
other_penalty = round(
|
| 472 |
+
(_season_penalty(top, other) + _season_penalty(bottom, other)) / 2
|
| 473 |
+
)
|
| 474 |
+
penalty = round((penalty * 0.65) + (other_penalty * 0.35))
|
| 475 |
+
return {
|
| 476 |
+
"score": score,
|
| 477 |
+
"breakdown": breakdown,
|
| 478 |
+
"reason": build_reason(breakdown, top, bottom, occasion, penalty, other=other),
|
| 479 |
+
"tip": build_tip(score, top, bottom, other=other),
|
| 480 |
+
"engine_version": "scoring-v2",
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
|
| 484 |
+
# ---------------------------------------------------------------------------
|
| 485 |
+
# Diversity penalty (non-mutating)
|
| 486 |
+
# ---------------------------------------------------------------------------
|
| 487 |
+
|
| 488 |
+
def _is_similar(a: dict[str, Any], b: dict[str, Any]) -> bool:
|
| 489 |
+
return (
|
| 490 |
+
extract_base_color(a["top"].get("color") or "")
|
| 491 |
+
== extract_base_color(b["top"].get("color") or "")
|
| 492 |
+
and extract_base_color(a["bottom"].get("color") or "")
|
| 493 |
+
== extract_base_color(b["bottom"].get("color") or "")
|
| 494 |
+
and extract_base_color((a.get("other") or {}).get("color") or "")
|
| 495 |
+
== extract_base_color((b.get("other") or {}).get("color") or "")
|
| 496 |
+
)
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
def _apply_diversity_penalty(pairs: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
| 500 |
+
result: list[dict[str, Any]] = []
|
| 501 |
+
for pair in pairs:
|
| 502 |
+
penalty = sum(10 for sel in result if _is_similar(pair, sel))
|
| 503 |
+
adjusted = copy.copy(pair)
|
| 504 |
+
adjusted["score"] = max(0, pair["score"] - penalty)
|
| 505 |
+
result.append(adjusted)
|
| 506 |
+
return result
|
| 507 |
+
|
| 508 |
+
|
| 509 |
+
# ---------------------------------------------------------------------------
|
| 510 |
+
# Recommender
|
| 511 |
+
# ---------------------------------------------------------------------------
|
| 512 |
+
|
| 513 |
+
def recommend_outfits(
|
| 514 |
+
tops: list[dict[str, Any]],
|
| 515 |
+
bottoms: list[dict[str, Any]],
|
| 516 |
+
occasion: str = "casual",
|
| 517 |
+
others: list[dict[str, Any]] | None = None,
|
| 518 |
+
locked_top: dict[str, Any] | None = None,
|
| 519 |
+
locked_bottom: dict[str, Any] | None = None,
|
| 520 |
+
locked_other: dict[str, Any] | None = None,
|
| 521 |
+
) -> list[dict[str, Any]]:
|
| 522 |
+
"""
|
| 523 |
+
Returns up to TOP_K scored pairs, sorted best-first.
|
| 524 |
+
Each entry: {top, bottom, score, breakdown, reason, tip}
|
| 525 |
+
"""
|
| 526 |
+
other_options = [locked_other] if locked_other else ([None] + list(others or []))
|
| 527 |
+
|
| 528 |
+
if locked_top and locked_bottom:
|
| 529 |
+
candidates = [(locked_top, locked_bottom, other) for other in other_options]
|
| 530 |
+
elif locked_top:
|
| 531 |
+
candidates = [(locked_top, b, other) for b in bottoms for other in other_options]
|
| 532 |
+
elif locked_bottom:
|
| 533 |
+
candidates = [(t, locked_bottom, other) for t in tops for other in other_options]
|
| 534 |
+
else:
|
| 535 |
+
candidates = [(t, b, other) for t in tops for b in bottoms for other in other_options]
|
| 536 |
+
|
| 537 |
+
scored: list[dict[str, Any]] = []
|
| 538 |
+
for top, bottom, other in candidates:
|
| 539 |
+
result = score_pair_full(top, bottom, occasion, other=other)
|
| 540 |
+
scored.append({
|
| 541 |
+
"top": top,
|
| 542 |
+
"bottom": bottom,
|
| 543 |
+
"other": other,
|
| 544 |
+
"score": result["score"],
|
| 545 |
+
"breakdown": result["breakdown"],
|
| 546 |
+
"reason": result["reason"],
|
| 547 |
+
"tip": result["tip"],
|
| 548 |
+
})
|
| 549 |
+
|
| 550 |
+
scored.sort(key=lambda x: x["score"], reverse=True)
|
| 551 |
+
scored = _apply_diversity_penalty(scored)
|
| 552 |
+
scored.sort(key=lambda x: x["score"], reverse=True)
|
| 553 |
+
return scored[:TOP_K]
|
scraped_json/product_urls_20260413T214331Z.json
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"product_urls": [
|
| 3 |
+
"https://www.nike.com/t/sportswear-club-mens-fleece-hoodie-cjm38J/HV1216-893",
|
| 4 |
+
"https://www.nike.com/t/247-impossiblysoft-mens-dri-fit-1-2-zip-top-s9Hx9b/HQ6953-010",
|
| 5 |
+
"https://www.nike.com/t/caitlin-clark-basketball-fleece-pullover-hoodie-tRRBDb6P/IQ5572-133",
|
| 6 |
+
"https://www.nike.com/t/club-mens-pullover-fleece-hoodie-00eeWNwD/FN3859-010",
|
| 7 |
+
"https://www.nike.com/t/club-mens-fleece-crew-RU1Il4yl/FN3886-010",
|
| 8 |
+
"https://www.nike.com/t/sportswear-club-fleece-pullover-hoodie-Gw4Nwq/BV2654-010",
|
| 9 |
+
"https://www.nike.com/t/sportswear-club-fleece-mens-full-zip-hoodie-nR8tst/BV2645-010",
|
| 10 |
+
"https://www.nike.com/t/club-fleece-mens-football-pullover-hoodie-wklPOsTs/M31233FB25-BLK",
|
| 11 |
+
"https://www.nike.com/t/therma-mens-therma-fit-hooded-fitness-pullover-X9fVm2/DQ4834-010",
|
| 12 |
+
"https://www.nike.com/t/solo-swoosh-mens-pullover-hoodie-bDPPFV/HV1082-010",
|
| 13 |
+
"https://www.nike.com/t/tech-mens-fleece-pullover-hoodie-2eYhFKJ7/IO9941-010",
|
| 14 |
+
"https://www.nike.com/t/club-fleece-mens-baseball-pullover-hoodie-gvB0EgFc/M31233BS25-ANT",
|
| 15 |
+
"https://www.nike.com/t/standard-issue-mens-therma-fit-brushed-basketball-pullover-hoodie-88aWq9IL/IM5918-010",
|
| 16 |
+
"https://www.nike.com/t/acg-tuff-fleece-pullover-hoodie-njCV8f/DZ3392-011",
|
| 17 |
+
"https://www.nike.com/t/tech-mens-fleece-windrunner-full-zip-jacket-HqUykgjE/HV0949-010",
|
| 18 |
+
"https://www.nike.com/t/mens-therma-fit-fleece-hoodie-nMuSUUk1/HV4062-010",
|
| 19 |
+
"https://www.nike.com/t/therma-mens-therma-fit-full-zip-fitness-top-vsR4Rm/DQ4830-010",
|
| 20 |
+
"https://www.nike.com/t/jordan-brooklyn-mens-oversized-pullover-hoodie-kkkKymFl/IM7943-010",
|
| 21 |
+
"https://www.nike.com/t/nocta-fleece-cs-hoodie-2-5DG03b/HM5762-010",
|
| 22 |
+
"https://www.nike.com/t/nikecourt-heritage-mens-dri-fit-french-terry-tennis-pullover-hoodie-Y9kjc6iO/HM6759-010",
|
| 23 |
+
"https://www.nike.com/t/jordan-brooklyn-fleece-mens-crew-neck-sweatshirt-X93KLz/FV7293-010",
|
| 24 |
+
"https://www.nike.com/t/jordan-brooklyn-mens-hike-mike-full-zip-hoodie-11ir96Q1/IF1887-010",
|
| 25 |
+
"https://www.nike.com/t/jordan-sport-crossover-mens-fleece-pullover-hoodie-VXzVnjEL/HQ8694-010",
|
| 26 |
+
"https://www.nike.com/t/primary-fleece-mens-dri-fit-uv-pullover-performance-hoodie-B10qz3/FZ0969-010"
|
| 27 |
+
],
|
| 28 |
+
"products": [
|
| 29 |
+
{
|
| 30 |
+
"item_link": "https://www.nike.com/t/sportswear-club-mens-fleece-hoodie-cjm38J/HV1216-893",
|
| 31 |
+
"name": "Nike Sportswear Club Men's Fleece Hoodie",
|
| 32 |
+
"price": "$54.97",
|
| 33 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/92557b1d-2396-4844-aac2-bdb1b09962ef/M+NK+CLUB+BB+HDY+RUN+REISSUE.png"
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"item_link": "https://www.nike.com/t/247-impossiblysoft-mens-dri-fit-1-2-zip-top-s9Hx9b/HQ6953-010",
|
| 37 |
+
"name": "Nike 24.7 ImpossiblySoft Men's Dri-FIT 1/2-Zip Top",
|
| 38 |
+
"price": "$125.00",
|
| 39 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/u_9ddf04c7-2a9a-4d76-add1-d15af8f0263d,c_scale,fl_relative,w_1.0,h_1.0,fl_layer_apply/692f9518-fc3b-4270-8f4a-d072e74ca17e/M+NK+DF+24.7+IS+HZ.png"
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"item_link": "https://www.nike.com/t/caitlin-clark-basketball-fleece-pullover-hoodie-tRRBDb6P/IQ5572-133",
|
| 43 |
+
"name": "Caitlin Clark Nike Basketball Fleece Pullover Hoodie",
|
| 44 |
+
"price": "$80.97",
|
| 45 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/a0b5441e-9126-49f4-9210-6f00b25cd673/CC+U+NK+FLC+PO+HOODIE+PREM.png"
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"item_link": "https://www.nike.com/t/club-mens-pullover-fleece-hoodie-00eeWNwD/FN3859-010",
|
| 49 |
+
"name": "Nike Club Men's Pullover Fleece Hoodie",
|
| 50 |
+
"price": "$70.00",
|
| 51 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/u_9ddf04c7-2a9a-4d76-add1-d15af8f0263d,c_scale,fl_relative,w_1.0,h_1.0,fl_layer_apply/058ea263-4c74-41f8-b9dc-83a9e03fa569/M+NK+CLUB+BB+PO+HOODIE.png"
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"item_link": "https://www.nike.com/t/club-mens-fleece-crew-RU1Il4yl/FN3886-010",
|
| 55 |
+
"name": "Nike Club Men's Fleece Crew",
|
| 56 |
+
"price": "$60.00",
|
| 57 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/u_9ddf04c7-2a9a-4d76-add1-d15af8f0263d,c_scale,fl_relative,w_1.0,h_1.0,fl_layer_apply/2bc7f3c9-8dbe-4016-8705-0ebfd5695b51/M+NK+CLUB+BB+CREW.png"
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"item_link": "https://www.nike.com/t/sportswear-club-fleece-pullover-hoodie-Gw4Nwq/BV2654-010",
|
| 61 |
+
"name": "Nike Sportswear Club Fleece Pullover Hoodie",
|
| 62 |
+
"price": "$65.00",
|
| 63 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/136c17a4-16c0-47ce-b922-679098771a7d/M+NSW+CLUB+HOODIE+PO+BB.png"
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"item_link": "https://www.nike.com/t/sportswear-club-fleece-mens-full-zip-hoodie-nR8tst/BV2645-010",
|
| 67 |
+
"name": "Nike Sportswear Club Fleece Men's Full-Zip Hoodie",
|
| 68 |
+
"price": "$70.00",
|
| 69 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/0133bce8-1d47-45f0-98af-bffd2738641b/M+NSW+CLUB+HOODIE+FZ+BB.png"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"item_link": "https://www.nike.com/t/club-fleece-mens-football-pullover-hoodie-wklPOsTs/M31233FB25-BLK",
|
| 73 |
+
"name": "Nike Club Fleece Men's Football Pullover Hoodie",
|
| 74 |
+
"price": "$48.97",
|
| 75 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/u_9ddf04c7-2a9a-4d76-add1-d15af8f0263d,c_scale,fl_relative,w_1.0,h_1.0,fl_layer_apply/7da73ac3-1373-4ce6-a60f-94931a5feb1b/FA25+SWH+SPT+FB+CLUB+FLC+POH.png"
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"item_link": "https://www.nike.com/t/therma-mens-therma-fit-hooded-fitness-pullover-X9fVm2/DQ4834-010",
|
| 79 |
+
"name": "Nike Therma Men's Therma-FIT Hooded Fitness Pullover",
|
| 80 |
+
"price": "$47.97",
|
| 81 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/5d0316e2-55c8-45fe-a164-c1c59575679b/M+NK+TF+HD+PO.png"
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"item_link": "https://www.nike.com/t/solo-swoosh-mens-pullover-hoodie-bDPPFV/HV1082-010",
|
| 85 |
+
"name": "Nike Solo Swoosh Men's Pullover Hoodie",
|
| 86 |
+
"price": "$90.00",
|
| 87 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/u_9ddf04c7-2a9a-4d76-add1-d15af8f0263d,c_scale,fl_relative,w_1.0,h_1.0,fl_layer_apply/b222e3d0-d44a-488e-9723-519d1526ae54/M+NL+SOLO+SWSH+BB+PO+HOODIE.png"
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"item_link": "https://www.nike.com/t/tech-mens-fleece-pullover-hoodie-2eYhFKJ7/IO9941-010",
|
| 91 |
+
"name": "Nike Tech Men's Fleece Pullover Hoodie",
|
| 92 |
+
"price": "$145.00",
|
| 93 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/14e6e662-674d-41f7-a28d-41ff10580723/M+NK+TCH+ERGO+PO+HOODIE.png"
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"item_link": "https://www.nike.com/t/club-fleece-mens-baseball-pullover-hoodie-gvB0EgFc/M31233BS25-ANT",
|
| 97 |
+
"name": "Nike Club Fleece Men's Baseball Pullover Hoodie",
|
| 98 |
+
"price": "$52.97",
|
| 99 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/u_9ddf04c7-2a9a-4d76-add1-d15af8f0263d,c_scale,fl_relative,w_1.0,h_1.0,fl_layer_apply/535f8f32-b453-4d4e-9b96-d316f628cca7/FA25+SWH+SPT+BSBL+CLUB+FLC+POH.png"
|
| 100 |
+
},
|
| 101 |
+
{
|
| 102 |
+
"item_link": "https://www.nike.com/t/standard-issue-mens-therma-fit-brushed-basketball-pullover-hoodie-88aWq9IL/IM5918-010",
|
| 103 |
+
"name": "Nike Standard Issue Men's Therma-FIT Brushed Basketball Pullover Hoodie",
|
| 104 |
+
"price": "$59.97",
|
| 105 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/u_9ddf04c7-2a9a-4d76-add1-d15af8f0263d,c_scale,fl_relative,w_1.0,h_1.0,fl_layer_apply/d26703fe-117e-438b-8174-4e7d69a71aa9/M+NK+TF+SI+BRSH+PO+HD.png"
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"item_link": "https://www.nike.com/t/acg-tuff-fleece-pullover-hoodie-njCV8f/DZ3392-011",
|
| 109 |
+
"name": "Nike ACG \"Tuff Fleece\" Pullover Hoodie",
|
| 110 |
+
"price": "$115.00",
|
| 111 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/0cc08282-0c99-4e7d-a714-c6541eab9204/U+ACG+TUFF+FLC+HOODIE+PO.png"
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"item_link": "https://www.nike.com/t/tech-mens-fleece-windrunner-full-zip-jacket-HqUykgjE/HV0949-010",
|
| 115 |
+
"name": "Nike Tech Men's Fleece Windrunner Full-Zip Jacket",
|
| 116 |
+
"price": "$140.00",
|
| 117 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/u_9ddf04c7-2a9a-4d76-add1-d15af8f0263d,c_scale,fl_relative,w_1.0,h_1.0,fl_layer_apply/58bf6658-12e4-4407-8427-b6f9f5c5e20f/M+NK+TCH+FLC+FZ+WR+HOODIE.png"
|
| 118 |
+
},
|
| 119 |
+
{
|
| 120 |
+
"item_link": "https://www.nike.com/t/mens-therma-fit-fleece-hoodie-nMuSUUk1/HV4062-010",
|
| 121 |
+
"name": "Nike Men's Therma-FIT Fleece Hoodie",
|
| 122 |
+
"price": "$54.97",
|
| 123 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/34bc662d-f272-4cd5-a738-bc718228947d/M+NK+TF+FLEECE+PO+HD+GFX.png"
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"item_link": "https://www.nike.com/t/therma-mens-therma-fit-full-zip-fitness-top-vsR4Rm/DQ4830-010",
|
| 127 |
+
"name": "Nike Therma Men's Therma-FIT Full-Zip Fitness Top",
|
| 128 |
+
"price": "$75.00",
|
| 129 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/b75e28b1-1acd-4756-bb8a-c3af777a080e/M+NK+TF+HD+FZ.png"
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"item_link": "https://www.nike.com/t/jordan-brooklyn-mens-oversized-pullover-hoodie-kkkKymFl/IM7943-010",
|
| 133 |
+
"name": "Jordan Brooklyn Men's Oversized Pullover Hoodie",
|
| 134 |
+
"price": "$75.00",
|
| 135 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/u_126ab356-44d8-4a06-89b4-fcdcc8df0245,c_scale,fl_relative,w_1.0,h_1.0,fl_layer_apply/dbd2c4e6-3aea-41a0-926e-5e75e7e80664/M+J+BRK+OVS+PO+HD+CLD.png"
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"item_link": "https://www.nike.com/t/nocta-fleece-cs-hoodie-2-5DG03b/HM5762-010",
|
| 139 |
+
"name": "NOCTA Fleece CS Hoodie 2",
|
| 140 |
+
"price": "$120.00",
|
| 141 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/f7e7b317-22c1-4c77-9231-73086563141f/M+NRG+CS+NOCTA+HOODIE+FLC+2.png"
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"item_link": "https://www.nike.com/t/nikecourt-heritage-mens-dri-fit-french-terry-tennis-pullover-hoodie-Y9kjc6iO/HM6759-010",
|
| 145 |
+
"name": "NikeCourt Heritage Men's Dri-FIT French Terry Tennis Pullover Hoodie",
|
| 146 |
+
"price": "$100.00",
|
| 147 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/u_9ddf04c7-2a9a-4d76-add1-d15af8f0263d,c_scale,fl_relative,w_1.0,h_1.0,fl_layer_apply/4bc4896e-fbec-4be9-b2ec-8f7c19bfc6e5/M+NKCT+DF+HRTGE+FLC+PO+HD.png"
|
| 148 |
+
},
|
| 149 |
+
{
|
| 150 |
+
"item_link": "https://www.nike.com/t/jordan-brooklyn-fleece-mens-crew-neck-sweatshirt-X93KLz/FV7293-010",
|
| 151 |
+
"name": "Jordan Brooklyn Fleece Men's Crew-Neck Sweatshirt",
|
| 152 |
+
"price": "$60.00",
|
| 153 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/u_126ab356-44d8-4a06-89b4-fcdcc8df0245,c_scale,fl_relative,w_1.0,h_1.0,fl_layer_apply/79b185ef-79d4-463c-b12b-92a1b355c1be/M+J+BRK+FLC+CREW.png"
|
| 154 |
+
},
|
| 155 |
+
{
|
| 156 |
+
"item_link": "https://www.nike.com/t/jordan-brooklyn-mens-hike-mike-full-zip-hoodie-11ir96Q1/IF1887-010",
|
| 157 |
+
"name": "Jordan Brooklyn Men's Hike Mike Full-Zip Hoodie",
|
| 158 |
+
"price": "$60.97",
|
| 159 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/u_126ab356-44d8-4a06-89b4-fcdcc8df0245,c_scale,fl_relative,w_1.0,h_1.0,fl_layer_apply/8c8e63b0-1530-4c4e-94d0-be7d27f982b0/M+J+BRK+HIKE+MIKE+ZIP+HD.png"
|
| 160 |
+
},
|
| 161 |
+
{
|
| 162 |
+
"item_link": "https://www.nike.com/t/jordan-sport-crossover-mens-fleece-pullover-hoodie-VXzVnjEL/HQ8694-010",
|
| 163 |
+
"name": "Jordan Sport Crossover Men's Fleece Pullover Hoodie",
|
| 164 |
+
"price": "$50.97",
|
| 165 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/u_126ab356-44d8-4a06-89b4-fcdcc8df0245,c_scale,fl_relative,w_1.0,h_1.0,fl_layer_apply/27619f5d-c9c4-4f07-96b2-fa7ca6274663/M+J+SPRT+CSVR+GFX+FLC+PO.png"
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"item_link": "https://www.nike.com/t/primary-fleece-mens-dri-fit-uv-pullover-performance-hoodie-B10qz3/FZ0969-010",
|
| 169 |
+
"name": "Nike Primary Fleece Men's Dri-FIT UV Pullover Performance Hoodie",
|
| 170 |
+
"price": "$55.97",
|
| 171 |
+
"image_url": "https://static.nike.com/a/images/t_web_pw_592_v2/f_auto/9bff5b2c-7b9c-4a22-9d63-eaca3808c540/M+NK+DF+UV+PRIMARY+PO+HOODIE.png"
|
| 172 |
+
}
|
| 173 |
+
]
|
| 174 |
+
}
|
scraper.py
ADDED
|
@@ -0,0 +1,512 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from io import StringIO
|
| 4 |
+
import csv
|
| 5 |
+
import json
|
| 6 |
+
from datetime import datetime, timezone
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Optional
|
| 9 |
+
|
| 10 |
+
import requests
|
| 11 |
+
from bs4 import BeautifulSoup
|
| 12 |
+
from fastapi import FastAPI, HTTPException
|
| 13 |
+
from fastapi.responses import StreamingResponse
|
| 14 |
+
from pydantic import BaseModel, Field
|
| 15 |
+
from urllib.parse import urlencode
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
app = FastAPI(title="Nike Scraper API", version="1.0.0")
|
| 19 |
+
|
| 20 |
+
NIKE_BASE_SEARCH = "https://www.nike.com/w"
|
| 21 |
+
NIKE_BASE_URL = "https://www.nike.com"
|
| 22 |
+
|
| 23 |
+
CATEGORY_ALIASES = {
|
| 24 |
+
"t-shirt": "t-shirt",
|
| 25 |
+
"tee": "t-shirt",
|
| 26 |
+
"shirt": "shirt",
|
| 27 |
+
"hoodie": "hoodie",
|
| 28 |
+
"sweatshirt": "sweatshirt",
|
| 29 |
+
"jacket": "jacket",
|
| 30 |
+
"gilet": "gilet",
|
| 31 |
+
"top": "top",
|
| 32 |
+
"tank": "tank top",
|
| 33 |
+
"polo": "polo",
|
| 34 |
+
"jersey": "jersey",
|
| 35 |
+
"bra": "sports bra",
|
| 36 |
+
"pant": "pants",
|
| 37 |
+
"pants": "pants",
|
| 38 |
+
"trousers": "trousers",
|
| 39 |
+
"shorts": "shorts",
|
| 40 |
+
"short": "shorts",
|
| 41 |
+
"leggings": "leggings",
|
| 42 |
+
"tights": "tights",
|
| 43 |
+
"joggers": "joggers",
|
| 44 |
+
"sweatpants": "sweatpants",
|
| 45 |
+
"skirt": "skirt",
|
| 46 |
+
"dress": "dress",
|
| 47 |
+
"tracksuit": "tracksuit",
|
| 48 |
+
"jumpsuit": "jumpsuit",
|
| 49 |
+
"socks": "socks",
|
| 50 |
+
"sock": "socks",
|
| 51 |
+
"hat": "hat",
|
| 52 |
+
"cap": "cap",
|
| 53 |
+
"bag": "bag",
|
| 54 |
+
"backpack": "backpack",
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
HEADERS = {
|
| 58 |
+
"User-Agent": (
|
| 59 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
| 60 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
| 61 |
+
"Chrome/123.0.0.0 Safari/537.36"
|
| 62 |
+
)
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
CATEGORIES = [
|
| 66 |
+
"sweaters",
|
| 67 |
+
"hoodies",
|
| 68 |
+
"t-shirts",
|
| 69 |
+
"jackets",
|
| 70 |
+
"shirts",
|
| 71 |
+
"crews",
|
| 72 |
+
"jerseys",
|
| 73 |
+
"tops",
|
| 74 |
+
"polos",
|
| 75 |
+
"tanks",
|
| 76 |
+
"compression",
|
| 77 |
+
"baselayer",
|
| 78 |
+
"jeans",
|
| 79 |
+
"shorts",
|
| 80 |
+
"skirts",
|
| 81 |
+
"tights",
|
| 82 |
+
"parkas",
|
| 83 |
+
"gilets",
|
| 84 |
+
"pants",
|
| 85 |
+
"leggings",
|
| 86 |
+
"trousers",
|
| 87 |
+
"joggers",
|
| 88 |
+
"sweatpants",
|
| 89 |
+
"dresses",
|
| 90 |
+
"rompers",
|
| 91 |
+
"jumpsuits",
|
| 92 |
+
"onesies",
|
| 93 |
+
"overalls",
|
| 94 |
+
"tracksuits",
|
| 95 |
+
"sneakers",
|
| 96 |
+
"slippers",
|
| 97 |
+
"sunglasses",
|
| 98 |
+
"bras",
|
| 99 |
+
"socks",
|
| 100 |
+
"hats",
|
| 101 |
+
"bags",
|
| 102 |
+
"backpacks",
|
| 103 |
+
]
|
| 104 |
+
|
| 105 |
+
SCRAPE_OUTPUT_DIR = Path(__file__).resolve().parent / "scraped_json"
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
class Recommendation(BaseModel):
|
| 109 |
+
color: str = Field(..., min_length=1)
|
| 110 |
+
category: str = Field(..., min_length=1)
|
| 111 |
+
gender: Optional[str] = Field(default=None, description="men or women")
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
class ScrapeRequest(BaseModel):
|
| 115 |
+
recommendation: Recommendation
|
| 116 |
+
max_products: int = Field(default=30, ge=1, le=300)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def _ensure_full_url(href: str) -> str:
|
| 120 |
+
if href.startswith("/"):
|
| 121 |
+
return f"{NIKE_BASE_URL}{href}"
|
| 122 |
+
return href
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def build_nike_search_url(color: str, category: str, gender: Optional[str] = None) -> str:
|
| 126 |
+
category_normalized = CATEGORY_ALIASES.get(category.lower(), category.lower())
|
| 127 |
+
parts: list[str] = []
|
| 128 |
+
if gender:
|
| 129 |
+
parts.append(gender.lower() + "s")
|
| 130 |
+
parts.append(color.lower())
|
| 131 |
+
parts.append(category_normalized)
|
| 132 |
+
query = " ".join(parts)
|
| 133 |
+
params = urlencode({"q": query, "vst": query})
|
| 134 |
+
return f"{NIKE_BASE_SEARCH}?{params}"
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def build_nike_urls_from_recommendation(recommendation: Recommendation) -> list[str]:
|
| 138 |
+
color = recommendation.color
|
| 139 |
+
category = recommendation.category
|
| 140 |
+
gender = recommendation.gender
|
| 141 |
+
if gender:
|
| 142 |
+
return [build_nike_search_url(color, category, gender)]
|
| 143 |
+
return [
|
| 144 |
+
build_nike_search_url(color, category, "men"),
|
| 145 |
+
build_nike_search_url(color, category, "women"),
|
| 146 |
+
build_nike_search_url(color, category),
|
| 147 |
+
]
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def build_search_urls_from_recommendation(recommendation: Recommendation, store: str = "nike") -> list[str]:
|
| 151 |
+
return build_nike_urls_from_recommendation(recommendation)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def build_search_urls_from_query(query: str, store: str = "nike", gender: Optional[str] = None) -> list[str]:
|
| 155 |
+
normalized_query = str(query or "").strip()
|
| 156 |
+
if not normalized_query:
|
| 157 |
+
return []
|
| 158 |
+
|
| 159 |
+
def _normalize_prefixed_query(prefix: str, value: str) -> str:
|
| 160 |
+
lowered = value.strip().lower()
|
| 161 |
+
p = prefix.strip().lower()
|
| 162 |
+
if lowered.startswith(f"{p} "):
|
| 163 |
+
return value.strip()
|
| 164 |
+
return f"{prefix} {value}".strip()
|
| 165 |
+
|
| 166 |
+
if gender:
|
| 167 |
+
q = _normalize_prefixed_query(gender, normalized_query)
|
| 168 |
+
return [f"{NIKE_BASE_SEARCH}?{urlencode({'q': q, 'vst': q})}"]
|
| 169 |
+
|
| 170 |
+
return [
|
| 171 |
+
f"{NIKE_BASE_SEARCH}?{urlencode({'q': f'men {normalized_query}'.strip(), 'vst': f'men {normalized_query}'.strip()})}",
|
| 172 |
+
f"{NIKE_BASE_SEARCH}?{urlencode({'q': f'women {normalized_query}'.strip(), 'vst': f'women {normalized_query}'.strip()})}",
|
| 173 |
+
f"{NIKE_BASE_SEARCH}?{urlencode({'q': normalized_query, 'vst': normalized_query})}",
|
| 174 |
+
]
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def _get_soup(url: str) -> BeautifulSoup:
|
| 178 |
+
response = requests.get(url, headers=HEADERS, timeout=20)
|
| 179 |
+
response.raise_for_status()
|
| 180 |
+
return BeautifulSoup(response.content, "lxml")
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def _ensure_store_url(href: str, base_url: str) -> str:
|
| 184 |
+
if not href:
|
| 185 |
+
return ""
|
| 186 |
+
if href.startswith("//"):
|
| 187 |
+
return f"https:{href}"
|
| 188 |
+
if href.startswith("/"):
|
| 189 |
+
return f"{base_url}{href}"
|
| 190 |
+
return href
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def extract_product_urls(search_url: str) -> list[str]:
|
| 194 |
+
soup = _get_soup(search_url)
|
| 195 |
+
product_links: list[str] = []
|
| 196 |
+
|
| 197 |
+
anchors = soup.find_all("a", {"class": "product-card__link-overlay"})
|
| 198 |
+
for anchor in anchors:
|
| 199 |
+
href = anchor.get("href")
|
| 200 |
+
if href:
|
| 201 |
+
full = _ensure_full_url(href)
|
| 202 |
+
if full not in product_links:
|
| 203 |
+
product_links.append(full)
|
| 204 |
+
|
| 205 |
+
if not product_links:
|
| 206 |
+
all_anchors = soup.find_all("a", href=True)
|
| 207 |
+
for anchor in all_anchors:
|
| 208 |
+
href = anchor.get("href")
|
| 209 |
+
if href and "/t/" in href:
|
| 210 |
+
full = _ensure_full_url(href)
|
| 211 |
+
if full not in product_links:
|
| 212 |
+
product_links.append(full)
|
| 213 |
+
|
| 214 |
+
return product_links
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def _extract_image_from_container(container: BeautifulSoup) -> str:
|
| 218 |
+
img = container.find("img")
|
| 219 |
+
if not img:
|
| 220 |
+
return ""
|
| 221 |
+
return str(img.get("src") or img.get("data-src") or img.get("srcset") or "").strip()
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def extract_product_summaries(search_url: str, store: str = "nike") -> list[dict[str, str]]:
|
| 225 |
+
soup = _get_soup(search_url)
|
| 226 |
+
summaries: list[dict[str, str]] = []
|
| 227 |
+
seen_links: set[str] = set()
|
| 228 |
+
|
| 229 |
+
containers = soup.find_all("div", {"class": "product-card__body"})
|
| 230 |
+
for container in containers:
|
| 231 |
+
anchor = container.find("a", {"class": "product-card__link-overlay"})
|
| 232 |
+
if not anchor:
|
| 233 |
+
continue
|
| 234 |
+
|
| 235 |
+
href = anchor.get("href")
|
| 236 |
+
if not href:
|
| 237 |
+
continue
|
| 238 |
+
|
| 239 |
+
item_link = _ensure_full_url(href)
|
| 240 |
+
if item_link in seen_links:
|
| 241 |
+
continue
|
| 242 |
+
seen_links.add(item_link)
|
| 243 |
+
|
| 244 |
+
title = get_title(container)
|
| 245 |
+
current_price, _ = get_prices(container)
|
| 246 |
+
image_url = _extract_image_from_container(container.parent if container.parent else container)
|
| 247 |
+
|
| 248 |
+
summaries.append(
|
| 249 |
+
{
|
| 250 |
+
"item_link": item_link,
|
| 251 |
+
"name": title,
|
| 252 |
+
"price": current_price,
|
| 253 |
+
"image_url": image_url,
|
| 254 |
+
}
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
if summaries:
|
| 258 |
+
return summaries
|
| 259 |
+
|
| 260 |
+
# Fallback path when Nike card markup changes.
|
| 261 |
+
for item_link in extract_product_urls(search_url):
|
| 262 |
+
if item_link in seen_links:
|
| 263 |
+
continue
|
| 264 |
+
seen_links.add(item_link)
|
| 265 |
+
summaries.append(
|
| 266 |
+
{
|
| 267 |
+
"item_link": item_link,
|
| 268 |
+
"name": "N/A",
|
| 269 |
+
"price": "N/A",
|
| 270 |
+
"image_url": "",
|
| 271 |
+
}
|
| 272 |
+
)
|
| 273 |
+
return summaries
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def get_title(container: BeautifulSoup) -> str:
|
| 277 |
+
try:
|
| 278 |
+
title = container.find_all("div", {"class": "product-card__title"})[0].text
|
| 279 |
+
subtitle = container.find_all("div", {"class": "product-card__subtitle"})[0].text
|
| 280 |
+
return f"{title} {subtitle}".strip()
|
| 281 |
+
except (IndexError, AttributeError):
|
| 282 |
+
return "N/A"
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def get_target_gender(title: str) -> str:
|
| 286 |
+
if "Men's" in title:
|
| 287 |
+
return "Men"
|
| 288 |
+
if "Women's" in title:
|
| 289 |
+
return "Women"
|
| 290 |
+
return "Unisex"
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
def get_subcategory(title: str) -> str:
|
| 294 |
+
for word in title.split(" "):
|
| 295 |
+
candidate = word.lower().strip(",.")
|
| 296 |
+
if candidate in CATEGORIES or (candidate + "s") in CATEGORIES:
|
| 297 |
+
return word
|
| 298 |
+
return ""
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
def get_prices(container: BeautifulSoup) -> tuple[str, str]:
|
| 302 |
+
try:
|
| 303 |
+
price_container = container.find_all("div", {"class": "product-price__wrapper"})
|
| 304 |
+
current_price = price_container[0].text
|
| 305 |
+
old_price = "N/A"
|
| 306 |
+
|
| 307 |
+
if current_price.count("$") == 2:
|
| 308 |
+
prices = current_price.split("$")
|
| 309 |
+
current_price = "$" + prices[1] if "." in prices[1] else "$" + prices[1] + ".00"
|
| 310 |
+
old_price = "$" + prices[2] if "." in prices[2] else "$" + prices[2] + ".00"
|
| 311 |
+
elif "." not in current_price:
|
| 312 |
+
current_price = current_price + ".00"
|
| 313 |
+
except (IndexError, AttributeError):
|
| 314 |
+
current_price, old_price = "N/A", "N/A"
|
| 315 |
+
|
| 316 |
+
return current_price, old_price
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
def get_item_image_link(item_soup: BeautifulSoup) -> str:
|
| 320 |
+
try:
|
| 321 |
+
img = item_soup.find("img", {"class": "css-viwop1 u-full-width u-full-height css-m5dkrx"})
|
| 322 |
+
return img.get("src") if img else "Click on item link for pictures."
|
| 323 |
+
except (IndexError, AttributeError):
|
| 324 |
+
return "Click on item link for pictures."
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def get_colors(item_soup: BeautifulSoup) -> str:
|
| 328 |
+
try:
|
| 329 |
+
current = item_soup.find_all(
|
| 330 |
+
"div",
|
| 331 |
+
{
|
| 332 |
+
"class": "colorway-product-overlay colorway-product-overlay--active "
|
| 333 |
+
"colorway-product-overlay--selected css-sa2cc9"
|
| 334 |
+
},
|
| 335 |
+
)
|
| 336 |
+
if current:
|
| 337 |
+
colors = current[0].find_all("img", alt=True)[0].get("alt")
|
| 338 |
+
for color in item_soup.find_all("div", {"class": "colorway-product-overlay css-sa2cc9"}):
|
| 339 |
+
alt = color.find_all("img", alt=True)[0].get("alt")
|
| 340 |
+
if alt != "Design your own Nike By You product":
|
| 341 |
+
colors += " || " + alt
|
| 342 |
+
else:
|
| 343 |
+
color_li = item_soup.find_all("li", {"class": "description-preview__color-description ncss-li"})
|
| 344 |
+
colors = str(color_li).split(": ")[1].replace("</li>]", "")
|
| 345 |
+
except (IndexError, AttributeError):
|
| 346 |
+
colors = "Click on item link for available colors."
|
| 347 |
+
return colors
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
def scrape_products(search_urls: list[str], max_products: int) -> list[dict[str, str]]:
|
| 351 |
+
items: list[dict[str, str]] = []
|
| 352 |
+
seen_links: set[str] = set()
|
| 353 |
+
|
| 354 |
+
for link in search_urls:
|
| 355 |
+
soup = _get_soup(link)
|
| 356 |
+
containers = soup.find_all("div", {"class": "product-card__body"})
|
| 357 |
+
|
| 358 |
+
for container in containers:
|
| 359 |
+
if len(items) >= max_products:
|
| 360 |
+
return items
|
| 361 |
+
|
| 362 |
+
anchor = container.find("a", {"class": "product-card__link-overlay"})
|
| 363 |
+
if not anchor:
|
| 364 |
+
continue
|
| 365 |
+
href = anchor.get("href")
|
| 366 |
+
if not href:
|
| 367 |
+
continue
|
| 368 |
+
|
| 369 |
+
item_link = _ensure_full_url(href)
|
| 370 |
+
if item_link in seen_links:
|
| 371 |
+
continue
|
| 372 |
+
seen_links.add(item_link)
|
| 373 |
+
|
| 374 |
+
title = get_title(container)
|
| 375 |
+
gender = get_target_gender(title)
|
| 376 |
+
current_price, old_price = get_prices(container)
|
| 377 |
+
subcategory = get_subcategory(title)
|
| 378 |
+
|
| 379 |
+
image_link = "Click on item link for pictures."
|
| 380 |
+
colors = "Click on item link for available colors."
|
| 381 |
+
try:
|
| 382 |
+
item_soup = _get_soup(item_link)
|
| 383 |
+
image_link = get_item_image_link(item_soup)
|
| 384 |
+
colors = get_colors(item_soup)
|
| 385 |
+
except requests.RequestException:
|
| 386 |
+
pass
|
| 387 |
+
|
| 388 |
+
items.append(
|
| 389 |
+
{
|
| 390 |
+
"name": title,
|
| 391 |
+
"gender": gender,
|
| 392 |
+
"price": current_price,
|
| 393 |
+
"sale_price": old_price,
|
| 394 |
+
"colors": colors,
|
| 395 |
+
"item_link": item_link,
|
| 396 |
+
"image_link": image_link,
|
| 397 |
+
"subcategory": subcategory,
|
| 398 |
+
"brand": "Nike",
|
| 399 |
+
}
|
| 400 |
+
)
|
| 401 |
+
|
| 402 |
+
return items
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
def _build_csv(products: list[dict[str, str]]) -> str:
|
| 406 |
+
output = StringIO()
|
| 407 |
+
writer = csv.DictWriter(
|
| 408 |
+
output,
|
| 409 |
+
fieldnames=[
|
| 410 |
+
"name",
|
| 411 |
+
"gender",
|
| 412 |
+
"price",
|
| 413 |
+
"sale_price",
|
| 414 |
+
"colors",
|
| 415 |
+
"item_link",
|
| 416 |
+
"image_link",
|
| 417 |
+
"subcategory",
|
| 418 |
+
"brand",
|
| 419 |
+
],
|
| 420 |
+
)
|
| 421 |
+
writer.writeheader()
|
| 422 |
+
writer.writerows(products)
|
| 423 |
+
return output.getvalue()
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
def _save_json_payload(prefix: str, payload: dict[str, object]) -> str:
|
| 427 |
+
SCRAPE_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 428 |
+
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
| 429 |
+
filename = f"{prefix}_{ts}.json"
|
| 430 |
+
file_path = SCRAPE_OUTPUT_DIR / filename
|
| 431 |
+
with file_path.open("w", encoding="utf-8") as f:
|
| 432 |
+
json.dump(payload, f, ensure_ascii=True, indent=2)
|
| 433 |
+
return str(file_path)
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
@app.get("/health")
|
| 437 |
+
def health() -> dict[str, str]:
|
| 438 |
+
return {"status": "ok"}
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
@app.get("/")
|
| 442 |
+
def root() -> dict[str, str]:
|
| 443 |
+
return {
|
| 444 |
+
"message": "Nike Scraper API is running.",
|
| 445 |
+
"docs": "/docs",
|
| 446 |
+
"health": "/health",
|
| 447 |
+
}
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
@app.post("/search-urls")
|
| 451 |
+
def search_urls(payload: Recommendation) -> dict[str, list[str]]:
|
| 452 |
+
return {"search_urls": build_nike_urls_from_recommendation(payload)}
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
@app.post("/product-urls")
|
| 456 |
+
def product_urls(payload: Recommendation) -> dict[str, object]:
|
| 457 |
+
try:
|
| 458 |
+
urls = build_nike_urls_from_recommendation(payload)
|
| 459 |
+
all_products: list[dict[str, str]] = []
|
| 460 |
+
seen_links: set[str] = set()
|
| 461 |
+
for url in urls:
|
| 462 |
+
for product in extract_product_summaries(url):
|
| 463 |
+
link = product.get("item_link", "")
|
| 464 |
+
if not link or link in seen_links:
|
| 465 |
+
continue
|
| 466 |
+
seen_links.add(link)
|
| 467 |
+
all_products.append(product)
|
| 468 |
+
|
| 469 |
+
response_payload: dict[str, object] = {
|
| 470 |
+
"product_urls": [item["item_link"] for item in all_products],
|
| 471 |
+
"products": all_products,
|
| 472 |
+
}
|
| 473 |
+
response_payload["saved_json_path"] = _save_json_payload("product_urls", response_payload)
|
| 474 |
+
return response_payload
|
| 475 |
+
except requests.RequestException as exc:
|
| 476 |
+
raise HTTPException(status_code=502, detail=f"Failed to fetch Nike pages: {exc}") from exc
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
@app.post("/scrape")
|
| 480 |
+
def scrape(payload: ScrapeRequest) -> dict[str, object]:
|
| 481 |
+
try:
|
| 482 |
+
search_urls = build_nike_urls_from_recommendation(payload.recommendation)
|
| 483 |
+
products = scrape_products(search_urls, max_products=payload.max_products)
|
| 484 |
+
except requests.RequestException as exc:
|
| 485 |
+
raise HTTPException(status_code=502, detail=f"Failed to fetch Nike pages: {exc}") from exc
|
| 486 |
+
|
| 487 |
+
response_payload: dict[str, object] = {
|
| 488 |
+
"search_urls": search_urls,
|
| 489 |
+
"count": len(products),
|
| 490 |
+
"products": products,
|
| 491 |
+
}
|
| 492 |
+
return response_payload
|
| 493 |
+
|
| 494 |
+
|
| 495 |
+
@app.post("/scrape.csv")
|
| 496 |
+
def scrape_csv(payload: ScrapeRequest) -> StreamingResponse:
|
| 497 |
+
try:
|
| 498 |
+
search_urls = build_nike_urls_from_recommendation(payload.recommendation)
|
| 499 |
+
products = scrape_products(search_urls, max_products=payload.max_products)
|
| 500 |
+
except requests.RequestException as exc:
|
| 501 |
+
raise HTTPException(status_code=502, detail=f"Failed to fetch Nike pages: {exc}") from exc
|
| 502 |
+
|
| 503 |
+
csv_content = _build_csv(products)
|
| 504 |
+
filename = (
|
| 505 |
+
f"nike_{payload.recommendation.gender or 'unisex'}_"
|
| 506 |
+
f"{payload.recommendation.color}_{payload.recommendation.category}.csv"
|
| 507 |
+
)
|
| 508 |
+
return StreamingResponse(
|
| 509 |
+
iter([csv_content]),
|
| 510 |
+
media_type="text/csv",
|
| 511 |
+
headers={"Content-Disposition": f"attachment; filename={filename}"},
|
| 512 |
+
)
|
zalando_scraper.py
ADDED
|
@@ -0,0 +1,1073 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import re
|
| 7 |
+
from typing import Any, Callable, Optional
|
| 8 |
+
from urllib.parse import urlencode, urlparse
|
| 9 |
+
|
| 10 |
+
import requests
|
| 11 |
+
from bs4 import BeautifulSoup
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _env_int(name: str, default: int) -> int:
|
| 15 |
+
raw = os.getenv(name)
|
| 16 |
+
if raw is None or str(raw).strip() == "":
|
| 17 |
+
return default
|
| 18 |
+
try:
|
| 19 |
+
return int(str(raw).strip())
|
| 20 |
+
except (TypeError, ValueError):
|
| 21 |
+
return default
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
ZALANDO_BASE_URL = "https://www.zalando.co.uk"
|
| 25 |
+
APIFY_ACTOR_ENDPOINT = os.getenv(
|
| 26 |
+
"APIFY_ACTOR_ENDPOINT",
|
| 27 |
+
"https://api.apify.com/v2/acts/vistics~zalando-scraper/run-sync-get-dataset-items",
|
| 28 |
+
)
|
| 29 |
+
APIFY_TOKEN = os.getenv("APIFY_API_TOKEN", "").strip()
|
| 30 |
+
APIFY_MAX_RESULTS = 20
|
| 31 |
+
APIFY_MIN_TIMEOUT_SECONDS = max(60, _env_int("APIFY_MIN_TIMEOUT_SECONDS", 180))
|
| 32 |
+
APIFY_WAIT_FOR_FINISH_SECONDS = max(60, _env_int("APIFY_WAIT_FOR_FINISH_SECONDS", 300))
|
| 33 |
+
HTML_FALLBACK_TIMEOUT_SECONDS = max(20, _env_int("ZALANDO_HTML_TIMEOUT_SECONDS", 45))
|
| 34 |
+
|
| 35 |
+
REQUEST_HEADERS = {
|
| 36 |
+
"User-Agent": (
|
| 37 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
| 38 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
| 39 |
+
"Chrome/124.0.0.0 Safari/537.36"
|
| 40 |
+
)
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
if not logging.getLogger().handlers:
|
| 44 |
+
logging.basicConfig(
|
| 45 |
+
level=os.getenv("LOG_LEVEL", "INFO").upper(),
|
| 46 |
+
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
logger = logging.getLogger(__name__)
|
| 50 |
+
logger.setLevel(getattr(logging, os.getenv("LOG_LEVEL", "INFO").upper(), logging.INFO))
|
| 51 |
+
|
| 52 |
+
CATEGORY_PATH_MAP = {
|
| 53 |
+
"topwear": {"women": "womens-clothing", "men": "mens-clothing", "unisex": "clothing"},
|
| 54 |
+
"bottomwear": {"women": "womens-clothing", "men": "mens-clothing", "unisex": "clothing"},
|
| 55 |
+
"layers": {"women": "womens-clothing", "men": "mens-clothing", "unisex": "clothing"},
|
| 56 |
+
"dress": {"women": "womens-clothing-dresses", "men": "mens-clothing", "unisex": "clothing"},
|
| 57 |
+
"dresses": {"women": "womens-clothing-dresses", "men": "mens-clothing", "unisex": "clothing"},
|
| 58 |
+
"shoes": {"women": "womens-shoes", "men": "mens-shoes", "unisex": "shoes"},
|
| 59 |
+
"footwear": {"women": "womens-shoes", "men": "mens-shoes", "unisex": "shoes"},
|
| 60 |
+
"sportswear": {"women": "womens-sports", "men": "mens-sports", "unisex": "sports"},
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
_COLOR_TERMS = [
|
| 64 |
+
"black",
|
| 65 |
+
"white",
|
| 66 |
+
"navy",
|
| 67 |
+
"blue",
|
| 68 |
+
"grey",
|
| 69 |
+
"gray",
|
| 70 |
+
"beige",
|
| 71 |
+
"olive",
|
| 72 |
+
"green",
|
| 73 |
+
"brown",
|
| 74 |
+
"khaki",
|
| 75 |
+
"cream",
|
| 76 |
+
"maroon",
|
| 77 |
+
"charcoal",
|
| 78 |
+
"tan",
|
| 79 |
+
"red",
|
| 80 |
+
"pink",
|
| 81 |
+
"purple",
|
| 82 |
+
"yellow",
|
| 83 |
+
"orange",
|
| 84 |
+
]
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
ScrapePostprocessFn = Callable[[list[dict[str, str]]], list[dict[str, str]]]
|
| 88 |
+
WardrobeSummary = dict[str, Any]
|
| 89 |
+
TextCompletionFn = Callable[[str, int], str]
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def _norm(value: Any) -> str:
|
| 93 |
+
return str(value or "").strip().lower()
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def _normalize_target_category(value: Any) -> str:
|
| 97 |
+
normalized = _norm(value)
|
| 98 |
+
if normalized in {"topwear", "top", "upper", "tops"}:
|
| 99 |
+
return "topwear"
|
| 100 |
+
if normalized in {"bottomwear", "bottom", "lower", "bottoms"}:
|
| 101 |
+
return "bottomwear"
|
| 102 |
+
return "both"
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def _extract_price_text(value: Any) -> str:
|
| 106 |
+
text = str(value or "").strip()
|
| 107 |
+
if not text:
|
| 108 |
+
return "N/A"
|
| 109 |
+
match = re.search(r"([\u00a3$€]\s?\d+[\d,]*(?:\.\d{2})?)", text)
|
| 110 |
+
if match:
|
| 111 |
+
return match.group(1).replace(" ", "")
|
| 112 |
+
return text
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def _extract_src_from_srcset(srcset: str) -> str:
|
| 116 |
+
if not srcset:
|
| 117 |
+
return ""
|
| 118 |
+
first = srcset.split(",")[0].strip()
|
| 119 |
+
return first.split(" ")[0].strip()
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def _ensure_zalando_url(value: str) -> str:
|
| 123 |
+
href = str(value or "").strip()
|
| 124 |
+
if not href:
|
| 125 |
+
return ""
|
| 126 |
+
if href.startswith("//"):
|
| 127 |
+
return f"https:{href}"
|
| 128 |
+
if href.startswith("/"):
|
| 129 |
+
return f"{ZALANDO_BASE_URL}{href}"
|
| 130 |
+
return href
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def _format_apify_money(raw_value: Any, currency_symbol: str) -> str:
|
| 134 |
+
text = str(raw_value or "").strip()
|
| 135 |
+
if not text:
|
| 136 |
+
return ""
|
| 137 |
+
|
| 138 |
+
normalized = text.replace(",", "")
|
| 139 |
+
# Apify commonly returns minor units like 5999 => 59.99
|
| 140 |
+
if re.fullmatch(r"\d+", normalized):
|
| 141 |
+
major = int(normalized) // 100
|
| 142 |
+
minor = int(normalized) % 100
|
| 143 |
+
return f"{currency_symbol}{major}.{minor:02d}" if currency_symbol else f"{major}.{minor:02d}"
|
| 144 |
+
|
| 145 |
+
match = re.search(r"\d+(?:\.\d{1,2})?", normalized)
|
| 146 |
+
if not match:
|
| 147 |
+
return ""
|
| 148 |
+
return f"{currency_symbol}{match.group(0)}" if currency_symbol else match.group(0)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def summarize_wardrobe_metadata(wardrobe_items: list[dict[str, Any]]) -> WardrobeSummary:
|
| 152 |
+
items = [item for item in wardrobe_items if isinstance(item, dict)]
|
| 153 |
+
colors: dict[str, int] = {}
|
| 154 |
+
types: dict[str, int] = {}
|
| 155 |
+
categories: dict[str, int] = {}
|
| 156 |
+
fabrics: dict[str, int] = {}
|
| 157 |
+
fits: dict[str, int] = {}
|
| 158 |
+
occasions: dict[str, int] = {}
|
| 159 |
+
|
| 160 |
+
for item in items:
|
| 161 |
+
description = item.get("description") if isinstance(item.get("description"), dict) else {}
|
| 162 |
+
color = str(item.get("color") or description.get("color") or "").strip().lower()
|
| 163 |
+
garment_type = str(item.get("type") or description.get("type") or "").strip().lower()
|
| 164 |
+
category = str(item.get("category") or description.get("category") or "").strip().lower()
|
| 165 |
+
fabric = str(item.get("fabric") or description.get("fabric") or "").strip().lower()
|
| 166 |
+
fit = str(item.get("fit") or description.get("fit") or "").strip().lower()
|
| 167 |
+
occasion = str(item.get("occasion") or description.get("occasion") or description.get("style") or "").strip().lower()
|
| 168 |
+
|
| 169 |
+
if color:
|
| 170 |
+
colors[color] = colors.get(color, 0) + 1
|
| 171 |
+
if garment_type:
|
| 172 |
+
types[garment_type] = types.get(garment_type, 0) + 1
|
| 173 |
+
if category:
|
| 174 |
+
categories[category] = categories.get(category, 0) + 1
|
| 175 |
+
if fabric:
|
| 176 |
+
fabrics[fabric] = fabrics.get(fabric, 0) + 1
|
| 177 |
+
if fit:
|
| 178 |
+
fits[fit] = fits.get(fit, 0) + 1
|
| 179 |
+
if occasion:
|
| 180 |
+
occasions[occasion] = occasions.get(occasion, 0) + 1
|
| 181 |
+
|
| 182 |
+
def top_values(counter: dict[str, int], limit: int = 8) -> list[dict[str, Any]]:
|
| 183 |
+
return [
|
| 184 |
+
{"value": key, "count": count}
|
| 185 |
+
for key, count in sorted(counter.items(), key=lambda pair: pair[1], reverse=True)[:limit]
|
| 186 |
+
]
|
| 187 |
+
|
| 188 |
+
return {
|
| 189 |
+
"total_items": len(items),
|
| 190 |
+
"colors": top_values(colors),
|
| 191 |
+
"types": top_values(types),
|
| 192 |
+
"categories": top_values(categories),
|
| 193 |
+
"fabrics": top_values(fabrics),
|
| 194 |
+
"fits": top_values(fits),
|
| 195 |
+
"occasions": top_values(occasions),
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def _count_query_signals(query: str, requested_category: str | None = None) -> dict[str, bool]:
|
| 200 |
+
normalized = _norm(query)
|
| 201 |
+
has_color = any(color in normalized for color in _COLOR_TERMS)
|
| 202 |
+
requested = _norm(requested_category)
|
| 203 |
+
has_type = bool(requested and requested not in {"both", "all"}) or any(
|
| 204 |
+
token in normalized for token in [
|
| 205 |
+
"trouser", "trousers", "pants", "jeans", "shorts", "joggers", "skirt", "dress",
|
| 206 |
+
"topwear", "bottomwear", "shirt", "tee", "blouse", "polo", "hoodie", "jacket",
|
| 207 |
+
"sweater", "blazer", "t-shirt", "tank", "leggings",
|
| 208 |
+
]
|
| 209 |
+
)
|
| 210 |
+
has_style = any(token in normalized for token in [
|
| 211 |
+
"slim", "regular", "relaxed", "oversized", "tailored", "smart", "casual", "formal",
|
| 212 |
+
"party", "work", "interview", "weekend", "minimal", "structured", "clean",
|
| 213 |
+
])
|
| 214 |
+
has_fit = any(token in normalized for token in ["slim-fit", "slim fit", "regular-fit", "regular fit", "relaxed-fit", "relaxed fit"])
|
| 215 |
+
return {
|
| 216 |
+
"has_color": has_color,
|
| 217 |
+
"has_type": has_type,
|
| 218 |
+
"has_style": has_style or has_fit,
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def is_underspecified_query(query: str, requested_category: str | None = None) -> bool:
|
| 223 |
+
signals = _count_query_signals(query, requested_category=requested_category)
|
| 224 |
+
explicit_signal_count = sum(1 for value in signals.values() if value)
|
| 225 |
+
vague_tokens = {
|
| 226 |
+
"some",
|
| 227 |
+
"something",
|
| 228 |
+
"stuff",
|
| 229 |
+
"nice",
|
| 230 |
+
"good",
|
| 231 |
+
"recommend",
|
| 232 |
+
"suggest",
|
| 233 |
+
"maybe",
|
| 234 |
+
"outfit",
|
| 235 |
+
"look",
|
| 236 |
+
}
|
| 237 |
+
normalized = _norm(query)
|
| 238 |
+
has_vague_language = any(token in normalized for token in vague_tokens)
|
| 239 |
+
return explicit_signal_count < 3 or has_vague_language
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def _build_enrichment_prompt(
|
| 243 |
+
query: str,
|
| 244 |
+
wardrobe_summary: WardrobeSummary,
|
| 245 |
+
requested_category: str | None,
|
| 246 |
+
gender: str | None,
|
| 247 |
+
) -> str:
|
| 248 |
+
return (
|
| 249 |
+
"You are helping enrich an underspecified Zalando shopping request. "
|
| 250 |
+
"Return ONLY valid JSON and no prose.\n\n"
|
| 251 |
+
"Output schema:\n"
|
| 252 |
+
'{"suggested_types":[],"suggested_colours":[],"occasion":"","style_notes":""}\n\n'
|
| 253 |
+
f"User query: {query}\n"
|
| 254 |
+
f"Requested category: {requested_category or ''}\n"
|
| 255 |
+
f"Gender: {gender or ''}\n"
|
| 256 |
+
f"Wardrobe metadata summary: {json.dumps(wardrobe_summary, ensure_ascii=True)}\n\n"
|
| 257 |
+
"Rules:\n"
|
| 258 |
+
"- Keep suggested_types to product/search terms that fit the requested category.\n"
|
| 259 |
+
"- Keep suggested_colours complementary to the wardrobe summary.\n"
|
| 260 |
+
"- Occasion must be a single short lowercase label when possible.\n"
|
| 261 |
+
"- style_notes must be concise and search-friendly.\n"
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def _parse_json_object(text: str) -> dict[str, Any]:
|
| 266 |
+
raw = str(text or "").strip()
|
| 267 |
+
if not raw:
|
| 268 |
+
return {}
|
| 269 |
+
try:
|
| 270 |
+
parsed = json.loads(raw)
|
| 271 |
+
return parsed if isinstance(parsed, dict) else {}
|
| 272 |
+
except json.JSONDecodeError:
|
| 273 |
+
start = raw.find("{")
|
| 274 |
+
end = raw.rfind("}")
|
| 275 |
+
if start == -1 or end == -1 or end <= start:
|
| 276 |
+
return {}
|
| 277 |
+
try:
|
| 278 |
+
parsed = json.loads(raw[start : end + 1])
|
| 279 |
+
return parsed if isinstance(parsed, dict) else {}
|
| 280 |
+
except json.JSONDecodeError:
|
| 281 |
+
return {}
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def _normalize_enrichment_payload(payload: dict[str, Any], requested_category: str | None) -> dict[str, Any]:
|
| 285 |
+
def to_list(value: Any) -> list[str]:
|
| 286 |
+
if not isinstance(value, list):
|
| 287 |
+
return []
|
| 288 |
+
cleaned: list[str] = []
|
| 289 |
+
for entry in value:
|
| 290 |
+
text = str(entry or "").strip()
|
| 291 |
+
if text and text not in cleaned:
|
| 292 |
+
cleaned.append(text)
|
| 293 |
+
return cleaned
|
| 294 |
+
|
| 295 |
+
suggested_types = to_list(payload.get("suggested_types"))
|
| 296 |
+
suggested_colours = to_list(payload.get("suggested_colours") or payload.get("suggested_colors"))
|
| 297 |
+
occasion = str(payload.get("occasion") or "").strip().lower()
|
| 298 |
+
style_notes = str(payload.get("style_notes") or "").strip()
|
| 299 |
+
|
| 300 |
+
requested = _norm(requested_category)
|
| 301 |
+
if requested and requested not in {"both", "all"} and requested not in {"topwear", "bottomwear"}:
|
| 302 |
+
requested = "bottomwear" if any(token in requested for token in ["bottom", "trouser", "pant", "jean", "skirt", "short"]) else "topwear"
|
| 303 |
+
|
| 304 |
+
if requested in {"topwear", "bottomwear"} and not suggested_types:
|
| 305 |
+
suggested_types = [requested]
|
| 306 |
+
|
| 307 |
+
if not suggested_colours:
|
| 308 |
+
suggested_colours = ["black"]
|
| 309 |
+
|
| 310 |
+
return {
|
| 311 |
+
"suggested_types": suggested_types,
|
| 312 |
+
"suggested_colours": suggested_colours,
|
| 313 |
+
"occasion": occasion,
|
| 314 |
+
"style_notes": style_notes,
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
def enrich_underspecified_query(
|
| 319 |
+
query: str,
|
| 320 |
+
wardrobe_items: list[dict[str, Any]] | None = None,
|
| 321 |
+
requested_category: str | None = None,
|
| 322 |
+
gender: str | None = None,
|
| 323 |
+
completion_fn: TextCompletionFn | None = None,
|
| 324 |
+
max_tokens: int = 500,
|
| 325 |
+
) -> dict[str, Any]:
|
| 326 |
+
wardrobe_summary = summarize_wardrobe_metadata(wardrobe_items or [])
|
| 327 |
+
if not is_underspecified_query(query, requested_category=requested_category):
|
| 328 |
+
return {
|
| 329 |
+
"used": False,
|
| 330 |
+
"query": str(query or "").strip(),
|
| 331 |
+
"wardrobe_summary": wardrobe_summary,
|
| 332 |
+
"enrichment": {
|
| 333 |
+
"suggested_types": [],
|
| 334 |
+
"suggested_colours": [],
|
| 335 |
+
"occasion": "",
|
| 336 |
+
"style_notes": "",
|
| 337 |
+
},
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
if not completion_fn:
|
| 341 |
+
return {
|
| 342 |
+
"used": True,
|
| 343 |
+
"query": str(query or "").strip(),
|
| 344 |
+
"wardrobe_summary": wardrobe_summary,
|
| 345 |
+
"enrichment": {
|
| 346 |
+
"suggested_types": [],
|
| 347 |
+
"suggested_colours": [],
|
| 348 |
+
"occasion": "",
|
| 349 |
+
"style_notes": "",
|
| 350 |
+
},
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
prompt = _build_enrichment_prompt(query, wardrobe_summary, requested_category, gender)
|
| 354 |
+
model_text = completion_fn(prompt, max_tokens)
|
| 355 |
+
parsed = _parse_json_object(model_text)
|
| 356 |
+
enrichment = _normalize_enrichment_payload(parsed, requested_category=requested_category)
|
| 357 |
+
return {
|
| 358 |
+
"used": True,
|
| 359 |
+
"query": str(query or "").strip(),
|
| 360 |
+
"wardrobe_summary": wardrobe_summary,
|
| 361 |
+
"enrichment": enrichment,
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
def compose_search_query_from_enrichment(
|
| 366 |
+
query: str,
|
| 367 |
+
enrichment: dict[str, Any] | None,
|
| 368 |
+
gender: str | None = None,
|
| 369 |
+
requested_category: str | None = None,
|
| 370 |
+
) -> str:
|
| 371 |
+
base_query = str(query or "").strip()
|
| 372 |
+
enrichment = enrichment or {}
|
| 373 |
+
target_category = _normalize_target_category(requested_category)
|
| 374 |
+
|
| 375 |
+
suggested_types = [str(value).strip() for value in (enrichment.get("suggested_types") or []) if str(value).strip()]
|
| 376 |
+
suggested_colours = [str(value).strip() for value in (enrichment.get("suggested_colours") or []) if str(value).strip()]
|
| 377 |
+
style_notes = str(enrichment.get("style_notes") or "").strip()
|
| 378 |
+
occasion = str(enrichment.get("occasion") or "").strip()
|
| 379 |
+
|
| 380 |
+
tokens: list[str] = []
|
| 381 |
+
if base_query:
|
| 382 |
+
tokens.extend([piece for piece in re.split(r"\s+", base_query) if piece])
|
| 383 |
+
elif gender:
|
| 384 |
+
tokens.append(_normalize_gender(gender, base_query))
|
| 385 |
+
|
| 386 |
+
def append_unique(token: str) -> None:
|
| 387 |
+
cleaned = str(token or "").strip()
|
| 388 |
+
if cleaned and cleaned not in tokens:
|
| 389 |
+
tokens.append(cleaned)
|
| 390 |
+
|
| 391 |
+
if gender:
|
| 392 |
+
append_unique(_normalize_gender(gender, base_query))
|
| 393 |
+
|
| 394 |
+
if suggested_colours:
|
| 395 |
+
append_unique(suggested_colours[0])
|
| 396 |
+
|
| 397 |
+
if suggested_types:
|
| 398 |
+
append_unique(suggested_types[0])
|
| 399 |
+
elif requested_category:
|
| 400 |
+
requested = _norm(requested_category)
|
| 401 |
+
if requested in {"topwear", "bottomwear"}:
|
| 402 |
+
append_unique(requested)
|
| 403 |
+
elif any(token in requested for token in ["bottom", "trouser", "pant", "jean", "skirt", "short"]):
|
| 404 |
+
append_unique("bottomwear")
|
| 405 |
+
elif any(token in requested for token in ["top", "shirt", "tee", "blouse", "polo", "jacket"]):
|
| 406 |
+
append_unique("topwear")
|
| 407 |
+
|
| 408 |
+
if occasion:
|
| 409 |
+
append_unique(occasion)
|
| 410 |
+
|
| 411 |
+
if style_notes:
|
| 412 |
+
style_tokens = [piece for piece in re.split(r"[^a-zA-Z0-9-]+", style_notes.lower()) if piece]
|
| 413 |
+
for token in style_tokens[:3]:
|
| 414 |
+
append_unique(token)
|
| 415 |
+
|
| 416 |
+
if not tokens:
|
| 417 |
+
tokens = [base_query or _normalize_gender(gender, base_query)]
|
| 418 |
+
|
| 419 |
+
topwear_terms = {"shirt", "shirts", "tee", "t-shirt", "tshirt", "topwear", "blazer", "jacket", "polo", "hoodie", "kurta"}
|
| 420 |
+
bottomwear_terms = {"trouser", "trousers", "pants", "jeans", "shorts", "joggers", "bottomwear"}
|
| 421 |
+
|
| 422 |
+
normalized_tokens = [str(token).strip().lower() for token in tokens]
|
| 423 |
+
has_topwear_term = any(token in topwear_terms for token in normalized_tokens)
|
| 424 |
+
has_bottomwear_term = any(token in bottomwear_terms for token in normalized_tokens)
|
| 425 |
+
|
| 426 |
+
if target_category == "bottomwear" and has_topwear_term and not has_bottomwear_term:
|
| 427 |
+
replacement = "trousers"
|
| 428 |
+
for index, token in enumerate(normalized_tokens):
|
| 429 |
+
if token in topwear_terms:
|
| 430 |
+
tokens[index] = replacement
|
| 431 |
+
normalized_tokens[index] = replacement
|
| 432 |
+
break
|
| 433 |
+
else:
|
| 434 |
+
append_unique(replacement)
|
| 435 |
+
elif target_category == "topwear" and has_bottomwear_term and not has_topwear_term:
|
| 436 |
+
replacement = "shirt"
|
| 437 |
+
for index, token in enumerate(normalized_tokens):
|
| 438 |
+
if token in bottomwear_terms:
|
| 439 |
+
tokens[index] = replacement
|
| 440 |
+
normalized_tokens[index] = replacement
|
| 441 |
+
break
|
| 442 |
+
else:
|
| 443 |
+
append_unique(replacement)
|
| 444 |
+
|
| 445 |
+
return " ".join(part for part in tokens if part).strip()
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
def _normalize_gender(gender: str | None, query: str) -> str:
|
| 449 |
+
g = _norm(gender)
|
| 450 |
+
if g in {"men", "male", "man", "mens"}:
|
| 451 |
+
return "men"
|
| 452 |
+
if g in {"women", "female", "woman", "womens"}:
|
| 453 |
+
return "women"
|
| 454 |
+
if g == "unisex":
|
| 455 |
+
return "unisex"
|
| 456 |
+
|
| 457 |
+
query_hint = _norm(query)
|
| 458 |
+
if any(token in query_hint for token in [" men ", "male", "man", "mens"]):
|
| 459 |
+
return "men"
|
| 460 |
+
if any(token in query_hint for token in [" women ", "female", "woman", "womens"]):
|
| 461 |
+
return "women"
|
| 462 |
+
return "unisex"
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
def _pick_category_path(query: str, audience: str) -> str:
|
| 466 |
+
haystack = _norm(query)
|
| 467 |
+
selected = ""
|
| 468 |
+
for token, path_map in CATEGORY_PATH_MAP.items():
|
| 469 |
+
if token in haystack:
|
| 470 |
+
selected = path_map.get(audience) or path_map.get("unisex") or ""
|
| 471 |
+
break
|
| 472 |
+
|
| 473 |
+
if not selected:
|
| 474 |
+
if audience == "men":
|
| 475 |
+
selected = "mens-clothing"
|
| 476 |
+
elif audience == "women":
|
| 477 |
+
selected = "womens-clothing"
|
| 478 |
+
else:
|
| 479 |
+
selected = "clothing"
|
| 480 |
+
|
| 481 |
+
if audience == "men" and selected.startswith("womens-"):
|
| 482 |
+
selected = selected.replace("womens-", "mens-", 1)
|
| 483 |
+
if audience == "women" and selected.startswith("mens-"):
|
| 484 |
+
selected = selected.replace("mens-", "womens-", 1)
|
| 485 |
+
if audience == "unisex" and selected.startswith(("mens-", "womens-")):
|
| 486 |
+
selected = selected.split("-", 1)[1]
|
| 487 |
+
|
| 488 |
+
return selected or "clothing"
|
| 489 |
+
|
| 490 |
+
|
| 491 |
+
def build_zalando_search_url(query: str, gender: str | None = None) -> str:
|
| 492 |
+
normalized_query = str(query or "").strip()
|
| 493 |
+
if not normalized_query:
|
| 494 |
+
raise ValueError("query is required")
|
| 495 |
+
|
| 496 |
+
audience = _normalize_gender(gender, normalized_query)
|
| 497 |
+
path = _pick_category_path(normalized_query, audience)
|
| 498 |
+
params = urlencode({"q": normalized_query})
|
| 499 |
+
return f"{ZALANDO_BASE_URL}/{path}?{params}"
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
def build_zalando_search_urls_from_query(query: str, gender: str | None = None) -> list[str]:
|
| 503 |
+
normalized_query = str(query or "").strip()
|
| 504 |
+
if not normalized_query:
|
| 505 |
+
return []
|
| 506 |
+
|
| 507 |
+
if gender:
|
| 508 |
+
return [build_zalando_search_url(normalized_query, gender=gender)]
|
| 509 |
+
|
| 510 |
+
urls: list[str] = []
|
| 511 |
+
for audience in ["women", "men", "unisex"]:
|
| 512 |
+
url = build_zalando_search_url(normalized_query, gender=audience)
|
| 513 |
+
if url not in urls:
|
| 514 |
+
urls.append(url)
|
| 515 |
+
return urls
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
def build_zalando_search_urls_from_request(
|
| 519 |
+
query: str,
|
| 520 |
+
gender: str | None = None,
|
| 521 |
+
wardrobe_items: list[dict[str, Any]] | None = None,
|
| 522 |
+
requested_category: str | None = None,
|
| 523 |
+
completion_fn: TextCompletionFn | None = None,
|
| 524 |
+
max_tokens: int = 500,
|
| 525 |
+
) -> tuple[list[str], dict[str, Any]]:
|
| 526 |
+
enrichment_result = enrich_underspecified_query(
|
| 527 |
+
query=query,
|
| 528 |
+
wardrobe_items=wardrobe_items,
|
| 529 |
+
requested_category=requested_category,
|
| 530 |
+
gender=gender,
|
| 531 |
+
completion_fn=completion_fn,
|
| 532 |
+
max_tokens=max_tokens,
|
| 533 |
+
)
|
| 534 |
+
final_query = compose_search_query_from_enrichment(
|
| 535 |
+
query=enrichment_result.get("query") or query,
|
| 536 |
+
enrichment=enrichment_result.get("enrichment") if isinstance(enrichment_result.get("enrichment"), dict) else None,
|
| 537 |
+
gender=gender,
|
| 538 |
+
requested_category=requested_category,
|
| 539 |
+
)
|
| 540 |
+
search_urls = build_zalando_search_urls_from_query(final_query, gender=gender)
|
| 541 |
+
return search_urls, {**enrichment_result, "final_query": final_query}
|
| 542 |
+
|
| 543 |
+
|
| 544 |
+
def _apify_request_url() -> str:
|
| 545 |
+
if APIFY_TOKEN:
|
| 546 |
+
return f"{APIFY_ACTOR_ENDPOINT}?token={APIFY_TOKEN}"
|
| 547 |
+
return APIFY_ACTOR_ENDPOINT
|
| 548 |
+
|
| 549 |
+
|
| 550 |
+
def _apify_actor_id_from_endpoint(endpoint: str) -> str:
|
| 551 |
+
parsed = urlparse(str(endpoint or "").strip())
|
| 552 |
+
segments = [segment for segment in parsed.path.split("/") if segment]
|
| 553 |
+
if "acts" in segments:
|
| 554 |
+
index = segments.index("acts")
|
| 555 |
+
if index + 1 < len(segments):
|
| 556 |
+
return segments[index + 1]
|
| 557 |
+
return "vistics~zalando-scraper"
|
| 558 |
+
|
| 559 |
+
|
| 560 |
+
def _build_apify_payload(search_url: str, max_results: int, start_urls_as_objects: bool) -> dict[str, Any]:
|
| 561 |
+
start_urls: list[Any]
|
| 562 |
+
if start_urls_as_objects:
|
| 563 |
+
start_urls = [{"url": search_url}]
|
| 564 |
+
else:
|
| 565 |
+
start_urls = [search_url]
|
| 566 |
+
|
| 567 |
+
return {
|
| 568 |
+
"startUrls": start_urls,
|
| 569 |
+
"maxResults": int(max_results),
|
| 570 |
+
}
|
| 571 |
+
|
| 572 |
+
|
| 573 |
+
def _http_error_detail(exc: requests.RequestException, limit: int = 800) -> str:
|
| 574 |
+
response = getattr(exc, "response", None)
|
| 575 |
+
if response is None:
|
| 576 |
+
return ""
|
| 577 |
+
|
| 578 |
+
status = getattr(response, "status_code", None)
|
| 579 |
+
body = ""
|
| 580 |
+
try:
|
| 581 |
+
body = str(response.text or "").strip().replace("\n", " ")
|
| 582 |
+
except Exception:
|
| 583 |
+
body = ""
|
| 584 |
+
if body:
|
| 585 |
+
body = body[:limit]
|
| 586 |
+
if status is None and not body:
|
| 587 |
+
return ""
|
| 588 |
+
return f"status={status} body={body}".strip()
|
| 589 |
+
|
| 590 |
+
|
| 591 |
+
def _extract_apify_items(raw_payload: Any) -> list[dict[str, Any]]:
|
| 592 |
+
if isinstance(raw_payload, list):
|
| 593 |
+
return [item for item in raw_payload if isinstance(item, dict)]
|
| 594 |
+
|
| 595 |
+
if isinstance(raw_payload, dict):
|
| 596 |
+
for key in ("items", "data"):
|
| 597 |
+
value = raw_payload.get(key)
|
| 598 |
+
if isinstance(value, list):
|
| 599 |
+
return [item for item in value if isinstance(item, dict)]
|
| 600 |
+
|
| 601 |
+
return []
|
| 602 |
+
|
| 603 |
+
|
| 604 |
+
def _normalize_apify_items(raw_items: list[dict[str, Any]], effective_limit: int) -> list[dict[str, str]]:
|
| 605 |
+
items: list[dict[str, str]] = []
|
| 606 |
+
seen: set[str] = set()
|
| 607 |
+
for raw in raw_items:
|
| 608 |
+
normalized = _normalize_product(raw)
|
| 609 |
+
if not normalized["item_link"] or normalized["item_link"] in seen:
|
| 610 |
+
continue
|
| 611 |
+
seen.add(normalized["item_link"])
|
| 612 |
+
items.append(normalized)
|
| 613 |
+
if len(items) >= effective_limit:
|
| 614 |
+
break
|
| 615 |
+
return items
|
| 616 |
+
|
| 617 |
+
|
| 618 |
+
def _scrape_with_apify_run_dataset_fallback(
|
| 619 |
+
search_url: str,
|
| 620 |
+
effective_limit: int,
|
| 621 |
+
timeout_seconds: int,
|
| 622 |
+
) -> list[dict[str, str]]:
|
| 623 |
+
actor_id = _apify_actor_id_from_endpoint(APIFY_ACTOR_ENDPOINT)
|
| 624 |
+
run_url = f"https://api.apify.com/v2/acts/{actor_id}/runs"
|
| 625 |
+
wait_for_finish = min(max(60, APIFY_WAIT_FOR_FINISH_SECONDS), 300)
|
| 626 |
+
variant_errors: list[str] = []
|
| 627 |
+
|
| 628 |
+
logger.info(
|
| 629 |
+
"zalando crawl retry source=apify-run search_url=%s actor_id=%s wait_for_finish=%s",
|
| 630 |
+
search_url,
|
| 631 |
+
actor_id,
|
| 632 |
+
wait_for_finish,
|
| 633 |
+
)
|
| 634 |
+
|
| 635 |
+
variants = [
|
| 636 |
+
("string", False),
|
| 637 |
+
("object", True),
|
| 638 |
+
]
|
| 639 |
+
|
| 640 |
+
for variant_name, use_object_start_urls in variants:
|
| 641 |
+
run_payload = _build_apify_payload(search_url, effective_limit, start_urls_as_objects=use_object_start_urls)
|
| 642 |
+
run_id = ""
|
| 643 |
+
run_status = ""
|
| 644 |
+
dataset_id = ""
|
| 645 |
+
try:
|
| 646 |
+
run_response = requests.post(
|
| 647 |
+
run_url,
|
| 648 |
+
params={"token": APIFY_TOKEN, "waitForFinish": wait_for_finish},
|
| 649 |
+
json=run_payload,
|
| 650 |
+
timeout=timeout_seconds,
|
| 651 |
+
)
|
| 652 |
+
run_response.raise_for_status()
|
| 653 |
+
run_json = run_response.json()
|
| 654 |
+
|
| 655 |
+
run_data = run_json.get("data") if isinstance(run_json, dict) else None
|
| 656 |
+
if not isinstance(run_data, dict):
|
| 657 |
+
variant_errors.append(f"{variant_name}: invalid run payload")
|
| 658 |
+
continue
|
| 659 |
+
|
| 660 |
+
run_id = str(run_data.get("id") or "").strip()
|
| 661 |
+
run_status = str(run_data.get("status") or "").strip()
|
| 662 |
+
dataset_id = str(run_data.get("defaultDatasetId") or "").strip()
|
| 663 |
+
logger.info(
|
| 664 |
+
"zalando crawl retry source=apify-run completed variant=%s run_id=%s status=%s dataset_id=%s",
|
| 665 |
+
variant_name,
|
| 666 |
+
run_id,
|
| 667 |
+
run_status,
|
| 668 |
+
dataset_id,
|
| 669 |
+
)
|
| 670 |
+
except requests.RequestException as exc:
|
| 671 |
+
detail = _http_error_detail(exc)
|
| 672 |
+
variant_errors.append(f"{variant_name}: {exc} {detail}".strip())
|
| 673 |
+
logger.warning(
|
| 674 |
+
"zalando crawl failed source=apify-run variant=%s search_url=%s error=%s detail=%s",
|
| 675 |
+
variant_name,
|
| 676 |
+
search_url,
|
| 677 |
+
exc,
|
| 678 |
+
detail,
|
| 679 |
+
)
|
| 680 |
+
continue
|
| 681 |
+
|
| 682 |
+
if not dataset_id:
|
| 683 |
+
variant_errors.append(f"{variant_name}: missing defaultDatasetId")
|
| 684 |
+
continue
|
| 685 |
+
|
| 686 |
+
try:
|
| 687 |
+
dataset_response = requests.get(
|
| 688 |
+
f"https://api.apify.com/v2/datasets/{dataset_id}/items",
|
| 689 |
+
params={
|
| 690 |
+
"token": APIFY_TOKEN,
|
| 691 |
+
"clean": "true",
|
| 692 |
+
"format": "json",
|
| 693 |
+
"limit": effective_limit,
|
| 694 |
+
},
|
| 695 |
+
timeout=timeout_seconds,
|
| 696 |
+
)
|
| 697 |
+
dataset_response.raise_for_status()
|
| 698 |
+
dataset_items = _extract_apify_items(dataset_response.json())
|
| 699 |
+
items = _normalize_apify_items(dataset_items, effective_limit)
|
| 700 |
+
logger.info(
|
| 701 |
+
"zalando crawl retry source=apify-dataset variant=%s run_id=%s dataset_id=%s raw_items=%s items=%s",
|
| 702 |
+
variant_name,
|
| 703 |
+
run_id,
|
| 704 |
+
dataset_id,
|
| 705 |
+
len(dataset_items),
|
| 706 |
+
len(items),
|
| 707 |
+
)
|
| 708 |
+
if items:
|
| 709 |
+
return items
|
| 710 |
+
variant_errors.append(f"{variant_name}: empty dataset")
|
| 711 |
+
except requests.RequestException as exc:
|
| 712 |
+
detail = _http_error_detail(exc)
|
| 713 |
+
variant_errors.append(f"{variant_name}: {exc} {detail}".strip())
|
| 714 |
+
logger.warning(
|
| 715 |
+
"zalando crawl failed source=apify-dataset variant=%s run_id=%s dataset_id=%s error=%s detail=%s",
|
| 716 |
+
variant_name,
|
| 717 |
+
run_id,
|
| 718 |
+
dataset_id,
|
| 719 |
+
exc,
|
| 720 |
+
detail,
|
| 721 |
+
)
|
| 722 |
+
|
| 723 |
+
if variant_errors:
|
| 724 |
+
logger.warning(
|
| 725 |
+
"zalando crawl retry source=apify-run exhausted search_url=%s errors=%s",
|
| 726 |
+
search_url,
|
| 727 |
+
"; ".join(variant_errors),
|
| 728 |
+
)
|
| 729 |
+
return []
|
| 730 |
+
|
| 731 |
+
|
| 732 |
+
def _normalize_product(item: dict[str, Any]) -> dict[str, str]:
|
| 733 |
+
name = str(
|
| 734 |
+
item.get("name")
|
| 735 |
+
or item.get("title")
|
| 736 |
+
or item.get("productName")
|
| 737 |
+
or item.get("product_name")
|
| 738 |
+
or "N/A"
|
| 739 |
+
).strip()
|
| 740 |
+
fallback_price = _extract_price_text(
|
| 741 |
+
item.get("price")
|
| 742 |
+
or item.get("currentPrice")
|
| 743 |
+
or item.get("displayPrice")
|
| 744 |
+
or item.get("priceLabel")
|
| 745 |
+
or "N/A"
|
| 746 |
+
)
|
| 747 |
+
currency_symbol = str(item.get("currencySymbol") or "").strip()
|
| 748 |
+
promotional_price = _format_apify_money(item.get("promotionalPrice"), currency_symbol)
|
| 749 |
+
original_price = _format_apify_money(item.get("originalPrice"), currency_symbol)
|
| 750 |
+
discount_percent = str(item.get("discountPercent") or "").strip()
|
| 751 |
+
brand = str(item.get("brand") or item.get("brandName") or "").strip()
|
| 752 |
+
|
| 753 |
+
if promotional_price:
|
| 754 |
+
price = promotional_price if not discount_percent else f"{promotional_price} ({discount_percent})"
|
| 755 |
+
elif original_price:
|
| 756 |
+
price = original_price
|
| 757 |
+
else:
|
| 758 |
+
price = fallback_price
|
| 759 |
+
|
| 760 |
+
image_url = _ensure_zalando_url(
|
| 761 |
+
str(
|
| 762 |
+
item.get("image")
|
| 763 |
+
or item.get("imageUrl")
|
| 764 |
+
or item.get("image_url")
|
| 765 |
+
or item.get("thumbnail")
|
| 766 |
+
or ""
|
| 767 |
+
)
|
| 768 |
+
)
|
| 769 |
+
|
| 770 |
+
url_value = _ensure_zalando_url(
|
| 771 |
+
str(
|
| 772 |
+
item.get("url")
|
| 773 |
+
or item.get("productUrl")
|
| 774 |
+
or item.get("item_link")
|
| 775 |
+
or item.get("link")
|
| 776 |
+
or ""
|
| 777 |
+
)
|
| 778 |
+
)
|
| 779 |
+
|
| 780 |
+
return {
|
| 781 |
+
"name": name or "N/A",
|
| 782 |
+
"price": price or "N/A",
|
| 783 |
+
"brand": brand,
|
| 784 |
+
"currency_symbol": currency_symbol,
|
| 785 |
+
"promotional_price": promotional_price,
|
| 786 |
+
"original_price": original_price,
|
| 787 |
+
"discount_percent": discount_percent,
|
| 788 |
+
"image_url": image_url,
|
| 789 |
+
"item_link": url_value,
|
| 790 |
+
}
|
| 791 |
+
|
| 792 |
+
|
| 793 |
+
def _scrape_with_apify(search_url: str, max_products: int | None, timeout_seconds: int) -> list[dict[str, str]]:
|
| 794 |
+
requested_limit = int(max_products) if isinstance(max_products, int) and max_products > 0 else APIFY_MAX_RESULTS
|
| 795 |
+
effective_limit = min(requested_limit, APIFY_MAX_RESULTS)
|
| 796 |
+
apify_timeout = max(int(timeout_seconds), APIFY_MIN_TIMEOUT_SECONDS)
|
| 797 |
+
actor_id = _apify_actor_id_from_endpoint(APIFY_ACTOR_ENDPOINT)
|
| 798 |
+
logger.info(
|
| 799 |
+
"zalando crawl start source=apify search_url=%s requested_max=%s effective_max=%s timeout=%s actor_id=%s",
|
| 800 |
+
search_url,
|
| 801 |
+
max_products,
|
| 802 |
+
effective_limit,
|
| 803 |
+
apify_timeout,
|
| 804 |
+
actor_id,
|
| 805 |
+
)
|
| 806 |
+
|
| 807 |
+
variants = [
|
| 808 |
+
("string", False),
|
| 809 |
+
("object", True),
|
| 810 |
+
]
|
| 811 |
+
variant_errors: list[str] = []
|
| 812 |
+
for variant_name, use_object_start_urls in variants:
|
| 813 |
+
try:
|
| 814 |
+
payload = _build_apify_payload(search_url, effective_limit, start_urls_as_objects=use_object_start_urls)
|
| 815 |
+
response = requests.post(_apify_request_url(), json=payload, timeout=apify_timeout)
|
| 816 |
+
response.raise_for_status()
|
| 817 |
+
|
| 818 |
+
raw_items = _extract_apify_items(response.json())
|
| 819 |
+
items = _normalize_apify_items(raw_items, effective_limit)
|
| 820 |
+
logger.info(
|
| 821 |
+
"zalando crawl end source=apify variant=%s search_url=%s crawled=%s raw_items=%s items=%s",
|
| 822 |
+
variant_name,
|
| 823 |
+
search_url,
|
| 824 |
+
bool(items),
|
| 825 |
+
len(raw_items),
|
| 826 |
+
len(items),
|
| 827 |
+
)
|
| 828 |
+
if items:
|
| 829 |
+
return items
|
| 830 |
+
variant_errors.append(f"{variant_name}: empty result")
|
| 831 |
+
except requests.RequestException as exc:
|
| 832 |
+
detail = _http_error_detail(exc)
|
| 833 |
+
variant_errors.append(f"{variant_name}: {exc} {detail}".strip())
|
| 834 |
+
logger.warning(
|
| 835 |
+
"zalando crawl failed source=apify variant=%s search_url=%s error=%s detail=%s",
|
| 836 |
+
variant_name,
|
| 837 |
+
search_url,
|
| 838 |
+
exc,
|
| 839 |
+
detail,
|
| 840 |
+
)
|
| 841 |
+
continue
|
| 842 |
+
|
| 843 |
+
try:
|
| 844 |
+
fallback_items = _scrape_with_apify_run_dataset_fallback(
|
| 845 |
+
search_url=search_url,
|
| 846 |
+
effective_limit=effective_limit,
|
| 847 |
+
timeout_seconds=apify_timeout,
|
| 848 |
+
)
|
| 849 |
+
logger.info(
|
| 850 |
+
"zalando crawl end source=apify-run search_url=%s crawled=%s items=%s",
|
| 851 |
+
search_url,
|
| 852 |
+
bool(fallback_items),
|
| 853 |
+
len(fallback_items),
|
| 854 |
+
)
|
| 855 |
+
if fallback_items:
|
| 856 |
+
return fallback_items
|
| 857 |
+
except requests.RequestException as exc:
|
| 858 |
+
detail = _http_error_detail(exc)
|
| 859 |
+
variant_errors.append(f"run_dataset: {exc} {detail}".strip())
|
| 860 |
+
logger.warning("zalando crawl failed source=apify-run search_url=%s error=%s detail=%s", search_url, exc, detail)
|
| 861 |
+
|
| 862 |
+
if variant_errors:
|
| 863 |
+
logger.warning(
|
| 864 |
+
"zalando crawl source=apify exhausted search_url=%s errors=%s",
|
| 865 |
+
search_url,
|
| 866 |
+
"; ".join(variant_errors),
|
| 867 |
+
)
|
| 868 |
+
|
| 869 |
+
logger.warning(
|
| 870 |
+
"zalando crawl end source=apify search_url=%s crawled=False items=0 reason=no_items_from_sync_or_run_dataset",
|
| 871 |
+
search_url,
|
| 872 |
+
)
|
| 873 |
+
return []
|
| 874 |
+
|
| 875 |
+
|
| 876 |
+
def _scrape_with_html(search_url: str, max_products: int | None, timeout_seconds: int) -> list[dict[str, str]]:
|
| 877 |
+
html_timeout = max(int(timeout_seconds), HTML_FALLBACK_TIMEOUT_SECONDS)
|
| 878 |
+
logger.info("zalando crawl start source=html search_url=%s max_products=%s timeout=%s", search_url, max_products, html_timeout)
|
| 879 |
+
response = requests.get(search_url, headers=REQUEST_HEADERS, timeout=html_timeout)
|
| 880 |
+
response.raise_for_status()
|
| 881 |
+
soup = BeautifulSoup(response.content, "lxml")
|
| 882 |
+
|
| 883 |
+
items: list[dict[str, str]] = []
|
| 884 |
+
seen: set[str] = set()
|
| 885 |
+
|
| 886 |
+
cards = soup.select('article, div[data-testid*="product"], li[data-testid*="product"]')
|
| 887 |
+
for card in cards:
|
| 888 |
+
link_tag = card.select_one('a[href*="/p/"]') or card.find("a", href=True)
|
| 889 |
+
if not link_tag:
|
| 890 |
+
continue
|
| 891 |
+
|
| 892 |
+
item_link = _ensure_zalando_url(str(link_tag.get("href") or ""))
|
| 893 |
+
if not item_link or item_link in seen or "zalando" not in item_link:
|
| 894 |
+
continue
|
| 895 |
+
|
| 896 |
+
name_tag = (
|
| 897 |
+
card.select_one('[data-testid*="product-name"]')
|
| 898 |
+
or card.select_one('[data-testid*="name"]')
|
| 899 |
+
or card.find("h3")
|
| 900 |
+
or card.find("h2")
|
| 901 |
+
or link_tag
|
| 902 |
+
)
|
| 903 |
+
name = str(name_tag.get_text(" ", strip=True) if name_tag else "N/A").strip() or "N/A"
|
| 904 |
+
|
| 905 |
+
price_tag = (
|
| 906 |
+
card.select_one('[data-testid*="price"]')
|
| 907 |
+
or card.find(attrs={"class": re.compile(r"price|money|amount", re.I)})
|
| 908 |
+
)
|
| 909 |
+
price_text = str(price_tag.get_text(" ", strip=True) if price_tag else "")
|
| 910 |
+
price = _extract_price_text(price_text)
|
| 911 |
+
|
| 912 |
+
img_tag = card.find("img")
|
| 913 |
+
image_url = ""
|
| 914 |
+
if img_tag:
|
| 915 |
+
image_url = _ensure_zalando_url(
|
| 916 |
+
str(
|
| 917 |
+
img_tag.get("src")
|
| 918 |
+
or img_tag.get("data-src")
|
| 919 |
+
or _extract_src_from_srcset(str(img_tag.get("srcset") or ""))
|
| 920 |
+
)
|
| 921 |
+
)
|
| 922 |
+
|
| 923 |
+
seen.add(item_link)
|
| 924 |
+
items.append(
|
| 925 |
+
{
|
| 926 |
+
"name": name,
|
| 927 |
+
"price": price,
|
| 928 |
+
"image_url": image_url,
|
| 929 |
+
"item_link": item_link,
|
| 930 |
+
}
|
| 931 |
+
)
|
| 932 |
+
if isinstance(max_products, int) and max_products > 0 and len(items) >= max_products:
|
| 933 |
+
break
|
| 934 |
+
|
| 935 |
+
logger.info("zalando crawl end source=html search_url=%s crawled=%s items=%s", search_url, bool(items), len(items))
|
| 936 |
+
return items
|
| 937 |
+
|
| 938 |
+
|
| 939 |
+
def _requires_postprocess(items: list[dict[str, str]]) -> bool:
|
| 940 |
+
if not items:
|
| 941 |
+
return False
|
| 942 |
+
missing = 0
|
| 943 |
+
for item in items:
|
| 944 |
+
if item.get("name") in {"", "N/A"} or item.get("price") in {"", "N/A"}:
|
| 945 |
+
missing += 1
|
| 946 |
+
return missing > 0
|
| 947 |
+
|
| 948 |
+
|
| 949 |
+
def extract_product_summaries(
|
| 950 |
+
search_url: str,
|
| 951 |
+
max_products: int | None = None,
|
| 952 |
+
request_timeout_seconds: int = 35,
|
| 953 |
+
use_apify: bool = True,
|
| 954 |
+
postprocess: Optional[ScrapePostprocessFn] = None,
|
| 955 |
+
) -> list[dict[str, str]]:
|
| 956 |
+
if not str(search_url or "").strip():
|
| 957 |
+
raise ValueError("search_url is required")
|
| 958 |
+
|
| 959 |
+
max_count = int(max_products) if isinstance(max_products, int) and max_products > 0 else None
|
| 960 |
+
logger.info(
|
| 961 |
+
"zalando crawl requested search_url=%s max_products=%s capped_to=%s use_apify=%s actor_id=%s",
|
| 962 |
+
search_url,
|
| 963 |
+
max_products,
|
| 964 |
+
max_count,
|
| 965 |
+
bool(use_apify and APIFY_TOKEN),
|
| 966 |
+
_apify_actor_id_from_endpoint(APIFY_ACTOR_ENDPOINT),
|
| 967 |
+
)
|
| 968 |
+
products: list[dict[str, str]] = []
|
| 969 |
+
errors: list[str] = []
|
| 970 |
+
|
| 971 |
+
if use_apify and APIFY_TOKEN:
|
| 972 |
+
try:
|
| 973 |
+
products = _scrape_with_apify(search_url, max_count, request_timeout_seconds)
|
| 974 |
+
if not products:
|
| 975 |
+
errors.append("apify: empty result set")
|
| 976 |
+
logger.warning("zalando crawl source=apify returned zero items search_url=%s", search_url)
|
| 977 |
+
except requests.RequestException as exc:
|
| 978 |
+
errors.append(f"apify: {exc}")
|
| 979 |
+
logger.warning("zalando crawl failed source=apify search_url=%s error=%s", search_url, exc)
|
| 980 |
+
|
| 981 |
+
if not products:
|
| 982 |
+
try:
|
| 983 |
+
if use_apify and APIFY_TOKEN:
|
| 984 |
+
logger.info("zalando crawl fallback source=html search_url=%s", search_url)
|
| 985 |
+
products = _scrape_with_html(search_url, max_count, request_timeout_seconds)
|
| 986 |
+
except requests.RequestException as exc:
|
| 987 |
+
errors.append(f"html: {exc}")
|
| 988 |
+
logger.warning("zalando crawl failed source=html search_url=%s error=%s", search_url, exc)
|
| 989 |
+
|
| 990 |
+
if postprocess and _requires_postprocess(products):
|
| 991 |
+
try:
|
| 992 |
+
products = postprocess(products)
|
| 993 |
+
except Exception:
|
| 994 |
+
# Never fail scraping because post-processing failed.
|
| 995 |
+
pass
|
| 996 |
+
|
| 997 |
+
if not products and errors:
|
| 998 |
+
logger.warning("zalando crawl completed with no results search_url=%s errors=%s", search_url, "; ".join(errors))
|
| 999 |
+
raise requests.RequestException("; ".join(errors))
|
| 1000 |
+
|
| 1001 |
+
logger.info("zalando crawl completed search_url=%s crawled=%s items=%s", search_url, bool(products), len(products))
|
| 1002 |
+
if isinstance(max_count, int) and max_count > 0:
|
| 1003 |
+
return products[:max_count]
|
| 1004 |
+
return products
|
| 1005 |
+
|
| 1006 |
+
|
| 1007 |
+
def search_products(
|
| 1008 |
+
query: str,
|
| 1009 |
+
gender: str | None = None,
|
| 1010 |
+
max_products: int | None = None,
|
| 1011 |
+
use_apify: bool = True,
|
| 1012 |
+
request_timeout_seconds: int = 35,
|
| 1013 |
+
postprocess: Optional[ScrapePostprocessFn] = None,
|
| 1014 |
+
wardrobe_items: list[dict[str, Any]] | None = None,
|
| 1015 |
+
requested_category: str | None = None,
|
| 1016 |
+
completion_fn: TextCompletionFn | None = None,
|
| 1017 |
+
enrichment_max_tokens: int = 500,
|
| 1018 |
+
) -> dict[str, Any]:
|
| 1019 |
+
max_count = int(max_products) if isinstance(max_products, int) and max_products > 0 else None
|
| 1020 |
+
search_urls, enrichment_result = build_zalando_search_urls_from_request(
|
| 1021 |
+
query=query,
|
| 1022 |
+
gender=gender,
|
| 1023 |
+
wardrobe_items=wardrobe_items,
|
| 1024 |
+
requested_category=requested_category,
|
| 1025 |
+
completion_fn=completion_fn,
|
| 1026 |
+
max_tokens=enrichment_max_tokens,
|
| 1027 |
+
)
|
| 1028 |
+
if not search_urls:
|
| 1029 |
+
raise ValueError("query is required")
|
| 1030 |
+
|
| 1031 |
+
logger.info(
|
| 1032 |
+
"zalando search plan query=%s search_urls=%s max_products=%s",
|
| 1033 |
+
query,
|
| 1034 |
+
len(search_urls),
|
| 1035 |
+
max_count,
|
| 1036 |
+
)
|
| 1037 |
+
|
| 1038 |
+
products: list[dict[str, str]] = []
|
| 1039 |
+
seen: set[str] = set()
|
| 1040 |
+
|
| 1041 |
+
for search_url in search_urls:
|
| 1042 |
+
summaries = extract_product_summaries(
|
| 1043 |
+
search_url=search_url,
|
| 1044 |
+
max_products=max_count,
|
| 1045 |
+
request_timeout_seconds=request_timeout_seconds,
|
| 1046 |
+
use_apify=use_apify,
|
| 1047 |
+
postprocess=postprocess,
|
| 1048 |
+
)
|
| 1049 |
+
for item in summaries:
|
| 1050 |
+
item_link = str(item.get("item_link") or "").strip()
|
| 1051 |
+
if not item_link or item_link in seen:
|
| 1052 |
+
continue
|
| 1053 |
+
seen.add(item_link)
|
| 1054 |
+
products.append(item)
|
| 1055 |
+
if isinstance(max_count, int) and max_count > 0 and len(products) >= max_count:
|
| 1056 |
+
break
|
| 1057 |
+
if isinstance(max_count, int) and max_count > 0 and len(products) >= max_count:
|
| 1058 |
+
break
|
| 1059 |
+
|
| 1060 |
+
logger.info(
|
| 1061 |
+
"zalando search completed query=%s crawled=%s items=%s search_urls=%s",
|
| 1062 |
+
query,
|
| 1063 |
+
bool(products),
|
| 1064 |
+
len(products),
|
| 1065 |
+
len(search_urls),
|
| 1066 |
+
)
|
| 1067 |
+
|
| 1068 |
+
return {
|
| 1069 |
+
"search_urls": search_urls,
|
| 1070 |
+
"products": products,
|
| 1071 |
+
"count": len(products),
|
| 1072 |
+
"enrichment": enrichment_result,
|
| 1073 |
+
}
|