File size: 5,063 Bytes
d992912 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | import os
import sys
import logging
from pathlib import Path
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from pydantic_settings import BaseSettings
logger = logging.getLogger("asos_search")
def _detect_environment() -> str:
if "google.colab" in sys.modules:
return "colab"
if "KAGGLE_KERNEL_RUN_TYPE" in os.environ:
return "kaggle"
return "local"
class Settings(BaseSettings):
"""Server-level settings loaded from environment variables."""
host: str = "0.0.0.0"
port: int = 8000
cors_origins: list[str] = ["*"]
data_dir: str = ""
data_path: str = ""
persistent_dir: str = ""
image_cache_dir: str = ""
log_level: str = "INFO"
hf_token: Optional[str] = None
model_config = {"env_prefix": "ASOS_"}
@dataclass
class SearchConfig:
"""Central configuration for the search engine."""
# Model
primary_model: str = "patrickjohncyh/fashion-clip"
fallback_model: str = "openai/clip-vit-base-patch32"
embedding_dim: int = 512
device: str = ""
hf_token: Optional[str] = None
# FAISS Index
n_clusters: int = 256
n_probe: int = 20
# Search Pipeline
retrieval_top_k: int = 300
final_top_n: int = 20
# Dual-Index Fusion
rrf_k: int = 60
image_index_weight: float = 0.55
text_index_weight: float = 0.45
# Re-ranking Weights
alpha_clip: float = 0.55
beta_tags: float = 0.25
gamma_text: float = 0.15
delta_freshness: float = 0.05
# CLIP Prompt Ensembling
prompt_templates: Tuple[str, ...] = (
"a photo of {}, a fashion product",
"a product photo of {}",
"a fashion item: {}",
"{}, studio product photography",
"an e-commerce photo of {}",
)
# Embedding Computation
embed_batch_size: int = 32
embed_checkpoint_interval: int = 2000
# Features
enable_multilingual: bool = True
enable_spell_correction: bool = True
# Paths (auto-detected)
data_dir: str = ""
data_path: str = ""
persistent_dir: str = ""
image_cache_dir: str = ""
# Derived Paths
image_index_path: str = ""
text_index_path: str = ""
image_embeddings_path: str = ""
text_embeddings_path: str = ""
def __post_init__(self):
if not self.device:
self.device = "cuda" if torch.cuda.is_available() else "cpu"
if not self.hf_token:
self.hf_token = os.environ.get("HF_TOKEN", None)
env = _detect_environment()
if env == "colab":
drive_base = "/content/drive/MyDrive/Colab Notebooks"
if not self.data_dir:
self.data_dir = drive_base
if not self.persistent_dir:
self.persistent_dir = os.path.join(drive_base, "asos_engine")
if not self.image_cache_dir:
self.image_cache_dir = "/content/asos_image_cache"
elif env == "kaggle":
if not self.data_dir:
self.data_dir = "/kaggle/input"
if not self.persistent_dir:
self.persistent_dir = "/kaggle/working/asos_engine"
if not self.image_cache_dir:
self.image_cache_dir = "/kaggle/working/asos_image_cache"
else:
project_root = str(Path(__file__).resolve().parent.parent.parent)
if not self.data_dir:
self.data_dir = project_root
if not self.persistent_dir:
self.persistent_dir = os.path.join(project_root, "asos_engine")
if not self.image_cache_dir:
self.image_cache_dir = os.path.join(project_root, "asos_image_cache")
if not self.data_path:
pq = Path(self.data_dir) / "asos_clean.parquet"
csv = Path(self.data_dir) / "asos_clean.csv"
if pq.exists():
self.data_path = str(pq)
elif csv.exists():
self.data_path = str(csv)
else:
self.data_path = str(csv)
Path(self.persistent_dir).mkdir(parents=True, exist_ok=True)
p = Path(self.persistent_dir)
self.image_index_path = str(p / "faiss_image_index.bin")
self.text_index_path = str(p / "faiss_text_index.bin")
self.image_embeddings_path = str(p / "image_embeddings.npy")
self.text_embeddings_path = str(p / "text_embeddings.npy")
@classmethod
def from_settings(cls, settings: Settings) -> "SearchConfig":
"""Create SearchConfig from server Settings, allowing env overrides."""
kwargs = {}
if settings.data_dir:
kwargs["data_dir"] = settings.data_dir
if settings.data_path:
kwargs["data_path"] = settings.data_path
if settings.persistent_dir:
kwargs["persistent_dir"] = settings.persistent_dir
if settings.image_cache_dir:
kwargs["image_cache_dir"] = settings.image_cache_dir
if settings.hf_token:
kwargs["hf_token"] = settings.hf_token
return cls(**kwargs)
|