Spaces:

akiroussama
/

rakuten-classifier

Running

File size: 5,216 Bytes

"""

Voting Predictor — Ensemble inference engine for image classification.



Combines three models via weighted soft voting:

  - M1: DINOv3 ViT-Large (weight 4/7) — best single model, self-supervised features

  - M2: XGBoost on ResNet50 features (weight 1/7) — uncorrelated with neural nets

  - M3: EfficientNet-B0 (weight 2/7) — lightweight CNN for diversity



XGBoost probabilities are "sharpened" (p^3 / sum(p^3)) to prevent

its flat distributions from diluting confident predictions from DINOv3.

"""
import torch
import torch.nn.functional as F
import xgboost as xgb
import timm, joblib, numpy as np
from pathlib import Path
from torchvision import models
import torch.nn as nn
from src.features.build_features import preprocess_image


class VotingPredictor:
    """Loads 3 image models and returns fused top-5 predictions."""

    def __init__(self, models_dir):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.mdir = Path(models_dir)
        self.loaded = False

    def load_models(self):
        """Load all three models + label encoder into memory (lazy, once)."""
        if self.loaded:
            return

        # M1 — DINOv3 ViT-Large fine-tuned on 27 Rakuten classes
        self.m1 = timm.create_model(
            'vit_large_patch14_reg4_dinov2.lvd142m', pretrained=False, num_classes=27
        )
        self.m1.load_state_dict(
            torch.load(self.mdir / "M1_IMAGE_DeepLearning_DINOv3.pth", map_location=self.device)
        )

        # M2 — XGBoost classifier on ResNet50 features (2048-dim)
        self.has_xgboost = False
        xgb_path = self.mdir / "M2_IMAGE_Classic_XGBoost.json"
        if xgb_path.exists():
            self.m2 = xgb.XGBClassifier()
            self.m2.load_model(str(xgb_path))
            self.le = joblib.load(self.mdir / "M2_IMAGE_XGBoost_Encoder.pkl")

            # ResNet50 feature extractor (headless) — feeds M2
            res = models.resnet50(weights=None)
            self.ext = nn.Sequential(*list(res.children())[:-1])
            self.ext.to(self.device).eval()
            self.has_xgboost = True

        # M3 — EfficientNet-B0 with custom 27-class head
        self.m3 = models.efficientnet_b0(weights=None)
        self.m3.classifier[1] = nn.Linear(1280, 27)
        self.m3.load_state_dict(
            torch.load(self.mdir / "M3_IMAGE_Classic_EfficientNetB0.pth", map_location=self.device)
        )

        # Move all torch models to device and set eval mode
        for m in [self.m1, self.m3]:
            m.to(self.device).eval()
        self.loaded = True

    def predict(self, img_p):
        """

        Run voting inference on a single image.



        Args:

            img_p: Path to the image file.



        Returns:

            List of top-5 dicts: [{"label": str, "confidence": float}, ...]

        """
        if not self.loaded:
            self.load_models()

        with torch.no_grad():
            # M1 — DINOv3 (518x518 input)
            p1 = F.softmax(
                self.m1(preprocess_image(img_p, "dino").to(self.device)), dim=1
            ).cpu().numpy()[0]

            # Shared 224x224 input for M2 and M3
            i2 = preprocess_image(img_p, "standard").to(self.device)

            # M3 — EfficientNet-B0
            p3 = F.softmax(self.m3(i2), dim=1).cpu().numpy()[0]

            if self.has_xgboost:
                # M2 — XGBoost on ResNet50 features
                f = self.ext(i2).squeeze().cpu().numpy().reshape(1, -1)
                raw_p2 = self.m2.predict_proba(f)[0]

                # Sharpening: raise to power 3, then renormalize.
                # This forces XGBoost to commit to a class instead of
                # spreading probability uniformly across all 27 classes.
                sharp_p2 = np.power(raw_p2, 3)
                p2 = sharp_p2 / sharp_p2.sum()

                # Weighted soft vote: DINOv3=4, EfficientNet=2, XGBoost=1
                f_p = (4.0 * p1 + 1.0 * p2 + 2.0 * p3) / 7.0
            else:
                # Fallback: DINOv3 + EfficientNet only (weights 4:2)
                f_p = (4.0 * p1 + 2.0 * p3) / 6.0

        # Return top-5 predictions with original Rakuten category codes
        ids = np.argsort(f_p)[-5:][::-1]

        # Use label encoder if available (XGBoost), otherwise use class index
        if self.has_xgboost:
            labels = [str(self.le.inverse_transform([i])[0]) for i in ids]
        else:
            # Load category mapping to get actual Rakuten codes
            import json
            mapping_path = self.mdir / "category_mapping.json"
            if mapping_path.exists():
                with open(mapping_path, 'r') as mf:
                    cat_map = json.load(mf)
                code_list = sorted(cat_map.keys(), key=int)
                labels = [code_list[i] if i < len(code_list) else str(i) for i in ids]
            else:
                labels = [str(i) for i in ids]

        return [
            {"label": labels[j], "confidence": float(f_p[ids[j]])}
            for j in range(len(ids))
        ]