Spaces:

lucasddmc
/

ViTViz

Running

File size: 25,259 Bytes

import pickle
import torch
import timm
from dataclasses import dataclass
from typing import Optional, Tuple, Dict, Any

# Importar VisionTransformer diretamente para criar modelos com arquiteturas customizadas
try:
    from timm.models.vision_transformer import VisionTransformer
except ImportError:
    VisionTransformer = None

try:
    from transformers import AutoModelForImageClassification
except Exception:  # pragma: no cover
    AutoModelForImageClassification = None

# Suporte a safetensors (formato moderno do HuggingFace)
try:
    from safetensors.torch import load_file as load_safetensors
except ImportError:
    load_safetensors = None

DEVICE_DEFAULT = torch.device("cuda" if torch.cuda.is_available() else "cpu")


@dataclass
class ViTConfig:
    """Configuração de arquitetura ViT extraída dinamicamente do modelo."""
    embed_dim: int = 768
    num_heads: int = 12
    num_layers: int = 12
    patch_size: int = 16
    img_size: int = 224
    num_classes: int = 1000
    mlp_ratio: float = 4.0
    qkv_bias: bool = True
    
    @property
    def grid_size(self) -> int:
        """Tamanho do grid de patches (ex: 224/16 = 14)."""
        return self.img_size // self.patch_size
    
    @property
    def num_patches(self) -> int:
        """Número total de patches (ex: 14*14 = 196)."""
        return self.grid_size ** 2
    
    @property
    def timm_model_name(self) -> str:
        """Retorna o nome do modelo timm correspondente (para fins informativos)."""
        # Mapeamento baseado em embed_dim e num_heads
        size_map = {
            (192, 3): 'tiny',
            (384, 6): 'small', 
            (768, 12): 'base',
            (1024, 16): 'large',
            (1280, 16): 'huge',
        }
        size = size_map.get((self.embed_dim, self.num_heads), 'custom')
        return f"vit_{size}_patch{self.patch_size}_{self.img_size}"


def create_vit_from_config(config: ViTConfig, device: Optional[torch.device] = None) -> torch.nn.Module:
    """Cria um modelo ViT diretamente a partir da configuração inferida.
    
    Isso permite criar modelos com arquiteturas arbitrárias, não limitadas
    aos nomes predefinidos do timm (vit_base_patch16_224, etc.).
    """
    device = device or DEVICE_DEFAULT
    
    if VisionTransformer is None:
        raise RuntimeError("VisionTransformer não disponível. Verifique a instalação do timm.")
    
    model = VisionTransformer(
        img_size=config.img_size,
        patch_size=config.patch_size,
        in_chans=3,
        num_classes=config.num_classes,
        embed_dim=config.embed_dim,
        depth=config.num_layers,
        num_heads=config.num_heads,
        mlp_ratio=config.mlp_ratio,
        qkv_bias=config.qkv_bias,
        class_token=True,
        global_pool='token',
    )
    
    return model.to(device)


def _strip_state_dict_prefix(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
    """Remove prefixos comuns de frameworks (Lightning, DDP, etc.) das keys do state_dict.
    
    Prefixos tratados:
    - 'model.' (PyTorch Lightning)
    - 'module.' (DataParallel/DistributedDataParallel)
    - 'encoder.' (alguns frameworks de self-supervised learning)
    - 'backbone.' (alguns frameworks de detecção)
    
    Returns:
        state_dict com keys sem prefixo
    """
    prefixes = ['model.', 'module.', 'encoder.', 'backbone.']
    
    # Verificar se alguma key tem prefixo
    has_prefix = False
    detected_prefix = None
    for key in state_dict.keys():
        for prefix in prefixes:
            if key.startswith(prefix):
                has_prefix = True
                detected_prefix = prefix
                break
        if has_prefix:
            break
    
    if not has_prefix:
        return state_dict
    
    print(f"[ViTViz] Detectado prefixo '{detected_prefix}' nas keys do state_dict (Lightning/DDP). Removendo...")
    
    new_sd: Dict[str, torch.Tensor] = {}
    for key, value in state_dict.items():
        new_key = key
        for prefix in prefixes:
            if key.startswith(prefix):
                new_key = key[len(prefix):]
                break
        new_sd[new_key] = value
    
    return new_sd


def validate_vit_structure(model: torch.nn.Module) -> Tuple[bool, str]:
    """Valida se o modelo tem a estrutura esperada de um ViT timm-compatível.
    
    Returns:
        (is_valid, error_message) - se inválido, error_message descreve o problema
    """
    if not hasattr(model, 'blocks'):
        return False, "Modelo não tem atributo 'blocks'. Não é um ViT compatível."
    
    if len(model.blocks) == 0:
        return False, "Modelo tem 'blocks' vazio."
    
    block = model.blocks[0]
    if not hasattr(block, 'attn'):
        return False, "Bloco não tem atributo 'attn'. Estrutura incompatível."
    
    attn = block.attn
    if not hasattr(attn, 'qkv'):
        return False, "Módulo de atenção não tem 'qkv'. Estrutura incompatível."
    
    if not hasattr(attn, 'num_heads'):
        return False, "Módulo de atenção não tem 'num_heads'. Estrutura incompatível."
    
    return True, ""


def infer_config_from_model(model: torch.nn.Module) -> ViTConfig:
    """Infere configuração ViT a partir de um modelo timm carregado."""
    config = ViTConfig()
    
    # Extrair img_size e patch_size do patch_embed
    if hasattr(model, 'patch_embed'):
        pe = model.patch_embed
        if hasattr(pe, 'img_size'):
            img_size = pe.img_size
            config.img_size = img_size[0] if isinstance(img_size, (tuple, list)) else img_size
        if hasattr(pe, 'patch_size'):
            patch_size = pe.patch_size
            config.patch_size = patch_size[0] if isinstance(patch_size, (tuple, list)) else patch_size
    
    # Extrair num_layers, embed_dim, num_heads dos blocks
    if hasattr(model, 'blocks') and len(model.blocks) > 0:
        config.num_layers = len(model.blocks)
        block = model.blocks[0]
        if hasattr(block, 'attn'):
            attn = block.attn
            if hasattr(attn, 'num_heads'):
                config.num_heads = attn.num_heads
            if hasattr(attn, 'qkv') and hasattr(attn.qkv, 'in_features'):
                config.embed_dim = attn.qkv.in_features
    
    # Extrair num_classes do head
    if hasattr(model, 'head') and hasattr(model.head, 'out_features'):
        config.num_classes = model.head.out_features
    elif hasattr(model, 'head') and hasattr(model.head, 'weight'):
        config.num_classes = model.head.weight.shape[0]
    
    return config


def infer_config_from_state_dict(state_dict: Dict[str, torch.Tensor]) -> ViTConfig:
    """Infere configuração ViT a partir de um state_dict."""
    config = ViTConfig()
    
    # Inferir num_layers contando blocks
    layer_indices = set()
    for key in state_dict.keys():
        if key.startswith('blocks.') and '.attn.' in key:
            # blocks.0.attn.qkv.weight -> extrair 0
            idx = int(key.split('.')[1])
            layer_indices.add(idx)
    if layer_indices:
        config.num_layers = max(layer_indices) + 1
    
    # Inferir embed_dim do primeiro bloco
    qkv_key = 'blocks.0.attn.qkv.weight'
    if qkv_key in state_dict:
        qkv_weight = state_dict[qkv_key]
        # qkv.weight shape: [3*embed_dim, embed_dim]
        config.embed_dim = qkv_weight.shape[1]
        # Inferir num_heads diretamente: qkv tem shape [3*embed_dim, embed_dim]
        # O output é 3*embed_dim = 3*num_heads*head_dim
        # Podemos calcular num_heads = (qkv_out // 3) // head_dim
        # Mas head_dim varia. Tentamos inferir de outra forma.
    
    # Inferir num_heads: tentar múltiplos métodos
    proj_key = 'blocks.0.attn.proj.weight'
    if proj_key in state_dict and qkv_key in state_dict:
        embed_dim = state_dict[proj_key].shape[0]
        qkv_out = state_dict[qkv_key].shape[0]  # 3*embed_dim
        
        # Método 1: Se qkv_out == 3*embed_dim, tentar head_dim comum (64, 32, 96)
        if qkv_out == 3 * embed_dim:
            # Testar head_dims comuns em ordem de preferência
            for head_dim in [64, 32, 96, 48, 128]:
                if embed_dim % head_dim == 0:
                    config.num_heads = embed_dim // head_dim
                    break
            else:
                # Fallback: assumir que num_heads divide embed_dim uniformemente
                # Tentar valores comuns de num_heads
                for nh in [12, 16, 8, 6, 24, 4, 3]:
                    if embed_dim % nh == 0:
                        config.num_heads = nh
                        break
    
    # Inferir qkv_bias
    qkv_bias_key = 'blocks.0.attn.qkv.bias'
    config.qkv_bias = qkv_bias_key in state_dict
    
    # Inferir mlp_ratio do MLP
    mlp_fc1_key = 'blocks.0.mlp.fc1.weight'
    if mlp_fc1_key in state_dict and config.embed_dim > 0:
        mlp_hidden = state_dict[mlp_fc1_key].shape[0]
        config.mlp_ratio = mlp_hidden / config.embed_dim
    
    # Inferir num_classes do head
    head_key = 'head.weight'
    if head_key in state_dict:
        config.num_classes = state_dict[head_key].shape[0]
    
    # Inferir patch_size e img_size do patch_embed
    patch_proj_key = 'patch_embed.proj.weight'
    if patch_proj_key in state_dict:
        # shape: [embed_dim, 3, patch_size, patch_size]
        patch_weight = state_dict[patch_proj_key]
        config.patch_size = patch_weight.shape[2]
    
    # Inferir img_size do pos_embed
    pos_embed_key = 'pos_embed'
    if pos_embed_key in state_dict:
        # shape: [1, num_patches+1, embed_dim]
        num_tokens = state_dict[pos_embed_key].shape[1]
        num_patches = num_tokens - 1  # -1 para CLS token
        grid_size = int(num_patches ** 0.5)
        config.img_size = grid_size * config.patch_size
    
    return config


def _hf_id2label_to_class_names(id2label: Any) -> Optional[Dict[int, str]]:
    if not isinstance(id2label, dict):
        return None
    out: Dict[int, str] = {}
    for k, v in id2label.items():
        try:
            out[int(k)] = str(v)
        except Exception:
            continue
    return out or None


def _convert_hf_vit_to_timm_state_dict(hf_sd: Dict[str, torch.Tensor], num_layers: int) -> Dict[str, torch.Tensor]:
    """Converte state_dict de ViT (Hugging Face Transformers) para chaves do timm ViT.

    Alvo: timm "vit_base_patch16_224".
    """
    out: Dict[str, torch.Tensor] = {}

    def get(key: str) -> torch.Tensor:
        if key not in hf_sd:
            raise KeyError(f"Missing key in HF state_dict: {key}")
        return hf_sd[key]

    # Embeddings
    out["cls_token"] = get("vit.embeddings.cls_token")
    out["pos_embed"] = get("vit.embeddings.position_embeddings")
    out["patch_embed.proj.weight"] = get("vit.embeddings.patch_embeddings.projection.weight")
    out["patch_embed.proj.bias"] = get("vit.embeddings.patch_embeddings.projection.bias")

    # Encoder blocks
    for i in range(num_layers):
        prefix = f"vit.encoder.layer.{i}"
        out[f"blocks.{i}.norm1.weight"] = get(f"{prefix}.layernorm_before.weight")
        out[f"blocks.{i}.norm1.bias"] = get(f"{prefix}.layernorm_before.bias")
        out[f"blocks.{i}.norm2.weight"] = get(f"{prefix}.layernorm_after.weight")
        out[f"blocks.{i}.norm2.bias"] = get(f"{prefix}.layernorm_after.bias")

        qw = get(f"{prefix}.attention.attention.query.weight")
        kw = get(f"{prefix}.attention.attention.key.weight")
        vw = get(f"{prefix}.attention.attention.value.weight")
        qb = get(f"{prefix}.attention.attention.query.bias")
        kb = get(f"{prefix}.attention.attention.key.bias")
        vb = get(f"{prefix}.attention.attention.value.bias")
        out[f"blocks.{i}.attn.qkv.weight"] = torch.cat([qw, kw, vw], dim=0)
        out[f"blocks.{i}.attn.qkv.bias"] = torch.cat([qb, kb, vb], dim=0)

        out[f"blocks.{i}.attn.proj.weight"] = get(f"{prefix}.attention.output.dense.weight")
        out[f"blocks.{i}.attn.proj.bias"] = get(f"{prefix}.attention.output.dense.bias")

        out[f"blocks.{i}.mlp.fc1.weight"] = get(f"{prefix}.intermediate.dense.weight")
        out[f"blocks.{i}.mlp.fc1.bias"] = get(f"{prefix}.intermediate.dense.bias")
        out[f"blocks.{i}.mlp.fc2.weight"] = get(f"{prefix}.output.dense.weight")
        out[f"blocks.{i}.mlp.fc2.bias"] = get(f"{prefix}.output.dense.bias")

    out["norm.weight"] = get("vit.layernorm.weight")
    out["norm.bias"] = get("vit.layernorm.bias")

    # Classifier
    if "classifier.weight" in hf_sd and "classifier.bias" in hf_sd:
        out["head.weight"] = get("classifier.weight")
        out["head.bias"] = get("classifier.bias")

    return out


def _convert_hf_timm_wrapper_to_timm_state_dict(hf_sd: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
    """Converte state_dict de TimmWrapper (Transformers) para formato timm ViT.

    Exemplo de origem: chaves com prefixo ``timm_model.``.
    """
    out: Dict[str, torch.Tensor] = {}

    for key, value in hf_sd.items():
        if key.startswith("timm_model."):
            out[key[len("timm_model."):]] = value
        elif key.startswith("classifier."):
            # Alguns wrappers usam head separado como classifier.
            out[f"head.{key[len('classifier.'):]}"] = value

    if not out:
        raise ValueError("State_dict de TimmWrapper sem chaves reconhecidas (timm_model.* / classifier.*).")

    return out


def load_vit_from_huggingface(model_id: str, device: Optional[torch.device] = None) -> Tuple[torch.nn.Module, Optional[Dict[int, str]], ViTConfig]:
    """Carrega ViT do Hugging Face Hub e retorna um modelo timm equivalente.
    
    Returns:
        (model, class_names, config)
    """
    if AutoModelForImageClassification is None:
        raise RuntimeError("transformers não está instalado; instale 'transformers' para carregar do Hugging Face.")

    device = device or DEVICE_DEFAULT
    hf_model = AutoModelForImageClassification.from_pretrained(model_id)
    hf_model.eval()
    cfg = getattr(hf_model, "config", None)
    class_names = _hf_id2label_to_class_names(getattr(cfg, "id2label", None)) if cfg is not None else None

    hf_sd = hf_model.state_dict()
    if any(key.startswith("timm_model.") for key in hf_sd.keys()):
        timm_sd = _convert_hf_timm_wrapper_to_timm_state_dict(hf_sd)
    else:
        num_layers = int(getattr(cfg, "num_hidden_layers", 12)) if cfg is not None else 12
        timm_sd = _convert_hf_vit_to_timm_state_dict(hf_sd, num_layers=num_layers)

    vit_config = infer_config_from_state_dict(timm_sd)
    if cfg is not None and hasattr(cfg, "num_labels"):
        try:
            vit_config.num_classes = int(getattr(cfg, "num_labels"))
        except Exception:
            pass

    print(f"[ViTViz] Carregando do HuggingFace: {vit_config.timm_model_name} "
          f"(embed_dim={vit_config.embed_dim}, heads={vit_config.num_heads}, "
          f"layers={vit_config.num_layers}, patch={vit_config.patch_size}, img={vit_config.img_size})")

    timm_model = create_vit_from_config(vit_config, device=device)
    timm_model.load_state_dict(timm_sd, strict=False)
    timm_model.eval()
    
    return timm_model, class_names, vit_config


class CustomUnpickler(pickle.Unpickler):
    """Unpickler que ignora classes customizadas ausentes criando dummies dinamicamente."""

    def find_class(self, module, name):
        try:
            return super().find_class(module, name)
        except Exception:
            # Cria uma classe dummy com o mesmo nome para permitir o unpickle
            return type(name, (), {})


def load_checkpoint(model_path: str, device: Optional[torch.device] = None) -> Any:
    """Carrega um checkpoint/modelo do caminho informado.
    
    Suporta formatos:
    - .pth / .pt: PyTorch checkpoint (torch.load)
    - .safetensors: Formato moderno do HuggingFace (mais seguro e rápido)

    Retorna o objeto carregado (modelo completo, state_dict ou dict de checkpoint).
    """
    device = device or DEVICE_DEFAULT
    
    # Detectar formato safetensors
    if model_path.endswith('.safetensors'):
        if load_safetensors is None:
            raise ImportError(
                "safetensors não está instalado. Instale com: pip install safetensors"
            )
        # safetensors sempre retorna um state_dict (não suporta modelo completo)
        state_dict = load_safetensors(model_path, device=str(device))
        return state_dict
    
    # Formato PyTorch padrão (.pth, .pt, .ckpt, etc.)
    try:
        return torch.load(model_path, map_location=device, weights_only=False)
    except (AttributeError, ModuleNotFoundError, RuntimeError):
        # Fallback quando há classes ausentes ou conflitos de versão
        with open(model_path, 'rb') as f:
            return CustomUnpickler(f).load()


def infer_num_classes(state_dict: Dict[str, torch.Tensor]) -> int:
    """Infere o número de classes a partir do state_dict (camada de head).

    Caso não encontre, retorna 1000 (padrão ImageNet).
    """
    for key, tensor in state_dict.items():
        if 'head' in key and 'weight' in key and hasattr(tensor, 'shape'):
            return tensor.shape[0]
    return 1000


def extract_class_names(checkpoint: Any) -> Optional[Dict[int, str]]:
    """Tenta extrair nomes de classes de um checkpoint (se presente)."""
    if not isinstance(checkpoint, dict):
        return None

    possible_keys = [
        'class_names', 'classes', 'class_to_idx', 'idx_to_class',
        'label_names', 'labels', 'class_labels'
    ]

    for key in possible_keys:
        if key in checkpoint:
            labels = checkpoint[key]
            if isinstance(labels, list):
                return {i: name for i, name in enumerate(labels)}
            if isinstance(labels, dict):
                # Se já for idx->nome
                if all(isinstance(k, int) for k in labels.keys()):
                    return labels  # type: ignore[return-value]
                # Se for nome->idx
                if all(isinstance(v, int) for v in labels.values()):
                    return {v: k for k, v in labels.items()}
            return labels  # type: ignore[return-value]
    return None


def load_class_names_from_file(labels_file: Optional[str]) -> Optional[Dict[int, str]]:
    """Carrega nomes de classes de um arquivo .txt (um por linha) ou .json (lista ou dict)."""
    if not labels_file:
        return None
    import json
    try:
        if labels_file.endswith('.json'):
            with open(labels_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if isinstance(data, list):
                    return {i: name for i, name in enumerate(data)}
                if isinstance(data, dict):
                    out: Dict[int, str] = {}
                    for k, v in data.items():
                        try:
                            out[int(k)] = v
                        except Exception:
                            # Ignora chaves não numéricas
                            pass
                    if out:
                        return out
                    # fallback se for nome->idx
                    if all(isinstance(v, int) for v in data.values()):
                        return {v: k for k, v in data.items()}
                    return None
        else:
            with open(labels_file, 'r', encoding='utf-8') as f:
                lines = [line.strip() for line in f if line.strip()]
                return {i: name for i, name in enumerate(lines)}
    except Exception:
        return None


def build_model_from_checkpoint(checkpoint: Any, device: Optional[torch.device] = None) -> Tuple[torch.nn.Module, ViTConfig]:
    """Constroi um modelo a partir de um checkpoint que pode ser um dict, state_dict ou o próprio modelo.
    
    Suporta arquiteturas ViT arbitrárias, não limitadas aos nomes predefinidos do timm.
    
    Returns:
        (model, config) - modelo carregado e configuração inferida
    """
    device = device or DEVICE_DEFAULT
    config: Optional[ViTConfig] = None
    
    # Detectar e logar se é checkpoint PyTorch Lightning
    if isinstance(checkpoint, dict) and 'pytorch-lightning_version' in checkpoint:
        print(f"[ViTViz] Detectado checkpoint PyTorch Lightning (v{checkpoint.get('pytorch-lightning_version', '?')})")
    
    if isinstance(checkpoint, dict):
        if 'model' in checkpoint:
            # Modelo completo dentro do dict
            model = checkpoint['model']
            config = infer_config_from_model(model)
            # Validar estrutura
            is_valid, error_msg = validate_vit_structure(model)
            if not is_valid:
                raise ValueError(f"Modelo inválido: {error_msg}")
        elif 'state_dict' in checkpoint:
            state_dict = checkpoint['state_dict']
            # Remover prefixos de frameworks (Lightning, DDP, etc.)
            state_dict = _strip_state_dict_prefix(state_dict)
            config = infer_config_from_state_dict(state_dict)
            print(f"[ViTViz] Arquitetura inferida: {config.timm_model_name} "
                  f"(embed_dim={config.embed_dim}, heads={config.num_heads}, "
                  f"layers={config.num_layers}, patch={config.patch_size}, img={config.img_size})")
            # Criar modelo com arquitetura customizada
            model = create_vit_from_config(config, device=device)
            # strict=False para suportar variações como CLIP (norm_pre, etc.)
            model.load_state_dict(state_dict, strict=False)
        elif 'model_state_dict' in checkpoint:
            # Novo formato com class_names embutidas
            state_dict = checkpoint['model_state_dict']
            # Remover prefixos de frameworks (Lightning, DDP, etc.)
            state_dict = _strip_state_dict_prefix(state_dict)
            config = infer_config_from_state_dict(state_dict)
            print(f"[ViTViz] Arquitetura inferida: {config.timm_model_name} "
                  f"(embed_dim={config.embed_dim}, heads={config.num_heads}, "
                  f"layers={config.num_layers}, patch={config.patch_size}, img={config.img_size})")
            # Criar modelo com arquitetura customizada
            model = create_vit_from_config(config, device=device)
            # strict=False para suportar variações como CLIP (norm_pre, etc.)
            model.load_state_dict(state_dict, strict=False)
        else:
            # assume dict é um state_dict puro
            # Remover prefixos de frameworks (Lightning, DDP, etc.)
            checkpoint = _strip_state_dict_prefix(checkpoint)
            config = infer_config_from_state_dict(checkpoint)
            print(f"[ViTViz] Arquitetura inferida: {config.timm_model_name} "
                  f"(embed_dim={config.embed_dim}, heads={config.num_heads}, "
                  f"layers={config.num_layers}, patch={config.patch_size}, img={config.img_size})")
            # Criar modelo com arquitetura customizada
            model = create_vit_from_config(config, device=device)
            # strict=False para suportar variações como CLIP (norm_pre, etc.)
            model.load_state_dict(checkpoint, strict=False)
    else:
        # modelo completo salvo via torch.save(model, ...)
        model = checkpoint
        # Validar estrutura
        is_valid, error_msg = validate_vit_structure(model)
        if not is_valid:
            raise ValueError(f"Modelo inválido: {error_msg}")
        config = infer_config_from_model(model)

    model = model.to(device)
    model.eval()
    
    # Garantir que config está preenchido
    if config is None:
        config = infer_config_from_model(model)
    
    return model, config


def load_model_and_labels(
    model_path: str,
    labels_file: Optional[str] = None,
    device: Optional[torch.device] = None,
) -> Tuple[torch.nn.Module, Optional[Dict[int, str]], Optional[str], ViTConfig]:
    """
    ** Função Principal **
    Carrega modelo e, se disponível, nomes de classes.

    Retorna: (model, class_names, origem_labels, config) onde origem_labels ∈ {"file", "checkpoint", "hf", None}
        None se não houver nomes de classes disponíveis.
        config contém a configuração da arquitetura ViT (embed_dim, num_heads, grid_size, etc.)
    """
    device = device or DEVICE_DEFAULT

    # Carregar diretamente do Hugging Face Hub (Transformers -> timm)
    if isinstance(model_path, str) and model_path.startswith("hf-model://"):
        model_id = model_path[len("hf-model://"):].strip("/")
        model, class_names, config = load_vit_from_huggingface(model_id, device=device)
        return model, class_names, 'hf', config

    checkpoint = load_checkpoint(model_path, device=device)
    class_names_ckpt = extract_class_names(checkpoint)
    # class_names_file = load_class_names_from_file(labels_file)
    # class_names = class_names_file or class_names_ckpt
    # source: Optional[str] = None
    # if class_names_file:
    #     source = 'file'
    # elif class_names_ckpt:
    #     source = 'checkpoint'

    class_names = class_names_ckpt
    source = 'checkpoint' if class_names_ckpt else None

    model, config = build_model_from_checkpoint(checkpoint, device=device)
    return model, class_names, source, config