from transformers import PretrainedConfig class TRENConfig(PretrainedConfig): """ Configuration for T-REN (Text-aligned Region Encoder Network). The trainable T-REN head (RegionEncoder) weights are stored in this HF repo. The DINOv3 ViT-L/16 backbone weights must be downloaded separately from Facebook Research (see load_backbone() in TRENModel). """ model_type = "tren" auto_map = { "AutoConfig": "configuration_tren.TRENConfig", "AutoModel": "modeling_tren.TRENModel", } def __init__( self, patch_size: int = 16, hidden_dim: int = 1024, text_embed_dim: int = 1024, num_decoder_layers: int = 2, num_attention_heads: int = 8, image_resolution: int = 512, num_multiscale_regions: int = 3, merging_iou_threshold: float = 0.8, merging_similarity_threshold: float = 0.975, **kwargs, ): self.patch_size = patch_size self.hidden_dim = hidden_dim self.text_embed_dim = text_embed_dim self.num_decoder_layers = num_decoder_layers self.num_attention_heads = num_attention_heads self.image_resolution = image_resolution self.num_multiscale_regions = num_multiscale_regions self.merging_iou_threshold = merging_iou_threshold self.merging_similarity_threshold = merging_similarity_threshold super().__init__(**kwargs)