import argparse from pathlib import Path import torch from huggingface_hub import hf_hub_download from configuration_f2p_decoder import F2PDecoderConfig from modeling_f2p_decoder import F2PDecoderModel def convert(output_dir: str) -> None: output_path = Path(output_dir) checkpoint_path = hf_hub_download("nyu-visionx/siglip2_decoder", "model.pt") state_dict = torch.load(checkpoint_path, map_location="cpu") state_dict = {f"decoder.{key}": value for key, value in state_dict.items()} config = F2PDecoderConfig() model = F2PDecoderModel(config) missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) unexpected_keys = [key for key in unexpected_keys if key] missing_keys = [ key for key in missing_keys if key not in {"image_mean", "image_std"} ] if missing_keys or unexpected_keys: raise RuntimeError( "Checkpoint conversion mismatch: " f"missing={missing_keys}, unexpected={unexpected_keys}" ) model.save_pretrained(output_path, safe_serialization=True) print(f"Saved Hugging Face artifact to {output_path}") def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--output_dir", default="hf_artifacts/f2p_decoder") args = parser.parse_args() convert(args.output_dir) if __name__ == "__main__": main()