| import argparse |
| from pathlib import Path |
|
|
| import torch |
| from huggingface_hub import hf_hub_download |
|
|
| from configuration_f2p_decoder import F2PDecoderConfig |
| from modeling_f2p_decoder import F2PDecoderModel |
|
|
|
|
| def convert(output_dir: str) -> None: |
| output_path = Path(output_dir) |
| checkpoint_path = hf_hub_download("nyu-visionx/siglip2_decoder", "model.pt") |
| state_dict = torch.load(checkpoint_path, map_location="cpu") |
| state_dict = {f"decoder.{key}": value for key, value in state_dict.items()} |
|
|
| config = F2PDecoderConfig() |
| model = F2PDecoderModel(config) |
| missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) |
| unexpected_keys = [key for key in unexpected_keys if key] |
| missing_keys = [ |
| key for key in missing_keys if key not in {"image_mean", "image_std"} |
| ] |
| if missing_keys or unexpected_keys: |
| raise RuntimeError( |
| "Checkpoint conversion mismatch: " |
| f"missing={missing_keys}, unexpected={unexpected_keys}" |
| ) |
|
|
| model.save_pretrained(output_path, safe_serialization=True) |
| print(f"Saved Hugging Face artifact to {output_path}") |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--output_dir", default="hf_artifacts/f2p_decoder") |
| args = parser.parse_args() |
| convert(args.output_dir) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|