Image-Text-to-Text
Transformers
Safetensors
isaac
text-generation
perceptron
issac-0.1
conversational
custom_code
Instructions to use PerceptronAI/Isaac-0.1 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use PerceptronAI/Isaac-0.1 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="PerceptronAI/Isaac-0.1", trust_remote_code=True) messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("PerceptronAI/Isaac-0.1", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use PerceptronAI/Isaac-0.1 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "PerceptronAI/Isaac-0.1" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "PerceptronAI/Isaac-0.1", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/PerceptronAI/Isaac-0.1
- SGLang
How to use PerceptronAI/Isaac-0.1 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "PerceptronAI/Isaac-0.1" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "PerceptronAI/Isaac-0.1", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "PerceptronAI/Isaac-0.1" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "PerceptronAI/Isaac-0.1", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use PerceptronAI/Isaac-0.1 with Docker Model Runner:
docker model run hf.co/PerceptronAI/Isaac-0.1
Commit ·
3566174
1
Parent(s): c00ac74
checkpoint restructure
Browse files
model-00002-of-00003.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:b73a606d306a09519e3fbe7bfd29077d39db48fee47ce19521b6b5c398cdcc32
|
| 3 |
-
size 4054187824
|
|
|
|
|
|
|
|
|
|
|
|
model-00003-of-00003.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:6941d35ff1feae1603946f8746a71205bb86343b57968402df2e737faf9258a2
|
| 3 |
-
size 1244659840
|
|
|
|
|
|
|
|
|
|
|
|
model-00001-of-00003.safetensors → model.safetensors
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b84184ec79aa409631e68dc76c3133bc1bbae76d61842fdcd4fe553dd6a3b579
|
| 3 |
+
size 10268388224
|
modular_isaac.py
CHANGED
|
@@ -1579,8 +1579,9 @@ class IsaacModel(Qwen3PreTrainedModel):
|
|
| 1579 |
raise ValueError("IsaacConfig should always have vision_config")
|
| 1580 |
|
| 1581 |
|
| 1582 |
-
|
| 1583 |
-
self.
|
|
|
|
| 1584 |
|
| 1585 |
# Dispatch table for TensorStream balanced embedding (text + vision)
|
| 1586 |
self.embed_fns = {
|
|
@@ -1632,10 +1633,6 @@ class IsaacModel(Qwen3PreTrainedModel):
|
|
| 1632 |
def vision_model(self) -> nn.Module:
|
| 1633 |
return self.vision_embedding.vision_tower
|
| 1634 |
|
| 1635 |
-
@property
|
| 1636 |
-
def vision_tower(self) -> nn.Module:
|
| 1637 |
-
return self.vision_embedding.vision_tower
|
| 1638 |
-
|
| 1639 |
def embed_text_tokens(self, token_ids: torch.Tensor) -> torch.Tensor:
|
| 1640 |
"""Embed text tokens, squeezing singleton dimensions."""
|
| 1641 |
# Text events are shaped as (..., 1); squeeze the singleton index dim
|
|
@@ -1647,7 +1644,7 @@ class IsaacModel(Qwen3PreTrainedModel):
|
|
| 1647 |
def embed_vision(self, vision_tokens: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
| 1648 |
"""Embed vision tokens using the vision encoder."""
|
| 1649 |
# vision tokens is (seq_patches, token_grids)
|
| 1650 |
-
return self.
|
| 1651 |
|
| 1652 |
def embed_stream(self, tensor_stream: TensorStream) -> torch.Tensor:
|
| 1653 |
"""
|
|
@@ -2110,4 +2107,4 @@ __all__ = [
|
|
| 2110 |
"IsaacForConditionalGeneration",
|
| 2111 |
"IsaacImageProcessorFast",
|
| 2112 |
"IsaacProcessor",
|
| 2113 |
-
]
|
|
|
|
| 1579 |
raise ValueError("IsaacConfig should always have vision_config")
|
| 1580 |
|
| 1581 |
|
| 1582 |
+
|
| 1583 |
+
self.vision_tower = IsaacVisionTransformer(config.vision_config)
|
| 1584 |
+
self.multimodal_projector = IsaacMultiModalProjector(config)
|
| 1585 |
|
| 1586 |
# Dispatch table for TensorStream balanced embedding (text + vision)
|
| 1587 |
self.embed_fns = {
|
|
|
|
| 1633 |
def vision_model(self) -> nn.Module:
|
| 1634 |
return self.vision_embedding.vision_tower
|
| 1635 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1636 |
def embed_text_tokens(self, token_ids: torch.Tensor) -> torch.Tensor:
|
| 1637 |
"""Embed text tokens, squeezing singleton dimensions."""
|
| 1638 |
# Text events are shaped as (..., 1); squeeze the singleton index dim
|
|
|
|
| 1644 |
def embed_vision(self, vision_tokens: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
| 1645 |
"""Embed vision tokens using the vision encoder."""
|
| 1646 |
# vision tokens is (seq_patches, token_grids)
|
| 1647 |
+
return self.multimodal_projector(self.vision_tower(vision_tokens))
|
| 1648 |
|
| 1649 |
def embed_stream(self, tensor_stream: TensorStream) -> torch.Tensor:
|
| 1650 |
"""
|
|
|
|
| 2107 |
"IsaacForConditionalGeneration",
|
| 2108 |
"IsaacImageProcessorFast",
|
| 2109 |
"IsaacProcessor",
|
| 2110 |
+
]
|