Instructions to use finalpandas/CASA-Helium1-VL-2B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use finalpandas/CASA-Helium1-VL-2B with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="finalpandas/CASA-Helium1-VL-2B", trust_remote_code=True)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("finalpandas/CASA-Helium1-VL-2B", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use finalpandas/CASA-Helium1-VL-2B with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "finalpandas/CASA-Helium1-VL-2B" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "finalpandas/CASA-Helium1-VL-2B", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/finalpandas/CASA-Helium1-VL-2B
- SGLang
How to use finalpandas/CASA-Helium1-VL-2B with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "finalpandas/CASA-Helium1-VL-2B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "finalpandas/CASA-Helium1-VL-2B", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "finalpandas/CASA-Helium1-VL-2B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "finalpandas/CASA-Helium1-VL-2B", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use finalpandas/CASA-Helium1-VL-2B with Docker Model Runner:
docker model run hf.co/finalpandas/CASA-Helium1-VL-2B
| # pylint: disable=protected-access | |
| """Utils to handle CASA layers construction""" | |
| from contextlib import contextmanager | |
| from dataclasses import dataclass, fields | |
| from typing import Any, Callable, Generic, TypeVar | |
| import torch | |
| def delta_w_factory( | |
| org_lin: torch.nn.Linear, new_lin: torch.nn.Linear | |
| ) -> Callable[[torch.Tensor], torch.Tensor]: | |
| """Factory for building linear op where the weights are the sum of two layers' weights""" | |
| def _delta_w_fwd(input: torch.Tensor) -> torch.Tensor: | |
| nonlocal org_lin, new_lin | |
| bias = None if org_lin.bias is None else org_lin.bias + new_lin.bias | |
| return torch.nn.functional.linear(input, org_lin.weight + new_lin.weight, bias) | |
| return _delta_w_fwd | |
| class StreamingState: | |
| """Streaming State used by CASA layers at inference to save | |
| e.g. the offset, the KV Cache and other persistent states""" | |
| offset: int = 0 | |
| def _is_valid_field(self, key: str) -> bool: | |
| return key in {x.name for x in fields(self)} | |
| def _init_field(self, key: str) -> None: | |
| """Init function for non-arggment dependent defauls""" | |
| assert self._is_valid_field(key) | |
| if key == "offset": | |
| self.offset = 0 | |
| else: | |
| # for fields which should be set explicitly and cannot be auto-initialized | |
| setattr(self, key, None) | |
| def init(self) -> None: | |
| for key in [x.name for x in fields(self)]: | |
| self._init_field(key) | |
| def _reset_field(self, name: str) -> None: | |
| """Resets the given field""" | |
| self._init_field(name) | |
| def reset(self) -> None: | |
| for f in fields(self): | |
| self._reset_field(f.name) | |
| def _get_field(self, f: str) -> Any: | |
| """Get field and init if not""" | |
| assert self._is_valid_field(f) | |
| if getattr(self, f) is None: | |
| self._init_field(f) | |
| return getattr(self, f) | |
| def _set_field(self, f: str, value: Any) -> None: | |
| assert self._is_valid_field(f) | |
| setattr(self, f, value) | |
| StreamingStateT = TypeVar("StreamingStateT", bound=StreamingState) | |
| class StreamingModule(torch.nn.Module, Generic[StreamingStateT]): # pylint: disable=abstract-method | |
| """Overrides Audiocraft's Streaming modules with additional small utils""" | |
| def __init__(self, state_class: type) -> None: | |
| torch.nn.Module.__init__(self) | |
| self.is_streaming: bool = False | |
| self.enable_viz: tuple[str, ...] = () | |
| self._streaming_state: StreamingStateT = state_class() | |
| def streaming_state(self) -> StreamingStateT: | |
| return self._streaming_state | |
| def _apply_named_streaming(self, fn: Callable): | |
| """Apply function to all streaming modules""" | |
| for name, module in self.named_modules(): | |
| if isinstance(module, StreamingModule): | |
| fn(name, module) | |
| def reset_streaming(self): | |
| """Reset the streaming state.""" | |
| def _reset(_: str, module: StreamingModule): | |
| module._streaming_state.reset() | |
| self._apply_named_streaming(_reset) | |
| def _set_streaming(self, streaming: bool, viz: tuple[str, ...] = ()): | |
| """Set all streaming modules in streaming mode""" | |
| def _set_streaming(_, module: StreamingModule) -> None: | |
| module.is_streaming = streaming | |
| module.enable_viz = viz | |
| if streaming: | |
| module.streaming_state.init() | |
| self._apply_named_streaming(_set_streaming) | |
| def streaming(self, stream: bool = True, viz: tuple[str, ...] = ()): | |
| """Context manager to enter streaming mode. Reset streaming state on exit.""" | |
| self._set_streaming(stream, viz) | |
| try: | |
| yield | |
| finally: | |
| self._set_streaming(False, ()) | |
| self.reset_streaming() | |