Instructions to use FreedomIntelligence/ALLaVA-3B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use FreedomIntelligence/ALLaVA-3B with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="FreedomIntelligence/ALLaVA-3B", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("FreedomIntelligence/ALLaVA-3B", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use FreedomIntelligence/ALLaVA-3B with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "FreedomIntelligence/ALLaVA-3B" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "FreedomIntelligence/ALLaVA-3B", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/FreedomIntelligence/ALLaVA-3B
- SGLang
How to use FreedomIntelligence/ALLaVA-3B with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "FreedomIntelligence/ALLaVA-3B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "FreedomIntelligence/ALLaVA-3B", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "FreedomIntelligence/ALLaVA-3B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "FreedomIntelligence/ALLaVA-3B", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use FreedomIntelligence/ALLaVA-3B with Docker Model Runner:
docker model run hf.co/FreedomIntelligence/ALLaVA-3B
upload modeling_llava_phi.py
Browse files- modeling_llava_phi.py +0 -37
modeling_llava_phi.py
CHANGED
|
@@ -193,44 +193,7 @@ class LlavaPhiForCausalLM(PhiForCausalLM, LlavaMetaForCausalLM):
|
|
| 193 |
return model_inputs
|
| 194 |
|
| 195 |
|
| 196 |
-
# def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
|
| 197 |
-
# '''
|
| 198 |
-
# This function is called for each token at inference
|
| 199 |
-
# '''
|
| 200 |
-
# pdb.set_trace()
|
| 201 |
-
# images = kwargs.pop("images", None)
|
| 202 |
|
| 203 |
-
|
| 204 |
-
# _inputs = super().prepare_inputs_for_generation(
|
| 205 |
-
# input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
|
| 206 |
-
# )
|
| 207 |
-
# if images is not None:
|
| 208 |
-
# _inputs['images'] = images
|
| 209 |
-
# return _inputs
|
| 210 |
-
|
| 211 |
-
# def build_chat_input(self, text, images):
|
| 212 |
-
|
| 213 |
-
# return inputs
|
| 214 |
-
|
| 215 |
-
# def chat(self, tokenizer, messages: List[dict], stream=False,
|
| 216 |
-
# generation_config: Optional[GenerationConfig]=None):
|
| 217 |
-
# generation_config = generation_config or self.generation_config
|
| 218 |
-
# input_ids = build_chat_input(self, tokenizer, messages, generation_config.max_new_tokens)
|
| 219 |
-
# if stream:
|
| 220 |
-
# streamer = TextIterStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
| 221 |
-
# Thread(target=self.generate, kwargs=dict(
|
| 222 |
-
# inputs=input_ids, streamer=streamer,
|
| 223 |
-
# generation_config=generation_config,
|
| 224 |
-
# )).start()
|
| 225 |
-
# return streamer
|
| 226 |
-
# else:
|
| 227 |
-
# outputs = self.generate(input_ids, generation_config=generation_config)
|
| 228 |
-
# response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
|
| 229 |
-
# return response
|
| 230 |
-
|
| 231 |
-
# def collate_text_input(self, ):
|
| 232 |
-
# pass
|
| 233 |
-
|
| 234 |
|
| 235 |
def chat(
|
| 236 |
self,
|
|
|
|
| 193 |
return model_inputs
|
| 194 |
|
| 195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
def chat(
|
| 199 |
self,
|