Instructions to use microsoft/Florence-2-base with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use microsoft/Florence-2-base with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="microsoft/Florence-2-base", trust_remote_code=True)

# Load model directly
from transformers import AutoProcessor, AutoModelForMultimodalLM

processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
model = AutoModelForMultimodalLM.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use microsoft/Florence-2-base with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "microsoft/Florence-2-base"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "microsoft/Florence-2-base",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/microsoft/Florence-2-base

SGLang

How to use microsoft/Florence-2-base with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "microsoft/Florence-2-base" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "microsoft/Florence-2-base",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "microsoft/Florence-2-base" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "microsoft/Florence-2-base",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use microsoft/Florence-2-base with Docker Model Runner:
```
docker model run hf.co/microsoft/Florence-2-base
```

clean up

#16

by pawlowskipawel - opened Jul 30, 2024

base: refs/heads/main

←

from: refs/pr/16

Discussion Files changed

+15

-7

Files changed (1) hide show

modeling_florence2.py +15 -7

modeling_florence2.py CHANGED Viewed

@@ -2643,7 +2643,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         return x
     def _merge_input_ids_with_image_features(
-        self, image_features, inputs_embeds
     ):
         batch_size, image_token_length = image_features.size()[:-1]
         device = image_features.device
@@ -2655,10 +2655,12 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             return image_features, image_attention_mask
         task_prefix_embeds = inputs_embeds
-        task_prefix_attention_mask = torch.ones(batch_size, task_prefix_embeds.size(1), device=device)
-        if len(task_prefix_attention_mask.shape) == 3:
-            task_prefix_attention_mask = task_prefix_attention_mask[:, 0]
         # concat [image embeds, task prefix embeds]
         inputs_embeds = torch.cat([image_features, task_prefix_embeds], dim=1)
@@ -2719,12 +2721,14 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "A green car parked in front of a yellow building."
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         image_features = None
         if inputs_embeds is None:
             # 1. Extra the input embeddings
@@ -2735,7 +2739,9 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
                 # (batch_size, num_image_tokens, hidden_size)
                 image_features = self._encode_image(pixel_values)
                 inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
         if inputs_embeds is not None:
             attention_mask = attention_mask.to(inputs_embeds.dtype)
         outputs = self.language_model(
@@ -2781,6 +2787,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         input_ids,
         inputs_embeds=None,
         pixel_values=None,
         **kwargs
         ):
@@ -2791,11 +2798,12 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             # 2. Merge text and images
             if pixel_values is not None:
                 image_features = self._encode_image(pixel_values)
-                inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
         return self.language_model.generate(
             input_ids=None,
             inputs_embeds=inputs_embeds,
             **kwargs
         )

         return x
     def _merge_input_ids_with_image_features(
+        self, image_features, inputs_embeds, task_prefix_attention_mask=None
     ):
         batch_size, image_token_length = image_features.size()[:-1]
         device = image_features.device
             return image_features, image_attention_mask
         task_prefix_embeds = inputs_embeds
+        if task_prefix_attention_mask is None:
+            task_prefix_attention_mask = torch.ones(batch_size, task_prefix_embeds.size(1), device=device)
+            if len(task_prefix_attention_mask.shape) == 3:
+                task_prefix_attention_mask = task_prefix_attention_mask[:, 0]
         # concat [image embeds, task prefix embeds]
         inputs_embeds = torch.cat([image_features, task_prefix_embeds], dim=1)
         >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "A green car parked in front of a yellow building."
         ```"""
+        print("asdasdasdasdasdasdasdasda")
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        print("asdasdasdasdasdasdasdasda")
         image_features = None
         if inputs_embeds is None:
             # 1. Extra the input embeddings
                 # (batch_size, num_image_tokens, hidden_size)
                 image_features = self._encode_image(pixel_values)
                 inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
+        print(attention_mask)
         if inputs_embeds is not None:
             attention_mask = attention_mask.to(inputs_embeds.dtype)
         outputs = self.language_model(
         input_ids,
         inputs_embeds=None,
         pixel_values=None,
+        attention_mask=None,
         **kwargs
         ):
             # 2. Merge text and images
             if pixel_values is not None:
                 image_features = self._encode_image(pixel_values)
+                inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds, task_prefix_attention_mask=attention_mask)
         return self.language_model.generate(
             input_ids=None,
             inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
             **kwargs
         )