dn6's picture
dn6 HF Staff
Upload folder using huggingface_hub
3b2978d verified
import json
import base64
from io import BytesIO
from typing import List, Union
from PIL import Image
from urllib.parse import urlparse
from diffusers.modular_pipelines.modular_pipeline_utils import ConfigSpec
from huggingface_hub import InferenceClient
from diffusers import ModularPipeline, ModularPipelineBlocks
from diffusers.modular_pipelines import InputParam, OutputParam, PipelineState
from diffusers.utils import logger
FRAME_MULTIPLE = 4
def _encode_image_to_base64(img: Image.Image, max_size_mb: float = 3.5) -> str:
"""Helper function to encode a PIL Image to base64 data URI.
Args:
img: PIL Image object
max_size_mb: Maximum size in MB for data URIs
Returns:
str: Base64 encoded data URI
"""
buffer = BytesIO()
img.save(buffer, format="PNG")
size_mb = len(buffer.getvalue()) / (1024 * 1024)
if size_mb <= max_size_mb:
img_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
return f"data:image/png;base64,{img_str}"
if img.mode not in ("RGB", "L"):
img = img.convert("RGB")
if size_mb > max_size_mb * 2:
scale = (max_size_mb / size_mb) ** 0.5
new_size = (int(img.width * scale), int(img.height * scale))
img = img.resize(new_size, Image.Resampling.LANCZOS)
buffer = BytesIO()
img.save(buffer, format="JPEG", quality=85, optimize=True)
img_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
return f"data:image/jpeg;base64,{img_str}"
def image_to_uri(image: Union[str, Image.Image], max_size_mb: float = 3.5) -> str:
"""Convert an image to a URI.
Args:
image: URL string, local file path string, or PIL Image object
max_size_mb: Maximum size in MB for data URIs (default 3.5MB)
Returns:
str: URL if input is a URL, data URI otherwise
"""
if isinstance(image, Image.Image):
return _encode_image_to_base64(image, max_size_mb)
parsed = urlparse(image)
if parsed.scheme in ("http", "https") and parsed.netloc:
return image
with Image.open(image) as img:
return _encode_image_to_base64(img, max_size_mb)
class ImageToMatrixGameAction(ModularPipelineBlocks):
model_name = "MatrixGameWan"
@property
def inputs(self):
return [
InputParam("image"),
InputParam("num_frames"),
InputParam("prompt"),
]
@property
def intermediate_outputs(self):
return [OutputParam("actions")]
@property
def expected_configs(self) -> List[ConfigSpec]:
return [ConfigSpec("model_id", default="Qwen/Qwen2.5-VL-72B-Instruct")]
def __call__(self, components: ModularPipeline, state: PipelineState):
client = InferenceClient()
instructions = """
You will be provided an image and you have to interpret how you would move inside it
if the image was in 3D space.
Here are the available actions you can take:
Movement Actions: forward, left, right
Camera Actions: camera_l, camera_r
You can also combine actions with an _ to create compound actions. e.g. forward_left_camera_l
Each action is rendered for 12 frames, so make sure the number of actions suggested fits into the total number of frames available: {num_frames}
e.g ["forward", "forward_left", "camera_l"]
Here are additional instructions for you to follow:
{prompt}
Only respond with the list of actions you have to take and nothing else.
"""
block_state = self.get_block_state(state)
image = block_state.image
prompt = block_state.prompt or ""
num_frames = block_state.num_frames
instructions = instructions.format(prompt=prompt, num_frames=num_frames)
try:
user_message = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": image_to_uri(image)},
},
{"type": "text", "text": instructions},
],
}
]
completion = client.chat.completions.create(
model=components.model_id,
messages=user_message,
temperature=0.2,
max_tokens=1000,
)
content = completion.choices[0].message.content
block_state.actions = json.loads(content)
self.set_block_state(state, block_state)
return components, state
except Exception as e:
logger.warning("Unable to generate actions. Defaulting to random actions")
return components, state