dn6 HF Staff

Upload folder using huggingface_hub

3b2978d verified 5 months ago

4.8 kB

	import json
	import base64
	from io import BytesIO

	from typing import List, Union
	from PIL import Image
	from urllib.parse import urlparse
	from diffusers.modular_pipelines.modular_pipeline_utils import ConfigSpec
	from huggingface_hub import InferenceClient
	from diffusers import ModularPipeline, ModularPipelineBlocks
	from diffusers.modular_pipelines import InputParam, OutputParam, PipelineState
	from diffusers.utils import logger


	FRAME_MULTIPLE = 4


	def _encode_image_to_base64(img: Image.Image, max_size_mb: float = 3.5) -> str:
	"""Helper function to encode a PIL Image to base64 data URI.

	Args:
	img: PIL Image object
	max_size_mb: Maximum size in MB for data URIs

	Returns:
	str: Base64 encoded data URI
	"""
	buffer = BytesIO()
	img.save(buffer, format="PNG")
	size_mb = len(buffer.getvalue()) / (1024 * 1024)

	if size_mb <= max_size_mb:
	img_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
	return f"data:image/png;base64,{img_str}"

	if img.mode not in ("RGB", "L"):
	img = img.convert("RGB")

	if size_mb > max_size_mb * 2:
	scale = (max_size_mb / size_mb) ** 0.5
	new_size = (int(img.width * scale), int(img.height * scale))
	img = img.resize(new_size, Image.Resampling.LANCZOS)

	buffer = BytesIO()
	img.save(buffer, format="JPEG", quality=85, optimize=True)
	img_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
	return f"data:image/jpeg;base64,{img_str}"


	def image_to_uri(image: Union[str, Image.Image], max_size_mb: float = 3.5) -> str:
	"""Convert an image to a URI.

	Args:
	image: URL string, local file path string, or PIL Image object
	max_size_mb: Maximum size in MB for data URIs (default 3.5MB)

	Returns:
	str: URL if input is a URL, data URI otherwise
	"""
	if isinstance(image, Image.Image):
	return _encode_image_to_base64(image, max_size_mb)

	parsed = urlparse(image)
	if parsed.scheme in ("http", "https") and parsed.netloc:
	return image

	with Image.open(image) as img:
	return _encode_image_to_base64(img, max_size_mb)


	class ImageToMatrixGameAction(ModularPipelineBlocks):
	model_name = "MatrixGameWan"

	@property
	def inputs(self):
	return [
	InputParam("image"),
	InputParam("num_frames"),
	InputParam("prompt"),
	]

	@property
	def intermediate_outputs(self):
	return [OutputParam("actions")]

	@property
	def expected_configs(self) -> List[ConfigSpec]:
	return [ConfigSpec("model_id", default="Qwen/Qwen2.5-VL-72B-Instruct")]

	def __call__(self, components: ModularPipeline, state: PipelineState):
	client = InferenceClient()
	instructions = """
	You will be provided an image and you have to interpret how you would move inside it
	if the image was in 3D space.

	Here are the available actions you can take:

	Movement Actions: forward, left, right
	Camera Actions: camera_l, camera_r

	You can also combine actions with an _ to create compound actions. e.g. forward_left_camera_l
	Each action is rendered for 12 frames, so make sure the number of actions suggested fits into the total number of frames available: {num_frames}

	e.g ["forward", "forward_left", "camera_l"]

	Here are additional instructions for you to follow:
	{prompt}

	Only respond with the list of actions you have to take and nothing else.
	"""
	block_state = self.get_block_state(state)

	image = block_state.image
	prompt = block_state.prompt or ""
	num_frames = block_state.num_frames
	instructions = instructions.format(prompt=prompt, num_frames=num_frames)

	try:
	user_message = [
	{
	"role": "user",
	"content": [
	{
	"type": "image_url",
	"image_url": {"url": image_to_uri(image)},
	},
	{"type": "text", "text": instructions},
	],
	}
	]

	completion = client.chat.completions.create(
	model=components.model_id,
	messages=user_message,
	temperature=0.2,
	max_tokens=1000,
	)
	content = completion.choices[0].message.content
	block_state.actions = json.loads(content)

	self.set_block_state(state, block_state)

	return components, state

	except Exception as e:
	logger.warning("Unable to generate actions. Defaulting to random actions")
	return components, state