Upload folder using huggingface_hub

1efcb3c verified 4 months ago

7.2 kB

	# Copyright 2024 Bytedance Ltd. and/or its affiliates
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import math
	import os
	from collections import defaultdict
	from io import BytesIO
	from typing import Any, Dict, List, Optional, Union

	import numpy as np
	import torch
	from datasets import load_dataset
	from PIL import Image
	from PIL.Image import Image as ImageObject
	from torch.utils.data import Dataset
	from transformers import PreTrainedTokenizer, ProcessorMixin

	from ..models.transformers.qwen2_vl import get_rope_index
	from . import torch_functional as VF
	import pdb


	def collate_fn(features: List[Dict[str, Any]]) -> Dict[str, Any]:
	tensors = defaultdict(list)
	non_tensors = defaultdict(list)
	for feature in features:
	for key, value in feature.items():
	if isinstance(value, torch.Tensor):
	tensors[key].append(value)
	else:
	non_tensors[key].append(value)

	for key, value in tensors.items():
	tensors[key] = torch.stack(value, dim=0)

	for key, value in non_tensors.items():
	non_tensors[key] = np.array(value, dtype=object)

	return {tensors, non_tensors}


	class ImageProcessMixin:
	max_pixels: int
	min_pixels: int

	def process_image(self, image: Union[Dict[str, Any], ImageObject]) -> ImageObject:
	if isinstance(image, dict):
	image = Image.open(BytesIO(image["bytes"]))
	elif isinstance(image, bytes):
	image = Image.open(BytesIO(image))

	if (image.width * image.height) > self.max_pixels:
	resize_factor = math.sqrt(self.max_pixels / (image.width * image.height))
	width, height = int(image.width * resize_factor), int(image.height * resize_factor)
	image = image.resize((width, height))

	if (image.width * image.height) < self.min_pixels:
	resize_factor = math.sqrt(self.min_pixels / (image.width * image.height))
	width, height = int(image.width * resize_factor), int(image.height * resize_factor)
	image = image.resize((width, height))

	if image.mode != "RGB":
	image = image.convert("RGB")

	return image


	class RLHFDataset(Dataset, ImageProcessMixin):
	"""
	We assume the dataset contains a column that contains prompts and other information
	"""

	def __init__(
	self,
	data_path: str,
	tokenizer: PreTrainedTokenizer,
	processor: Optional[ProcessorMixin],
	prompt_key: str = "prompt",
	answer_key: str = "answer",
	image_key: str = "images",
	max_prompt_length: int = 1024,
	truncation: str = "error",
	system_prompt: str = None,
	max_pixels: int = None,
	min_pixels: int = None,
	):
	self.tokenizer = tokenizer
	self.processor = processor
	self.prompt_key = prompt_key
	self.answer_key = answer_key
	self.image_key = image_key
	self.max_prompt_length = max_prompt_length
	self.truncation = truncation
	self.system_prompt = system_prompt
	self.max_pixels = max_pixels
	self.min_pixels = min_pixels

	if "@" in data_path:
	data_path, data_split = data_path.split("@")
	else:
	data_split = "train"

	if os.path.isdir(data_path):
	self.dataset = load_dataset("parquet", data_dir=data_path, split="train")
	elif os.path.isfile(data_path):
	self.dataset = load_dataset("parquet", data_files=data_path, split="train")
	else: # remote dataset
	self.dataset = load_dataset(data_path, split=data_split)

	def __len__(self):
	return len(self.dataset)

	def __getitem__(self, index):
	row_dict: dict = self.dataset[index]
	prompt_str: str = row_dict[self.prompt_key]
	messages = []
	if self.system_prompt:
	# prompt_str = " ".join((self.system_prompt.strip(), prompt_str))
	messages.append({'role': 'system', 'content': self.system_prompt.strip()}) # modified

	if self.image_key in row_dict:
	# https://huggingface.co/docs/transformers/en/tasks/image_text_to_text
	content_list = []
	for i, content in enumerate(prompt_str.split("<image>")):
	if i != 0:
	content_list.append({"type": "image"})

	if content:
	content_list.append({"type": "text", "text": content})

	messages.append({"role": "user", "content": content_list}) # modified
	prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
	images = [self.process_image(image) for image in row_dict.pop(self.image_key)]
	model_inputs = self.processor(images, [prompt], add_special_tokens=False, return_tensors="pt")
	input_ids = model_inputs.pop("input_ids")[0]
	attention_mask = model_inputs.pop("attention_mask")[0]
	row_dict["multi_modal_data"] = {"image": images}
	row_dict["multi_modal_inputs"] = dict(model_inputs) # num of <\|image_pad\|>: hw //(merge_size2)

	# qwen2vl mrope
	position_ids = get_rope_index(
	self.processor,
	input_ids=input_ids,
	image_grid_thw=model_inputs["image_grid_thw"],
	attention_mask=attention_mask,
	) # (3, seq_length)
	else:
	messages.append({"role": "user", "content": prompt_str}) # modified
	prompt = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
	model_inputs = self.tokenizer([prompt], add_special_tokens=False, return_tensors="pt")
	input_ids = model_inputs.pop("input_ids")[0]
	attention_mask = model_inputs.pop("attention_mask")[0]
	position_ids = torch.clip(attention_mask.cumsum(dim=0) - 1, min=0, max=None) # (seq_length,)

	input_ids, attention_mask, position_ids = VF.postprocess_data(
	input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	max_length=self.max_prompt_length,
	pad_token_id=self.tokenizer.pad_token_id,
	left_pad=True,
	truncation=self.truncation,
	)
	row_dict["input_ids"] = input_ids
	row_dict["attention_mask"] = attention_mask
	row_dict["position_ids"] = position_ids
	row_dict["raw_prompt_ids"] = self.tokenizer.encode(prompt, add_special_tokens=False)
	row_dict["ground_truth"] = row_dict.pop(self.answer_key)
	return row_dict