Upload files with `vila-upload`.
Browse files- auto_processor.py +41 -5
auto_processor.py
CHANGED
|
@@ -18,8 +18,35 @@ from .media import Image, Video, extract_media
|
|
| 18 |
from .mm_utils import process_image, process_images
|
| 19 |
from .tokenizer_utils import tokenize_conversation
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
# tensor shape is (batch_size, seq_len)
|
| 24 |
max_len = max([ids.shape[1] for ids in input_ids_list])
|
| 25 |
if target_len is not None:
|
|
@@ -47,6 +74,8 @@ class VILAProcessorKwargs(ProcessingKwargs, total=False):
|
|
| 47 |
}
|
| 48 |
|
| 49 |
|
|
|
|
|
|
|
| 50 |
class VILAProcessor(ProcessorMixin):
|
| 51 |
# attributes = ["image_processor", "tokenizer"]
|
| 52 |
attributes = []
|
|
@@ -115,7 +144,7 @@ class VILAProcessor(ProcessorMixin):
|
|
| 115 |
return BatchFeature(
|
| 116 |
data={
|
| 117 |
# "input_ids": torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id),
|
| 118 |
-
"input_ids":
|
| 119 |
input_ids,
|
| 120 |
padding_value=self.tokenizer.pad_token_id,
|
| 121 |
padding_side="left",
|
|
@@ -216,9 +245,17 @@ class VILAProcessor(ProcessorMixin):
|
|
| 216 |
vila_chat["from"] = "human"
|
| 217 |
for content in chat["content"]:
|
| 218 |
if content["type"] == "image":
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
elif content["type"] == "text":
|
| 221 |
vila_chat["value"].append(content["text"])
|
|
|
|
| 222 |
else:
|
| 223 |
raise ValueError(f"Unsupported content type: {content['type']}")
|
| 224 |
elif chat["role"] == "assistant":
|
|
@@ -228,7 +265,6 @@ class VILAProcessor(ProcessorMixin):
|
|
| 228 |
vila_chat["value"].append(content["text"])
|
| 229 |
vila_conv.append(vila_chat)
|
| 230 |
|
| 231 |
-
# return self(vila_conv)
|
| 232 |
return vila_conv
|
| 233 |
|
| 234 |
|
|
|
|
| 18 |
from .mm_utils import process_image, process_images
|
| 19 |
from .tokenizer_utils import tokenize_conversation
|
| 20 |
|
| 21 |
+
def fetch_image_url_or_fpath(url_or_fpath):
|
| 22 |
+
if url_or_fpath.startswith("http") or url_or_fpath.startswith("https"):
|
| 23 |
+
import tempfile
|
| 24 |
+
import requests
|
| 25 |
+
|
| 26 |
+
# Download the image to a temporary file
|
| 27 |
+
temp_dir = tempfile.mkdtemp()
|
| 28 |
+
temp_file = os.path.join(temp_dir, os.path.basename(url_or_fpath))
|
| 29 |
+
|
| 30 |
+
response = requests.get(url_or_fpath, stream=True)
|
| 31 |
+
response.raise_for_status()
|
| 32 |
+
|
| 33 |
+
with open(temp_file, "wb") as f:
|
| 34 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 35 |
+
f.write(chunk)
|
| 36 |
+
|
| 37 |
+
return temp_file
|
| 38 |
+
elif url_or_fpath.startswith("file://"):
|
| 39 |
+
fpath = url_or_fpath.replace("file://", "")
|
| 40 |
+
assert osp.exists(fpath), f"File {fpath} does not exist"
|
| 41 |
+
return fpath
|
| 42 |
+
elif osp.exists(url_or_fpath):
|
| 43 |
+
assert osp.isfile(url_or_fpath), f"File {url_or_fpath} is not a file"
|
| 44 |
+
return url_or_fpath
|
| 45 |
+
else:
|
| 46 |
+
raise ValueError(f"Unsupported image path: {url_or_fpath}")
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def __pad_fn(input_ids_list, padding_value=0, target_len=None, padding_side="left"):
|
| 50 |
# tensor shape is (batch_size, seq_len)
|
| 51 |
max_len = max([ids.shape[1] for ids in input_ids_list])
|
| 52 |
if target_len is not None:
|
|
|
|
| 74 |
}
|
| 75 |
|
| 76 |
|
| 77 |
+
|
| 78 |
+
|
| 79 |
class VILAProcessor(ProcessorMixin):
|
| 80 |
# attributes = ["image_processor", "tokenizer"]
|
| 81 |
attributes = []
|
|
|
|
| 144 |
return BatchFeature(
|
| 145 |
data={
|
| 146 |
# "input_ids": torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id),
|
| 147 |
+
"input_ids": __pad_fn(
|
| 148 |
input_ids,
|
| 149 |
padding_value=self.tokenizer.pad_token_id,
|
| 150 |
padding_side="left",
|
|
|
|
| 245 |
vila_chat["from"] = "human"
|
| 246 |
for content in chat["content"]:
|
| 247 |
if content["type"] == "image":
|
| 248 |
+
if "path" in content:
|
| 249 |
+
# VILA style
|
| 250 |
+
vila_chat["value"].append(Image(fetch_image_url_or_fpath(content["path"])))
|
| 251 |
+
elif "image" in content:
|
| 252 |
+
# Qwen style
|
| 253 |
+
vila_chat["value"].append(Image(fetch_image_url_or_fpath(content["image"])))
|
| 254 |
+
else:
|
| 255 |
+
raise ValueError(f"Unsupported content type `image`: {content}, `image` and `path` are required")
|
| 256 |
elif content["type"] == "text":
|
| 257 |
vila_chat["value"].append(content["text"])
|
| 258 |
+
# NOTE(ligeng): video supports are needed here
|
| 259 |
else:
|
| 260 |
raise ValueError(f"Unsupported content type: {content['type']}")
|
| 261 |
elif chat["role"] == "assistant":
|
|
|
|
| 265 |
vila_chat["value"].append(content["text"])
|
| 266 |
vila_conv.append(vila_chat)
|
| 267 |
|
|
|
|
| 268 |
return vila_conv
|
| 269 |
|
| 270 |
|