|
|
import os |
|
|
import sys |
|
|
import time |
|
|
from concurrent.futures import ThreadPoolExecutor |
|
|
from typing import List, Optional, Tuple, Union |
|
|
|
|
|
from tqdm import tqdm |
|
|
from transformers import AutoModel |
|
|
|
|
|
from lmms_eval.api.instance import Instance |
|
|
from lmms_eval.api.registry import register_model |
|
|
from lmms_eval.models.model_utils.gen_metrics import log_metrics |
|
|
from lmms_eval.models.simple.vllm import VLLM as VLLMSimple |
|
|
from lmms_eval.protocol import ChatMessages |
|
|
|
|
|
try: |
|
|
from vllm import LLM, SamplingParams |
|
|
except ImportError: |
|
|
vllm = None |
|
|
|
|
|
WORKERS = int(os.getenv("WORKERS", "32")) |
|
|
|
|
|
|
|
|
@register_model("longvila") |
|
|
class LongVila(VLLMSimple): |
|
|
is_simple = False |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
model="Efficient-Large-Model/LongVILA-R1-7B", |
|
|
tensor_parallel_size=1, |
|
|
data_parallel_size=1, |
|
|
gpu_memory_utilization=0.5, |
|
|
batch_size=1, |
|
|
max_frame_num=32, |
|
|
trust_remote_code=True, |
|
|
chat_template=None, |
|
|
max_pixels: int = 1605632, |
|
|
min_image_pixels=28, |
|
|
fps: Optional[int] = None, |
|
|
device_map: Optional[str] = "cuda", |
|
|
**kwargs, |
|
|
): |
|
|
|
|
|
model_root = model |
|
|
llm_path = os.path.join(model_root, "llm") |
|
|
|
|
|
kwargs["enable_prompt_embeds"] = True |
|
|
self.fps = fps |
|
|
self.max_pixels = max_pixels |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
from remote_code.media import extract_media as _extract_media |
|
|
from remote_code.mm_utils import process_images as _process_images |
|
|
from remote_code.tokenizer_utils import ( |
|
|
tokenize_conversation as _tokenize_conversation, |
|
|
) |
|
|
except Exception as e: |
|
|
raise ImportError(f"Failed to import LongVILA remote_code utilities from '{model_root}'. Ensure the model path contains remote_code. Original error: {e}") |
|
|
|
|
|
self.extract_media = _extract_media |
|
|
self.process_images = _process_images |
|
|
self.tokenize_conversation = _tokenize_conversation |
|
|
|
|
|
|
|
|
|
|
|
self.model_encoder = AutoModel.from_pretrained( |
|
|
model_root, |
|
|
trust_remote_code=True, |
|
|
device_map=device_map, |
|
|
llm_only_need_embed=True, |
|
|
) |
|
|
super().__init__(llm_path, tensor_parallel_size, data_parallel_size, gpu_memory_utilization, batch_size, max_frame_num, trust_remote_code, chat_template, min_image_pixels, **kwargs) |
|
|
|
|
|
def _to_remote_conversation(self, chat_messages: ChatMessages) -> list: |
|
|
""" |
|
|
Convert ChatMessages to LongVILA remote_code conversation format. |
|
|
[{"from": "human"|"gpt", "value": [str | {"path": media_path}, ...]}, ...] |
|
|
""" |
|
|
role_map = {"user": "human", "assistant": "gpt", "system": "human"} |
|
|
conversation = [] |
|
|
for msg in chat_messages.messages: |
|
|
from_role = role_map.get(msg.role, "human") |
|
|
value_parts = [] |
|
|
for content in msg.content: |
|
|
|
|
|
if getattr(content, "type", None) == "text": |
|
|
value_parts.append(content.text) |
|
|
|
|
|
elif getattr(content, "type", None) in ("image", "video", "audio"): |
|
|
value_parts.append({"path": content.url}) |
|
|
if value_parts: |
|
|
conversation.append({"from": from_role, "value": value_parts}) |
|
|
return conversation |
|
|
|
|
|
def make_one_request(self, request: Instance) -> Tuple["object", dict]: |
|
|
""" |
|
|
Build prompt embeddings and per-request sampling params from an Instance. |
|
|
Returns (inputs_embeds, params_dict). Does not mutate input. |
|
|
""" |
|
|
ctx, doc_to_messages, gen_kwargs, doc_id, task, split = request.arguments |
|
|
raw_messages = doc_to_messages(self.task_dict[task][split][doc_id]) |
|
|
chat_messages = ChatMessages(messages=raw_messages) |
|
|
|
|
|
|
|
|
_gen = dict(gen_kwargs or {}) |
|
|
_gen.setdefault("max_new_tokens", 4096) |
|
|
_gen.setdefault("temperature", 0) |
|
|
_gen.setdefault("top_p", 0.95) |
|
|
|
|
|
params = { |
|
|
"temperature": _gen["temperature"], |
|
|
"max_tokens": _gen["max_new_tokens"], |
|
|
"top_p": _gen["top_p"], |
|
|
} |
|
|
|
|
|
|
|
|
conversation = self._to_remote_conversation(chat_messages) |
|
|
|
|
|
|
|
|
if self.fps: |
|
|
self.model_encoder.config.fps = self.fps |
|
|
else: |
|
|
self.model_encoder.config.num_video_frames = self.max_frame_num |
|
|
self.model_encoder.config.fps = 0 |
|
|
media = self.extract_media(conversation, self.model_encoder.config) |
|
|
if "video" in media and media["video"] is not None: |
|
|
media["video"] = [self.process_images(images, self.model_encoder.vision_tower.image_processor, self.model_encoder.config).half() for images in media["video"]] |
|
|
|
|
|
|
|
|
input_ids = self.tokenize_conversation(conversation, self.model_encoder.tokenizer, add_generation_prompt=True).unsqueeze(0).cuda() |
|
|
|
|
|
|
|
|
inputs_embeds, _, _ = self.model_encoder._embed(input_ids, media, {"video": {}}, None, None) |
|
|
|
|
|
return inputs_embeds, params |
|
|
|
|
|
def generate_until(self, requests) -> List[str]: |
|
|
res = [] |
|
|
self.load_cache() |
|
|
res, requests = self.get_response_from_cache(requests) |
|
|
pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") |
|
|
|
|
|
batch_size = self.batch_size_per_gpu |
|
|
batched_requests = [requests[i : i + batch_size] for i in range(0, len(requests), batch_size)] |
|
|
e2e_latency = 0 |
|
|
for batch_requests in batched_requests: |
|
|
prompt_embeds_list = [] |
|
|
params_list = [] |
|
|
|
|
|
for req in tqdm(batch_requests, disable=(self.rank != 0), desc="Building embeddings"): |
|
|
inputs_embeds, params = self.make_one_request(req) |
|
|
prompt_embeds_list.append({"prompt_embeds": inputs_embeds.squeeze(0)}) |
|
|
params_list.append(params) |
|
|
|
|
|
|
|
|
sampling_params = SamplingParams(**params_list[-1]) |
|
|
|
|
|
start_time = time.time() |
|
|
response = self.client.generate(prompts=prompt_embeds_list, sampling_params=sampling_params) |
|
|
end_time = time.time() |
|
|
|
|
|
response_text = [o.outputs[0].text for o in response] |
|
|
for req, text in zip(batch_requests, response_text): |
|
|
self.add_request_response_to_cache(req, text) |
|
|
|
|
|
|
|
|
e2e_latency += end_time - start_time |
|
|
|
|
|
assert len(response_text) == len(batch_requests) |
|
|
res.extend(response_text) |
|
|
pbar.update(len(batch_requests)) |
|
|
|
|
|
pbar.close() |
|
|
return res |
|
|
|
|
|
def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: |
|
|
|
|
|
assert False, "GPT4V not support" |
|
|
|
|
|
def generate_until_multi_round(self, requests) -> List[str]: |
|
|
raise NotImplementedError("TODO: Implement multi-round generation") |
|
|
|