DeepQwenVL-Base / model.py

Upload folder using huggingface_hub

32d2edb verified 5 days ago

39.3 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from typing import Optional, List, Union, Tuple
	from transformers import Qwen2VLTextModel, Qwen2VLTextConfig, Qwen2VLPreTrainedModel, PretrainedConfig
	from transformers.models.llama.modeling_llama import LlamaAttention, LlamaRotaryEmbedding
	from transformers.generation.utils import GenerationMixin
	from transformers.modeling_utils import PreTrainedModel
	from transformers.modeling_outputs import ModelOutput
	from PIL import Image, ImageOps
	from encoder import build_sam_vit_b, build_clip_l, MlpProjector
	from addict import Dict as ADict
	import os
	import math
	from data import (
	format_messages,
	load_pil_images,
	text_encode,
	BasicImageTransform,
	dynamic_preprocess,
	re_match,
	process_image_with_refs,
	NoEOSTextStreamer,
	)
	from tqdm import tqdm
	from dataclasses import dataclass


	class DeepQwenVLConfig(PretrainedConfig):
	"""
	Configuration class for DeepQwenVL model.

	This config wraps both the Qwen2VL text config and DeepSeek vision config.
	When loading from a Qwen2-VL checkpoint, it will use the checkpoint's config
	directly for the text model.
	"""
	model_type = "deepqwen_vl"

	def __init__(
	self,
	deepseek_vision_hidden_size: int = 2048,

	# Projector settings
	projector_type: str = "mlp", # "vision_projector" or "mlp"
	projector_input_dim: int = 2048,
	projector_output_dim: int = None,
	projector_hidden_dim: int = None, # If None, uses projector_output_dim

	# Learnable vision tokens
	image_newline_dim: int = None, # If None, uses hidden_size
	view_separator_dim: int = None, # If None, uses hidden_size

	hidden_size: int = 1536,
	intermediate_size: int = 8960,
	num_hidden_layers: int = 28,
	num_attention_heads: int = 12,
	num_key_value_heads: int = 2,
	hidden_act: str = "silu",
	max_position_embeddings: int = 32768,
	initializer_range: float = 0.02,
	rms_norm_eps: float = 1e-6,
	use_cache: bool = True,
	tie_word_embeddings: bool = True,
	rope_theta: float = 1000000.0,
	attention_dropout: float = 0.0,
	vocab_size: int = 151936,

	bos_token_id: int = 151643,
	eos_token_id: int = 151645,
	pad_token_id: int = 151643,
	image_token_id: int = 151655,
	video_token_id: int = 151656,
	vision_start_token_id: int = 151652,
	vision_end_token_id: int = 151653,
	vision_token_id: int = 151654,

	rope_scaling: dict = None,

	**kwargs
	):
	super().__init__(
	bos_token_id=bos_token_id,
	eos_token_id=eos_token_id,
	pad_token_id=pad_token_id,
	tie_word_embeddings=tie_word_embeddings,
	**kwargs
	)

	self.deepseek_vision_hidden_size = deepseek_vision_hidden_size

	# Projector settings
	self.projector_type = projector_type
	self.projector_input_dim = projector_input_dim
	self.projector_output_dim = projector_output_dim if projector_output_dim else hidden_size
	self.projector_hidden_dim = projector_hidden_dim if projector_hidden_dim else self.projector_output_dim

	# Learnable vision tokens
	self.image_newline_dim = image_newline_dim if image_newline_dim else hidden_size
	self.view_separator_dim = view_separator_dim if view_separator_dim else hidden_size

	# Text model settings
	self.hidden_size = hidden_size
	self.intermediate_size = intermediate_size
	self.num_hidden_layers = num_hidden_layers
	self.num_attention_heads = num_attention_heads
	self.num_key_value_heads = num_key_value_heads
	self.hidden_act = hidden_act
	self.max_position_embeddings = max_position_embeddings
	self.initializer_range = initializer_range
	self.rms_norm_eps = rms_norm_eps
	self.use_cache = use_cache
	self.rope_theta = rope_theta
	self.attention_dropout = attention_dropout
	self.vocab_size = vocab_size

	# Special tokens
	self.image_token_id = image_token_id
	self.video_token_id = video_token_id
	self.vision_start_token_id = vision_start_token_id
	self.vision_end_token_id = vision_end_token_id
	self.vision_token_id = vision_token_id

	# Rope scaling
	if rope_scaling is None:
	rope_scaling = {"type": "mrope", "mrope_section": [16, 24, 24]}
	self.rope_scaling = rope_scaling

	def to_text_config(self) -> Qwen2VLTextConfig:
	"""Convert to Qwen2VLTextConfig for the text model."""
	return Qwen2VLTextConfig(
	hidden_size=self.hidden_size,
	intermediate_size=self.intermediate_size,
	num_hidden_layers=self.num_hidden_layers,
	num_attention_heads=self.num_attention_heads,
	num_key_value_heads=self.num_key_value_heads,
	hidden_act=self.hidden_act,
	max_position_embeddings=self.max_position_embeddings,
	initializer_range=self.initializer_range,
	rms_norm_eps=self.rms_norm_eps,
	use_cache=self.use_cache,
	tie_word_embeddings=self.tie_word_embeddings,
	rope_theta=self.rope_theta,
	attention_dropout=self.attention_dropout,
	vocab_size=self.vocab_size,
	bos_token_id=self.bos_token_id,
	eos_token_id=self.eos_token_id,
	pad_token_id=self.pad_token_id,
	rope_scaling=self.rope_scaling,
	)


	@dataclass
	class DeepQwenOutputWithPast(ModelOutput):
	last_hidden_state: torch.FloatTensor = None
	past_key_values: Optional[list[torch.FloatTensor]] = None
	hidden_states: Optional[tuple[torch.FloatTensor]] = None
	attentions: Optional[tuple[torch.FloatTensor]] = None

	@dataclass
	class DeepQwenCausalLMOutputWithPast(ModelOutput):
	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	past_key_values: Optional[list[torch.FloatTensor]] = None
	hidden_states: Optional[tuple[torch.FloatTensor]] = None
	attentions: Optional[tuple[torch.FloatTensor]] = None


	class VisionProjector(nn.Module):
	"""
	Vision projector with DeepSeek's pretrained layer + trainable adapter.

	Architecture:
	deepseek_proj: Linear(2048→1280) [FROZEN - loaded from DeepSeek checkpoint]
	SiLU activation
	norm: LayerNorm(1280) [TRAINABLE]
	adapter: Linear(1280→1536) [TRAINABLE]

	This preserves DeepSeek's learned vision-text alignment while adapting to Qwen's
	embedding space. Total 2 layers like LLaVA's MLP projector.
	"""

	def __init__(self, input_dim: int = 2048, hidden_dim: int = 1280, output_dim: int = 1536):
	super().__init__()
	# DeepSeek's original projection (will be frozen after loading weights)
	self.deepseek_proj = nn.Linear(input_dim, hidden_dim)
	# Adapter for Qwen (trainable)
	self.norm = nn.LayerNorm(hidden_dim)
	self.adapter = nn.Linear(hidden_dim, output_dim)
	self._init_adapter_weights()

	def _init_adapter_weights(self):
	"""Initialize adapter weights. deepseek_proj will be loaded from checkpoint."""
	nn.init.ones_(self.norm.weight)
	nn.init.zeros_(self.norm.bias)
	nn.init.normal_(self.adapter.weight, mean=0.0, std=0.01)
	nn.init.zeros_(self.adapter.bias)

	def forward(self, x):
	x = self.deepseek_proj(x)
	x = F.silu(x)
	x = self.norm(x)
	x = self.adapter(x)
	return x

	class DeepQwenVLPreTrainedModel(PreTrainedModel):
	config_class = DeepQwenVLConfig
	base_model_prefix = "model"
	supports_gradient_checkpointing = True
	_skip_keys_device_placement = "past_key_values"
	_supports_flash_attn = True
	_supports_sdpa = True
	_supports_static_cache = True
	_supports_attention_backend = True

	_keys_to_ignore_on_load_missing = [
	"sam_model",
	"vision_model",
	"projector",
	"image_newline",
	"view_separator",
	]

	def _init_weights(self, module):
	"""Initialize the weights."""
	std = self.config.initializer_range if hasattr(self.config, 'initializer_range') else 0.02
	if isinstance(module, nn.Linear):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=std)


	class DeepQwenVLModel(Qwen2VLTextModel):
	"""
	DeepQwenVL Model that combines DeepSeek's vision encoders with Qwen2VL's text model.

	Accepts either:
	- A DeepQwenVLConfig
	- A Qwen2VLTextConfig (for compatibility with from_pretrained from Qwen checkpoints)
	- A generic PretrainedConfig (will extract necessary fields)
	"""
	config_class = DeepQwenVLConfig

	def __init__(self, config):
	if isinstance(config, DeepQwenVLConfig):
	text_config = config.to_text_config()
	output_hidden_size = config.projector_output_dim
	vision_dim = config.deepseek_vision_hidden_size
	elif isinstance(config, Qwen2VLTextConfig):
	text_config = config
	output_hidden_size = config.hidden_size
	vision_dim = 2048
	else:
	text_config = config
	output_hidden_size = getattr(config, 'hidden_size', 1536)
	vision_dim = getattr(config, 'deepseek_vision_hidden_size', 2048)

	super(DeepQwenVLModel, self).__init__(text_config)

	self.config = config
	self.output_hidden_size = output_hidden_size

	self.sam_model = build_sam_vit_b()
	self.vision_model = build_clip_l()

	self.deepseek_vision_dim = vision_dim
	self.deepseek_hidden_dim = 1280 # DeepSeek's projector output dimension
	# New projector: DeepSeek layer (frozen) + adapter (trainable)
	self.projector = VisionProjector(
	input_dim=self.deepseek_vision_dim, # 2048
	hidden_dim=self.deepseek_hidden_dim, # 1280 (DeepSeek's output)
	output_dim=output_hidden_size # 1536 (Qwen's hidden size)
	)

	embed_std = 1 / torch.sqrt(torch.tensor(output_hidden_size, dtype=torch.float32))
	self.image_newline = nn.Parameter(torch.randn(output_hidden_size) * embed_std)
	self.view_separator = nn.Parameter(torch.randn(output_hidden_size) * embed_std)

	def forward(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	images: Optional[torch.FloatTensor] = None,
	images_seq_mask: Optional[torch.FloatTensor] = None,
	images_spatial_crop: Optional[torch.FloatTensor] = None,
	return_dict: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	) -> Union[torch.Tensor, List[torch.Tensor]]:

	if inputs_embeds is None:
	inputs_embeds = self.get_input_embeddings()(input_ids)

	sam_model = getattr(self, 'sam_model', None)
	vision_model = getattr(self, 'vision_model', None)

	should_process_images = (
	sam_model is not None
	and images is not None
	and images_seq_mask is not None
	and (input_ids.shape[1] != 1 or self.training)
	and torch.sum(images[0][1]).item() != 0
	)

	if should_process_images:
	idx = 0
	for image, crop_shape in zip(images, images_spatial_crop):
	images_in_this_batch = []
	patches = image[0]
	image_ori = image[1]

	if torch.sum(patches).item() != 0:
	# Process local patches
	with torch.no_grad():
	local_features_1 = sam_model(patches)
	local_features_2 = vision_model(patches, local_features_1)
	local_features = torch.cat((local_features_2[:, 1:], local_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
	local_features = local_features.detach()
	local_features = self.projector(local_features)

	# Process global image
	with torch.no_grad():
	global_features_1 = sam_model(image_ori)
	global_features_2 = vision_model(image_ori, global_features_1)
	global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
	global_features = global_features.detach()
	global_features = self.projector(global_features)

	# Reshape and add newline tokens
	_, hw, n_dim = global_features.shape
	h = w = int(hw ** 0.5)
	_2, hw2, n_dim2 = local_features.shape
	h2 = w2 = int(hw2 ** 0.5)
	width_crop_num, height_crop_num = crop_shape[0], crop_shape[1]

	global_features = global_features.view(h, w, n_dim)
	global_features = torch.cat(
	[global_features, self.image_newline[None, None, :].expand(h, 1, n_dim)], dim=1
	)
	global_features = global_features.view(-1, n_dim)

	local_features = local_features.view(
	height_crop_num, width_crop_num, h2, w2, n_dim2
	).permute(0, 2, 1, 3, 4).reshape(height_crop_numh2, width_crop_numw2, n_dim2)
	local_features = torch.cat(
	[local_features, self.image_newline[None, None, :].expand(height_crop_num * h2, 1, n_dim2)], dim=1
	)
	local_features = local_features.view(-1, n_dim2)

	global_local_features = torch.cat([local_features, global_features, self.view_separator[None, :]], dim=0)
	images_in_this_batch.append(global_local_features)
	else:
	# Global-only branch (small images)
	with torch.no_grad():
	global_features_1 = sam_model(image_ori)
	global_features_2 = vision_model(image_ori, global_features_1)
	global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
	global_features = global_features.detach()
	global_features = self.projector(global_features)

	_, hw, n_dim = global_features.shape
	h = w = int(hw ** 0.5)
	global_features = global_features.view(h, w, n_dim)
	global_features = torch.cat(
	[global_features, self.image_newline[None, None, :].expand(h, 1, n_dim)], dim=1
	)
	global_features = global_features.view(-1, n_dim)
	global_local_features = torch.cat([global_features, self.view_separator[None, :]], dim=0)
	images_in_this_batch.append(global_local_features)

	if images_in_this_batch:
	images_in_this_batch = torch.cat(images_in_this_batch, dim=0)
	inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1).cuda(), images_in_this_batch)
	idx += 1

	outputs = super().forward(
	input_ids=None, attention_mask=attention_mask, past_key_values=past_key_values,
	inputs_embeds=inputs_embeds, use_cache=use_cache, position_ids=position_ids,
	output_attentions=output_attentions, output_hidden_states=output_hidden_states,
	return_dict=return_dict, cache_position=cache_position
	)

	return DeepQwenOutputWithPast(
	last_hidden_state=outputs.last_hidden_state,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	) if return_dict else outputs.to_tuple()


	class DeepQwenVLForCausalLM(DeepQwenVLModel, GenerationMixin):
	"""
	DeepQwenVL Model for causal language modeling with vision capabilities.

	Combines DeepSeek's vision encoders (SAM + CLIP) with Qwen2VL's text model.
	"""
	config_class = DeepQwenVLConfig
	_tied_weights_keys = ["lm_head.weight"]

	_keys_to_ignore_on_load_missing = [
	# "sam_model",
	# "vision_model",
	# "projector",
	# "image_newline",
	# "view_separator",
	]

	def __init__(self, config):
	"""
	Initialize the model.

	Args:
	config: Can be DeepQwenVLConfig, Qwen2VLTextConfig, or a generic config
	from a Qwen2-VL checkpoint.
	"""
	super().__init__(config)

	hidden_size = getattr(config, 'hidden_size', 1536)
	vocab_size = getattr(config, 'vocab_size', 151936)

	self.lm_head = nn.Linear(hidden_size, vocab_size, bias=False)

	self.post_init()

	def get_output_embeddings(self):
	return getattr(self, 'lm_head', None)

	def set_output_embeddings(self, new_embeddings):
	self.lm_head = new_embeddings

	def forward(
	self,
	input_ids: torch.LongTensor = None,
	labels: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	images: Optional[torch.FloatTensor] = None,
	images_seq_mask: Optional[torch.FloatTensor] = None,
	images_spatial_crop: Optional[torch.FloatTensor] = None,
	return_dict: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	) -> Union[torch.Tensor, List[torch.Tensor]]:

	outputs = super().forward(
	input_ids=input_ids,
	attention_mask=attention_mask,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	position_ids = position_ids,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	images=images,
	images_seq_mask=images_seq_mask,
	images_spatial_crop=images_spatial_crop,
	return_dict=True,
	cache_position=cache_position,
	)

	hidden_states = outputs[0]
	logits = self.lm_head(hidden_states)
	logits = logits.float()

	loss = None
	if labels is not None:
	loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)

	return DeepQwenCausalLMOutputWithPast(
	loss=loss,
	logits=logits,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)

	def prepare_inputs_for_generation(
	self,
	input_ids,
	past_key_values=None,
	attention_mask=None,
	inputs_embeds=None,
	cache_position=None,
	position_ids=None,
	images=None,
	images_seq_mask=None,
	images_spatial_crop=None,
	**kwargs,
	):
	model_inputs = super().prepare_inputs_for_generation(
	input_ids,
	past_key_values=past_key_values,
	attention_mask=attention_mask,
	inputs_embeds=inputs_embeds,
	cache_position=cache_position,
	position_ids=position_ids,
	**kwargs,
	)

	model_inputs["images"] = images
	model_inputs["images_seq_mask"] = images_seq_mask
	model_inputs["images_spatial_crop"] = images_spatial_crop
	model_inputs["position_ids"] = None

	# Clear images after first forward pass (cache_position[0] != 0 means subsequent tokens)
	if cache_position is not None and cache_position[0] != 0:
	model_inputs["images"] = None
	model_inputs["images_seq_mask"] = None
	model_inputs["images_spatial_crop"] = None

	return model_inputs

	def reinitialize_projector(self, vis_mlp=None, device=None, dtype=None):
	"""
	Reinitialize the projector, image_newline, and view_separator.
	Call this after from_pretrained when loading from a Qwen checkpoint.
	"""
	if device is None:
	for param in self.parameters():
	if param.device.type != 'meta':
	device = param.device
	break
	if device is None:
	device = 'cpu'
	if dtype is None:
	dtype = torch.bfloat16

	input_dim = self.deepseek_vision_dim
	output_dim = self.output_hidden_size

	if vis_mlp is not None:
	self.projector = VisionProjector(input_dim=input_dim, output_dim=output_dim).to(device=device, dtype=dtype)

	else:
	self.projector = nn.Linear(in_features=input_dim, out_features=output_dim).to(device=device, dtype=dtype)
	nn.init.normal_(self.projector.weight, mean=0.0, std=0.01)
	if self.projector.bias is not None:
	nn.init.zeros_(self.projector.bias)

	embed_std = 1 / torch.sqrt(torch.tensor(output_dim, dtype=torch.float32))
	self.image_newline = nn.Parameter(
	torch.randn(output_dim, device=device, dtype=dtype) * embed_std.item()
	)
	self.view_separator = nn.Parameter(
	torch.randn(output_dim, device=device, dtype=dtype) * embed_std.item()
	)

	print(f"Projector reinitialized on {device} with dtype {dtype}")

	def load_pretrained_vision(self, pretrained_path: str):
	try:
	from safetensors import safe_open
	except ImportError:
	raise ImportError("Please install safetensors to load the pretrained vision model.")

	assert os.path.exists(pretrained_path), f"Pretrained path {pretrained_path} does not exist."

	vision_weights = {}
	with safe_open(f"{pretrained_path}/model-00001-of-000001.safetensors", framework="pt", device="cpu") as f:
	for k in f.keys():
	vision_weights[k] = f.get_tensor(k)

	prefixes = {
	"sam_model": "model.sam_model.",
	"vision_model": "model.vision_model.",
	}

	try:
	for p in prefixes.keys():
	state_dict = {}

	for k, v in vision_weights.items():
	if k.startswith(prefixes[p]):
	new_key = k[len(prefixes[p]):]
	state_dict[new_key] = v

	getattr(self, p).load_state_dict(state_dict, strict=False)

	print("Pretrained vision model loaded successfully.")
	except Exception as e:
	print("Error loading pretrained vision model:", e)
	raise e

	def load_deepseek_projector(self, pretrained_path: str):
	"""
	Load DeepSeek's projector weights into the deepseek_proj layer.

	DeepSeek checkpoint has:
	- projector.weight: shape (1280, 2048)
	- projector.bias: shape (1280,)

	These get loaded into self.projector.deepseek_proj
	"""
	try:
	from safetensors import safe_open
	except ImportError:
	raise ImportError("Please install safetensors to load DeepSeek projector.")

	assert os.path.exists(pretrained_path), f"Pretrained path {pretrained_path} does not exist."

	# Find safetensors file
	safetensor_files = [f for f in os.listdir(pretrained_path) if f.endswith('.safetensors')]
	if not safetensor_files:
	raise FileNotFoundError(f"No safetensors files found in {pretrained_path}")

	safetensor_path = os.path.join(pretrained_path, safetensor_files[0])

	projector_weights = {}
	with safe_open(safetensor_path, framework="pt", device="cpu") as f:
	for k in f.keys():
	if 'projector' in k:
	projector_weights[k] = f.get_tensor(k)

	# Load into deepseek_proj
	if 'projector.weight' in projector_weights:
	self.projector.deepseek_proj.weight.data = projector_weights['projector.weight']
	self.projector.deepseek_proj.bias.data = projector_weights['projector.bias']
	print(f"Loaded DeepSeek projector weights: {self.projector.deepseek_proj.weight.shape}")
	print(f" Weight mean: {self.projector.deepseek_proj.weight.mean().item():.6f}")
	print(f" Weight std: {self.projector.deepseek_proj.weight.std().item():.6f}")
	elif 'model.projector.weight' in projector_weights:
	self.projector.deepseek_proj.weight.data = projector_weights['model.projector.weight']
	self.projector.deepseek_proj.bias.data = projector_weights['model.projector.bias']
	print(f"Loaded DeepSeek projector weights (model. prefix)")
	else:
	print(f"Warning: Could not find projector weights. Available keys: {list(projector_weights.keys())}")

	def disable_torch_init(self):
	"""
	Disable the redundant torch default initialization to accelerate model creation.
	"""
	import torch
	setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
	setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)

	def infer(
	self,
	tokenizer,
	prompt='',
	image_file='',
	output_path = '',
	base_size=1024,
	image_size=640,
	crop_mode=True,
	test_compress=False,
	save_results=False,
	eval_mode=False
	):
	self.disable_torch_init()

	os.makedirs(output_path, exist_ok=True)
	os.makedirs(f'{output_path}/images', exist_ok=True)
	conversation = [
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": f"{image_file}",
	},
	{"type": "text", "text": f"{prompt}"},
	],
	}
	]

	formatted_prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)

	patch_size = 16
	downsample_ratio = 4
	images = load_pil_images(conversation)

	valid_img_tokens = 0
	ratio = 1

	image_draw = images[0].copy()

	w,h = image_draw.size
	ratio = 1 - ((max(w, h) - min(w, h)) / (max(w, h)))


	image_transform=BasicImageTransform(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), normalize=True)
	images_seq_mask = []

	image_token = '<\|image_pad\|>'
	image_token_id = 151655
	text_splits = formatted_prompt.split(image_token)

	images_list, images_crop_list, images_seq_mask = [], [], []
	tokenized_str = []
	images_spatial_crop = []
	for text_sep, image in zip(text_splits, images):

	tokenized_sep = text_encode(tokenizer, text_sep, bos=False, eos=False)
	tokenized_str += tokenized_sep
	images_seq_mask += [False] * len(tokenized_sep)

	if crop_mode:

	if image.size[0] <= 640 and image.size[1] <= 640:
	crop_ratio = [1, 1]

	else:
	if crop_mode:
	images_crop_raw, crop_ratio = dynamic_preprocess(image)
	else:
	crop_ratio = [1, 1]

	global_view = ImageOps.pad(image, (base_size, base_size),
	color=tuple(int(x * 255) for x in image_transform.mean))

	if base_size == 1024:
	valid_img_tokens += int(256 * ratio)
	elif base_size == 1280:
	valid_img_tokens += int(400 * ratio)
	# elif base_size == 640:
	# valid_img_tokens += int(100 * ratio)


	images_list.append(image_transform(global_view).to(torch.bfloat16))

	# global_view_tensor = image_transform(global_view).to(torch.bfloat16)

	width_crop_num, height_crop_num = crop_ratio

	images_spatial_crop.append([width_crop_num, height_crop_num])


	if width_crop_num > 1 or height_crop_num > 1:
	"""process the local views"""

	for i in range(len(images_crop_raw)):
	images_crop_list.append(image_transform(images_crop_raw[i]).to(torch.bfloat16))

	if image_size == 640:
	valid_img_tokens += len(images_crop_list) * 100

	num_queries = math.ceil((image_size // patch_size) / downsample_ratio)
	num_queries_base = math.ceil((base_size // patch_size) / downsample_ratio)

	"""add image tokens"""

	tokenized_image = ([image_token_id] * num_queries_base + [image_token_id]) * num_queries_base
	tokenized_image += [image_token_id]
	if width_crop_num > 1 or height_crop_num > 1:
	tokenized_image += ([image_token_id] * (num_queries * width_crop_num) + [image_token_id]) * (
	num_queries * height_crop_num)
	tokenized_str += tokenized_image
	images_seq_mask += [True] * len(tokenized_image)
	# num_image_tokens.append(len(tokenized_image))

	else:
	"""process the global view"""
	if image_size <= 640:
	image = image.resize((image_size, image_size))
	global_view = ImageOps.pad(image, (image_size, image_size),
	color=tuple(int(x * 255) for x in image_transform.mean))
	images_list.append(image_transform(global_view).to(torch.bfloat16))

	if base_size == 1024:
	valid_img_tokens += int(256 * ratio)
	elif base_size == 1280:
	valid_img_tokens += int(400 * ratio)
	elif base_size == 640:
	valid_img_tokens += int(100 * 1)
	elif base_size == 512:
	valid_img_tokens += int(64 * 1)

	width_crop_num, height_crop_num = 1, 1

	images_spatial_crop.append([width_crop_num, height_crop_num])


	"""add image tokens"""
	num_queries = math.ceil((image_size // patch_size) / downsample_ratio)

	tokenized_image = ([image_token_id] * num_queries + [image_token_id]) * num_queries
	tokenized_image += [image_token_id]
	# tokenized_image += ([self.image_token_id] * (num_queries * width_crop_num) + [self.image_token_id]) * (
	# num_queries * height_crop_num)
	tokenized_str += tokenized_image
	images_seq_mask += [True] * len(tokenized_image)
	# num_image_tokens.append(len(tokenized_image))

	"""process the last text split"""
	tokenized_sep = text_encode(tokenizer, text_splits[-1], bos=False, eos=False)
	tokenized_str += tokenized_sep
	images_seq_mask += [False] * len(tokenized_sep)

	# Qwen2VL has NO bos_token (bos_token_id is None)
	# The chat template already handles proper formatting

	input_ids = torch.LongTensor(tokenized_str)

	images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)

	if len(images_list) == 0:
	images_ori = torch.zeros((1, 3, image_size, image_size))
	images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
	images_crop = torch.zeros((1, 3, base_size, base_size))

	else:
	images_ori = torch.stack(images_list, dim=0)
	images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
	if images_crop_list:
	images_crop = torch.stack(images_crop_list, dim=0)
	else:
	images_crop = torch.zeros((1, 3, base_size, base_size))



	if not eval_mode:
	streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
	with torch.autocast("cuda", dtype=torch.bfloat16):
	with torch.no_grad():
	output_ids = self.generate(
	input_ids.unsqueeze(0).cuda(),
	images=[(images_crop.cuda(), images_ori.cuda())],
	images_seq_mask=images_seq_mask.unsqueeze(0).cuda(),
	images_spatial_crop=images_spatial_crop,
	temperature=0.5,
	eos_token_id=tokenizer.eos_token_id,
	streamer=streamer,
	max_new_tokens=8192,
	no_repeat_ngram_size=20,
	use_cache=True
	)
	else:
	with torch.autocast("cuda", dtype=torch.bfloat16):
	with torch.no_grad():
	output_ids = self.generate(
	input_ids.unsqueeze(0).cuda(),
	images=[(images_crop.cuda(), images_ori.cuda())],
	images_seq_mask=images_seq_mask.unsqueeze(0).cuda(),
	images_spatial_crop=images_spatial_crop,
	temperature=0.5,
	eos_token_id=tokenizer.eos_token_id,
	max_new_tokens=8192,
	no_repeat_ngram_size=35,
	use_cache=True
	)

	# Check if conversation has image
	has_image = any(
	(isinstance(item, dict) and item.get('type') == 'image')
	for msg in conversation
	for item in (msg.get('content', []) if isinstance(msg.get('content'), list) else [])
	)

	if has_image and eval_mode:
	outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:], skip_special_tokens=False)
	# Qwen2VL's EOS token is <\|im_end\|>
	stop_str = tokenizer.eos_token or '<\|im_end\|>'
	if outputs.endswith(stop_str):
	outputs = outputs[:-len(stop_str)]
	outputs = outputs.strip()

	return outputs

	if has_image and test_compress:
	outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:], skip_special_tokens=False)
	pure_texts_outputs_token_length = len(text_encode(tokenizer, outputs, bos=False, eos=False))
	print('='*50)
	print('image size: ', (w, h))
	print('valid image tokens: ', int(valid_img_tokens))
	print('output texts tokens (valid): ', pure_texts_outputs_token_length)
	print('compression ratio: ', round(pure_texts_outputs_token_length/valid_img_tokens, 2))
	print('='*50)


	if has_image and save_results:
	outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:], skip_special_tokens=False)
	# Qwen2VL's EOS token
	stop_str = tokenizer.eos_token or '<\|im_end\|>'

	print('='15 + 'save results:' + '='15)

	if outputs.endswith(stop_str):
	outputs = outputs[:-len(stop_str)]
	outputs = outputs.strip()

	matches_ref, matches_images, mathes_other = re_match(outputs)
	result = process_image_with_refs(image_draw, matches_ref, output_path)

	for idx, a_match_image in enumerate(tqdm(matches_images, desc="image")):
	outputs = outputs.replace(a_match_image, '![](images/' + str(idx) + '.jpg)\n')

	for idx, a_match_other in enumerate(tqdm(mathes_other, desc="other")):
	outputs = outputs.replace(a_match_other, '').replace('\\coloneqq', ':=').replace('\\eqqcolon', '=:')

	with open(f'{output_path}/result.mmd', 'w', encoding = 'utf-8') as afile:
	afile.write(outputs)

	if 'line_type' in outputs:
	import matplotlib.pyplot as plt
	lines = eval(outputs)['Line']['line']

	line_type = eval(outputs)['Line']['line_type']
	endpoints = eval(outputs)['Line']['line_endpoint']

	fig, ax = plt.subplots(figsize=(3,3), dpi=200)
	ax.set_xlim(-15, 15)
	ax.set_ylim(-15, 15)

	for idx, line in enumerate(lines):
	try:
	p0 = eval(line.split(' -- ')[0])
	p1 = eval(line.split(' -- ')[-1])

	if line_type[idx] == '--':
	ax.plot([p0[0], p1[0]], [p0[1], p1[1]], linewidth=0.8, color='k')
	else:
	ax.plot([p0[0], p1[0]], [p0[1], p1[1]], linewidth = 0.8, color = 'k')

	ax.scatter(p0[0], p0[1], s=5, color = 'k')
	ax.scatter(p1[0], p1[1], s=5, color = 'k')
	except:
	pass

	for endpoint in endpoints:

	label = endpoint.split(': ')[0]
	(x, y) = eval(endpoint.split(': ')[1])
	ax.annotate(label, (x, y), xytext=(1, 1), textcoords='offset points',
	fontsize=5, fontweight='light')


	plt.savefig(f'{output_path}/geo.jpg')
	plt.close()

	result.save(f"{output_path}/result_with_boxes.jpg")


	## TODO

	# new training loop:
	## image -> vision encoder -> projection ->! txt_decoder -> embedding -> pool
	# => alignment(text_pooling, image_pooling)
	## text -> text encoder -> projection -> embedding -> pool

	## cant let projection layer output into text decoder