Tnt3o5 commited on Sep 23, 2025

Commit

dc8d64b

verified ·

1 Parent(s): 86f1943

Upload folder using huggingface_hub

Browse files

Files changed (22) hide show

.gitattributes +1 -0
added_tokens.json +5 -0
args.json +343 -0
config.json +56 -0
configuration_hyper_qwen2.py +123 -0
configuration_mplugowl3.py +47 -0
generation_config.json +14 -0
image_processing_mplugowl3.py +416 -0
merges.txt +0 -0
model.safetensors +3 -0
modeling_hyper_qwen2.py +1532 -0
modeling_mplugowl3.py +231 -0
preprocessor_config.json +119 -0
processing_mplugowl3.py +396 -0
processor_config.json +6 -0
special_tokens_map.json +20 -0
tokenizer.json +3 -0
tokenizer_config.json +45 -0
trainer_state.json +202 -0
training_args.bin +3 -0
vocab.json +0 -0
x_sdpa.py +61 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

args.json ADDED Viewed

	@@ -0,0 +1,343 @@

+{
+  "output_dir": "/kaggle/working/outputs/mplug/v5-20250923-083759",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "no",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 2,
+  "per_device_eval_batch_size": 2,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 64,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 4.64e-05,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.95,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 1.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.0,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/kaggle/working/outputs/mplug/v5-20250923-083759/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 20,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "steps",
+  "save_steps": 20.0,
+  "save_total_limit": 1,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": true,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "seed": 42,
+  "data_seed": 42,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 20.0,
+  "dataloader_num_workers": 8,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": "/kaggle/working/outputs/mplug/v5-20250923-083759",
+  "disable_tqdm": null,
+  "remove_unused_columns": true,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "tp_size": 0,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": null,
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": true,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": "/kaggle/working/outputs/mplug/v4-20250923-021527/checkpoint-240",
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_token": null,
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 18000000,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": false,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "tuner_backend": "peft",
+  "vit_gradient_checkpointing": null,
+  "router_aux_loss_coef": 0.0,
+  "enable_dft_loss": false,
+  "enable_channel_loss": false,
+  "check_model": true,
+  "acc_strategy": "token",
+  "train_dataloader_shuffle": true,
+  "max_epochs": null,
+  "aligner_lr": null,
+  "vit_lr": null,
+  "use_logits_to_keep": null,
+  "ds3_gather_for_generation": true,
+  "resume_only_model": false,
+  "optimizer": null,
+  "loss_type": null,
+  "metric": null,
+  "eval_use_evalscope": false,
+  "eval_dataset": [],
+  "eval_dataset_args": null,
+  "eval_limit": null,
+  "eval_generation_config": null,
+  "extra_eval_args": null,
+  "use_flash_ckpt": false,
+  "model": "/kaggle/working/outputs/mplug/v3-20250922-041102/checkpoint-100",
+  "model_type": "mplug_owl3",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": "flash_attn",
+  "new_special_tokens": [],
+  "num_labels": null,
+  "problem_type": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "max_memory": {},
+  "max_model_len": null,
+  "local_repo_path": null,
+  "init_strategy": null,
+  "template": "mplug_owl3",
+  "system": null,
+  "max_length": 4096,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "agent_template": null,
+  "norm_bbox": null,
+  "use_chat_template": false,
+  "padding_free": false,
+  "padding_side": "right",
+  "loss_scale": "all",
+  "sequence_parallel_size": 1,
+  "response_prefix": null,
+  "template_backend": "swift",
+  "dataset": [
+    "/kaggle/working/pretraining_vqa_fixed.jsonl",
+    "/kaggle/input/5cd-ds/5cd_pretraining.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.0,
+  "dataset_num_proc": 1,
+  "load_from_cache_file": true,
+  "dataset_shuffle": true,
+  "val_dataset_shuffle": false,
+  "streaming": false,
+  "interleave_prob": null,
+  "stopping_strategy": "first_exhausted",
+  "shuffle_buffer_size": 1000,
+  "download_mode": "reuse_dataset_if_exists",
+  "columns": {},
+  "strict": false,
+  "model_name": null,
+  "model_author": null,
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.0,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": "/kaggle/working/outputs/mplug/v3-20250922-041102/checkpoint-100",
+  "lora_modules": [],
+  "train_type": "full",
+  "adapters": [],
+  "external_plugins": [],
+  "model_kwargs": {},
+  "load_args": false,
+  "load_data_args": false,
+  "packing": false,
+  "packing_length": null,
+  "lazy_tokenize": true,
+  "cached_dataset": [],
+  "custom_register_path": [],
+  "use_hf": true,
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "freeze_parameters": [],
+  "freeze_parameters_regex": null,
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [
+    "vision2text_model"
+  ],
+  "trainable_parameters_regex": null,
+  "freeze_llm": false,
+  "freeze_vit": false,
+  "freeze_aligner": false,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "target_parameters": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "swanlab_token": null,
+  "swanlab_project": null,
+  "swanlab_workspace": null,
+  "swanlab_exp_name": null,
+  "swanlab_lark_webhook_url": null,
+  "swanlab_lark_secret": null,
+  "swanlab_mode": "cloud",
+  "add_version": true,
+  "create_checkpoint_symlink": false,
+  "zero_hpz_partition_size": null,
+  "deepspeed_autotp_size": null,
+  "early_stop_interval": null,
+  "rank": 0,
+  "global_world_size": 4,
+  "local_world_size": 4,
+  "model_suffix": "checkpoint-100",
+  "model_info": "ModelInfo(model_type='mplug_owl3', model_dir='/kaggle/working/outputs/mplug/v3-20250922-041102/checkpoint-100', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='mplug_owl3', model_groups=[ModelGroup(models=[Model(ms_model_id='iic/mPLUG-Owl3-1B-241014', hf_model_id='mPLUG/mPLUG-Owl3-1B-241014', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='iic/mPLUG-Owl3-2B-241014', hf_model_id='mPLUG/mPLUG-Owl3-2B-241014', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='iic/mPLUG-Owl3-7B-240728', hf_model_id='mPLUG/mPLUG-Owl3-7B-240728', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='mplug_owl3', get_function=<function get_model_tokenizer_mplug_owl3 at 0x7f7ec582dda0>, model_arch=MultiModelKeys(arch_name='mplug_owl3', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['language_model'], aligner=['vision2text_model'], vision_tower=['vision_model'], generator=[]), architectures=['mPLUGOwl3Model'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.36', 'icecream', 'decord'], tags=['vision', 'video'])",
+  "model_dir": "/kaggle/working/outputs/mplug/v3-20250922-041102/checkpoint-100",
+  "hub": "<class 'swift.hub.hub.HFHub'>",
+  "evaluation_strategy": "steps",
+  "training_args": "Seq2SeqTrainingArguments(output_dir='/kaggle/working/outputs/mplug/v5-20250923-083759', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, eval_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=2, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=64, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=4.64e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.0, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/kaggle/working/outputs/mplug/v5-20250923-083759/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=20, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=20, save_total_limit=1, save_safetensors=True, save_on_each_node=False, save_only_model=True, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20.0, dataloader_num_workers=8, dataloader_prefetch_factor=10, past_index=-1, run_name='/kaggle/working/outputs/mplug/v5-20250923-083759', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, tp_size=0, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=True, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint='/kaggle/working/outputs/mplug/v4-20250923-021527/checkpoint-240', hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
+}

config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "architectures": [
+    "mPLUGOwl3Model"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_mplugowl3.mPLUGOwl3Config",
+    "AutoModel": "modeling_mplugowl3.mPLUGOwl3Model",
+    "AutoModelForCausalLM": "modeling_mplugowl3.mPLUGOwl3Model"
+  },
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "hyper_layers": [
+    6,
+    13,
+    20,
+    22
+  ],
+  "image_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 24,
+  "model_type": "mplugowl3",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "patch_size": 14,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vision_config": {
+    "attention_dropout": 0.0,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 384,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "pad_token_id": 151643,
+    "patch_size": 14,
+    "torch_dtype": "bfloat16"
+  },
+  "vocab_size": 151851
+}

configuration_hyper_qwen2.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from transformers.configuration_utils import PretrainedConfig
+class HyperQwen2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
+    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    ```python
+    >>> from transformers import Qwen2Model, Qwen2Config
+    >>> # Initializing a Qwen2 style configuration
+    >>> configuration = Qwen2Config()
+    >>> # Initializing a model from the Qwen2-7B style configuration
+    >>> model = Qwen2Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "qwen2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        hyper_layers=[1,9,17,25],
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if use_sliding_window else None
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.hyper_layers = hyper_layers
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

configuration_mplugowl3.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# coding=utf-8
+""" mPLUGOwl3 model configuration"""
+import os
+from typing import Union
+from transformers.utils import logging
+from .configuration_hyper_qwen2 import HyperQwen2Config
+from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
+logger = logging.get_logger(__name__)
+class mPLUGOwl3Config(HyperQwen2Config):
+    model_type = "mplugowl3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    default_vision_config = {
+        "hidden_size": 1152,
+        "image_size": 384,
+        "intermediate_size": 4304,
+        "model_type": "siglip_vision_model",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 27,
+        "patch_size": 14
+    }
+    def __init__(
+        self,
+        use_cache=True,
+        vision_config=None,
+        **kwargs,
+    ):
+        self.use_cache = use_cache
+        # same as HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit add tgt_sizes
+        if vision_config is None:
+            self.vision_config = SiglipVisionConfig(**self.default_vision_config)
+            logger.info("vision_config is None, using default vision config")
+        elif isinstance(vision_config, dict):
+            self.vision_config = SiglipVisionConfig(**vision_config)
+        elif isinstance(vision_config, SiglipVisionConfig):
+            self.vision_config = vision_config
+        self.image_size = self.vision_config.image_size
+        self.patch_size = self.vision_config.patch_size
+        super().__init__(**kwargs)

generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.51.3"
+}

image_processing_mplugowl3.py ADDED Viewed

	@@ -0,0 +1,416 @@

+import random
+from typing import Optional, Union, Dict, Any, List
+from einops import rearrange, repeat
+import torch
+import math
+import PIL.Image
+import PIL.ImageSequence
+import numpy as np
+import PIL
+from PIL import Image
+from transformers.utils import TensorType, requires_backends, is_torch_dtype, is_torch_device
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers import AutoImageProcessor
+from transformers.image_transforms import to_channel_dimension_format
+from transformers.image_utils import (
+    ImageInput,
+    make_list_of_images,
+    valid_images,
+    is_torch_tensor,
+    is_batched,
+    to_numpy_array,
+    infer_channel_dimension_format,
+    ChannelDimension
+)
+from torchvision.ops.boxes import box_area
+from torchvision.transforms import functional as F
+from torchvision.transforms.transforms import InterpolationMode
+from torchvision import transforms
+def recursive_converter(converter, value):
+    if isinstance(value, list):
+        new_value = []
+        for v in value:
+            new_value += [recursive_converter(converter, v)]
+        return new_value
+    else:
+        return converter(value)
+def box_iou(boxes1, area1, boxes2, eps=1e-5):
+    area2 = box_area(boxes2)
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+    union = area1[:, None] + area2 - inter
+    iou = inter / (union+eps)
+    return iou, union
+available_anchor_strategy = ['docowl', 'random', 'highest', 'last', 'llava']
+grid_dict = {
+    'grid_33':[
+        (1,1),
+        (1,2),(2,1),
+        (1,3),(3,1),
+        (2,2),(1,4),(4,1),
+        (1,5),(5,1),
+        (1,6),(6,1),(2,3),(3,2),
+        (1,7),(7,1),
+        (4,2),(2,4),(1,8),(8,1),
+        (3,3),(1,9),(9,1)],
+    'grid_squ_3x3':[
+        (1,1),(2,2),(3,3)
+    ],
+    'grid_squ_4':[
+        (2,2),(1,3),(1,4),(3,1),(4,1)
+    ],
+    'grid_squ_6':[
+        (2,2),(1,3),(1,4),(3,1),(4,1), (2,3),(3,2)
+    ],
+    'grid_squ_2':[
+        (2,1)
+    ],
+    'grid_squ_9':[
+        (1,1),
+        (1,2),(2,1),
+        (1,3),(3,1),
+        (2,2),(1,4),(4,1),
+        (1,5),(5,1),
+        (1,6),(6,1),(2,3),(3,2),
+        (1,7),(7,1),
+        (4,2),(2,4),(1,8),(8,1),
+        (3,3),(1,9),(9,1)],
+}
+cut_prompt_template_dict = {
+    'v0': lambda img_token, h, w: f''.join([f"{img_token}" for i in range(h) for j in range(w)]),
+    'v1': lambda img_token, h, w: f'Cut to {h} rows {w} columns, '+ ' '.join([f"subimg({i},{j}){img_token}"for i in range(h) for j in range(w)]),
+    'v1_global': lambda img_token, h, w: f'Cut to {h} rows {w} columns with a global view, '+ ' '.join([f"subimg({i},{j}){img_token}"for i in range(h) for j in range(w)]+[f"global_view{img_token}"]),
+    'v2_global': lambda img_token, h, w: f'Cut to {h} rows {w} columns with a global view\n'+ '\n'.join([' '.join([f"subimg({i},{j}){img_token}" for j in range(w)]) for i in range(h)])+f"\nglobal_view{img_token}",
+    'v3': lambda img_token, h, w: f'<|start_cut|>{h}*{w}'+ ' '.join([f"{img_token}"for i in range(h) for j in range(w)])+'<|end_cut|>',
+    'v3_global': lambda img_token, h, w: f'<|start_cut|>{h}*{w}\n'+ '\n'.join([' '.join([f"{img_token}" for j in range(w)]) for i in range(h)])+f'\n{img_token}<|end_cut|>',
+}
+def anchor_rank(anchors, anchors_areas, input_image_size, eps=1e-5):
+    # anchors x1 y1 x2 y2
+    # image_size: (h, w)
+    # xyxy
+    input_image_bbox = torch.tensor([0, 0, input_image_size[1], input_image_size[0]]).unsqueeze(0)
+    boxes1 = anchors
+    boxes2 = input_image_bbox
+    boxes3 = anchors.clone()
+    # y2
+    boxes3[:,3] = input_image_size[0]/input_image_size[1]*anchors[:,2] # 用于算分辨率无关的iou
+    area1 = anchors_areas
+    iou, _ = box_iou(boxes1, area1, boxes2)
+    iou = iou.squeeze(1)
+    shape_iou, _ = box_iou(boxes1, area1, boxes3)
+    shape_iou = shape_iou.diag()
+    # 优先匹配形状接近 再匹配分辨率接近
+    index = torch.argmax(shape_iou*100+iou,dim=0)
+    return index
+def select_best_resolution(anchors, anchors_areas, input_image_size): # TODO For a futher check
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_size = (input_image_size[1], input_image_size[0])
+    possible_resolutions = [(_[2], _[3]) for _ in anchors] # xyxy -> w,h
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float('inf')
+    index = 0
+    for i, (width, height) in enumerate(possible_resolutions):
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+            index = i
+    return index
+def build_cut_shape_indices(cut_shape):
+    # cut_shape: a list of (nh,nw)
+    cut_shape_indices = []
+    for shape in cut_shape:
+        n=shape[0]*shape[1]
+        indices = torch.cat([
+            repeat(torch.tensor(shape),'l -> n l',n=n),
+            torch.arange(n).unsqueeze(1)
+        ], dim=1)
+        assert indices.shape[0] == n
+        assert indices.shape[1] == 3 # nh,nw,idx
+        cut_shape_indices.append(indices)
+    cut_shape_indices = torch.cat(cut_shape_indices,dim=0).long()
+    return cut_shape_indices
+class AnchorResize(torch.nn.Module):
+    def __init__(self, image_size, anchors, interpolation=InterpolationMode.BILINEAR, antialias=None, anchor_strategy='docowl'):
+        super().__init__()
+        self.image_size = image_size
+        # xyxy
+        self.anchors = torch.tensor(
+            [[0, 0, _[1]*image_size[1], _[0]*image_size[0]]
+            for _ in anchors], requires_grad=False
+        )
+        self.anchor_areas = box_area(self.anchors)
+        self.interpolation = interpolation
+        self.antialias = antialias
+        self.anchor_strategy = anchor_strategy
+        assert self.anchor_strategy in available_anchor_strategy
+    def resize_global(self, img):
+        return F.resize(img, self.image_size, self.interpolation, max_size=None, antialias=self.antialias)
+    def forward(self, img, skip_resize=False):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be scaled.
+        Returns:
+            PIL Image or Tensor: Rescaled image.
+        """
+        if self.anchor_strategy == 'docowl':
+            selected_anchor = anchor_rank(self.anchors, self.anchor_areas, (img.size[1], img.size[0]))
+        elif self.anchor_strategy == 'random':
+            selected_anchor = random.randint(0,len(self.anchors)-1)
+        elif self.anchor_strategy == 'highest':
+            # 选面积最大的 在这个基础上 尽可能选最方正的
+            selected_anchor = torch.argmax(self.anchors[:,2]*self.anchors[:,3]*100-torch.abs(self.anchors[:,2]-self.anchors[:,3]))
+        elif self.anchor_strategy == 'last':
+            selected_anchor = len(self.anchors)-1
+        elif self.anchor_strategy == 'llava':
+            selected_anchor = select_best_resolution(self.anchors, self.anchor_areas, (img.size[1], img.size[0]))
+        else:
+            selected_anchor = None
+        assert selected_anchor is not None
+        target_size = self.anchors[selected_anchor][2:].tolist() # w,h
+        if skip_resize:
+            # for debug
+            return selected_anchor
+        return F.resize(img, [target_size[1],target_size[0]], self.interpolation, max_size=None, antialias=self.antialias), selected_anchor
+    def __repr__(self) -> str:
+        detail = f"(size={self.image_size}, anchor={self.anchors}, interpolation={self.interpolation.value}, antialias={self.antialias})"
+        return f"{self.__class__.__name__}{detail}"
+class CutMixin:
+    def __init__(self, cut_cfg={"anchors": "grid_squ_6", "anchor_strategy": "docowl", "cut_prompt": "v3", "add_global": True, "cut_prob": 1.0}) -> None:
+        if cut_cfg is None:
+            self.cut_enable = False
+            return
+        else:
+            self.cut_enable = True
+        image_size = self.image_size
+        anchors = cut_cfg.get('anchors','grid_33')
+        anchor_strategy = cut_cfg.get('anchor_strategy','docowl')
+        cut_prompt = cut_cfg.get('cut_prompt','v0')
+        self.cut_prob = cut_cfg.get('cut_prob', 1.0)
+        self.force_shape_cut = cut_cfg.get('force_shape_cut', False)
+        force_shape_cut_anchors = cut_cfg.get('force_shape_cut_anchors', 'force_shape_cut_anchors')
+        self.add_global = cut_cfg.get('add_global', False)
+        # h,w
+        if isinstance(image_size, int):
+            image_size = (image_size, image_size)
+        self.image_size = image_size
+        if anchors in grid_dict:
+            anchors = grid_dict[anchors]
+        else:
+            anchors = eval(anchors)
+        self.anchors = [tuple(_) for _ in anchors]
+        self.anchor_max = max([max(_) for _ in self.anchors])
+        self.resizer = AnchorResize(image_size=image_size, anchors=anchors, interpolation=InterpolationMode.BICUBIC, anchor_strategy=anchor_strategy)
+        if force_shape_cut_anchors in grid_dict:
+            force_shape_cut_anchors = grid_dict[force_shape_cut_anchors]
+        else:
+            force_shape_cut_anchors = eval(force_shape_cut_anchors)
+        self.force_shape_cut_anchors = [tuple(_) for _ in force_shape_cut_anchors]
+        self.force_shape_cut_anchors_max = max([max(_) for _ in self.force_shape_cut_anchors])
+        self.old_resizer = transforms.Resize(image_size,interpolation=InterpolationMode.BICUBIC)
+        # 把image processor的缩放去掉 只保留后面的变换
+        self.image_transform = transforms.Compose(self.image_transform.transforms[1:])
+        if self.add_global:
+            self.cut_prompt_template = cut_prompt_template_dict[cut_prompt+'_global']
+        else:
+            self.cut_prompt_template = cut_prompt_template_dict[cut_prompt]
+        self.media_tokens = ["<|image|>", "<|video|>"]
+    def _process_image(self, images):
+        new_images = []
+        cut_shape = []
+        for image in images:
+            raw_image = image
+            image, selected_anchor = self.resizer(image)
+            image_input = self.image_transform(image) # h,w,3 -> 3,h,w
+            cut_shape.append((image_input.shape[1]//self.image_size[0], image_input.shape[2]//self.image_size[1])) # cut_h, cut_w
+            image_input = rearrange(image_input, 'C (num_h h) (num_w w) -> (num_h num_w) C h w', h=self.image_size[0], w=self.image_size[1])
+            new_images.append(image_input)
+            if self.add_global:
+                new_images.append(self.image_transform(self.resizer.resize_global(raw_image)).unsqueeze(0))
+                cut_shape.append((1,1))
+        new_images = torch.cat(new_images,dim=0)
+        cut_shape_indices = build_cut_shape_indices(cut_shape)
+        return new_images, cut_shape, cut_shape_indices
+class mPLUGOwl3BatchFeature(BatchFeature):
+    r"""
+    Extend from BatchFeature for supporting various image size
+    """
+    def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
+        super().__init__(data)
+        self.convert_to_tensors(tensor_type=tensor_type)
+    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
+        if tensor_type is None:
+            return self
+        is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type)
+        def converter(value):
+            try:
+                if not is_tensor(value):
+                    tensor = as_tensor(value)
+                    return tensor
+            except:  # noqa E722
+                if key == "overflowing_values":
+                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
+                raise ValueError(
+                    "Unable to create tensor, you should probably activate padding "
+                    "with 'padding=True' to have batched tensors with the same length."
+                )
+        for key, value in self.items():
+            self[key] = recursive_converter(converter, value)
+        return self
+    def to(self, *args, **kwargs) -> "mPLUGOwl3BatchFeature":
+        requires_backends(self, ["torch"])
+        import torch
+        def cast_tensor(v):
+            # check if v is a floating point
+            if torch.is_floating_point(v):
+                # cast and send to device
+                return v.to(*args, **kwargs)
+            elif device is not None:
+                return v.to(device=device)
+            else:
+                return v
+        new_data = {}
+        device = kwargs.get("device")
+        # Check if the args are a device or a dtype
+        if device is None and len(args) > 0:
+            # device should be always the first argument
+            arg = args[0]
+            if is_torch_dtype(arg):
+                # The first argument is a dtype
+                pass
+            elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
+                device = arg
+            else:
+                # it's something else
+                raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
+        # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
+        for k, v in self.items():
+            new_data[k] = recursive_converter(cast_tensor, v)
+        self.data = new_data
+        return self
+class mPLUGOwl3ImageProcessor(BaseImageProcessor, CutMixin):
+    model_input_names = ["pixel_values"]
+    def __init__(
+            self,
+            image_size,
+            mean=[0.5, 0.5, 0.5],
+            std=[0.5, 0.5, 0.5],
+            **kwargs):
+        super().__init__(**kwargs)
+        self.image_size = image_size
+        self.image_transform = transforms.Compose([
+            transforms.Resize((image_size, image_size), interpolation=Image.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize(mean, std),
+        ])
+        CutMixin.__init__(self)
+    def preprocess(
+            self,
+            images: Union[Image.Image, List[Image.Image]],
+            cut_enable=True,
+            **kwargs
+        ) -> mPLUGOwl3BatchFeature:
+        if isinstance(images, Image.Image):
+            images_list = [images]
+        else:
+            images_list = images
+        if self.cut_enable and cut_enable:
+            image_data, cut_shape, cut_shape_indices = self._process_image(images_list)
+        else:
+            image_data = [self.image_transform(self.resizer.resize_global(image)) for image in images_list]
+            image_data = torch.stack(image_data, dim=0)
+            cut_shape = cut_shape_indices = None
+        return mPLUGOwl3BatchFeature(data={'pixel_values': image_data, 'cut_shape':cut_shape, 'cut_shape_indices':cut_shape_indices})
+    def to_dict(self):
+        encoder_dict = super().to_dict()
+        pop_keys = ['image_transform', 'resizer', 'old_resizer', 'cut_prompt_template']
+        for pk in pop_keys:
+            encoder_dict.pop(pk, None)
+        return encoder_dict
+AutoImageProcessor.register("mPLUGOwl3ImageProcessor", mPLUGOwl3ImageProcessor)

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f332135cfd7c39db0280e58a3a041c5fcb0e86d4773816b5ab87e37f9de1b7f
+size 1848369040

modeling_hyper_qwen2.py ADDED Viewed

	@@ -0,0 +1,1532 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Qwen2 model."""
+import inspect
+import math
+from typing import List, Optional, Tuple, Union
+from einops import rearrange, repeat
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_hyper_qwen2 import HyperQwen2Config
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+from .x_sdpa import ScaleDotProductAttention
+try:
+    from flash_attn.layers.rotary import apply_rotary_emb_func
+    from einops import rearrange
+    use_flash_rotary = True
+    print("use flash_attn rotary")
+except ImportError:
+    use_flash_rotary = False
+    print("import flash_attn rotary fail")
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
+_CONFIG_FOR_DOC = "HyperQwen2Config"
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Qwen2
+class Qwen2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, base=10000, use_fp32=False, use_outer_in_rope=False):
+        super().__init__()
+        self.dim = dim
+        self.base = base
+        self.use_fp32 = use_fp32
+        if use_fp32:
+            self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        else:
+            inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+            self.register_buffer("inv_freq", inv_freq)
+        self._rotary_pos_emb_cache = None
+        self._seq_len_cached = 0
+        self.use_outer_in_rope = use_outer_in_rope
+        self._ntk_alpha_cached = 1.0
+    def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0):
+        seqlen = max_seq_len + offset
+        if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached:
+            base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
+            self.inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, device=self.inv_freq.device).float() / self.dim))
+            self._seq_len_cached = seqlen
+            self._ntk_alpha_cached = ntk_alpha
+            seq = torch.arange(seqlen, device=self.inv_freq.device)
+            # Don't do einsum, it converts fp32 to fp16            # TODO: CHECK this
+            if self.use_outer_in_rope:
+                freqs = torch.outer(seq.type_as(self.inv_freq), self.inv_freq)
+            else:
+                freqs = einsum('i , j -> i j', seq.type_as(self.inv_freq), self.inv_freq)
+            # first part even vector components, second part odd vector components,
+            #  2 * dim in dimension size
+            emb = torch.cat((freqs, freqs), dim=-1)
+            # emb [seq_length, .., dim]
+            from einops import rearrange
+            self._rotary_pos_emb_cache = rearrange(emb, 'n d -> n 1 1 d')
+    def forward(self, max_seq_len, offset=0, ntk_alpha=1.0):
+        self.update_rotary_pos_emb_cache(max_seq_len, offset, ntk_alpha)
+        return self._rotary_pos_emb_cache[offset:offset + max_seq_len]
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
+class Qwen2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def make_t2v_mask(media_offset_line, num_images):
+    assert len(media_offset_line.shape) == 1
+    media_offset_line = media_offset_line.view(-1,1)
+    # print_rank_0(media_offset_line)
+    visual_arange=torch.arange(num_images, device=media_offset_line.device).view(1,-1)
+    mask = (media_offset_line<=visual_arange)
+    # print_rank_0(mask)
+    return mask
+def select_query(media_offset, num_queries=None):
+    query_indices = media_offset[:,:,1]>=0 # B L
+    assert query_indices.sum().item()%num_queries == 0, query_indices.sum().item()
+    query_indices = query_indices.nonzero()
+    ptr = 0
+    while ptr < query_indices.shape[0]:
+        first_query_index, last_query_index  = query_indices[ptr], query_indices[ptr+num_queries-1]
+        assert (last_query_index[1] - first_query_index[1] + 1).item() == num_queries
+        assert last_query_index[0].item() == first_query_index[0].item()
+        batch_id, begin_i, end_i = first_query_index[0].item(), first_query_index[1].item(), first_query_index[1].item()+num_queries
+        yield batch_id, begin_i, end_i
+        ptr += num_queries
+def _rotate_half(x):
+    """
+    change sign so the last dimension becomes [-odd, +even]
+    """
+    from einops import rearrange
+    x = rearrange(x, '... (j d) -> ... j d', j=2)
+    x1, x2 = x.unbind(dim=-2)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb_core(t, freqs, use_fp32=False, debug=False):
+    """
+    input tensor t is of shape [seq_length, ..., dim]
+    rotary positional embeding tensor freqs is of shape [seq_length, ..., dim]
+    check https://kexue.fm/archives/8265 for detailed formulas
+    """
+    if use_flash_rotary and use_fp32:
+        t_ = rearrange(t, 's b ... -> b s ...').contiguous()
+        if use_fp32:
+            t_ = t_.float()
+        freqs = freqs.squeeze(1).squeeze(1)
+        cos = freqs[:, :freqs.shape[-1] // 2].cos()
+        sin = freqs[:, :freqs.shape[-1] // 2].sin()
+        output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
+        if debug:
+            from icecream import ic
+            ic(t_.shape, freqs.shape, cos.shape)
+        return rearrange(output, 'b s ... -> s b ...')
+    rot_dim = freqs.shape[-1]
+    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
+    t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
+    if use_fp32:
+        t_ = t_.float()
+        t_pass_ = t_pass_.float()
+    # first part is cosine component
+    # second part is sine component, need to change signs with _rotate_half method
+    t_ = (t_ * freqs.cos()) + (_rotate_half(t_) * freqs.sin())
+    return torch.cat((t_, t_pass_), dim=-1).type_as(t)
+class HyperQwen2Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+    def __init__(self, config: HyperQwen2Config, layer_idx: Optional[int] = None, is_hyper_enabed=False):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = Qwen2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+        self.rotary_emb_core = RotaryEmbedding(
+            self.head_dim, base=self.rope_theta, use_fp32=True, use_outer_in_rope=True
+        )
+        # Hyper Attention Modules
+        self.is_hyper_enabed = is_hyper_enabed
+        if self.is_hyper_enabed:
+            self.v_kv_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim * 2, bias=True)
+            self.gate = nn.Parameter(torch.zeros(self.hidden_size))
+            self.v_core_attention_sdpa = ScaleDotProductAttention(layer_number=-1,causal=False, attention_dropout=self.attention_dropout)
+            self.visual_cache={}
+    def apply_mi_rope(self, key_layer, media_offset_line, length_each_img):
+        # input shape should be [s b h d]
+        key_layer = rearrange(key_layer, 'b h s d -> s b h d')
+        if self.rotary_emb_core.inv_freq.device!=key_layer.device:
+            self.rotary_emb_core.inv_freq = self.rotary_emb_core.inv_freq.to(key_layer.device)
+        rotary_pos_emb_max_seq_len = self.config.max_position_embeddings
+        ntk_alpha = 1
+        rotary_pos_emb = self.rotary_emb_core(rotary_pos_emb_max_seq_len, ntk_alpha=ntk_alpha)
+        assert rotary_pos_emb is not None
+        if isinstance(rotary_pos_emb, tuple):
+            rotary_pos_emb = rotary_pos_emb
+        else:
+            rotary_pos_emb = ((rotary_pos_emb,) * 2)
+        if rotary_pos_emb is not None:
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+            # ic(key_layer.shape, k_pos_emb.shape)
+            image_pos = (media_offset_line[1:] - media_offset_line[:-1]).nonzero().squeeze(1)+1
+            k_pos_emb = repeat(k_pos_emb[image_pos], 'N_img b h d -> (N_img L) b h d', L=length_each_img) # N_img, dim
+            key_layer = apply_rotary_pos_emb_core(key_layer, k_pos_emb, use_fp32=True) # TODO difference
+        key_layer = rearrange(key_layer, 's b h d -> b h s d')
+        return key_layer
+    def crossattention(self, query_layer, vision_features, media_offset, context_layer):
+        '''
+        query_layer: [s b h d]
+        vision_features: [b' lv d]
+        context_layer: s b d
+        '''
+        if vision_features is None or (self.is_hyper_enabed == False):
+            return context_layer
+        context_layer_clone = context_layer.clone()
+        # obtain dynamic gate value
+        vision_features = vision_features.contiguous()
+        vision_features = self.v_kv_proj(vision_features)
+        length_each_img = vision_features.shape[1]
+        sequence_length = query_layer.shape[0]
+        if sequence_length == 1:
+            # 此时处于生成模式
+            completion_flag=True
+            media_offset = media_offset[:,-1:]
+        else:
+            completion_flag=False
+            self.visual_cache['media_offset'] = media_offset
+            self.visual_cache['vision_features'] = vision_features
+        query_layer = rearrange(query_layer, 'L B H D -> B H L D') # [25, 2, 32, 128])
+        assert sequence_length == media_offset.shape[1], (sequence_length, media_offset.shape)
+        gate_value = torch.sigmoid(self.gate)
+        for batch_id, begin_i, end_i in select_query(media_offset, sequence_length):
+            # media_offset should be set to -100000 for samples without images.
+            assert begin_i == 0
+            assert end_i == sequence_length, (end_i, sequence_length)
+            curr_offset = media_offset[batch_id,end_i-1] # 当前数据序列的最后一个token拿到的media offset应该是当前数据的所有图
+            if (not completion_flag):
+                # 对于生成模式 query对视觉可见性应该是全部
+                # v2t mask只对prefill阶段有效
+                re_to_zero_media_offset = (media_offset[batch_id,:,1]-curr_offset[0]).to(query_layer.device)
+                query_shift = re_to_zero_media_offset.nonzero()[0].item() # 找到第一个非0位置
+                curr_mask = make_t2v_mask(
+                    re_to_zero_media_offset[query_shift:], # 取end表示最多能看几张图
+                    num_images=curr_offset[1]-curr_offset[0],
+                )
+                curr_mask = repeat(curr_mask, 's_q s_k -> B H s_q (s_k img_l)', B=1, H=1, img_l=length_each_img)
+                # print_rank_0(query_shift)
+            else:
+                curr_mask = None
+                query_shift = 0
+            curr_query_tokens = query_layer[batch_id,:,query_shift:].unsqueeze(0).clone().contiguous()
+            assert curr_offset[0]<vision_features.shape[0]
+            assert curr_offset[1]<=vision_features.shape[0]
+            curr_vision_kv: torch.Tensor = rearrange(vision_features[curr_offset[0]:curr_offset[1]].clone(), 'BL Lv (H KV D) -> KV 1 H (BL Lv) D', KV=2, H=self.num_key_value_heads)
+            key_layer = curr_vision_kv[0].contiguous() # [b h s d]
+            value_layer = curr_vision_kv[1].contiguous()
+            # Apply MI-Rope
+            key_layer = self.apply_mi_rope(key_layer, media_offset_line=self.visual_cache['media_offset'][batch_id,:,1]-curr_offset[0], length_each_img=length_each_img)
+            key_layer = repeat_kv(key_layer, self.num_key_value_groups)
+            value_layer = repeat_kv(value_layer, self.num_key_value_groups)
+            v_context_layer = self.v_core_attention_sdpa(curr_query_tokens, key_layer, value_layer, attn_mask=curr_mask, order='bhsd').squeeze(1)
+            # Apply dynamic gate
+            context_layer_clone[query_shift:, batch_id] = context_layer[query_shift:, batch_id].clone() * (1-gate_value) + v_context_layer * gate_value
+        return context_layer_clone
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        image_embeds=None,
+        media_offset=None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        raise NotImplementError("We do not support eager model yet. Use attn_implementation == \"flash_attention_2\" or attn_implementation == \"sdpa\".")
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        # Hyper Attention
+        attn_output = self.crossattention(query_states.permute(1,0,1,3), image_embeds, media_offset, attn_output.permute(1,0,2))
+        attn_output = attn_output.permute(1,0,2)
+        #### End of Hyper Attention
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class HyperQwen2FlashAttention2(HyperQwen2Attention):
+    """
+    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
+    as the weights of the module stays untouched. The only required change would be on the forward pass
+    where it needs to correctly call the public API of flash attention and deal with padding tokens
+    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
+    config.max_window_layers layers.
+    """
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        image_embeds=None,
+        media_offset=None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ):
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+            and self.config.use_sliding_window
+        )
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        # Hyper Attention
+        # (batch_size, seqlen, nheads, headdim) -> [s b h d]
+        attn_output = self.crossattention(query_states.permute(1,0,2,3), image_embeds, media_offset, attn_output.permute(1,0,2))
+        attn_output = attn_output.permute(1,0,2)
+        #### End of Hyper Attention
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+        # Decide whether to use SWA or not by layer index.
+        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
+            use_sliding_windows = False
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+        return attn_output
+    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+# Copied from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Qwen2
+class HyperQwen2SdpaAttention(HyperQwen2Attention):
+    """
+    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    # Adapted from Qwen2Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        image_embeds=None,
+        media_offset=None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        # Hyper Attention
+        attn_output = self.crossattention(query_states.permute(2,0,1,3), image_embeds, media_offset, attn_output.permute(1,0,2))
+        attn_output = attn_output.permute(1,0,2)
+        #### End of Hyper Attention
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+QWEN2_ATTENTION_CLASSES = {
+    "eager": HyperQwen2Attention,
+    "flash_attention_2": HyperQwen2FlashAttention2,
+    "sdpa": HyperQwen2SdpaAttention,
+}
+class HyperQwen2DecoderLayer(nn.Module):
+    def __init__(self, config: HyperQwen2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+        self.is_hyper_enabled = (layer_idx+1) in config.hyper_layers
+        self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx, is_hyper_enabed=self.is_hyper_enabled)
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        image_embeds=None,
+        media_offset=None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Shared LayerNorm
+        if image_embeds is not None and self.is_hyper_enabled:
+            image_embeds = self.input_layernorm(image_embeds)
+        else:
+            image_embeds = media_offset = None
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            image_embeds=image_embeds,
+            media_offset=media_offset,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+QWEN2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`HyperQwen2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2PreTrainedModel(PreTrainedModel):
+    config_class = HyperQwen2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HyperQwen2DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+QWEN2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_START_DOCSTRING,
+)
+class HyperQwen2Model(Qwen2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
+    Args:
+        config: HyperQwen2Config
+    """
+    def __init__(self, config: HyperQwen2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [HyperQwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_embeds=None,
+        media_offset=None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        past_key_values_length = 0
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    image_embeds,
+                    media_offset,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    image_embeds=image_embeds,
+                    media_offset=media_offset,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class HyperQwen2ForCausalLM(Qwen2PreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = HyperQwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_embeds=None,
+        media_offset=None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
+        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            image_embeds=image_embeds,
+            media_offset=media_offset,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                'image_embeds': kwargs.get('image_embeds'),
+                'media_offset': kwargs.get('media_offset'),
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past

modeling_mplugowl3.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import math
+from typing import List, Optional
+import json
+import torch
+import torchvision
+from threading import Thread
+from copy import deepcopy
+from PIL import Image
+from transformers import AutoProcessor, Qwen2PreTrainedModel, Qwen2ForCausalLM, TextIteratorStreamer
+from .processing_mplugowl3 import mPLUGOwl3Processor
+from .image_processing_mplugowl3 import mPLUGOwl3ImageProcessor
+from .configuration_mplugowl3 import mPLUGOwl3Config
+# from .modeling_navit_siglip import SiglipVisionTransformer
+from transformers.models.siglip.modeling_siglip import SiglipVisionTransformer
+from .x_sdpa import ScaleDotProductAttention
+from .modeling_hyper_qwen2 import HyperQwen2ForCausalLM
+from torch import nn
+class mPLUGOwl3PreTrainedModel(Qwen2PreTrainedModel):
+    config_class = mPLUGOwl3Config
+class mPLUGOwl3Model(mPLUGOwl3PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.language_model = HyperQwen2ForCausalLM(config)
+        self.vision_model = self.init_vision_module()
+        self.vision_dim = self.vision_model.embed_dim
+        self.embed_dim = self.language_model.config.hidden_size
+        self.vision2text_model = nn.Linear(self.vision_dim, self.embed_dim)
+        self.processor = None
+        self.terminators = ['<|im_end|>', '<|endoftext|>']
+    def init_vision_module(self):
+        self.config.vision_config._attn_implementation = self.config.vision_config._attn_implementation
+        model = SiglipVisionTransformer(self.config.vision_config)
+        setattr(model, 'embed_dim', model.embeddings.embed_dim)
+        setattr(model, 'patch_size', model.embeddings.patch_size)
+        return model
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.language_model.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+    def get_decoder(self):
+        return self.language_model
+    def forward_image(self, pixel_values):
+        if pixel_values is None:
+            return None
+        dtype = self.language_model.model.embed_tokens.weight.dtype
+        with torch.inference_mode():
+            image_embeds = self.vision_model(pixel_values.to(dtype), output_hidden_states=True).hidden_states[-2]
+        if self.vision2text_model is not None:
+            image_embeds = self.vision2text_model(image_embeds)
+        else:
+            pass
+        return image_embeds
+    def forward(self, pixel_values=None, **kwargs):
+        image_embeds = self.forward_image(pixel_values)
+        return self.language_model(
+            image_embeds=image_embeds,
+            **kwargs
+        )
+    def _decode(self, input_ids, image_embeds, media_offset, tokenizer, attention_mask, decode_text=False, **kwargs):
+        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
+        output = self.language_model.generate(
+            input_ids=input_ids,
+            image_embeds=image_embeds,
+            media_offset=media_offset,
+            pad_token_id=0,
+            eos_token_id=terminators,
+            attention_mask=attention_mask,
+            **kwargs
+        )
+        output = output[:,input_ids.shape[1]:]
+        if decode_text:
+            return self._decode_text(output, tokenizer)
+        return output
+    def _decode_stream(self, input_ids, image_embeds, media_offset, tokenizer, **kwargs):
+        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
+        streamer = TextIteratorStreamer(tokenizer=tokenizer)
+        generation_kwargs = {
+            'input_ids': input_ids,
+            'image_embeds': image_embeds,
+            'media_offset': media_offset,
+            'pad_token_id': 0,
+            'eos_token_id': terminators,
+            'streamer': streamer
+        }
+        generation_kwargs.update(kwargs)
+        thread = Thread(target=self.language_model.generate, kwargs=generation_kwargs)
+        thread.start()
+        return streamer
+    def _decode_text(self, result_ids, tokenizer):
+        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
+        result_text = []
+        for result in result_ids:
+            result = result[result != 0]
+            if result[-1] in terminators:
+                result = result[:-1]
+            result_text.append(tokenizer.decode(result).strip())
+        return result_text
+    def init_processor(self, tokenizer):
+        ip = mPLUGOwl3ImageProcessor(image_size=384)
+        self.processor = mPLUGOwl3Processor(image_processor=ip, tokenizer=tokenizer)
+        processor = self.processor
+        return processor
+    def generate(
+        self,
+        input_ids=None,
+        pixel_values=None,
+        media_offset=None,
+        attention_mask=None,
+        tokenizer=None,
+        stream=False,
+        decode_text=False,
+        **kwargs
+    ):
+        assert input_ids is not None
+        with torch.inference_mode():
+            image_embeds = self.forward_image(pixel_values)
+            if stream:
+                result = self._decode_stream(input_ids=input_ids, image_embeds=image_embeds, media_offset=media_offset, tokenizer=tokenizer, **kwargs)
+            else:
+                result = self._decode(input_ids=input_ids, image_embeds=image_embeds, media_offset=media_offset, tokenizer=tokenizer, attention_mask=attention_mask, decode_text=decode_text, **kwargs)
+        return result
+    def chat(
+        self,
+        images,
+        videos,
+        messages,
+        tokenizer,
+        processor=None,
+        max_new_tokens=2048,
+        min_new_tokens=0,
+        sampling=True,
+        max_inp_length=8192,
+        system_prompt='',
+        stream=False,
+        max_slice_nums=None,
+        use_image_id=None,
+        **kwargs
+    ):
+        cut_flag = kwargs.get('kwargs', True)
+        if processor is None:
+            if self.processor is None:
+                processor = self.init_processor(tokenizer)
+            else:
+                processor = self.processor
+        inputs = processor(messages, images=images, videos=videos, cut_enable=cut_flag)
+        inputs.to('cuda')
+        inputs.update({
+            'tokenizer': tokenizer,
+            'max_new_tokens': max_new_tokens,
+            # 'stream':True,
+        })
+        if sampling:
+            generation_config = {
+                "top_p": 0.8,
+                "top_k": 100,
+                "temperature": 0.7,
+                "do_sample": True,
+                # "repetition_penalty": 1.05
+            }
+        else:
+            generation_config = {
+                "num_beams": 3,
+                # "repetition_penalty": 1.2,
+            }
+        if min_new_tokens > 0:
+            generation_config['min_new_tokens'] = min_new_tokens
+        generation_config.update(
+            (k, kwargs[k]) for k in generation_config.keys() & kwargs.keys()
+        )
+        with torch.inference_mode():
+            res = self.generate(
+                **inputs,
+                stream=stream,
+                decode_text=True,
+                **generation_config
+            )
+        if stream:
+            def stream_gen():
+                for text in res:
+                    for term in self.terminators:
+                        text = text.replace(term, '')
+                    yield text
+            return stream_gen()
+        else:
+            answer = res[0]
+            return answer

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,119 @@

+{
+  "add_global": true,
+  "anchor_max": 4,
+  "anchors": [
+    [
+      2,
+      2
+    ],
+    [
+      1,
+      3
+    ],
+    [
+      1,
+      4
+    ],
+    [
+      3,
+      1
+    ],
+    [
+      4,
+      1
+    ],
+    [
+      2,
+      3
+    ],
+    [
+      3,
+      2
+    ]
+  ],
+  "cut_enable": true,
+  "cut_prob": 1.0,
+  "force_shape_cut": false,
+  "force_shape_cut_anchors": [
+    [
+      "f"
+    ],
+    [
+      "o"
+    ],
+    [
+      "r"
+    ],
+    [
+      "c"
+    ],
+    [
+      "e"
+    ],
+    [
+      "_"
+    ],
+    [
+      "s"
+    ],
+    [
+      "h"
+    ],
+    [
+      "a"
+    ],
+    [
+      "p"
+    ],
+    [
+      "e"
+    ],
+    [
+      "_"
+    ],
+    [
+      "c"
+    ],
+    [
+      "u"
+    ],
+    [
+      "t"
+    ],
+    [
+      "_"
+    ],
+    [
+      "a"
+    ],
+    [
+      "n"
+    ],
+    [
+      "c"
+    ],
+    [
+      "h"
+    ],
+    [
+      "o"
+    ],
+    [
+      "r"
+    ],
+    [
+      "s"
+    ]
+  ],
+  "force_shape_cut_anchors_max": "u",
+  "image_processor_type": "mPLUGOwl3ImageProcessor",
+  "image_size": [
+    384,
+    384
+  ],
+  "media_tokens": [
+    "<|image|>",
+    "<|video|>"
+  ],
+  "processor_class": "mPLUGOwl3Processor"
+}

processing_mplugowl3.py ADDED Viewed

	@@ -0,0 +1,396 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for mPLUGOwl3.
+"""
+from typing import List, Optional, Union, Dict, Any
+import warnings
+import torch
+import re
+from transformers.image_processing_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from transformers.utils import TensorType, requires_backends, is_torch_dtype, is_torch_device
+from .image_processing_mplugowl3 import mPLUGOwl3BatchFeature, mPLUGOwl3ImageProcessor
+OWL_MEDIA_TOKEN=['<|image|>']
+class MediaIndicesHelper():
+    def __init__(self, tokenizer) -> None:
+        self.media_position = []
+        self.tokenizer = tokenizer
+    def has_media(self, text, media_tokens=None):
+        if media_tokens is None:
+            media_tokens = OWL_MEDIA_TOKEN
+        has_media_flag = any([media_token == text for media_token in media_tokens])
+        if any([media_token in text for media_token in media_tokens]):
+            # 不允许出现text中包含media token但是不仅仅是media token。 media token必须单独为一个chunk
+            assert has_media_flag, text
+        return has_media_flag
+    def add_media(self, text_chunk, text=None, tokenize_fn=None):
+        # cross
+        assert tokenize_fn is not None
+        assert text is not None
+        assert text in OWL_MEDIA_TOKEN
+        media_token_ids = tokenize_fn(text)
+        start = len(text_chunk)
+        end = start + len(media_token_ids)
+        self.media_position.append([start, end])
+        text_chunk.extend(media_token_ids)
+        return len(media_token_ids)
+    def cal_media_offset(self, input_ids):
+        if len(self.media_position) == 0:
+            return torch.ones_like(input_ids)*(-1000000)
+        media_starts = torch.tensor([_[0] for _ in self.media_position]).reshape(1,-1)
+        rng = torch.arange(input_ids.shape[0]).reshape(-1,1)
+        matrix = (rng > media_starts).sum(dim=1)
+        return matrix
+    def len_images(self,):
+        return len(self.media_position)
+class mPLUGOwl3Processor(ProcessorMixin):
+    r"""
+    Args:
+        image_processor ([`mPLUGOwl3ImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerWrapper`], *optional*):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(self, image_processor: mPLUGOwl3ImageProcessor = None, tokenizer=None, prompt_style='chatml', inference_mode=True, addition_eod="<|endoftext|>"):
+        super().__init__(image_processor, tokenizer)
+        self.image_processor: mPLUGOwl3ImageProcessor
+        self.prompt_style = prompt_style
+        self.inference_mode = inference_mode
+        self.media_tokens = ["<|image|>"]
+        self.addition_eod = addition_eod
+    def build_text_qwen(self, messages):
+        # role should be within ['system', 'user', 'assistant']
+        im_start, im_end = '<|im_start|>', '<|im_end|>'
+        text = []
+        for num_turn, message in enumerate(messages):
+            if num_turn == 0 and message['role'] != 'system':
+                if self.prompt_style != 'plain':
+                    text.append({
+                        "text": f"{im_start}system\n{im_end}",
+                        "label": 0
+                    })
+            if message['role'] == 'system':
+                if self.prompt_style != 'plain':
+                    text.append({
+                        "text": f"{im_start}system\n{message['content']}{im_end}",
+                        "label": 0
+                    })
+            elif message['role'] == 'user':
+                if self.prompt_style != 'plain':
+                    content = f"\n{im_start}user\n{message['content']}{im_end}"
+                else:
+                    content = message['content']
+                pattern = '|'.join(map(re.escape, self.media_tokens))
+                chunk_strs = re.split(f'({pattern})', content)
+                for chunk_str in chunk_strs:
+                    text.append({
+                        "text": chunk_str,
+                        "label": 0
+                    })
+            elif message['role'] == 'assistant':
+                if self.prompt_style != 'plain':
+                    text.append({"text": f"\n{im_start}assistant\n", "label": 0})
+                    text.append({"text": f"{message['content']}{im_end}", "label": 1})
+                else:
+                    text.append({"text": f"{message['content']}", "label": 1})
+                text.append({"text": self.addition_eod, "label": 1})
+            else:
+                raise NotImplementedError
+        if self.inference_mode:
+            while text and text[-1]['label']==1:  # 只要列表非空且最后一个元素满足条件
+                text.pop()  # 就移除最后一个元素
+        return text
+    def wrapped_tokenize(self, text):
+        return self.tokenizer(text).input_ids
+    def encode_text_sft(self, texts):
+        # output enc_chunk
+        enc_chunk = []
+        label_chunk = []
+        enc_length = 0
+        num_images = 0
+        media_helper = MediaIndicesHelper(tokenizer=self.tokenizer)
+        for current_ti, text_chunk in enumerate(texts):
+            text = text_chunk["text"]
+            label = text_chunk["label"]
+            if not media_helper.has_media(text):
+                curr_chunk=self.wrapped_tokenize(text)
+                if label == 1:
+                    enc_length += len(curr_chunk)
+                    enc_chunk += curr_chunk
+                    label_chunk += [label] * len(curr_chunk)
+                else:
+                    enc_length += len(curr_chunk)
+                    enc_chunk += curr_chunk
+                    label_chunk += [label] * len(curr_chunk)
+            # For media tokens
+            else:
+                add_length = media_helper.add_media(
+                    enc_chunk,
+                    text=text,
+                    tokenize_fn=self.wrapped_tokenize)
+                enc_length += add_length
+                label_chunk += [label] * add_length
+                # enc_chunk.extend([self.media_tokens[text]] * self.media_lengths[text])
+                # enc_length += self.media_lengths[text]
+                # label_chunk += [label] * self.media_lengths[text]
+                num_images += 1
+        enc_chunk = torch.tensor(enc_chunk).long()
+        media_offset = []
+        media_before = 0
+        for i,_ in enumerate([media_helper]):
+            mo = _.cal_media_offset(enc_chunk)
+            media_offset.append(torch.cat([(torch.ones(mo.shape[0],1)*media_before).long().to(mo.device), (mo+media_before).unsqueeze(1)], dim=1)) # L 2
+            media_before += _.len_images()
+        media_offset = torch.stack(media_offset, dim=0)
+        return {
+            'input_ids': enc_chunk.unsqueeze(0),
+            'media_offset': media_offset,
+        }
+    def __call__(
+        self,
+        messages,
+        images = None,
+        videos = None,
+        max_length: Optional[int] = None,
+        cut_enable=True,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        **kwargs
+    ) -> mPLUGOwl3BatchFeature:
+        medias = []
+        if videos is not None:
+            medias.extend([{'type': 'video', 'content': video, 'use_video_span': True} for video in videos])
+        if images is not None:
+            medias.extend([{'type':'image', 'content': image}  for image in images])
+        if len(medias):
+            image_tensor_list = []
+            pattern = r"(<\|image\|>|<\|video\|>)"
+            # 存在媒体
+            image_token_ptr = 0
+            media_layout = []
+            for message in messages:
+                text_list = re.split(pattern, message['content'])
+                text = ''
+                for text_content in text_list:
+                    if text_content in ['<|image|>', '<|video|>']:
+                        media_item = medias[image_token_ptr]
+                        image_token_ptr += 1
+                        if text_content == '<|image|>':
+                            assert media_item['type'] == 'image'
+                            image = media_item['content']
+                            image_inputs = self.image_processor([image], cut_enable=cut_enable, return_tensors=return_tensors)
+                            if image_inputs.get('cut_shape',None) is not None:
+                                cut_shape = image_inputs['cut_shape']
+                                cut_text = self.image_processor.cut_prompt_template(img_token='<|image|>', h=cut_shape[0][0], w=cut_shape[0][1])
+                                text += cut_text
+                                image_tensor_list.append(image_inputs['pixel_values'])
+                            else:
+                                text += text_content
+                        elif text_content == '<|video|>':
+                            assert media_item['type'] == 'video'
+                            video = media_item['content']
+                            use_video_span = media_item['use_video_span']
+                            image_tensor = self.image_processor(video, cut_enable=False)['pixel_values']
+                            image_tensor_list.append(image_tensor)
+                            num_video_frame = image_tensor.shape[0]
+                            if use_video_span:
+                                text_content = '<|start_video_frame|>'+'<|image|>'*num_video_frame+'<|end_video_frame|>'
+                            else:
+                                text_content = '<|image|>'*num_video_frame
+                            text += text_content
+                    else:
+                        text += text_content
+                message['content'] = text
+            assert image_token_ptr == len(medias), (image_token_ptr,len(medias)) # 保证图和token数目一致
+            assert all(len(_.shape) == 4 for _ in image_tensor_list), [_.shape for _ in image_tensor_list]
+            num_image_tokens = sum([_['content'].count('<|image|>')for _ in messages])
+            num_image_shapes = sum([_.shape[0] for _ in image_tensor_list])
+            assert num_image_tokens == num_image_shapes, (messages, [_.shape for _ in image_tensor_list])
+        image_tensor_list = torch.cat(image_tensor_list, dim=0)
+        # text = ''.join([_['text'] for _ in text])
+        text = self.build_text_qwen(messages)
+        model_inputs = self.encode_text_sft(text)
+        if len(medias) is not None:
+            model_inputs.update({'pixel_values': image_tensor_list})
+            # if 'cut_shape' in model_inputs:
+            #     model_inputs.pop('cut_shape')
+            # if 'cut_shape_indices' in model_inputs:
+            #     model_inputs.pop('cut_shape_indices')
+        return mPLUGOwl3BatchFeature(model_inputs)
+    def check_media(self, images, messages):
+        media_num = 0 if images is None else len(images)
+        media_count = sum([message['content'].count('<|image|>') for message in messages])
+        assert media_num == media_count
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        output_ids = args[0]
+        result_text = []
+        for result in output_ids:
+            result = result[result != 0]
+            if result[0] == self.tokenizer.bos_id:
+                result = result[1:]
+            if result[-1] == self.tokenizer.eos_id:
+                result = result[:-1]
+            result_text.append(self.tokenizer.decode(result, *args[1:], **kwargs).strip())
+        return result_text
+        # return self.tokenizer.batch_decode(*args, **kwargs)
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        result = args[0]
+        result = result[result != 0]
+        if result[0] == self.tokenizer.bos_id:
+            result = result[1:]
+        if result[-1] == self.tokenizer.eos_id or (hasattr(self.tokenizer, "eot_id") and result[-1] == self.tokenizer.eot_id):
+            result = result[:-1]
+        return self.tokenizer.decode(result, *args[1:], **kwargs).strip()
+    def _convert(
+        self, input_str, max_inp_length: Optional[int] = None
+    ):
+        if self.version > 2.5 or not getattr(self.tokenizer, "add_bos_token", False):
+            input_ids = self.tokenizer.encode(input_str)
+        else:
+            input_ids = [self.tokenizer.bos_id] + self.tokenizer.encode(input_str)
+        if max_inp_length is not None:
+            input_ids = input_ids[:max_inp_length]
+        input_ids = torch.tensor(input_ids, dtype=torch.int32)
+        start_cond = (input_ids == self.tokenizer.im_start_id) | (input_ids == self.tokenizer.slice_start_id)
+        end_cond = (input_ids == self.tokenizer.im_end_id) | (input_ids == self.tokenizer.slice_end_id)
+        image_start_tokens = torch.where(start_cond)[0]
+        image_start_tokens += 1
+        image_end_tokens = torch.where(end_cond)[0]
+        valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))
+        image_bounds = torch.hstack(
+            [
+                image_start_tokens[:valid_image_nums].unsqueeze(-1),
+                image_end_tokens[:valid_image_nums].unsqueeze(-1),
+            ]
+        )
+        return input_ids, image_bounds
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    def pad(self, inputs, max_length=None, padding_value=0, padding_side="left"):
+        items = []
+        if isinstance(inputs[0], list):
+            assert isinstance(inputs[0][0], torch.Tensor)
+            for it in inputs:
+                for tr in it:
+                    items.append(tr)
+        else:
+            assert isinstance(inputs[0], torch.Tensor)
+            items = inputs
+        batch_size = len(items)
+        shape = items[0].shape
+        dim = len(shape)
+        assert dim <= 2
+        if max_length is None:
+            max_length = 0
+        max_length = max(max_length, max(item.shape[-1] for item in items))
+        min_length = min(item.shape[-1] for item in items)
+        dtype = items[0].dtype
+        if dim == 0:
+            return torch.stack([item for item in items], dim=0), [0]
+        elif dim == 1:
+            if max_length == min_length:
+                return torch.stack([item for item in items], dim=0), [0] * batch_size
+            tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
+        else:
+            tensor = (
+                torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype)
+                + padding_value
+            )
+        padding_length = []
+        for i, item in enumerate(items):
+            if dim == 1:
+                if padding_side == "left":
+                    tensor[i, -len(item) :] = item.clone()
+                else:
+                    tensor[i, : len(item)] = item.clone()
+            elif dim == 2:
+                if padding_side == "left":
+                    tensor[i, -len(item) :, :] = item.clone()
+                else:
+                    tensor[i, : len(item), :] = item.clone()
+            padding_length.append(tensor.shape[-1] - len(item))
+        return tensor, padding_length

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "addition_eod": "<|endoftext|>",
+  "inference_mode": true,
+  "processor_class": "mPLUGOwl3Processor",
+  "prompt_style": "chatml"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bcfe42da0a4497e8b2b172c1f9f4ec423a46dc12907f4349c55025f670422ba9
+size 11418266

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "mPLUGOwl3Processor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,202 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.6909581646423751,
+  "eval_steps": 20.0,
+  "global_step": 400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.001727395411605938,
+      "grad_norm": 2.296875,
+      "learning_rate": 4.6399657310349495e-05,
+      "loss": 1.484876036643982,
+      "step": 1,
+      "token_acc": 0.6462335484450566
+    },
+    {
+      "epoch": 0.03454790823211876,
+      "grad_norm": 1.90625,
+      "learning_rate": 4.626305873397369e-05,
+      "loss": 1.7690341347142269,
+      "step": 20,
+      "token_acc": 0.6112596155574777
+    },
+    {
+      "epoch": 0.06909581646423751,
+      "grad_norm": 1.4296875,
+      "learning_rate": 4.5853851566096545e-05,
+      "loss": 1.4788523674011231,
+      "step": 40,
+      "token_acc": 0.6483243713876449
+    },
+    {
+      "epoch": 0.10364372469635627,
+      "grad_norm": 1.4140625,
+      "learning_rate": 4.5177209302199275e-05,
+      "loss": 1.439206314086914,
+      "step": 60,
+      "token_acc": 0.6553296455190211
+    },
+    {
+      "epoch": 0.13819163292847503,
+      "grad_norm": 1.515625,
+      "learning_rate": 4.4241119894718515e-05,
+      "loss": 1.4094647407531737,
+      "step": 80,
+      "token_acc": 0.6602077268441662
+    },
+    {
+      "epoch": 0.17273954116059378,
+      "grad_norm": 2.421875,
+      "learning_rate": 4.305663414266929e-05,
+      "loss": 1.403604221343994,
+      "step": 100,
+      "token_acc": 0.6611030317769436
+    },
+    {
+      "epoch": 0.20728744939271254,
+      "grad_norm": 1.3203125,
+      "learning_rate": 4.1637735233851226e-05,
+      "loss": 1.441183090209961,
+      "step": 120,
+      "token_acc": 0.6547628498758228
+    },
+    {
+      "epoch": 0.2418353576248313,
+      "grad_norm": 1.4453125,
+      "learning_rate": 4.000117366937959e-05,
+      "loss": 1.4156587600708008,
+      "step": 140,
+      "token_acc": 0.6589790481773043
+    },
+    {
+      "epoch": 0.27638326585695006,
+      "grad_norm": 1.6484375,
+      "learning_rate": 3.816626951930339e-05,
+      "loss": 1.4070409774780273,
+      "step": 160,
+      "token_acc": 0.6601143735897974
+    },
+    {
+      "epoch": 0.31093117408906884,
+      "grad_norm": 1.609375,
+      "learning_rate": 3.6154684343738876e-05,
+      "loss": 1.391796875,
+      "step": 180,
+      "token_acc": 0.6627911109115973
+    },
+    {
+      "epoch": 0.34547908232118757,
+      "grad_norm": 1.3984375,
+      "learning_rate": 3.399016547205342e-05,
+      "loss": 1.371710205078125,
+      "step": 200,
+      "token_acc": 0.6663792943443839
+    },
+    {
+      "epoch": 0.38002699055330635,
+      "grad_norm": 1.7578125,
+      "learning_rate": 3.169826565895621e-05,
+      "loss": 1.3606269836425782,
+      "step": 220,
+      "token_acc": 0.6679015259153485
+    },
+    {
+      "epoch": 0.4145748987854251,
+      "grad_norm": 1.265625,
+      "learning_rate": 2.9306041427034397e-05,
+      "loss": 1.3541061401367187,
+      "step": 240,
+      "token_acc": 0.6687709899517441
+    },
+    {
+      "epoch": 0.44912280701754387,
+      "grad_norm": 1.46875,
+      "learning_rate": 4.626305873397369e-05,
+      "loss": 1.465986156463623,
+      "step": 260,
+      "token_acc": 0.6466508267300063
+    },
+    {
+      "epoch": 0.4836707152496626,
+      "grad_norm": 1.6875,
+      "learning_rate": 4.5853851566096545e-05,
+      "loss": 1.3503923416137695,
+      "step": 280,
+      "token_acc": 0.6691855650431532
+    },
+    {
+      "epoch": 0.5182186234817814,
+      "grad_norm": 1.3203125,
+      "learning_rate": 4.5177209302199275e-05,
+      "loss": 1.334010124206543,
+      "step": 300,
+      "token_acc": 0.6722883507781587
+    },
+    {
+      "epoch": 0.5527665317139001,
+      "grad_norm": 1.578125,
+      "learning_rate": 4.4241119894718515e-05,
+      "loss": 1.3245996475219726,
+      "step": 320,
+      "token_acc": 0.6737743942234781
+    },
+    {
+      "epoch": 0.5873144399460188,
+      "grad_norm": 1.46875,
+      "learning_rate": 4.305663414266929e-05,
+      "loss": 1.310002040863037,
+      "step": 340,
+      "token_acc": 0.6765112470534054
+    },
+    {
+      "epoch": 0.6218623481781377,
+      "grad_norm": 1.375,
+      "learning_rate": 4.1637735233851226e-05,
+      "loss": 1.299285888671875,
+      "step": 360,
+      "token_acc": 0.6787883284600116
+    },
+    {
+      "epoch": 0.6564102564102564,
+      "grad_norm": 1.421875,
+      "learning_rate": 4.000117366937959e-05,
+      "loss": 1.3011038780212403,
+      "step": 380,
+      "token_acc": 0.6782609286738766
+    },
+    {
+      "epoch": 0.6909581646423751,
+      "grad_norm": 1.703125,
+      "learning_rate": 3.816626951930339e-05,
+      "loss": 1.2823470115661622,
+      "step": 400,
+      "token_acc": 0.6813430169989177
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 578,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 20,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.7796747815367475e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b3b1c8f4ed1b99416a2d6f3b33f7ee964dc29939329047f06cfcf1829490b0c
+size 6673

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

x_sdpa.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from torch import nn
+from icecream import ic
+from einops import rearrange
+class ScaleDotProductAttention(nn.Module):
+    def __init__(self, layer_number, causal=False, softmax_scale=None, attention_dropout=0.0):
+        super().__init__()
+        self.layer_number = layer_number
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+        # Qwen 不需要scale
+    def forward(self, q, k, v, attn_mask=None, order='sbhd'):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
+        """
+        # (N,...,L,E)
+        import torch
+        import torch.nn as nn
+        import torch.nn.functional as F
+        if order == 'sbhd':
+            q, k, v = [rearrange(x, 's b h d -> b h s d').contiguous()
+                       for x in (q, k, v)]
+        elif order == 'bhsd':
+            pass
+        if attn_mask is not None:
+            attn_mask = (~attn_mask.clone().bool()).contiguous()
+        else:
+            attn_mask = None
+        # attention mask, True means it will take part in attention B H s_q s_k
+        if self.training:
+            # during training q,k,v always have same seqlen
+            if self.causal:
+                assert q.shape[-2] == k.shape[-2]
+            is_causal = self.causal
+            dropout_p = self.dropout_p
+        else:
+            # turn off FA causal mask after first inference autoregressive iteration
+            # only on first autoregressive step q,k,v have same seqlen
+            if self.causal:
+                is_causal = q.shape[-2] == k.shape[-2]
+            else:
+                is_causal = self.causal
+            dropout_p = 0.0
+        # 如果is_causal则无视输入的mask 反之会使用输入的mask
+        o = F.scaled_dot_product_attention(q, k, v,
+            attn_mask=attn_mask,
+            dropout_p=dropout_p,
+            is_causal=is_causal,
+            scale=self.softmax_scale
+            )
+        # B Head L D -> L B (Head D)
+        o = rearrange(o, 'B Head L D -> L B (Head D)').contiguous()
+        return o