ModalityDance
/

IVTLR_QWEN_M3COT

Model card Files Files and versions

xet

Community

FYYDCC commited on 30 days ago

Commit

ef286b5

verified ·

1 Parent(s): 796c3ac

Update README.md

Browse files

Files changed (1) hide show

README.md +115 -0

README.md CHANGED Viewed

@@ -31,4 +31,119 @@ qwen_m3cot_path = hf_hub_download("ModalityDance/IVTLR_QWEN_M3COT", "model.pth")
 # Download Qwen2-VL model trained on ScienceQA
 qwen_sqa_path = hf_hub_download("ModalityDance/IVTLR_QWEN_SQA", "model.pth")
 ```

 # Download Qwen2-VL model trained on ScienceQA
 qwen_sqa_path = hf_hub_download("ModalityDance/IVTLR_QWEN_SQA", "model.pth")
+```
+---
+### Quick Start
+The following code shows how to load the pretrained IVT-LR model and run inference on a single image-text example. Replace `image` and `text` with your own input.
+```python
+from transformers import AutoTokenizer, AutoProcessor, Qwen2VLForConditionalGeneration
+from qwen_ivtlr import IVTLR
+from qwen_vl_utils import process_vision_info
+from peft import LoraConfig, get_peft_model
+from huggingface_hub import hf_hub_download
+import torch
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Download model
+checkpoint_path = hf_hub_download("ModalityDance/IVTLR_QWEN_M3COT", "model.pth")
+# Load processor and tokenizer
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+tokenizer = AutoTokenizer.from_pretrained(
+    "Qwen/Qwen2-VL-7B-Instruct",
+    use_fast=False,
+    trust_remote_code=True,
+    padding_side="right"
+)
+tokenizer.add_special_tokens({
+    "additional_special_tokens": ["<|start-latent|>", "<|end-latent|>", "<|latent|>"]
+})
+# Load base model with LoRA
+base_model = Qwen2VLForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen2-VL-7B-Instruct",
+    device_map="cuda",
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True,
+    attn_implementation="eager"
+)
+base_model.resize_token_embeddings(len(tokenizer))
+processor.tokenizer = tokenizer
+lora_config = LoraConfig(
+    task_type="CAUSAL_LM",
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+    r=64, lora_alpha=16, lora_dropout=0.05, bias="none", inference_mode=False
+)
+base_model = get_peft_model(base_model, lora_config)
+# Create IVTLR model
+latent_id = tokenizer.convert_tokens_to_ids("<|latent|>")
+start_id = tokenizer.convert_tokens_to_ids("<|start-latent|>")
+end_id = tokenizer.convert_tokens_to_ids("<|end-latent|>")
+image_token_id = tokenizer.convert_tokens_to_ids(processor.image_token)
+visual_start_id = tokenizer.convert_tokens_to_ids("<|vision_start|>")
+visual_end_id = tokenizer.convert_tokens_to_ids("<|vision_end|>")
+model = IVTLR(
+    base_model,
+    latent_token_id=latent_id,
+    start_latent_id=start_id,
+    end_latent_id=end_id,
+    eos_token_id=tokenizer.eos_token_id,
+    image_token_id=image_token_id,
+    visual_start_id=visual_start_id,
+    visual_end_id=visual_end_id
+)
+# Load checkpoint
+state_dict = torch.load(checkpoint_path, map_location="cpu")
+if any(k.startswith("module.") for k in state_dict.keys()):
+    state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+model.load_state_dict(state_dict, strict=True)
+model = model.to(device)
+model.eval()
+# ============ Inference ============
+# Replace with your own image and text
+image = "your_image.jpg"  # PIL Image or path to image
+text = "Your question here"
+messages = [{
+    "role": "user",
+    "content": [
+        {"type": "image", "image": image, "resized_height": 280, "resized_width": 280},
+        {"type": "text", "text": text}
+    ]
+}]
+prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+prompt = prompt + "<|latent|>" * 3  # Add latent tokens
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(
+    text=[prompt],
+    images=image_inputs,
+    videos=video_inputs,
+    padding=True,
+    return_tensors="pt"
+).to(device)
+with torch.no_grad():
+    outputs = model.generate(
+        input_ids=inputs["input_ids"],
+        attention_mask=inputs["attention_mask"],
+        pixel_values=inputs["pixel_values"],
+        image_grid_thw=inputs["image_grid_thw"],
+        max_new_tokens=512
+    )
+response = processor.decode(outputs[0], skip_special_tokens=True)
+print(response)
 ```