FYYDCC commited on
Commit
ef286b5
·
verified ·
1 Parent(s): 796c3ac

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +115 -0
README.md CHANGED
@@ -31,4 +31,119 @@ qwen_m3cot_path = hf_hub_download("ModalityDance/IVTLR_QWEN_M3COT", "model.pth")
31
 
32
  # Download Qwen2-VL model trained on ScienceQA
33
  qwen_sqa_path = hf_hub_download("ModalityDance/IVTLR_QWEN_SQA", "model.pth")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  ```
 
31
 
32
  # Download Qwen2-VL model trained on ScienceQA
33
  qwen_sqa_path = hf_hub_download("ModalityDance/IVTLR_QWEN_SQA", "model.pth")
34
+ ```
35
+
36
+ ---
37
+
38
+ ### Quick Start
39
+
40
+ The following code shows how to load the pretrained IVT-LR model and run inference on a single image-text example. Replace `image` and `text` with your own input.
41
+
42
+
43
+ ```python
44
+ from transformers import AutoTokenizer, AutoProcessor, Qwen2VLForConditionalGeneration
45
+ from qwen_ivtlr import IVTLR
46
+ from qwen_vl_utils import process_vision_info
47
+ from peft import LoraConfig, get_peft_model
48
+ from huggingface_hub import hf_hub_download
49
+ import torch
50
+
51
+ device = "cuda" if torch.cuda.is_available() else "cpu"
52
+
53
+ # Download model
54
+ checkpoint_path = hf_hub_download("ModalityDance/IVTLR_QWEN_M3COT", "model.pth")
55
+
56
+ # Load processor and tokenizer
57
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
58
+ tokenizer = AutoTokenizer.from_pretrained(
59
+ "Qwen/Qwen2-VL-7B-Instruct",
60
+ use_fast=False,
61
+ trust_remote_code=True,
62
+ padding_side="right"
63
+ )
64
+ tokenizer.add_special_tokens({
65
+ "additional_special_tokens": ["<|start-latent|>", "<|end-latent|>", "<|latent|>"]
66
+ })
67
+
68
+ # Load base model with LoRA
69
+ base_model = Qwen2VLForConditionalGeneration.from_pretrained(
70
+ "Qwen/Qwen2-VL-7B-Instruct",
71
+ device_map="cuda",
72
+ torch_dtype=torch.bfloat16,
73
+ trust_remote_code=True,
74
+ attn_implementation="eager"
75
+ )
76
+ base_model.resize_token_embeddings(len(tokenizer))
77
+ processor.tokenizer = tokenizer
78
+
79
+ lora_config = LoraConfig(
80
+ task_type="CAUSAL_LM",
81
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
82
+ r=64, lora_alpha=16, lora_dropout=0.05, bias="none", inference_mode=False
83
+ )
84
+ base_model = get_peft_model(base_model, lora_config)
85
+
86
+ # Create IVTLR model
87
+ latent_id = tokenizer.convert_tokens_to_ids("<|latent|>")
88
+ start_id = tokenizer.convert_tokens_to_ids("<|start-latent|>")
89
+ end_id = tokenizer.convert_tokens_to_ids("<|end-latent|>")
90
+ image_token_id = tokenizer.convert_tokens_to_ids(processor.image_token)
91
+ visual_start_id = tokenizer.convert_tokens_to_ids("<|vision_start|>")
92
+ visual_end_id = tokenizer.convert_tokens_to_ids("<|vision_end|>")
93
+
94
+ model = IVTLR(
95
+ base_model,
96
+ latent_token_id=latent_id,
97
+ start_latent_id=start_id,
98
+ end_latent_id=end_id,
99
+ eos_token_id=tokenizer.eos_token_id,
100
+ image_token_id=image_token_id,
101
+ visual_start_id=visual_start_id,
102
+ visual_end_id=visual_end_id
103
+ )
104
+
105
+ # Load checkpoint
106
+ state_dict = torch.load(checkpoint_path, map_location="cpu")
107
+ if any(k.startswith("module.") for k in state_dict.keys()):
108
+ state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
109
+ model.load_state_dict(state_dict, strict=True)
110
+ model = model.to(device)
111
+ model.eval()
112
+
113
+ # ============ Inference ============
114
+ # Replace with your own image and text
115
+ image = "your_image.jpg" # PIL Image or path to image
116
+ text = "Your question here"
117
+
118
+ messages = [{
119
+ "role": "user",
120
+ "content": [
121
+ {"type": "image", "image": image, "resized_height": 280, "resized_width": 280},
122
+ {"type": "text", "text": text}
123
+ ]
124
+ }]
125
+
126
+ prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
127
+ prompt = prompt + "<|latent|>" * 3 # Add latent tokens
128
+
129
+ image_inputs, video_inputs = process_vision_info(messages)
130
+ inputs = processor(
131
+ text=[prompt],
132
+ images=image_inputs,
133
+ videos=video_inputs,
134
+ padding=True,
135
+ return_tensors="pt"
136
+ ).to(device)
137
+
138
+ with torch.no_grad():
139
+ outputs = model.generate(
140
+ input_ids=inputs["input_ids"],
141
+ attention_mask=inputs["attention_mask"],
142
+ pixel_values=inputs["pixel_values"],
143
+ image_grid_thw=inputs["image_grid_thw"],
144
+ max_new_tokens=512
145
+ )
146
+
147
+ response = processor.decode(outputs[0], skip_special_tokens=True)
148
+ print(response)
149
  ```