FYYDCC commited on
Commit
aeb0e5a
·
verified ·
1 Parent(s): 765f2c5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +94 -0
README.md CHANGED
@@ -31,4 +31,98 @@ chameleon_m3cot_path = hf_hub_download("ModalityDance/IVTLR_CHAMELEON_M3COT", "m
31
 
32
  # Download Chameleon model trained on ScienceQA
33
  chameleon_sqa_path = hf_hub_download("ModalityDance/IVTLR_CHAMELEON_SQA", "model.pth")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  ```
 
31
 
32
  # Download Chameleon model trained on ScienceQA
33
  chameleon_sqa_path = hf_hub_download("ModalityDance/IVTLR_CHAMELEON_SQA", "model.pth")
34
+ ```
35
+
36
+
37
+ ---
38
+
39
+ ### Quick Start
40
+
41
+ The following code shows how to load the pretrained IVT-LR model and run inference on a single image-text example. Replace `image` and `text` with your own input.
42
+
43
+ ```python
44
+ from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
45
+ from chameleon_ivtlr import IVTLR
46
+ from peft import LoraConfig, get_peft_model
47
+ from huggingface_hub import hf_hub_download
48
+ import torch
49
+
50
+ device = "cuda" if torch.cuda.is_available() else "cpu"
51
+
52
+ # Download model
53
+ checkpoint_path = hf_hub_download("ModalityDance/IVTLR_CHAMELEON_M3COT", "model.pth")
54
+
55
+ # Load processor and tokenizer
56
+ processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")
57
+ tokenizer = processor.tokenizer
58
+ tokenizer.padding_side = "right"
59
+ tokenizer.pad_token = tokenizer.eos_token
60
+ tokenizer.add_special_tokens({
61
+ "additional_special_tokens": ["<|start-latent|>", "<|end-latent|>", "<|latent|>"]
62
+ })
63
+
64
+ # Load base model with LoRA
65
+ base_model = ChameleonForConditionalGeneration.from_pretrained(
66
+ "facebook/chameleon-7b",
67
+ device_map="cuda",
68
+ torch_dtype=torch.bfloat16,
69
+ trust_remote_code=True,
70
+ attn_implementation="eager"
71
+ )
72
+ base_model.resize_token_embeddings(len(tokenizer))
73
+ processor.tokenizer = tokenizer
74
+
75
+ lora_config = LoraConfig(
76
+ task_type="CAUSAL_LM",
77
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
78
+ r=64, lora_alpha=16, lora_dropout=0.05, bias="none", inference_mode=False
79
+ )
80
+ base_model = get_peft_model(base_model, lora_config)
81
+
82
+ # Create IVTLR model
83
+ latent_id = tokenizer.convert_tokens_to_ids("<|latent|>")
84
+ start_id = tokenizer.convert_tokens_to_ids("<|start-latent|>")
85
+ end_id = tokenizer.convert_tokens_to_ids("<|end-latent|>")
86
+ image_token_id = tokenizer.convert_tokens_to_ids(processor.image_token)
87
+
88
+ model = IVTLR(
89
+ base_model,
90
+ latent_token_id=latent_id,
91
+ start_latent_id=start_id,
92
+ end_latent_id=end_id,
93
+ eos_token_id=tokenizer.eos_token_id,
94
+ image_token_id=image_token_id
95
+ )
96
+
97
+ # Load checkpoint
98
+ state_dict = torch.load(checkpoint_path, map_location="cpu")
99
+ if any(k.startswith("module.") for k in state_dict.keys()):
100
+ state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
101
+ model.load_state_dict(state_dict, strict=True)
102
+ model = model.to(device)
103
+ model.eval()
104
+
105
+ # ============ Inference ============
106
+ # Replace with your own image and text
107
+ image = "your_image.jpg" # PIL Image or path to image
108
+ text = "Your question here"
109
+
110
+ prompt = f"<image>{text}<|latent|><|latent|><|latent|>"
111
+
112
+ inputs = processor(
113
+ images=image,
114
+ text=prompt,
115
+ return_tensors="pt"
116
+ ).to(device)
117
+
118
+ with torch.no_grad():
119
+ outputs = model.generate(
120
+ input_ids=inputs["input_ids"],
121
+ attention_mask=inputs["attention_mask"],
122
+ pixel_values=inputs["pixel_values"],
123
+ max_new_tokens=512
124
+ )
125
+
126
+ response = processor.decode(outputs[0], skip_special_tokens=True)
127
+ print(response)
128
  ```