ModalityDance
/

IVTLR_CHAMELEON_M3COT

Model card Files Files and versions

xet

Community

FYYDCC commited on 16 days ago

Commit

aeb0e5a

verified ·

1 Parent(s): 765f2c5

Update README.md

Browse files

Files changed (1) hide show

README.md +94 -0

README.md CHANGED Viewed

@@ -31,4 +31,98 @@ chameleon_m3cot_path = hf_hub_download("ModalityDance/IVTLR_CHAMELEON_M3COT", "m
 # Download Chameleon model trained on ScienceQA
 chameleon_sqa_path = hf_hub_download("ModalityDance/IVTLR_CHAMELEON_SQA", "model.pth")
 ```

 # Download Chameleon model trained on ScienceQA
 chameleon_sqa_path = hf_hub_download("ModalityDance/IVTLR_CHAMELEON_SQA", "model.pth")
+```
+---
+### Quick Start
+The following code shows how to load the pretrained IVT-LR model and run inference on a single image-text example. Replace `image` and `text` with your own input.
+```python
+from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
+from chameleon_ivtlr import IVTLR
+from peft import LoraConfig, get_peft_model
+from huggingface_hub import hf_hub_download
+import torch
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Download model
+checkpoint_path = hf_hub_download("ModalityDance/IVTLR_CHAMELEON_M3COT", "model.pth")
+# Load processor and tokenizer
+processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")
+tokenizer = processor.tokenizer
+tokenizer.padding_side = "right"
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.add_special_tokens({
+    "additional_special_tokens": ["<|start-latent|>", "<|end-latent|>", "<|latent|>"]
+})
+# Load base model with LoRA
+base_model = ChameleonForConditionalGeneration.from_pretrained(
+    "facebook/chameleon-7b",
+    device_map="cuda",
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True,
+    attn_implementation="eager"
+)
+base_model.resize_token_embeddings(len(tokenizer))
+processor.tokenizer = tokenizer
+lora_config = LoraConfig(
+    task_type="CAUSAL_LM",
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+    r=64, lora_alpha=16, lora_dropout=0.05, bias="none", inference_mode=False
+)
+base_model = get_peft_model(base_model, lora_config)
+# Create IVTLR model
+latent_id = tokenizer.convert_tokens_to_ids("<|latent|>")
+start_id = tokenizer.convert_tokens_to_ids("<|start-latent|>")
+end_id = tokenizer.convert_tokens_to_ids("<|end-latent|>")
+image_token_id = tokenizer.convert_tokens_to_ids(processor.image_token)
+model = IVTLR(
+    base_model,
+    latent_token_id=latent_id,
+    start_latent_id=start_id,
+    end_latent_id=end_id,
+    eos_token_id=tokenizer.eos_token_id,
+    image_token_id=image_token_id
+)
+# Load checkpoint
+state_dict = torch.load(checkpoint_path, map_location="cpu")
+if any(k.startswith("module.") for k in state_dict.keys()):
+    state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+model.load_state_dict(state_dict, strict=True)
+model = model.to(device)
+model.eval()
+# ============ Inference ============
+# Replace with your own image and text
+image = "your_image.jpg"  # PIL Image or path to image
+text = "Your question here"
+prompt = f"<image>{text}<|latent|><|latent|><|latent|>"
+inputs = processor(
+    images=image,
+    text=prompt,
+    return_tensors="pt"
+).to(device)
+with torch.no_grad():
+    outputs = model.generate(
+        input_ids=inputs["input_ids"],
+        attention_mask=inputs["attention_mask"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=512
+    )
+response = processor.decode(outputs[0], skip_special_tokens=True)
+print(response)
 ```