Upload TaiVisionForCausalLM
Browse files- config.json +6 -1
- model.safetensors +1 -1
- modeling_taivisionlm.py +9 -40
config.json
CHANGED
|
@@ -1,6 +1,10 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
| 2 |
"auto_map": {
|
| 3 |
-
"AutoConfig": "configuration_taivisionlm.TaiVisionLMConfig"
|
|
|
|
| 4 |
},
|
| 5 |
"hidden_size": 2048,
|
| 6 |
"ignore_index": -100,
|
|
@@ -21,6 +25,7 @@
|
|
| 21 |
"torch_dtype": "bfloat16",
|
| 22 |
"vocab_size": 32001
|
| 23 |
},
|
|
|
|
| 24 |
"transformers_version": "4.44.0",
|
| 25 |
"vision_config": {
|
| 26 |
"model_type": "siglip_vision_model",
|
|
|
|
| 1 |
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"TaiVisionForCausalLM"
|
| 4 |
+
],
|
| 5 |
"auto_map": {
|
| 6 |
+
"AutoConfig": "configuration_taivisionlm.TaiVisionLMConfig",
|
| 7 |
+
"AutoModelForCausalLM": "modeling_taivisionlm.TaiVisionForCausalLM"
|
| 8 |
},
|
| 9 |
"hidden_size": 2048,
|
| 10 |
"ignore_index": -100,
|
|
|
|
| 25 |
"torch_dtype": "bfloat16",
|
| 26 |
"vocab_size": 32001
|
| 27 |
},
|
| 28 |
+
"torch_dtype": "float32",
|
| 29 |
"transformers_version": "4.44.0",
|
| 30 |
"vision_config": {
|
| 31 |
"model_type": "siglip_vision_model",
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4806424752
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:11d50e45bc0203fb3be9a06add95e21a024690098db67cd7b97f29ae03c2bb57
|
| 3 |
size 4806424752
|
modeling_taivisionlm.py
CHANGED
|
@@ -156,18 +156,17 @@ class TaiVisionForCausalLM(TaiVisionPreTrainedModel):
|
|
| 156 |
self.language_model = language_model
|
| 157 |
self.post_init()
|
| 158 |
|
| 159 |
-
def
|
| 160 |
-
|
| 161 |
-
load the pretrained weights for language model and vision model
|
| 162 |
-
'''
|
| 163 |
-
import transformers
|
| 164 |
-
language_model = AutoModelForCausalLM.from_pretrained("benchang1110/Taiwan-tinyllama-v1.0-chat")
|
| 165 |
if language_model.vocab_size != self.vocab_size:
|
| 166 |
print("vocab size mismatch, resize the token embeddings for the pretained language model")
|
| 167 |
language_model.resize_token_embeddings(self.vocab_size)
|
| 168 |
-
self.language_model
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
# Copied from transformers.models.paligemma.modeling_paligemma.PaliGemmaForConditionalGeneration.get_input_embeddings with PaliGemma->TaiVisionLM
|
| 173 |
def get_input_embeddings(self):
|
|
@@ -439,34 +438,4 @@ class TaiVisionForCausalLM(TaiVisionPreTrainedModel):
|
|
| 439 |
if cache_position[0] == 0:
|
| 440 |
model_inputs["pixel_values"] = pixel_values
|
| 441 |
|
| 442 |
-
return model_inputs
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
if __name__ == '__main__':
|
| 447 |
-
import transformers
|
| 448 |
-
config = transformers.AutoConfig.from_pretrained("benchang1110/TaiVision-base",trust_remote_code=True)
|
| 449 |
-
model = TaiVisionForCausalLM(config).to("cuda")
|
| 450 |
-
print(model)
|
| 451 |
-
model.save_pretrained
|
| 452 |
-
# Test forward
|
| 453 |
-
import torch
|
| 454 |
-
from PIL import Image
|
| 455 |
-
import requests
|
| 456 |
-
# Initialize processor
|
| 457 |
-
processor = transformers.AutoProcessor.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True)
|
| 458 |
-
|
| 459 |
-
# Load image
|
| 460 |
-
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
|
| 461 |
-
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
|
| 462 |
-
|
| 463 |
-
# Define prompt and label
|
| 464 |
-
prompt = "What is the color of the car?"
|
| 465 |
-
label = "I am fine, thank you."
|
| 466 |
-
|
| 467 |
-
# Process inputs
|
| 468 |
-
inputs = processor(prompts=prompt,images=image, return_tensors="pt", padding=False, max_length=512).to('cuda')
|
| 469 |
-
|
| 470 |
-
outputs = model.generate(**inputs, max_length=512, do_sample=True, pad_token_id=processor.tokenizer.pad_token_id)
|
| 471 |
-
print(processor.decode(outputs[0], skip_special_tokens=True))
|
| 472 |
-
|
|
|
|
| 156 |
self.language_model = language_model
|
| 157 |
self.post_init()
|
| 158 |
|
| 159 |
+
def load_language_model(self, model_id = "benchang1110/Taiwan-tinyllama-v1.0-chat"):
|
| 160 |
+
language_model = AutoModelForCausalLM.from_pretrained(model_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
if language_model.vocab_size != self.vocab_size:
|
| 162 |
print("vocab size mismatch, resize the token embeddings for the pretained language model")
|
| 163 |
language_model.resize_token_embeddings(self.vocab_size)
|
| 164 |
+
self.language_model.load_state_dict(language_model.state_dict(),strict=True)
|
| 165 |
+
|
| 166 |
+
def load_vision_model(self,model_id = "google/siglip-base-patch16-224"):
|
| 167 |
+
import transformers
|
| 168 |
+
vision_model = transformers.SiglipVisionModel.from_pretrained(model_id)
|
| 169 |
+
self.vision_tower.load_state_dict(vision_model.state_dict(),strict=True)
|
| 170 |
|
| 171 |
# Copied from transformers.models.paligemma.modeling_paligemma.PaliGemmaForConditionalGeneration.get_input_embeddings with PaliGemma->TaiVisionLM
|
| 172 |
def get_input_embeddings(self):
|
|
|
|
| 438 |
if cache_position[0] == 0:
|
| 439 |
model_inputs["pixel_values"] = pixel_values
|
| 440 |
|
| 441 |
+
return model_inputs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|