TeamUNIVA
/

VLM_prototype

Model card Files Files and versions

UNIVA-Jason commited on Nov 18

Commit

d55f16d

·

verified ·

1 Parent(s): a16cf3c

Update README.md

Files changed (1) hide show

README.md +25 -32

README.md CHANGED Viewed

@@ -33,55 +33,48 @@ effective batch size: ~64
 pip install torch transformers pillow
 ## Inference Example
-from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
-import torch
-from PIL import Image
-model_path = "YOUR_HF_USERNAME/25EMBAI-VLM-FM"
 dtype = torch.bfloat16
-### Load model
 model = AutoModel.from_pretrained(
-    model_path,
-    trust_remote_code=True,
-).to(device="cuda", dtype=dtype)
-### Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_path)
-### Load image processor from model assets
 image_processor = AutoImageProcessor.from_pretrained(
     model_path,
     trust_remote_code=True,
 )
 model.eval()
-### Load image
-img = Image.open("sample.png").convert("RGB")
-### Transform image → visual embeddings
 pixel = image_processor(img, return_tensors="pt")["pixel_values"].to(
-    dtype=dtype, device="cuda"
 )
-### Prompt
-prompt = "please describe this image."
-### Multimodal generation
 output = model.generate_text(
-    images=pixel,
-    prompt=prompt,
-    max_new_tokens=512,
-    do_sample=True,
-    top_p=0.9,
-    temperature=0.7,
-)
 print(output)
 # Limitations & Biases
 This model is an early-stage prototype. It will be updated and reorganized in future releases.
 Because it was trained on web-scale multimodal data:

 pip install torch transformers pillow
 ## Inference Example
+```
+from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
+import torch
+from PIL import Image
+model_path = '/home/raid/models/25EMBAI_save_test'
+vision_model = 'ViT-H-14-378-quickgelu'
+vision_pretrained = 'dfn5b'
 dtype = torch.bfloat16
+image_path = '/home/jason/git/UNIVA/25EMBAI_VLM_FM/qwen/train/sample.png'
 model = AutoModel.from_pretrained(
+  model_path,
+  trust_remote_code=True
+).to(device = 'cuda', dtype=dtype)
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 image_processor = AutoImageProcessor.from_pretrained(
     model_path,
     trust_remote_code=True,
 )
 model.eval()
+img = Image.open(image_path).convert("RGB")
 pixel = image_processor(img, return_tensors="pt")["pixel_values"].to(
+    dtype=dtype,
+    device='cuda',
 )
+prompt = 'please describe this image.'
 output = model.generate_text(
+            images=pixel,
+            prompt=prompt,
+            max_new_tokens=512,
+            do_sample=True,
+            top_p=0.9,
+            temperature=0.7,
+        )
 print(output)
+```
 # Limitations & Biases
 This model is an early-stage prototype. It will be updated and reorganized in future releases.
 Because it was trained on web-scale multimodal data: