UNIVA-Jason commited on
Commit
d55f16d
·
verified ·
1 Parent(s): a16cf3c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +25 -32
README.md CHANGED
@@ -33,55 +33,48 @@ effective batch size: ~64
33
  pip install torch transformers pillow
34
 
35
  ## Inference Example
36
- from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
37
-
38
- import torch
39
-
40
- from PIL import Image
41
-
42
- model_path = "YOUR_HF_USERNAME/25EMBAI-VLM-FM"
43
-
44
  dtype = torch.bfloat16
 
45
 
46
- ### Load model
47
  model = AutoModel.from_pretrained(
48
- model_path,
49
- trust_remote_code=True,
50
- ).to(device="cuda", dtype=dtype)
51
-
52
- ### Load tokenizer
53
  tokenizer = AutoTokenizer.from_pretrained(model_path)
54
-
55
- ### Load image processor from model assets
56
  image_processor = AutoImageProcessor.from_pretrained(
57
  model_path,
58
  trust_remote_code=True,
59
  )
60
-
61
  model.eval()
62
 
63
- ### Load image
64
- img = Image.open("sample.png").convert("RGB")
65
 
66
- ### Transform image → visual embeddings
67
  pixel = image_processor(img, return_tensors="pt")["pixel_values"].to(
68
- dtype=dtype, device="cuda"
 
69
  )
 
70
 
71
- ### Prompt
72
- prompt = "please describe this image."
73
-
74
- ### Multimodal generation
75
  output = model.generate_text(
76
- images=pixel,
77
- prompt=prompt,
78
- max_new_tokens=512,
79
- do_sample=True,
80
- top_p=0.9,
81
- temperature=0.7,
82
- )
83
 
84
  print(output)
 
 
85
  # Limitations & Biases
86
  This model is an early-stage prototype.
It will be updated and reorganized in future releases.
87
  Because it was trained on web-scale multimodal data:
 
33
  pip install torch transformers pillow
34
 
35
  ## Inference Example
36
+ ```
37
+ from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
38
+ import torch
39
+ from PIL import Image
40
+
41
+ model_path = '/home/raid/models/25EMBAI_save_test'
42
+ vision_model = 'ViT-H-14-378-quickgelu'
43
+ vision_pretrained = 'dfn5b'
44
  dtype = torch.bfloat16
45
+ image_path = '/home/jason/git/UNIVA/25EMBAI_VLM_FM/qwen/train/sample.png'
46
 
 
47
  model = AutoModel.from_pretrained(
48
+ model_path,
49
+ trust_remote_code=True
50
+ ).to(device = 'cuda', dtype=dtype)
 
 
51
  tokenizer = AutoTokenizer.from_pretrained(model_path)
 
 
52
  image_processor = AutoImageProcessor.from_pretrained(
53
  model_path,
54
  trust_remote_code=True,
55
  )
 
56
  model.eval()
57
 
58
+ img = Image.open(image_path).convert("RGB")
 
59
 
 
60
  pixel = image_processor(img, return_tensors="pt")["pixel_values"].to(
61
+ dtype=dtype,
62
+ device='cuda',
63
  )
64
+ prompt = 'please describe this image.'
65
 
 
 
 
 
66
  output = model.generate_text(
67
+ images=pixel,
68
+ prompt=prompt,
69
+ max_new_tokens=512,
70
+ do_sample=True,
71
+ top_p=0.9,
72
+ temperature=0.7,
73
+ )
74
 
75
  print(output)
76
+ ```
77
+
78
  # Limitations & Biases
79
  This model is an early-stage prototype.
It will be updated and reorganized in future releases.
80
  Because it was trained on web-scale multimodal data: