OrlandoHugBot commited on
Commit
c6d5c95
·
verified ·
1 Parent(s): eb800e1

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +33 -10
README.md CHANGED
@@ -78,27 +78,50 @@ from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor
78
  from diffusers import FlowMatchEulerDiscreteScheduler, AutoencoderKL
79
 
80
  # Load model components
81
- pretrained_model_name_or_path = "/path/to/unipicv2_qwen2_5_vl_7b_sd_3_5m_kontext"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
- transformer = SD3Transformer2DKontextModel.from_pretrained(
84
- pretrained_model_name_or_path, subfolder="transformer", torch_dtype=torch.bfloat16).cuda()
85
 
86
  vae = AutoencoderKL.from_pretrained(
87
- pretrained_model_name_or_path, subfolder="vae", torch_dtype=torch.bfloat16).cuda()
 
88
 
89
  # Load Qwen2.5-VL model
90
  lmm = Qwen2_5_VLForConditionalGeneration.from_pretrained(
91
- "Qwen/Qwen2.5-VL-7B-Instruct",
92
- torch_dtype=torch.bfloat16,
93
- attn_implementation="flash_attention_2").cuda()
94
 
95
- processor = Qwen2_5_VLProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
96
  processor.chat_template = processor.chat_template.replace(
97
  "{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}",
98
  "")
99
 
100
  conditioner = StableDiffusion3Conditioner.from_pretrained(
101
- pretrained_model_name_or_path, subfolder="conditioner", torch_dtype=torch.bfloat16).cuda()
102
 
103
  scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
104
 
@@ -166,7 +189,7 @@ min_pixels = max_pixels = int(image.height * 28 / 32 * image.width * 28 / 32)
166
  inputs = processor(
167
  text=texts, images=[image]*2,
168
  min_pixels=min_pixels, max_pixels=max_pixels,
169
- videos=None, padding=True, return_tensors="pt").to("cuda")
170
 
171
  # Process with vision understanding
172
  input_ids, attention_mask, pixel_values, image_grid_thw = \
 
78
  from diffusers import FlowMatchEulerDiscreteScheduler, AutoencoderKL
79
 
80
  # Load model components
81
+ pretrained_model_name_or_path = "/path/to/UniPic2-Metaquery-Flash/UniPic2-Metaquery"
82
+ vlm_path = "/path/to/UniPic2-Metaquery-Flash/Qwen2.5-VL-7B-Instruct-AWQ"
83
+
84
+
85
+ quant = "int4" # {"int4", "fp16"}
86
+
87
+ bnb4 = BitsAndBytesConfig(
88
+ load_in_4bit=True,
89
+ bnb_4bit_use_double_quant=True,
90
+ bnb_4bit_quant_type="nf4",
91
+ bnb_4bit_compute_dtype=torch.float16, # 与 LMM/Cond 对齐
92
+ )
93
+
94
+ if quant == "int4":
95
+ transformer = SD3Transformer2DKontextModel.from_pretrained(
96
+ PRETRAINED_DIR, subfolder="transformer",
97
+ quantization_config=bnb4, device_map="auto", low_cpu_mem_usage=True
98
+ )
99
+ elif quant == "fp16":
100
+ transformer = SD3Transformer2DKontextModel.from_pretrained(
101
+ PRETRAINED_DIR, subfolder="transformer",
102
+ torch_dtype=torch.float16, device_map="auto", low_cpu_mem_usage=True
103
+ )
104
+ else:
105
+ raise ValueError(f"Unsupported quant: {quant}")
106
 
 
 
107
 
108
  vae = AutoencoderKL.from_pretrained(
109
+ pretrained_model_name_or_path, subfolder="vae",
110
+ torch_dtype=torch.bfloat16, device_map="auto", low_cpu_mem_usage=True)
111
 
112
  # Load Qwen2.5-VL model
113
  lmm = Qwen2_5_VLForConditionalGeneration.from_pretrained(
114
+ vlm_path,
115
+ torch_dtype=torch.bfloat16,device_map="auto",
116
+ attn_implementation="flash_attention_2")
117
 
118
+ processor = Qwen2_5_VLProcessor.from_pretrained(vlm_path)
119
  processor.chat_template = processor.chat_template.replace(
120
  "{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}",
121
  "")
122
 
123
  conditioner = StableDiffusion3Conditioner.from_pretrained(
124
+ pretrained_model_name_or_path, subfolder="conditioner",device_map="auto", torch_dtype=torch.bfloat16)
125
 
126
  scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
127
 
 
189
  inputs = processor(
190
  text=texts, images=[image]*2,
191
  min_pixels=min_pixels, max_pixels=max_pixels,
192
+ videos=None, padding=True, return_tensors="pt")
193
 
194
  # Process with vision understanding
195
  input_ids, attention_mask, pixel_values, image_grid_thw = \