Update README.md
Browse files
README.md
CHANGED
|
@@ -78,27 +78,50 @@ from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor
|
|
| 78 |
from diffusers import FlowMatchEulerDiscreteScheduler, AutoencoderKL
|
| 79 |
|
| 80 |
# Load model components
|
| 81 |
-
pretrained_model_name_or_path = "/path/to/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
-
transformer = SD3Transformer2DKontextModel.from_pretrained(
|
| 84 |
-
pretrained_model_name_or_path, subfolder="transformer", torch_dtype=torch.bfloat16).cuda()
|
| 85 |
|
| 86 |
vae = AutoencoderKL.from_pretrained(
|
| 87 |
-
pretrained_model_name_or_path, subfolder="vae",
|
|
|
|
| 88 |
|
| 89 |
# Load Qwen2.5-VL model
|
| 90 |
lmm = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 91 |
-
|
| 92 |
-
torch_dtype=torch.bfloat16,
|
| 93 |
-
attn_implementation="flash_attention_2")
|
| 94 |
|
| 95 |
-
processor = Qwen2_5_VLProcessor.from_pretrained(
|
| 96 |
processor.chat_template = processor.chat_template.replace(
|
| 97 |
"{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}",
|
| 98 |
"")
|
| 99 |
|
| 100 |
conditioner = StableDiffusion3Conditioner.from_pretrained(
|
| 101 |
-
pretrained_model_name_or_path, subfolder="conditioner", torch_dtype=torch.bfloat16)
|
| 102 |
|
| 103 |
scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
|
| 104 |
|
|
@@ -166,7 +189,7 @@ min_pixels = max_pixels = int(image.height * 28 / 32 * image.width * 28 / 32)
|
|
| 166 |
inputs = processor(
|
| 167 |
text=texts, images=[image]*2,
|
| 168 |
min_pixels=min_pixels, max_pixels=max_pixels,
|
| 169 |
-
videos=None, padding=True, return_tensors="pt")
|
| 170 |
|
| 171 |
# Process with vision understanding
|
| 172 |
input_ids, attention_mask, pixel_values, image_grid_thw = \
|
|
|
|
| 78 |
from diffusers import FlowMatchEulerDiscreteScheduler, AutoencoderKL
|
| 79 |
|
| 80 |
# Load model components
|
| 81 |
+
pretrained_model_name_or_path = "/path/to/UniPic2-Metaquery-Flash/UniPic2-Metaquery"
|
| 82 |
+
vlm_path = "/path/to/UniPic2-Metaquery-Flash/Qwen2.5-VL-7B-Instruct-AWQ"
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
quant = "int4" # {"int4", "fp16"}
|
| 86 |
+
|
| 87 |
+
bnb4 = BitsAndBytesConfig(
|
| 88 |
+
load_in_4bit=True,
|
| 89 |
+
bnb_4bit_use_double_quant=True,
|
| 90 |
+
bnb_4bit_quant_type="nf4",
|
| 91 |
+
bnb_4bit_compute_dtype=torch.float16, # 与 LMM/Cond 对齐
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
if quant == "int4":
|
| 95 |
+
transformer = SD3Transformer2DKontextModel.from_pretrained(
|
| 96 |
+
PRETRAINED_DIR, subfolder="transformer",
|
| 97 |
+
quantization_config=bnb4, device_map="auto", low_cpu_mem_usage=True
|
| 98 |
+
)
|
| 99 |
+
elif quant == "fp16":
|
| 100 |
+
transformer = SD3Transformer2DKontextModel.from_pretrained(
|
| 101 |
+
PRETRAINED_DIR, subfolder="transformer",
|
| 102 |
+
torch_dtype=torch.float16, device_map="auto", low_cpu_mem_usage=True
|
| 103 |
+
)
|
| 104 |
+
else:
|
| 105 |
+
raise ValueError(f"Unsupported quant: {quant}")
|
| 106 |
|
|
|
|
|
|
|
| 107 |
|
| 108 |
vae = AutoencoderKL.from_pretrained(
|
| 109 |
+
pretrained_model_name_or_path, subfolder="vae",
|
| 110 |
+
torch_dtype=torch.bfloat16, device_map="auto", low_cpu_mem_usage=True)
|
| 111 |
|
| 112 |
# Load Qwen2.5-VL model
|
| 113 |
lmm = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 114 |
+
vlm_path,
|
| 115 |
+
torch_dtype=torch.bfloat16,device_map="auto",
|
| 116 |
+
attn_implementation="flash_attention_2")
|
| 117 |
|
| 118 |
+
processor = Qwen2_5_VLProcessor.from_pretrained(vlm_path)
|
| 119 |
processor.chat_template = processor.chat_template.replace(
|
| 120 |
"{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}",
|
| 121 |
"")
|
| 122 |
|
| 123 |
conditioner = StableDiffusion3Conditioner.from_pretrained(
|
| 124 |
+
pretrained_model_name_or_path, subfolder="conditioner",device_map="auto", torch_dtype=torch.bfloat16)
|
| 125 |
|
| 126 |
scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
|
| 127 |
|
|
|
|
| 189 |
inputs = processor(
|
| 190 |
text=texts, images=[image]*2,
|
| 191 |
min_pixels=min_pixels, max_pixels=max_pixels,
|
| 192 |
+
videos=None, padding=True, return_tensors="pt")
|
| 193 |
|
| 194 |
# Process with vision understanding
|
| 195 |
input_ids, attention_mask, pixel_values, image_grid_thw = \
|