Instructions to use Balta10/fire with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Diffusers
How to use Balta10/fire with Diffusers:
pip install -U diffusers transformers accelerate
import torch from diffusers import DiffusionPipeline from diffusers.utils import load_image # switch to "mps" for apple devices pipe = DiffusionPipeline.from_pretrained("Balta10/fire", dtype=torch.bfloat16, device_map="cuda") prompt = "Turn this cat into a dog" input_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png") image = pipe(image=input_image, prompt=prompt).images[0] - Notebooks
- Google Colab
- Kaggle
Commit ·
6950426
0
Parent(s):
Duplicate from prithivMLmods/FireRed-Image-Edit-1.0-8bit
Browse filesCo-authored-by: Prithiv Sakthi <prithivMLmods@users.noreply.huggingface.co>
- .gitattributes +36 -0
- README.md +231 -0
- model_index.json +29 -0
- processor/added_tokens.json +24 -0
- processor/chat_template.jinja +7 -0
- processor/merges.txt +0 -0
- processor/preprocessor_config.json +39 -0
- processor/special_tokens_map.json +31 -0
- processor/tokenizer.json +3 -0
- processor/tokenizer_config.json +208 -0
- processor/video_preprocessor_config.json +45 -0
- processor/vocab.json +0 -0
- scheduler/scheduler_config.json +18 -0
- text_encoder/config.json +147 -0
- text_encoder/generation_config.json +6 -0
- text_encoder/model-00001-of-00002.safetensors +3 -0
- text_encoder/model-00002-of-00002.safetensors +3 -0
- text_encoder/model.safetensors.index.json +0 -0
- tokenizer/added_tokens.json +24 -0
- tokenizer/chat_template.jinja +54 -0
- tokenizer/merges.txt +0 -0
- tokenizer/special_tokens_map.json +31 -0
- tokenizer/tokenizer_config.json +207 -0
- tokenizer/vocab.json +0 -0
- transformer/config.json +36 -0
- transformer/diffusion_pytorch_model-00001-of-00003.safetensors +3 -0
- transformer/diffusion_pytorch_model-00002-of-00003.safetensors +3 -0
- transformer/diffusion_pytorch_model-00003-of-00003.safetensors +3 -0
- transformer/diffusion_pytorch_model.safetensors.index.json +0 -0
- vae/config.json +73 -0
- vae/diffusion_pytorch_model.safetensors +3 -0
.gitattributes
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
processor/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
library_name: diffusers
|
| 4 |
+
language:
|
| 5 |
+
- en
|
| 6 |
+
base_model:
|
| 7 |
+
- FireRedTeam/FireRed-Image-Edit-1.0
|
| 8 |
+
pipeline_tag: image-to-image
|
| 9 |
+
tags:
|
| 10 |
+
- art
|
| 11 |
+
- 8bit
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# **FireRed-Image-Edit-1.0-8bit**
|
| 15 |
+
|
| 16 |
+
> FireRed-Image-Edit-1.0-8bit is an 8-bit quantized edition of FireRed-Image-Edit-1.0 (FireRedTeam), engineered to deliver the same instruction-driven diffusion transformer image editing capabilities with significantly reduced memory footprint and improved inference efficiency. Built upon the original 1.6B-sample training corpus refined into over 100M high-quality text-to-image and editing pairs through cleaning, stratification, auto-labeling, and dual-stage semantic filtering, this quantized release preserves the model’s multi-stage training pipeline, including large-scale pre-training, supervised fine-tuning, and reinforcement learning with techniques such as Multi-Condition Aware Bucket Sampling for variable resolutions, Stochastic Instruction Alignment, Asymmetric Gradient Optimization for stable DPO, DiffusionNFT with layout-OCR rewards for precise text editing, and differentiable Consistency Loss for strong identity preservation. The 8-bit quantization reduces VRAM requirements and accelerates deployment while maintaining high alignment, semantic consistency, and visual fidelity across diverse editing scenarios such as photo restoration, object insertion and modification, style transfer with text fidelity, multi-image virtual try-on, and layout-aware text editing. Optimized for practical workflows and ComfyUI integration, this version enables broader accessibility on consumer-grade GPUs without substantial quality degradation, making it suitable for research, production, and lightweight deployment environments.
|
| 17 |
+
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
## Quick Start with Diffusers 🧨
|
| 21 |
+
|
| 22 |
+
### Install the required packages
|
| 23 |
+
|
| 24 |
+
```py
|
| 25 |
+
transformers # - transformers@v4.57.6
|
| 26 |
+
torch # - torch@v2.9.1+cu128
|
| 27 |
+
diffusers # - diffusers@v0.37.0.dev0
|
| 28 |
+
bitsandbytes # - bitsandbytes@v0.49.2
|
| 29 |
+
gradio # - gradio@v6.6.0
|
| 30 |
+
accelerate # - accelerate@v1.12.0
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
### Run FireRed-Image-Edit-1.0-8bit [Demo]
|
| 34 |
+
|
| 35 |
+
```py
|
| 36 |
+
import os
|
| 37 |
+
import gc
|
| 38 |
+
import gradio as gr
|
| 39 |
+
import numpy as np
|
| 40 |
+
#import spaces # Uncomment the Spaces-related modules if you are using HF ZeroGPU
|
| 41 |
+
|
| 42 |
+
import torch
|
| 43 |
+
import random
|
| 44 |
+
from PIL import Image
|
| 45 |
+
|
| 46 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 47 |
+
|
| 48 |
+
print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
|
| 49 |
+
print("torch.__version__ =", torch.__version__)
|
| 50 |
+
print("Using device:", device)
|
| 51 |
+
|
| 52 |
+
from diffusers.models import QwenImageTransformer2DModel
|
| 53 |
+
from diffusers import QwenImageEditPlusPipeline
|
| 54 |
+
from diffusers.utils import load_image
|
| 55 |
+
|
| 56 |
+
dtype = torch.bfloat16
|
| 57 |
+
|
| 58 |
+
transformer = QwenImageTransformer2DModel.from_pretrained(
|
| 59 |
+
"prithivMLmods/FireRed-Image-Edit-1.0-8bit",
|
| 60 |
+
subfolder="transformer",
|
| 61 |
+
torch_dtype=dtype
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
pipe = QwenImageEditPlusPipeline.from_pretrained(
|
| 65 |
+
"prithivMLmods/FireRed-Image-Edit-1.0-8bit",
|
| 66 |
+
transformer=transformer,
|
| 67 |
+
torch_dtype=dtype
|
| 68 |
+
).to(device)
|
| 69 |
+
|
| 70 |
+
MAX_SEED = np.iinfo(np.int32).max
|
| 71 |
+
|
| 72 |
+
def update_dimensions_on_upload(image):
|
| 73 |
+
if image is None:
|
| 74 |
+
return 1024, 1024
|
| 75 |
+
|
| 76 |
+
original_width, original_height = image.size
|
| 77 |
+
|
| 78 |
+
if original_width > original_height:
|
| 79 |
+
new_width = 1024
|
| 80 |
+
aspect_ratio = original_height / original_width
|
| 81 |
+
new_height = int(new_width * aspect_ratio)
|
| 82 |
+
else:
|
| 83 |
+
new_height = 1024
|
| 84 |
+
aspect_ratio = original_width / original_height
|
| 85 |
+
new_width = int(new_height * aspect_ratio)
|
| 86 |
+
|
| 87 |
+
new_width = (new_width // 8) * 8
|
| 88 |
+
new_height = (new_height // 8) * 8
|
| 89 |
+
|
| 90 |
+
return new_width, new_height
|
| 91 |
+
|
| 92 |
+
#@spaces.GPU
|
| 93 |
+
def infer(
|
| 94 |
+
images,
|
| 95 |
+
prompt,
|
| 96 |
+
seed,
|
| 97 |
+
randomize_seed,
|
| 98 |
+
guidance_scale,
|
| 99 |
+
steps,
|
| 100 |
+
progress=gr.Progress(track_tqdm=True)
|
| 101 |
+
):
|
| 102 |
+
gc.collect()
|
| 103 |
+
torch.cuda.empty_cache()
|
| 104 |
+
|
| 105 |
+
if not images:
|
| 106 |
+
raise gr.Error("Please upload at least one image to edit.")
|
| 107 |
+
|
| 108 |
+
pil_images = []
|
| 109 |
+
if images is not None:
|
| 110 |
+
for item in images:
|
| 111 |
+
try:
|
| 112 |
+
if isinstance(item, tuple) or isinstance(item, list):
|
| 113 |
+
path_or_img = item[0]
|
| 114 |
+
else:
|
| 115 |
+
path_or_img = item
|
| 116 |
+
|
| 117 |
+
if isinstance(path_or_img, str):
|
| 118 |
+
pil_images.append(Image.open(path_or_img).convert("RGB"))
|
| 119 |
+
elif isinstance(path_or_img, Image.Image):
|
| 120 |
+
pil_images.append(path_or_img.convert("RGB"))
|
| 121 |
+
else:
|
| 122 |
+
pil_images.append(Image.open(path_or_img.name).convert("RGB"))
|
| 123 |
+
except Exception as e:
|
| 124 |
+
print(f"Skipping invalid image item: {e}")
|
| 125 |
+
continue
|
| 126 |
+
|
| 127 |
+
if not pil_images:
|
| 128 |
+
raise gr.Error("Could not process uploaded images.")
|
| 129 |
+
|
| 130 |
+
if randomize_seed:
|
| 131 |
+
seed = random.randint(0, MAX_SEED)
|
| 132 |
+
|
| 133 |
+
generator = torch.Generator(device=device).manual_seed(seed)
|
| 134 |
+
negative_prompt = "worst quality, low quality, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, jpeg artifacts, signature, watermark, username, blurry"
|
| 135 |
+
|
| 136 |
+
width, height = update_dimensions_on_upload(pil_images[0])
|
| 137 |
+
|
| 138 |
+
try:
|
| 139 |
+
result_image = pipe(
|
| 140 |
+
image=pil_images,
|
| 141 |
+
prompt=prompt,
|
| 142 |
+
negative_prompt=negative_prompt,
|
| 143 |
+
height=height,
|
| 144 |
+
width=width,
|
| 145 |
+
num_inference_steps=steps,
|
| 146 |
+
generator=generator,
|
| 147 |
+
true_cfg_scale=guidance_scale,
|
| 148 |
+
).images[0]
|
| 149 |
+
|
| 150 |
+
return result_image, seed
|
| 151 |
+
|
| 152 |
+
except Exception as e:
|
| 153 |
+
raise e
|
| 154 |
+
finally:
|
| 155 |
+
gc.collect()
|
| 156 |
+
torch.cuda.empty_cache()
|
| 157 |
+
|
| 158 |
+
#@spaces.GPU
|
| 159 |
+
def infer_example(images, prompt):
|
| 160 |
+
if not images:
|
| 161 |
+
return None, 0
|
| 162 |
+
|
| 163 |
+
if isinstance(images, str):
|
| 164 |
+
images_list = [images]
|
| 165 |
+
else:
|
| 166 |
+
images_list = images
|
| 167 |
+
|
| 168 |
+
result, seed = infer(
|
| 169 |
+
images=images_list,
|
| 170 |
+
prompt=prompt,
|
| 171 |
+
seed=0,
|
| 172 |
+
randomize_seed=True,
|
| 173 |
+
guidance_scale=1.0,
|
| 174 |
+
steps=20
|
| 175 |
+
)
|
| 176 |
+
return result, seed
|
| 177 |
+
|
| 178 |
+
css="""
|
| 179 |
+
#col-container {
|
| 180 |
+
margin: 0 auto;
|
| 181 |
+
max-width: 1000px;
|
| 182 |
+
}
|
| 183 |
+
#main-title h1 {font-size: 2.4em !important;}
|
| 184 |
+
"""
|
| 185 |
+
|
| 186 |
+
with gr.Blocks() as demo:
|
| 187 |
+
with gr.Column(elem_id="col-container"):
|
| 188 |
+
gr.Markdown("# **FireRed-Image-Edit-1.0-8bit**", elem_id="main-title")
|
| 189 |
+
|
| 190 |
+
with gr.Row(equal_height=True):
|
| 191 |
+
with gr.Column():
|
| 192 |
+
images = gr.Gallery(
|
| 193 |
+
label="Upload Images",
|
| 194 |
+
type="filepath",
|
| 195 |
+
columns=2,
|
| 196 |
+
rows=1,
|
| 197 |
+
height=300,
|
| 198 |
+
allow_preview=True
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
with gr.Row():
|
| 202 |
+
prompt = gr.Text(
|
| 203 |
+
label="Edit Prompt",
|
| 204 |
+
show_label=True,
|
| 205 |
+
placeholder="e.g., transform into anime..",
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
with gr.Row():
|
| 209 |
+
run_button = gr.Button("Edit Image", variant="primary")
|
| 210 |
+
|
| 211 |
+
with gr.Column():
|
| 212 |
+
output_image = gr.Image(label="Output Image", interactive=False, format="png", height=390)
|
| 213 |
+
|
| 214 |
+
with gr.Accordion("Advanced Settings", open=False, visible=True):
|
| 215 |
+
seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
|
| 216 |
+
randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
|
| 217 |
+
guidance_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, step=0.1, value=1.0)
|
| 218 |
+
steps = gr.Slider(label="Inference Steps", minimum=1, maximum=50, step=1, value=20)
|
| 219 |
+
|
| 220 |
+
run_button.click(
|
| 221 |
+
fn=infer,
|
| 222 |
+
inputs=[images, prompt, seed, randomize_seed, guidance_scale, steps],
|
| 223 |
+
outputs=[output_image, seed]
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
if __name__ == "__main__":
|
| 227 |
+
demo.queue(max_size=30).launch(css=css, mcp_server=True, ssr_mode=False, show_error=True)
|
| 228 |
+
```
|
| 229 |
+
|
| 230 |
+
> [!IMPORTANT]
|
| 231 |
+
This repository follows the same release notes, terms and conditions, and license as the original model page, [FireRed-Image-Edit-1.0](https://huggingface.co/FireRedTeam/FireRed-Image-Edit-1.0).
|
model_index.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "QwenImageEditPlusPipeline",
|
| 3 |
+
"_diffusers_version": "0.37.0.dev0",
|
| 4 |
+
"_name_or_path": "FireRedTeam/FireRed-Image-Edit-1.0",
|
| 5 |
+
"processor": [
|
| 6 |
+
"transformers",
|
| 7 |
+
"Qwen2VLProcessor"
|
| 8 |
+
],
|
| 9 |
+
"scheduler": [
|
| 10 |
+
"diffusers",
|
| 11 |
+
"FlowMatchEulerDiscreteScheduler"
|
| 12 |
+
],
|
| 13 |
+
"text_encoder": [
|
| 14 |
+
"transformers",
|
| 15 |
+
"Qwen2_5_VLForConditionalGeneration"
|
| 16 |
+
],
|
| 17 |
+
"tokenizer": [
|
| 18 |
+
"transformers",
|
| 19 |
+
"Qwen2Tokenizer"
|
| 20 |
+
],
|
| 21 |
+
"transformer": [
|
| 22 |
+
"diffusers",
|
| 23 |
+
"QwenImageTransformer2DModel"
|
| 24 |
+
],
|
| 25 |
+
"vae": [
|
| 26 |
+
"diffusers",
|
| 27 |
+
"AutoencoderKLQwenImage"
|
| 28 |
+
]
|
| 29 |
+
}
|
processor/added_tokens.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</tool_call>": 151658,
|
| 3 |
+
"<tool_call>": 151657,
|
| 4 |
+
"<|box_end|>": 151649,
|
| 5 |
+
"<|box_start|>": 151648,
|
| 6 |
+
"<|endoftext|>": 151643,
|
| 7 |
+
"<|file_sep|>": 151664,
|
| 8 |
+
"<|fim_middle|>": 151660,
|
| 9 |
+
"<|fim_pad|>": 151662,
|
| 10 |
+
"<|fim_prefix|>": 151659,
|
| 11 |
+
"<|fim_suffix|>": 151661,
|
| 12 |
+
"<|im_end|>": 151645,
|
| 13 |
+
"<|im_start|>": 151644,
|
| 14 |
+
"<|image_pad|>": 151655,
|
| 15 |
+
"<|object_ref_end|>": 151647,
|
| 16 |
+
"<|object_ref_start|>": 151646,
|
| 17 |
+
"<|quad_end|>": 151651,
|
| 18 |
+
"<|quad_start|>": 151650,
|
| 19 |
+
"<|repo_name|>": 151663,
|
| 20 |
+
"<|video_pad|>": 151656,
|
| 21 |
+
"<|vision_end|>": 151653,
|
| 22 |
+
"<|vision_pad|>": 151654,
|
| 23 |
+
"<|vision_start|>": 151652
|
| 24 |
+
}
|
processor/chat_template.jinja
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
|
| 2 |
+
You are a helpful assistant.<|im_end|>
|
| 3 |
+
{% endif %}<|im_start|>{{ message['role'] }}
|
| 4 |
+
{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
|
| 5 |
+
{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
|
| 6 |
+
{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
|
| 7 |
+
{% endif %}
|
processor/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
processor/preprocessor_config.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"crop_size": null,
|
| 3 |
+
"data_format": "channels_first",
|
| 4 |
+
"default_to_square": true,
|
| 5 |
+
"device": null,
|
| 6 |
+
"disable_grouping": null,
|
| 7 |
+
"do_center_crop": null,
|
| 8 |
+
"do_convert_rgb": true,
|
| 9 |
+
"do_normalize": true,
|
| 10 |
+
"do_pad": null,
|
| 11 |
+
"do_rescale": true,
|
| 12 |
+
"do_resize": true,
|
| 13 |
+
"image_mean": [
|
| 14 |
+
0.48145466,
|
| 15 |
+
0.4578275,
|
| 16 |
+
0.40821073
|
| 17 |
+
],
|
| 18 |
+
"image_processor_type": "Qwen2VLImageProcessorFast",
|
| 19 |
+
"image_std": [
|
| 20 |
+
0.26862954,
|
| 21 |
+
0.26130258,
|
| 22 |
+
0.27577711
|
| 23 |
+
],
|
| 24 |
+
"input_data_format": null,
|
| 25 |
+
"max_pixels": 12845056,
|
| 26 |
+
"merge_size": 2,
|
| 27 |
+
"min_pixels": 3136,
|
| 28 |
+
"pad_size": null,
|
| 29 |
+
"patch_size": 14,
|
| 30 |
+
"processor_class": "Qwen2VLProcessor",
|
| 31 |
+
"resample": 3,
|
| 32 |
+
"rescale_factor": 0.00392156862745098,
|
| 33 |
+
"return_tensors": null,
|
| 34 |
+
"size": {
|
| 35 |
+
"longest_edge": 12845056,
|
| 36 |
+
"shortest_edge": 3136
|
| 37 |
+
},
|
| 38 |
+
"temporal_patch_size": 2
|
| 39 |
+
}
|
processor/special_tokens_map.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>"
|
| 16 |
+
],
|
| 17 |
+
"eos_token": {
|
| 18 |
+
"content": "<|im_end|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
"pad_token": {
|
| 25 |
+
"content": "<|endoftext|>",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
+
}
|
processor/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
|
| 3 |
+
size 11421896
|
processor/tokenizer_config.json
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
}
|
| 181 |
+
},
|
| 182 |
+
"additional_special_tokens": [
|
| 183 |
+
"<|im_start|>",
|
| 184 |
+
"<|im_end|>",
|
| 185 |
+
"<|object_ref_start|>",
|
| 186 |
+
"<|object_ref_end|>",
|
| 187 |
+
"<|box_start|>",
|
| 188 |
+
"<|box_end|>",
|
| 189 |
+
"<|quad_start|>",
|
| 190 |
+
"<|quad_end|>",
|
| 191 |
+
"<|vision_start|>",
|
| 192 |
+
"<|vision_end|>",
|
| 193 |
+
"<|vision_pad|>",
|
| 194 |
+
"<|image_pad|>",
|
| 195 |
+
"<|video_pad|>"
|
| 196 |
+
],
|
| 197 |
+
"bos_token": null,
|
| 198 |
+
"clean_up_tokenization_spaces": false,
|
| 199 |
+
"eos_token": "<|im_end|>",
|
| 200 |
+
"errors": "replace",
|
| 201 |
+
"extra_special_tokens": {},
|
| 202 |
+
"model_max_length": 131072,
|
| 203 |
+
"pad_token": "<|endoftext|>",
|
| 204 |
+
"processor_class": "Qwen2VLProcessor",
|
| 205 |
+
"split_special_tokens": false,
|
| 206 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 207 |
+
"unk_token": null
|
| 208 |
+
}
|
processor/video_preprocessor_config.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"crop_size": null,
|
| 3 |
+
"data_format": "channels_first",
|
| 4 |
+
"default_to_square": true,
|
| 5 |
+
"device": null,
|
| 6 |
+
"do_center_crop": null,
|
| 7 |
+
"do_convert_rgb": true,
|
| 8 |
+
"do_normalize": true,
|
| 9 |
+
"do_pad": null,
|
| 10 |
+
"do_rescale": true,
|
| 11 |
+
"do_resize": true,
|
| 12 |
+
"do_sample_frames": false,
|
| 13 |
+
"fps": null,
|
| 14 |
+
"image_mean": [
|
| 15 |
+
0.48145466,
|
| 16 |
+
0.4578275,
|
| 17 |
+
0.40821073
|
| 18 |
+
],
|
| 19 |
+
"image_std": [
|
| 20 |
+
0.26862954,
|
| 21 |
+
0.26130258,
|
| 22 |
+
0.27577711
|
| 23 |
+
],
|
| 24 |
+
"input_data_format": null,
|
| 25 |
+
"max_frames": 768,
|
| 26 |
+
"max_pixels": 12845056,
|
| 27 |
+
"merge_size": 2,
|
| 28 |
+
"min_frames": 4,
|
| 29 |
+
"min_pixels": 3136,
|
| 30 |
+
"num_frames": null,
|
| 31 |
+
"pad_size": null,
|
| 32 |
+
"patch_size": 14,
|
| 33 |
+
"processor_class": "Qwen2VLProcessor",
|
| 34 |
+
"resample": 3,
|
| 35 |
+
"rescale_factor": 0.00392156862745098,
|
| 36 |
+
"return_metadata": false,
|
| 37 |
+
"size": {
|
| 38 |
+
"longest_edge": 12845056,
|
| 39 |
+
"shortest_edge": 3136
|
| 40 |
+
},
|
| 41 |
+
"size_divisor": null,
|
| 42 |
+
"temporal_patch_size": 2,
|
| 43 |
+
"video_metadata": null,
|
| 44 |
+
"video_processor_type": "Qwen2VLVideoProcessor"
|
| 45 |
+
}
|
processor/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
scheduler/scheduler_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "FlowMatchEulerDiscreteScheduler",
|
| 3 |
+
"_diffusers_version": "0.37.0.dev0",
|
| 4 |
+
"base_image_seq_len": 256,
|
| 5 |
+
"base_shift": 0.5,
|
| 6 |
+
"invert_sigmas": false,
|
| 7 |
+
"max_image_seq_len": 8192,
|
| 8 |
+
"max_shift": 0.9,
|
| 9 |
+
"num_train_timesteps": 1000,
|
| 10 |
+
"shift": 1.0,
|
| 11 |
+
"shift_terminal": 0.02,
|
| 12 |
+
"stochastic_sampling": false,
|
| 13 |
+
"time_shift_type": "exponential",
|
| 14 |
+
"use_beta_sigmas": false,
|
| 15 |
+
"use_dynamic_shifting": true,
|
| 16 |
+
"use_exponential_sigmas": false,
|
| 17 |
+
"use_karras_sigmas": false
|
| 18 |
+
}
|
text_encoder/config.json
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen2_5_VLForConditionalGeneration"
|
| 4 |
+
],
|
| 5 |
+
"attention_dropout": 0.0,
|
| 6 |
+
"bos_token_id": 151643,
|
| 7 |
+
"dtype": "bfloat16",
|
| 8 |
+
"eos_token_id": 151645,
|
| 9 |
+
"hidden_act": "silu",
|
| 10 |
+
"hidden_size": 3584,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": 18944,
|
| 13 |
+
"max_position_embeddings": 128000,
|
| 14 |
+
"max_window_layers": 28,
|
| 15 |
+
"model_type": "qwen2_5_vl",
|
| 16 |
+
"num_attention_heads": 28,
|
| 17 |
+
"num_hidden_layers": 28,
|
| 18 |
+
"num_key_value_heads": 4,
|
| 19 |
+
"quantization_config": {
|
| 20 |
+
"_load_in_4bit": false,
|
| 21 |
+
"_load_in_8bit": true,
|
| 22 |
+
"bnb_4bit_compute_dtype": "float32",
|
| 23 |
+
"bnb_4bit_quant_storage": "uint8",
|
| 24 |
+
"bnb_4bit_quant_type": "fp4",
|
| 25 |
+
"bnb_4bit_use_double_quant": false,
|
| 26 |
+
"llm_int8_enable_fp32_cpu_offload": false,
|
| 27 |
+
"llm_int8_has_fp16_weight": false,
|
| 28 |
+
"llm_int8_skip_modules": null,
|
| 29 |
+
"llm_int8_threshold": 6.0,
|
| 30 |
+
"load_in_4bit": false,
|
| 31 |
+
"load_in_8bit": true,
|
| 32 |
+
"quant_method": "bitsandbytes"
|
| 33 |
+
},
|
| 34 |
+
"rms_norm_eps": 1e-06,
|
| 35 |
+
"rope_scaling": {
|
| 36 |
+
"mrope_section": [
|
| 37 |
+
16,
|
| 38 |
+
24,
|
| 39 |
+
24
|
| 40 |
+
],
|
| 41 |
+
"rope_type": "default",
|
| 42 |
+
"type": "default"
|
| 43 |
+
},
|
| 44 |
+
"rope_theta": 1000000.0,
|
| 45 |
+
"sliding_window": 32768,
|
| 46 |
+
"text_config": {
|
| 47 |
+
"_name_or_path": "/home/user/.cache/huggingface/hub/models--FireRedTeam--FireRed-Image-Edit-1.0/snapshots/0aea9d520c801c9f4b691cc92f736a7a8628e6a2/text_encoder",
|
| 48 |
+
"architectures": [
|
| 49 |
+
"Qwen2_5_VLForConditionalGeneration"
|
| 50 |
+
],
|
| 51 |
+
"attention_dropout": 0.0,
|
| 52 |
+
"bos_token_id": 151643,
|
| 53 |
+
"dtype": "bfloat16",
|
| 54 |
+
"eos_token_id": 151645,
|
| 55 |
+
"hidden_act": "silu",
|
| 56 |
+
"hidden_size": 3584,
|
| 57 |
+
"image_token_id": 151655,
|
| 58 |
+
"initializer_range": 0.02,
|
| 59 |
+
"intermediate_size": 18944,
|
| 60 |
+
"layer_types": [
|
| 61 |
+
"full_attention",
|
| 62 |
+
"full_attention",
|
| 63 |
+
"full_attention",
|
| 64 |
+
"full_attention",
|
| 65 |
+
"full_attention",
|
| 66 |
+
"full_attention",
|
| 67 |
+
"full_attention",
|
| 68 |
+
"full_attention",
|
| 69 |
+
"full_attention",
|
| 70 |
+
"full_attention",
|
| 71 |
+
"full_attention",
|
| 72 |
+
"full_attention",
|
| 73 |
+
"full_attention",
|
| 74 |
+
"full_attention",
|
| 75 |
+
"full_attention",
|
| 76 |
+
"full_attention",
|
| 77 |
+
"full_attention",
|
| 78 |
+
"full_attention",
|
| 79 |
+
"full_attention",
|
| 80 |
+
"full_attention",
|
| 81 |
+
"full_attention",
|
| 82 |
+
"full_attention",
|
| 83 |
+
"full_attention",
|
| 84 |
+
"full_attention",
|
| 85 |
+
"full_attention",
|
| 86 |
+
"full_attention",
|
| 87 |
+
"full_attention",
|
| 88 |
+
"full_attention"
|
| 89 |
+
],
|
| 90 |
+
"max_position_embeddings": 128000,
|
| 91 |
+
"max_window_layers": 28,
|
| 92 |
+
"model_type": "qwen2_5_vl_text",
|
| 93 |
+
"num_attention_heads": 28,
|
| 94 |
+
"num_hidden_layers": 28,
|
| 95 |
+
"num_key_value_heads": 4,
|
| 96 |
+
"rms_norm_eps": 1e-06,
|
| 97 |
+
"rope_scaling": {
|
| 98 |
+
"mrope_section": [
|
| 99 |
+
16,
|
| 100 |
+
24,
|
| 101 |
+
24
|
| 102 |
+
],
|
| 103 |
+
"rope_type": "default",
|
| 104 |
+
"type": "default"
|
| 105 |
+
},
|
| 106 |
+
"rope_theta": 1000000.0,
|
| 107 |
+
"sliding_window": null,
|
| 108 |
+
"use_cache": true,
|
| 109 |
+
"use_sliding_window": false,
|
| 110 |
+
"video_token_id": 151656,
|
| 111 |
+
"vision_end_token_id": 151653,
|
| 112 |
+
"vision_start_token_id": 151652,
|
| 113 |
+
"vision_token_id": 151654,
|
| 114 |
+
"vocab_size": 152064
|
| 115 |
+
},
|
| 116 |
+
"tie_word_embeddings": false,
|
| 117 |
+
"transformers_version": "4.57.3",
|
| 118 |
+
"use_cache": true,
|
| 119 |
+
"use_sliding_window": false,
|
| 120 |
+
"vision_config": {
|
| 121 |
+
"depth": 32,
|
| 122 |
+
"dtype": "bfloat16",
|
| 123 |
+
"fullatt_block_indexes": [
|
| 124 |
+
7,
|
| 125 |
+
15,
|
| 126 |
+
23,
|
| 127 |
+
31
|
| 128 |
+
],
|
| 129 |
+
"hidden_act": "silu",
|
| 130 |
+
"hidden_size": 1280,
|
| 131 |
+
"in_channels": 3,
|
| 132 |
+
"in_chans": 3,
|
| 133 |
+
"initializer_range": 0.02,
|
| 134 |
+
"intermediate_size": 3420,
|
| 135 |
+
"model_type": "qwen2_5_vl",
|
| 136 |
+
"num_heads": 16,
|
| 137 |
+
"out_hidden_size": 3584,
|
| 138 |
+
"patch_size": 14,
|
| 139 |
+
"spatial_merge_size": 2,
|
| 140 |
+
"spatial_patch_size": 14,
|
| 141 |
+
"temporal_patch_size": 2,
|
| 142 |
+
"tokens_per_second": 2,
|
| 143 |
+
"window_size": 112
|
| 144 |
+
},
|
| 145 |
+
"vision_token_id": 151654,
|
| 146 |
+
"vocab_size": 152064
|
| 147 |
+
}
|
text_encoder/generation_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 151643,
|
| 4 |
+
"eos_token_id": 151645,
|
| 5 |
+
"transformers_version": "4.57.3"
|
| 6 |
+
}
|
text_encoder/model-00001-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cb92d92afd4a8124cf6d071c015495ae8eac1eca7781e0bd5d7ed0a49538d71d
|
| 3 |
+
size 4968241771
|
text_encoder/model-00002-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:933addea488445e1d9c1dc8891c46ca280a635d8300d1c00fec91df3604c1351
|
| 3 |
+
size 4423721915
|
text_encoder/model.safetensors.index.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer/added_tokens.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</tool_call>": 151658,
|
| 3 |
+
"<tool_call>": 151657,
|
| 4 |
+
"<|box_end|>": 151649,
|
| 5 |
+
"<|box_start|>": 151648,
|
| 6 |
+
"<|endoftext|>": 151643,
|
| 7 |
+
"<|file_sep|>": 151664,
|
| 8 |
+
"<|fim_middle|>": 151660,
|
| 9 |
+
"<|fim_pad|>": 151662,
|
| 10 |
+
"<|fim_prefix|>": 151659,
|
| 11 |
+
"<|fim_suffix|>": 151661,
|
| 12 |
+
"<|im_end|>": 151645,
|
| 13 |
+
"<|im_start|>": 151644,
|
| 14 |
+
"<|image_pad|>": 151655,
|
| 15 |
+
"<|object_ref_end|>": 151647,
|
| 16 |
+
"<|object_ref_start|>": 151646,
|
| 17 |
+
"<|quad_end|>": 151651,
|
| 18 |
+
"<|quad_start|>": 151650,
|
| 19 |
+
"<|repo_name|>": 151663,
|
| 20 |
+
"<|video_pad|>": 151656,
|
| 21 |
+
"<|vision_end|>": 151653,
|
| 22 |
+
"<|vision_pad|>": 151654,
|
| 23 |
+
"<|vision_start|>": 151652
|
| 24 |
+
}
|
tokenizer/chat_template.jinja
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 4 |
+
{{- messages[0]['content'] }}
|
| 5 |
+
{%- else %}
|
| 6 |
+
{{- 'You are a helpful assistant.' }}
|
| 7 |
+
{%- endif %}
|
| 8 |
+
{{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 9 |
+
{%- for tool in tools %}
|
| 10 |
+
{{- "\n" }}
|
| 11 |
+
{{- tool | tojson }}
|
| 12 |
+
{%- endfor %}
|
| 13 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 14 |
+
{%- else %}
|
| 15 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 16 |
+
{{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
|
| 17 |
+
{%- else %}
|
| 18 |
+
{{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
|
| 19 |
+
{%- endif %}
|
| 20 |
+
{%- endif %}
|
| 21 |
+
{%- for message in messages %}
|
| 22 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
|
| 23 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 24 |
+
{%- elif message.role == "assistant" %}
|
| 25 |
+
{{- '<|im_start|>' + message.role }}
|
| 26 |
+
{%- if message.content %}
|
| 27 |
+
{{- '\n' + message.content }}
|
| 28 |
+
{%- endif %}
|
| 29 |
+
{%- for tool_call in message.tool_calls %}
|
| 30 |
+
{%- if tool_call.function is defined %}
|
| 31 |
+
{%- set tool_call = tool_call.function %}
|
| 32 |
+
{%- endif %}
|
| 33 |
+
{{- '\n<tool_call>\n{"name": "' }}
|
| 34 |
+
{{- tool_call.name }}
|
| 35 |
+
{{- '", "arguments": ' }}
|
| 36 |
+
{{- tool_call.arguments | tojson }}
|
| 37 |
+
{{- '}\n</tool_call>' }}
|
| 38 |
+
{%- endfor %}
|
| 39 |
+
{{- '<|im_end|>\n' }}
|
| 40 |
+
{%- elif message.role == "tool" %}
|
| 41 |
+
{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
|
| 42 |
+
{{- '<|im_start|>user' }}
|
| 43 |
+
{%- endif %}
|
| 44 |
+
{{- '\n<tool_response>\n' }}
|
| 45 |
+
{{- message.content }}
|
| 46 |
+
{{- '\n</tool_response>' }}
|
| 47 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 48 |
+
{{- '<|im_end|>\n' }}
|
| 49 |
+
{%- endif %}
|
| 50 |
+
{%- endif %}
|
| 51 |
+
{%- endfor %}
|
| 52 |
+
{%- if add_generation_prompt %}
|
| 53 |
+
{{- '<|im_start|>assistant\n' }}
|
| 54 |
+
{%- endif %}
|
tokenizer/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer/special_tokens_map.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>"
|
| 16 |
+
],
|
| 17 |
+
"eos_token": {
|
| 18 |
+
"content": "<|im_end|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
"pad_token": {
|
| 25 |
+
"content": "<|endoftext|>",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
+
}
|
tokenizer/tokenizer_config.json
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
}
|
| 181 |
+
},
|
| 182 |
+
"additional_special_tokens": [
|
| 183 |
+
"<|im_start|>",
|
| 184 |
+
"<|im_end|>",
|
| 185 |
+
"<|object_ref_start|>",
|
| 186 |
+
"<|object_ref_end|>",
|
| 187 |
+
"<|box_start|>",
|
| 188 |
+
"<|box_end|>",
|
| 189 |
+
"<|quad_start|>",
|
| 190 |
+
"<|quad_end|>",
|
| 191 |
+
"<|vision_start|>",
|
| 192 |
+
"<|vision_end|>",
|
| 193 |
+
"<|vision_pad|>",
|
| 194 |
+
"<|image_pad|>",
|
| 195 |
+
"<|video_pad|>"
|
| 196 |
+
],
|
| 197 |
+
"bos_token": null,
|
| 198 |
+
"clean_up_tokenization_spaces": false,
|
| 199 |
+
"eos_token": "<|im_end|>",
|
| 200 |
+
"errors": "replace",
|
| 201 |
+
"extra_special_tokens": {},
|
| 202 |
+
"model_max_length": 131072,
|
| 203 |
+
"pad_token": "<|endoftext|>",
|
| 204 |
+
"split_special_tokens": false,
|
| 205 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 206 |
+
"unk_token": null
|
| 207 |
+
}
|
tokenizer/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
transformer/config.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "QwenImageTransformer2DModel",
|
| 3 |
+
"_diffusers_version": "0.37.0.dev0",
|
| 4 |
+
"_name_or_path": "/home/user/.cache/huggingface/hub/models--FireRedTeam--FireRed-Image-Edit-1.0/snapshots/0aea9d520c801c9f4b691cc92f736a7a8628e6a2/transformer",
|
| 5 |
+
"attention_head_dim": 128,
|
| 6 |
+
"axes_dims_rope": [
|
| 7 |
+
16,
|
| 8 |
+
56,
|
| 9 |
+
56
|
| 10 |
+
],
|
| 11 |
+
"guidance_embeds": false,
|
| 12 |
+
"in_channels": 64,
|
| 13 |
+
"joint_attention_dim": 3584,
|
| 14 |
+
"num_attention_heads": 24,
|
| 15 |
+
"num_layers": 60,
|
| 16 |
+
"out_channels": 16,
|
| 17 |
+
"patch_size": 2,
|
| 18 |
+
"quantization_config": {
|
| 19 |
+
"_load_in_4bit": false,
|
| 20 |
+
"_load_in_8bit": true,
|
| 21 |
+
"bnb_4bit_compute_dtype": "float32",
|
| 22 |
+
"bnb_4bit_quant_storage": "uint8",
|
| 23 |
+
"bnb_4bit_quant_type": "fp4",
|
| 24 |
+
"bnb_4bit_use_double_quant": false,
|
| 25 |
+
"llm_int8_enable_fp32_cpu_offload": false,
|
| 26 |
+
"llm_int8_has_fp16_weight": false,
|
| 27 |
+
"llm_int8_skip_modules": null,
|
| 28 |
+
"llm_int8_threshold": 6.0,
|
| 29 |
+
"load_in_4bit": false,
|
| 30 |
+
"load_in_8bit": true,
|
| 31 |
+
"quant_method": "bitsandbytes"
|
| 32 |
+
},
|
| 33 |
+
"use_additional_t_cond": false,
|
| 34 |
+
"use_layer3d_rope": false,
|
| 35 |
+
"zero_cond_t": false
|
| 36 |
+
}
|
transformer/diffusion_pytorch_model-00001-of-00003.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:67153504e5fce615ef3ee760befc6f3c39b77aa2c3a1056a55061a66474d520d
|
| 3 |
+
size 9994210072
|
transformer/diffusion_pytorch_model-00002-of-00003.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:80fd4995453e3681329feb892561e57bffa96d7ad62ca5fb0e2786bff2504f84
|
| 3 |
+
size 9972642155
|
transformer/diffusion_pytorch_model-00003-of-00003.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:014c7e6a16601a9e8b843e2645de6786d1ea1f219cffd34ea7b13ada7916a791
|
| 3 |
+
size 491732851
|
transformer/diffusion_pytorch_model.safetensors.index.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vae/config.json
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "AutoencoderKLQwenImage",
|
| 3 |
+
"_diffusers_version": "0.37.0.dev0",
|
| 4 |
+
"_name_or_path": "/home/user/.cache/huggingface/hub/models--FireRedTeam--FireRed-Image-Edit-1.0/snapshots/0aea9d520c801c9f4b691cc92f736a7a8628e6a2/vae",
|
| 5 |
+
"attn_scales": [],
|
| 6 |
+
"base_dim": 96,
|
| 7 |
+
"dim_mult": [
|
| 8 |
+
1,
|
| 9 |
+
2,
|
| 10 |
+
4,
|
| 11 |
+
4
|
| 12 |
+
],
|
| 13 |
+
"dropout": 0.0,
|
| 14 |
+
"input_channels": 3,
|
| 15 |
+
"latents_mean": [
|
| 16 |
+
-0.7571,
|
| 17 |
+
-0.7089,
|
| 18 |
+
-0.9113,
|
| 19 |
+
0.1075,
|
| 20 |
+
-0.1745,
|
| 21 |
+
0.9653,
|
| 22 |
+
-0.1517,
|
| 23 |
+
1.5508,
|
| 24 |
+
0.4134,
|
| 25 |
+
-0.0715,
|
| 26 |
+
0.5517,
|
| 27 |
+
-0.3632,
|
| 28 |
+
-0.1922,
|
| 29 |
+
-0.9497,
|
| 30 |
+
0.2503,
|
| 31 |
+
-0.2921
|
| 32 |
+
],
|
| 33 |
+
"latents_std": [
|
| 34 |
+
2.8184,
|
| 35 |
+
1.4541,
|
| 36 |
+
2.3275,
|
| 37 |
+
2.6558,
|
| 38 |
+
1.2196,
|
| 39 |
+
1.7708,
|
| 40 |
+
2.6052,
|
| 41 |
+
2.0743,
|
| 42 |
+
3.2687,
|
| 43 |
+
2.1526,
|
| 44 |
+
2.8652,
|
| 45 |
+
1.5579,
|
| 46 |
+
1.6382,
|
| 47 |
+
1.1253,
|
| 48 |
+
2.8251,
|
| 49 |
+
1.916
|
| 50 |
+
],
|
| 51 |
+
"num_res_blocks": 2,
|
| 52 |
+
"quantization_config": {
|
| 53 |
+
"_load_in_4bit": false,
|
| 54 |
+
"_load_in_8bit": true,
|
| 55 |
+
"bnb_4bit_compute_dtype": "float32",
|
| 56 |
+
"bnb_4bit_quant_storage": "uint8",
|
| 57 |
+
"bnb_4bit_quant_type": "fp4",
|
| 58 |
+
"bnb_4bit_use_double_quant": false,
|
| 59 |
+
"llm_int8_enable_fp32_cpu_offload": false,
|
| 60 |
+
"llm_int8_has_fp16_weight": false,
|
| 61 |
+
"llm_int8_skip_modules": null,
|
| 62 |
+
"llm_int8_threshold": 6.0,
|
| 63 |
+
"load_in_4bit": false,
|
| 64 |
+
"load_in_8bit": true,
|
| 65 |
+
"quant_method": "bitsandbytes"
|
| 66 |
+
},
|
| 67 |
+
"temperal_downsample": [
|
| 68 |
+
false,
|
| 69 |
+
true,
|
| 70 |
+
true
|
| 71 |
+
],
|
| 72 |
+
"z_dim": 16
|
| 73 |
+
}
|
vae/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0c8bc8b758c649abef9ea407b95408389a3b2f610d0d10fcb054fe171d0a8344
|
| 3 |
+
size 253806966
|