ChuuniZ commited on
Commit
d7603bf
·
verified ·
1 Parent(s): 099f049

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. BiRefNet/README.md +8 -0
  2. BiRefNet/RMBG-2.0/BiRefNet_config.py +11 -0
  3. BiRefNet/RMBG-2.0/birefnet.py +2244 -0
  4. BiRefNet/RMBG-2.0/config.json +21 -0
  5. BiRefNet/RMBG-2.0/diagram1.png +0 -0
  6. BiRefNet/RMBG-2.0/preprocessor_config.json +23 -0
  7. Joy_caption/README.md +79 -0
  8. Joy_caption/app.py +536 -0
  9. Joy_caption/cgrkzexw-599808/config.yaml +39 -0
  10. Joy_caption/cgrkzexw-599808/text_model/README.md +202 -0
  11. Joy_caption/cgrkzexw-599808/text_model/adapter_config.json +34 -0
  12. Joy_caption/cgrkzexw-599808/text_model/tokenizer.json +0 -0
  13. Joy_caption/joycaption_alpha_two_cli_mod.ipynb +46 -0
  14. Joy_caption/requirements.txt +10 -0
  15. LLM/Florence-2-base-PromptGen-v2.0/configuration_florence2.py +340 -0
  16. LLM/Florence-2-base-PromptGen-v2.0/generation_config.json +13 -0
  17. LLM/Florence-2-base-PromptGen-v2.0/merges.txt +0 -0
  18. LLM/Florence-2-base-PromptGen-v2.0/processing_florence2.py +1088 -0
  19. LLM/Florence-2-large-PromptGen-v2.0/README.md +71 -0
  20. LLM/Florence-2-large-PromptGen-v2.0/added_tokens.json +1026 -0
  21. LLM/Florence-2-large-PromptGen-v2.0/config.json +138 -0
  22. LLM/Florence-2-large-PromptGen-v2.0/configuration_florence2.py +340 -0
  23. LLM/Florence-2-large-PromptGen-v2.0/generation_config.json +4 -0
  24. LLM/Florence-2-large-PromptGen-v2.0/merges.txt +0 -0
  25. LLM/Florence-2-large-PromptGen-v2.0/modeling_florence2.py +0 -0
  26. LLM/Florence-2-large-PromptGen-v2.0/preprocessor_config.json +33 -0
  27. LLM/Florence-2-large-PromptGen-v2.0/processing_florence2.py +1088 -0
  28. LLM/Florence-2-large-PromptGen-v2.0/special_tokens_map.json +0 -0
  29. LLM/Florence-2-large-PromptGen-v2.0/tokenizer.json +0 -0
  30. LLM/Florence-2-large-PromptGen-v2.0/tokenizer_config.json +0 -0
  31. LLM/Florence-2-large-PromptGen-v2.0/vocab.json +0 -0
  32. LLM/Llama-3.1-8B-Lexi-Uncensored-V2/README.md +155 -0
  33. LLM/Llama-3.1-8B-Lexi-Uncensored-V2/config.json +41 -0
  34. LLM/Llama-3.1-8B-Lexi-Uncensored-V2/generation_config.json +14 -0
  35. LLM/Llama-3.1-8B-Lexi-Uncensored-V2/model.safetensors.index.json +298 -0
  36. LLM/Llama-3.1-8B-Lexi-Uncensored-V2/special_tokens_map.json +23 -0
  37. LLM/Llama-3.1-8B-Lexi-Uncensored-V2/tokenizer.json +0 -0
  38. LLM/Llama-3.1-8B-Lexi-Uncensored-V2/tokenizer_config.json +2064 -0
  39. clip/siglip-so400m-patch14-384/README.md +112 -0
  40. clip/siglip-so400m-patch14-384/config.json +25 -0
  41. clip/siglip-so400m-patch14-384/preprocessor_config.json +23 -0
  42. clip/siglip-so400m-patch14-384/special_tokens_map.json +23 -0
  43. clip/siglip-so400m-patch14-384/tokenizer.json +0 -0
  44. clip/siglip-so400m-patch14-384/tokenizer_config.json +33 -0
  45. clip_interrogator/models--timm--vit_large_patch14_clip_224.openai/refs/main +1 -0
  46. controlnet/FLUX.1/FLUX.1-dev-ControlNet-Union-Pro-2.0/README.md +154 -0
  47. controlnet/FLUX.1/FLUX.1-dev-ControlNet-Union-Pro-2.0/conds/canny.png +0 -0
  48. controlnet/FLUX.1/FLUX.1-dev-ControlNet-Union-Pro-2.0/config.json +19 -0
  49. controlnet/FLUX.1/FLUX.1-dev-ControlNet-Union-Pro-2.0/controlnet_flux.py +509 -0
  50. controlnet/FLUX.1/FLUX.1-dev-ControlNet-Union-Pro-2.0/pipeline_flux_controlnet.py +1181 -0
BiRefNet/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ license: mit
4
+ ---
5
+
6
+ This is used to store the checkpoints of BiRefNet, please refer the following repo link
7
+ 1. Official implement https://github.com/zhengpeng7/birefnet
8
+ 2. ComfyUI BiRefNet node https://github.com/viperyl/ComfyUI-BiRefNet
BiRefNet/RMBG-2.0/BiRefNet_config.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+ class BiRefNetConfig(PretrainedConfig):
4
+ model_type = "SegformerForSemanticSegmentation"
5
+ def __init__(
6
+ self,
7
+ bb_pretrained=False,
8
+ **kwargs
9
+ ):
10
+ self.bb_pretrained = bb_pretrained
11
+ super().__init__(**kwargs)
BiRefNet/RMBG-2.0/birefnet.py ADDED
@@ -0,0 +1,2244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### config.py
2
+
3
+ import os
4
+ import math
5
+ from transformers import PretrainedConfig
6
+
7
+ class Config(PretrainedConfig):
8
+ def __init__(self) -> None:
9
+ # PATH settings
10
+ self.sys_home_dir = os.path.expanduser('~') # Make up your file system as: SYS_HOME_DIR/codes/dis/BiRefNet, SYS_HOME_DIR/datasets/dis/xx, SYS_HOME_DIR/weights/xx
11
+
12
+ # TASK settings
13
+ self.task = ['DIS5K', 'COD', 'HRSOD', 'DIS5K+HRSOD+HRS10K', 'P3M-10k'][0]
14
+ self.training_set = {
15
+ 'DIS5K': ['DIS-TR', 'DIS-TR+DIS-TE1+DIS-TE2+DIS-TE3+DIS-TE4'][0],
16
+ 'COD': 'TR-COD10K+TR-CAMO',
17
+ 'HRSOD': ['TR-DUTS', 'TR-HRSOD', 'TR-UHRSD', 'TR-DUTS+TR-HRSOD', 'TR-DUTS+TR-UHRSD', 'TR-HRSOD+TR-UHRSD', 'TR-DUTS+TR-HRSOD+TR-UHRSD'][5],
18
+ 'DIS5K+HRSOD+HRS10K': 'DIS-TE1+DIS-TE2+DIS-TE3+DIS-TE4+DIS-TR+TE-HRS10K+TE-HRSOD+TE-UHRSD+TR-HRS10K+TR-HRSOD+TR-UHRSD', # leave DIS-VD for evaluation.
19
+ 'P3M-10k': 'TR-P3M-10k',
20
+ }[self.task]
21
+ self.prompt4loc = ['dense', 'sparse'][0]
22
+
23
+ # Faster-Training settings
24
+ self.load_all = True
25
+ self.compile = True # 1. Trigger CPU memory leak in some extend, which is an inherent problem of PyTorch.
26
+ # Machines with > 70GB CPU memory can run the whole training on DIS5K with default setting.
27
+ # 2. Higher PyTorch version may fix it: https://github.com/pytorch/pytorch/issues/119607.
28
+ # 3. But compile in Pytorch > 2.0.1 seems to bring no acceleration for training.
29
+ self.precisionHigh = True
30
+
31
+ # MODEL settings
32
+ self.ms_supervision = True
33
+ self.out_ref = self.ms_supervision and True
34
+ self.dec_ipt = True
35
+ self.dec_ipt_split = True
36
+ self.cxt_num = [0, 3][1] # multi-scale skip connections from encoder
37
+ self.mul_scl_ipt = ['', 'add', 'cat'][2]
38
+ self.dec_att = ['', 'ASPP', 'ASPPDeformable'][2]
39
+ self.squeeze_block = ['', 'BasicDecBlk_x1', 'ResBlk_x4', 'ASPP_x3', 'ASPPDeformable_x3'][1]
40
+ self.dec_blk = ['BasicDecBlk', 'ResBlk', 'HierarAttDecBlk'][0]
41
+
42
+ # TRAINING settings
43
+ self.batch_size = 4
44
+ self.IoU_finetune_last_epochs = [
45
+ 0,
46
+ {
47
+ 'DIS5K': -50,
48
+ 'COD': -20,
49
+ 'HRSOD': -20,
50
+ 'DIS5K+HRSOD+HRS10K': -20,
51
+ 'P3M-10k': -20,
52
+ }[self.task]
53
+ ][1] # choose 0 to skip
54
+ self.lr = (1e-4 if 'DIS5K' in self.task else 1e-5) * math.sqrt(self.batch_size / 4) # DIS needs high lr to converge faster. Adapt the lr linearly
55
+ self.size = 1024
56
+ self.num_workers = max(4, self.batch_size) # will be decrease to min(it, batch_size) at the initialization of the data_loader
57
+
58
+ # Backbone settings
59
+ self.bb = [
60
+ 'vgg16', 'vgg16bn', 'resnet50', # 0, 1, 2
61
+ 'swin_v1_t', 'swin_v1_s', # 3, 4
62
+ 'swin_v1_b', 'swin_v1_l', # 5-bs9, 6-bs4
63
+ 'pvt_v2_b0', 'pvt_v2_b1', # 7, 8
64
+ 'pvt_v2_b2', 'pvt_v2_b5', # 9-bs10, 10-bs5
65
+ ][6]
66
+ self.lateral_channels_in_collection = {
67
+ 'vgg16': [512, 256, 128, 64], 'vgg16bn': [512, 256, 128, 64], 'resnet50': [1024, 512, 256, 64],
68
+ 'pvt_v2_b2': [512, 320, 128, 64], 'pvt_v2_b5': [512, 320, 128, 64],
69
+ 'swin_v1_b': [1024, 512, 256, 128], 'swin_v1_l': [1536, 768, 384, 192],
70
+ 'swin_v1_t': [768, 384, 192, 96], 'swin_v1_s': [768, 384, 192, 96],
71
+ 'pvt_v2_b0': [256, 160, 64, 32], 'pvt_v2_b1': [512, 320, 128, 64],
72
+ }[self.bb]
73
+ if self.mul_scl_ipt == 'cat':
74
+ self.lateral_channels_in_collection = [channel * 2 for channel in self.lateral_channels_in_collection]
75
+ self.cxt = self.lateral_channels_in_collection[1:][::-1][-self.cxt_num:] if self.cxt_num else []
76
+
77
+ # MODEL settings - inactive
78
+ self.lat_blk = ['BasicLatBlk'][0]
79
+ self.dec_channels_inter = ['fixed', 'adap'][0]
80
+ self.refine = ['', 'itself', 'RefUNet', 'Refiner', 'RefinerPVTInChannels4'][0]
81
+ self.progressive_ref = self.refine and True
82
+ self.ender = self.progressive_ref and False
83
+ self.scale = self.progressive_ref and 2
84
+ self.auxiliary_classification = False # Only for DIS5K, where class labels are saved in `dataset.py`.
85
+ self.refine_iteration = 1
86
+ self.freeze_bb = False
87
+ self.model = [
88
+ 'BiRefNet',
89
+ ][0]
90
+ if self.dec_blk == 'HierarAttDecBlk':
91
+ self.batch_size = 2 ** [0, 1, 2, 3, 4][2]
92
+
93
+ # TRAINING settings - inactive
94
+ self.preproc_methods = ['flip', 'enhance', 'rotate', 'pepper', 'crop'][:4]
95
+ self.optimizer = ['Adam', 'AdamW'][1]
96
+ self.lr_decay_epochs = [1e5] # Set to negative N to decay the lr in the last N-th epoch.
97
+ self.lr_decay_rate = 0.5
98
+ # Loss
99
+ self.lambdas_pix_last = {
100
+ # not 0 means opening this loss
101
+ # original rate -- 1 : 30 : 1.5 : 0.2, bce x 30
102
+ 'bce': 30 * 1, # high performance
103
+ 'iou': 0.5 * 1, # 0 / 255
104
+ 'iou_patch': 0.5 * 0, # 0 / 255, win_size = (64, 64)
105
+ 'mse': 150 * 0, # can smooth the saliency map
106
+ 'triplet': 3 * 0,
107
+ 'reg': 100 * 0,
108
+ 'ssim': 10 * 1, # help contours,
109
+ 'cnt': 5 * 0, # help contours
110
+ 'structure': 5 * 0, # structure loss from codes of MVANet. A little improvement on DIS-TE[1,2,3], a bit more decrease on DIS-TE4.
111
+ }
112
+ self.lambdas_cls = {
113
+ 'ce': 5.0
114
+ }
115
+ # Adv
116
+ self.lambda_adv_g = 10. * 0 # turn to 0 to avoid adv training
117
+ self.lambda_adv_d = 3. * (self.lambda_adv_g > 0)
118
+
119
+ # PATH settings - inactive
120
+ self.data_root_dir = os.path.join(self.sys_home_dir, 'datasets/dis')
121
+ self.weights_root_dir = os.path.join(self.sys_home_dir, 'weights')
122
+ self.weights = {
123
+ 'pvt_v2_b2': os.path.join(self.weights_root_dir, 'pvt_v2_b2.pth'),
124
+ 'pvt_v2_b5': os.path.join(self.weights_root_dir, ['pvt_v2_b5.pth', 'pvt_v2_b5_22k.pth'][0]),
125
+ 'swin_v1_b': os.path.join(self.weights_root_dir, ['swin_base_patch4_window12_384_22kto1k.pth', 'swin_base_patch4_window12_384_22k.pth'][0]),
126
+ 'swin_v1_l': os.path.join(self.weights_root_dir, ['swin_large_patch4_window12_384_22kto1k.pth', 'swin_large_patch4_window12_384_22k.pth'][0]),
127
+ 'swin_v1_t': os.path.join(self.weights_root_dir, ['swin_tiny_patch4_window7_224_22kto1k_finetune.pth'][0]),
128
+ 'swin_v1_s': os.path.join(self.weights_root_dir, ['swin_small_patch4_window7_224_22kto1k_finetune.pth'][0]),
129
+ 'pvt_v2_b0': os.path.join(self.weights_root_dir, ['pvt_v2_b0.pth'][0]),
130
+ 'pvt_v2_b1': os.path.join(self.weights_root_dir, ['pvt_v2_b1.pth'][0]),
131
+ }
132
+
133
+ # Callbacks - inactive
134
+ self.verbose_eval = True
135
+ self.only_S_MAE = False
136
+ self.use_fp16 = False # Bugs. It may cause nan in training.
137
+ self.SDPA_enabled = False # Bugs. Slower and errors occur in multi-GPUs
138
+
139
+ # others
140
+ self.device = [0, 'cpu'][0] # .to(0) == .to('cuda:0')
141
+
142
+ self.batch_size_valid = 1
143
+ self.rand_seed = 7
144
+ # run_sh_file = [f for f in os.listdir('.') if 'train.sh' == f] + [os.path.join('..', f) for f in os.listdir('..') if 'train.sh' == f]
145
+ # with open(run_sh_file[0], 'r') as f:
146
+ # lines = f.readlines()
147
+ # self.save_last = int([l.strip() for l in lines if '"{}")'.format(self.task) in l and 'val_last=' in l][0].split('val_last=')[-1].split()[0])
148
+ # self.save_step = int([l.strip() for l in lines if '"{}")'.format(self.task) in l and 'step=' in l][0].split('step=')[-1].split()[0])
149
+ # self.val_step = [0, self.save_step][0]
150
+
151
+ def print_task(self) -> None:
152
+ # Return task for choosing settings in shell scripts.
153
+ print(self.task)
154
+
155
+
156
+
157
+ ### models/backbones/pvt_v2.py
158
+
159
+ import torch
160
+ import torch.nn as nn
161
+ from functools import partial
162
+
163
+ from timm.models.layers import DropPath, to_2tuple, trunc_normal_
164
+ from timm.models.registry import register_model
165
+
166
+ import math
167
+
168
+ # from config import Config
169
+
170
+ # config = Config()
171
+
172
+ class Mlp(nn.Module):
173
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
174
+ super().__init__()
175
+ out_features = out_features or in_features
176
+ hidden_features = hidden_features or in_features
177
+ self.fc1 = nn.Linear(in_features, hidden_features)
178
+ self.dwconv = DWConv(hidden_features)
179
+ self.act = act_layer()
180
+ self.fc2 = nn.Linear(hidden_features, out_features)
181
+ self.drop = nn.Dropout(drop)
182
+
183
+ self.apply(self._init_weights)
184
+
185
+ def _init_weights(self, m):
186
+ if isinstance(m, nn.Linear):
187
+ trunc_normal_(m.weight, std=.02)
188
+ if isinstance(m, nn.Linear) and m.bias is not None:
189
+ nn.init.constant_(m.bias, 0)
190
+ elif isinstance(m, nn.LayerNorm):
191
+ nn.init.constant_(m.bias, 0)
192
+ nn.init.constant_(m.weight, 1.0)
193
+ elif isinstance(m, nn.Conv2d):
194
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
195
+ fan_out //= m.groups
196
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
197
+ if m.bias is not None:
198
+ m.bias.data.zero_()
199
+
200
+ def forward(self, x, H, W):
201
+ x = self.fc1(x)
202
+ x = self.dwconv(x, H, W)
203
+ x = self.act(x)
204
+ x = self.drop(x)
205
+ x = self.fc2(x)
206
+ x = self.drop(x)
207
+ return x
208
+
209
+
210
+ class Attention(nn.Module):
211
+ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1):
212
+ super().__init__()
213
+ assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
214
+
215
+ self.dim = dim
216
+ self.num_heads = num_heads
217
+ head_dim = dim // num_heads
218
+ self.scale = qk_scale or head_dim ** -0.5
219
+
220
+ self.q = nn.Linear(dim, dim, bias=qkv_bias)
221
+ self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
222
+ self.attn_drop_prob = attn_drop
223
+ self.attn_drop = nn.Dropout(attn_drop)
224
+ self.proj = nn.Linear(dim, dim)
225
+ self.proj_drop = nn.Dropout(proj_drop)
226
+
227
+ self.sr_ratio = sr_ratio
228
+ if sr_ratio > 1:
229
+ self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
230
+ self.norm = nn.LayerNorm(dim)
231
+
232
+ self.apply(self._init_weights)
233
+
234
+ def _init_weights(self, m):
235
+ if isinstance(m, nn.Linear):
236
+ trunc_normal_(m.weight, std=.02)
237
+ if isinstance(m, nn.Linear) and m.bias is not None:
238
+ nn.init.constant_(m.bias, 0)
239
+ elif isinstance(m, nn.LayerNorm):
240
+ nn.init.constant_(m.bias, 0)
241
+ nn.init.constant_(m.weight, 1.0)
242
+ elif isinstance(m, nn.Conv2d):
243
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
244
+ fan_out //= m.groups
245
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
246
+ if m.bias is not None:
247
+ m.bias.data.zero_()
248
+
249
+ def forward(self, x, H, W):
250
+ B, N, C = x.shape
251
+ q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
252
+
253
+ if self.sr_ratio > 1:
254
+ x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
255
+ x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
256
+ x_ = self.norm(x_)
257
+ kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
258
+ else:
259
+ kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
260
+ k, v = kv[0], kv[1]
261
+
262
+ if config.SDPA_enabled:
263
+ x = torch.nn.functional.scaled_dot_product_attention(
264
+ q, k, v,
265
+ attn_mask=None, dropout_p=self.attn_drop_prob, is_causal=False
266
+ ).transpose(1, 2).reshape(B, N, C)
267
+ else:
268
+ attn = (q @ k.transpose(-2, -1)) * self.scale
269
+ attn = attn.softmax(dim=-1)
270
+ attn = self.attn_drop(attn)
271
+
272
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
273
+ x = self.proj(x)
274
+ x = self.proj_drop(x)
275
+
276
+ return x
277
+
278
+
279
+ class Block(nn.Module):
280
+
281
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
282
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1):
283
+ super().__init__()
284
+ self.norm1 = norm_layer(dim)
285
+ self.attn = Attention(
286
+ dim,
287
+ num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
288
+ attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio)
289
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
290
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
291
+ self.norm2 = norm_layer(dim)
292
+ mlp_hidden_dim = int(dim * mlp_ratio)
293
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
294
+
295
+ self.apply(self._init_weights)
296
+
297
+ def _init_weights(self, m):
298
+ if isinstance(m, nn.Linear):
299
+ trunc_normal_(m.weight, std=.02)
300
+ if isinstance(m, nn.Linear) and m.bias is not None:
301
+ nn.init.constant_(m.bias, 0)
302
+ elif isinstance(m, nn.LayerNorm):
303
+ nn.init.constant_(m.bias, 0)
304
+ nn.init.constant_(m.weight, 1.0)
305
+ elif isinstance(m, nn.Conv2d):
306
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
307
+ fan_out //= m.groups
308
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
309
+ if m.bias is not None:
310
+ m.bias.data.zero_()
311
+
312
+ def forward(self, x, H, W):
313
+ x = x + self.drop_path(self.attn(self.norm1(x), H, W))
314
+ x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
315
+
316
+ return x
317
+
318
+
319
+ class OverlapPatchEmbed(nn.Module):
320
+ """ Image to Patch Embedding
321
+ """
322
+
323
+ def __init__(self, img_size=224, patch_size=7, stride=4, in_channels=3, embed_dim=768):
324
+ super().__init__()
325
+ img_size = to_2tuple(img_size)
326
+ patch_size = to_2tuple(patch_size)
327
+
328
+ self.img_size = img_size
329
+ self.patch_size = patch_size
330
+ self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
331
+ self.num_patches = self.H * self.W
332
+ self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=stride,
333
+ padding=(patch_size[0] // 2, patch_size[1] // 2))
334
+ self.norm = nn.LayerNorm(embed_dim)
335
+
336
+ self.apply(self._init_weights)
337
+
338
+ def _init_weights(self, m):
339
+ if isinstance(m, nn.Linear):
340
+ trunc_normal_(m.weight, std=.02)
341
+ if isinstance(m, nn.Linear) and m.bias is not None:
342
+ nn.init.constant_(m.bias, 0)
343
+ elif isinstance(m, nn.LayerNorm):
344
+ nn.init.constant_(m.bias, 0)
345
+ nn.init.constant_(m.weight, 1.0)
346
+ elif isinstance(m, nn.Conv2d):
347
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
348
+ fan_out //= m.groups
349
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
350
+ if m.bias is not None:
351
+ m.bias.data.zero_()
352
+
353
+ def forward(self, x):
354
+ x = self.proj(x)
355
+ _, _, H, W = x.shape
356
+ x = x.flatten(2).transpose(1, 2)
357
+ x = self.norm(x)
358
+
359
+ return x, H, W
360
+
361
+
362
+ class PyramidVisionTransformerImpr(nn.Module):
363
+ def __init__(self, img_size=224, patch_size=16, in_channels=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
364
+ num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
365
+ attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm,
366
+ depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1]):
367
+ super().__init__()
368
+ self.num_classes = num_classes
369
+ self.depths = depths
370
+
371
+ # patch_embed
372
+ self.patch_embed1 = OverlapPatchEmbed(img_size=img_size, patch_size=7, stride=4, in_channels=in_channels,
373
+ embed_dim=embed_dims[0])
374
+ self.patch_embed2 = OverlapPatchEmbed(img_size=img_size // 4, patch_size=3, stride=2, in_channels=embed_dims[0],
375
+ embed_dim=embed_dims[1])
376
+ self.patch_embed3 = OverlapPatchEmbed(img_size=img_size // 8, patch_size=3, stride=2, in_channels=embed_dims[1],
377
+ embed_dim=embed_dims[2])
378
+ self.patch_embed4 = OverlapPatchEmbed(img_size=img_size // 16, patch_size=3, stride=2, in_channels=embed_dims[2],
379
+ embed_dim=embed_dims[3])
380
+
381
+ # transformer encoder
382
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
383
+ cur = 0
384
+ self.block1 = nn.ModuleList([Block(
385
+ dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale,
386
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
387
+ sr_ratio=sr_ratios[0])
388
+ for i in range(depths[0])])
389
+ self.norm1 = norm_layer(embed_dims[0])
390
+
391
+ cur += depths[0]
392
+ self.block2 = nn.ModuleList([Block(
393
+ dim=embed_dims[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale,
394
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
395
+ sr_ratio=sr_ratios[1])
396
+ for i in range(depths[1])])
397
+ self.norm2 = norm_layer(embed_dims[1])
398
+
399
+ cur += depths[1]
400
+ self.block3 = nn.ModuleList([Block(
401
+ dim=embed_dims[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale,
402
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
403
+ sr_ratio=sr_ratios[2])
404
+ for i in range(depths[2])])
405
+ self.norm3 = norm_layer(embed_dims[2])
406
+
407
+ cur += depths[2]
408
+ self.block4 = nn.ModuleList([Block(
409
+ dim=embed_dims[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale,
410
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
411
+ sr_ratio=sr_ratios[3])
412
+ for i in range(depths[3])])
413
+ self.norm4 = norm_layer(embed_dims[3])
414
+
415
+ # classification head
416
+ # self.head = nn.Linear(embed_dims[3], num_classes) if num_classes > 0 else nn.Identity()
417
+
418
+ self.apply(self._init_weights)
419
+
420
+ def _init_weights(self, m):
421
+ if isinstance(m, nn.Linear):
422
+ trunc_normal_(m.weight, std=.02)
423
+ if isinstance(m, nn.Linear) and m.bias is not None:
424
+ nn.init.constant_(m.bias, 0)
425
+ elif isinstance(m, nn.LayerNorm):
426
+ nn.init.constant_(m.bias, 0)
427
+ nn.init.constant_(m.weight, 1.0)
428
+ elif isinstance(m, nn.Conv2d):
429
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
430
+ fan_out //= m.groups
431
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
432
+ if m.bias is not None:
433
+ m.bias.data.zero_()
434
+
435
+ def init_weights(self, pretrained=None):
436
+ if isinstance(pretrained, str):
437
+ logger = 1
438
+ #load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger)
439
+
440
+ def reset_drop_path(self, drop_path_rate):
441
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))]
442
+ cur = 0
443
+ for i in range(self.depths[0]):
444
+ self.block1[i].drop_path.drop_prob = dpr[cur + i]
445
+
446
+ cur += self.depths[0]
447
+ for i in range(self.depths[1]):
448
+ self.block2[i].drop_path.drop_prob = dpr[cur + i]
449
+
450
+ cur += self.depths[1]
451
+ for i in range(self.depths[2]):
452
+ self.block3[i].drop_path.drop_prob = dpr[cur + i]
453
+
454
+ cur += self.depths[2]
455
+ for i in range(self.depths[3]):
456
+ self.block4[i].drop_path.drop_prob = dpr[cur + i]
457
+
458
+ def freeze_patch_emb(self):
459
+ self.patch_embed1.requires_grad = False
460
+
461
+ @torch.jit.ignore
462
+ def no_weight_decay(self):
463
+ return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'} # has pos_embed may be better
464
+
465
+ def get_classifier(self):
466
+ return self.head
467
+
468
+ def reset_classifier(self, num_classes, global_pool=''):
469
+ self.num_classes = num_classes
470
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
471
+
472
+ def forward_features(self, x):
473
+ B = x.shape[0]
474
+ outs = []
475
+
476
+ # stage 1
477
+ x, H, W = self.patch_embed1(x)
478
+ for i, blk in enumerate(self.block1):
479
+ x = blk(x, H, W)
480
+ x = self.norm1(x)
481
+ x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
482
+ outs.append(x)
483
+
484
+ # stage 2
485
+ x, H, W = self.patch_embed2(x)
486
+ for i, blk in enumerate(self.block2):
487
+ x = blk(x, H, W)
488
+ x = self.norm2(x)
489
+ x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
490
+ outs.append(x)
491
+
492
+ # stage 3
493
+ x, H, W = self.patch_embed3(x)
494
+ for i, blk in enumerate(self.block3):
495
+ x = blk(x, H, W)
496
+ x = self.norm3(x)
497
+ x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
498
+ outs.append(x)
499
+
500
+ # stage 4
501
+ x, H, W = self.patch_embed4(x)
502
+ for i, blk in enumerate(self.block4):
503
+ x = blk(x, H, W)
504
+ x = self.norm4(x)
505
+ x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
506
+ outs.append(x)
507
+
508
+ return outs
509
+
510
+ # return x.mean(dim=1)
511
+
512
+ def forward(self, x):
513
+ x = self.forward_features(x)
514
+ # x = self.head(x)
515
+
516
+ return x
517
+
518
+
519
+ class DWConv(nn.Module):
520
+ def __init__(self, dim=768):
521
+ super(DWConv, self).__init__()
522
+ self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
523
+
524
+ def forward(self, x, H, W):
525
+ B, N, C = x.shape
526
+ x = x.transpose(1, 2).view(B, C, H, W).contiguous()
527
+ x = self.dwconv(x)
528
+ x = x.flatten(2).transpose(1, 2)
529
+
530
+ return x
531
+
532
+
533
+ def _conv_filter(state_dict, patch_size=16):
534
+ """ convert patch embedding weight from manual patchify + linear proj to conv"""
535
+ out_dict = {}
536
+ for k, v in state_dict.items():
537
+ if 'patch_embed.proj.weight' in k:
538
+ v = v.reshape((v.shape[0], 3, patch_size, patch_size))
539
+ out_dict[k] = v
540
+
541
+ return out_dict
542
+
543
+
544
+ ## @register_model
545
+ class pvt_v2_b0(PyramidVisionTransformerImpr):
546
+ def __init__(self, **kwargs):
547
+ super(pvt_v2_b0, self).__init__(
548
+ patch_size=4, embed_dims=[32, 64, 160, 256], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4],
549
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
550
+ drop_rate=0.0, drop_path_rate=0.1)
551
+
552
+
553
+
554
+ ## @register_model
555
+ class pvt_v2_b1(PyramidVisionTransformerImpr):
556
+ def __init__(self, **kwargs):
557
+ super(pvt_v2_b1, self).__init__(
558
+ patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4],
559
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
560
+ drop_rate=0.0, drop_path_rate=0.1)
561
+
562
+ ## @register_model
563
+ class pvt_v2_b2(PyramidVisionTransformerImpr):
564
+ def __init__(self, in_channels=3, **kwargs):
565
+ super(pvt_v2_b2, self).__init__(
566
+ patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4],
567
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1],
568
+ drop_rate=0.0, drop_path_rate=0.1, in_channels=in_channels)
569
+
570
+ ## @register_model
571
+ class pvt_v2_b3(PyramidVisionTransformerImpr):
572
+ def __init__(self, **kwargs):
573
+ super(pvt_v2_b3, self).__init__(
574
+ patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4],
575
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1],
576
+ drop_rate=0.0, drop_path_rate=0.1)
577
+
578
+ ## @register_model
579
+ class pvt_v2_b4(PyramidVisionTransformerImpr):
580
+ def __init__(self, **kwargs):
581
+ super(pvt_v2_b4, self).__init__(
582
+ patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4],
583
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 8, 27, 3], sr_ratios=[8, 4, 2, 1],
584
+ drop_rate=0.0, drop_path_rate=0.1)
585
+
586
+
587
+ ## @register_model
588
+ class pvt_v2_b5(PyramidVisionTransformerImpr):
589
+ def __init__(self, **kwargs):
590
+ super(pvt_v2_b5, self).__init__(
591
+ patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
592
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1],
593
+ drop_rate=0.0, drop_path_rate=0.1)
594
+
595
+
596
+
597
+ ### models/backbones/swin_v1.py
598
+
599
+ # --------------------------------------------------------
600
+ # Swin Transformer
601
+ # Copyright (c) 2021 Microsoft
602
+ # Licensed under The MIT License [see LICENSE for details]
603
+ # Written by Ze Liu, Yutong Lin, Yixuan Wei
604
+ # --------------------------------------------------------
605
+
606
+ import torch
607
+ import torch.nn as nn
608
+ import torch.nn.functional as F
609
+ import torch.utils.checkpoint as checkpoint
610
+ import numpy as np
611
+ from timm.models.layers import DropPath, to_2tuple, trunc_normal_
612
+
613
+ # from config import Config
614
+
615
+
616
+ # config = Config()
617
+
618
+ class Mlp(nn.Module):
619
+ """ Multilayer perceptron."""
620
+
621
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
622
+ super().__init__()
623
+ out_features = out_features or in_features
624
+ hidden_features = hidden_features or in_features
625
+ self.fc1 = nn.Linear(in_features, hidden_features)
626
+ self.act = act_layer()
627
+ self.fc2 = nn.Linear(hidden_features, out_features)
628
+ self.drop = nn.Dropout(drop)
629
+
630
+ def forward(self, x):
631
+ x = self.fc1(x)
632
+ x = self.act(x)
633
+ x = self.drop(x)
634
+ x = self.fc2(x)
635
+ x = self.drop(x)
636
+ return x
637
+
638
+
639
+ def window_partition(x, window_size):
640
+ """
641
+ Args:
642
+ x: (B, H, W, C)
643
+ window_size (int): window size
644
+
645
+ Returns:
646
+ windows: (num_windows*B, window_size, window_size, C)
647
+ """
648
+ B, H, W, C = x.shape
649
+ x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
650
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
651
+ return windows
652
+
653
+
654
+ def window_reverse(windows, window_size, H, W):
655
+ """
656
+ Args:
657
+ windows: (num_windows*B, window_size, window_size, C)
658
+ window_size (int): Window size
659
+ H (int): Height of image
660
+ W (int): Width of image
661
+
662
+ Returns:
663
+ x: (B, H, W, C)
664
+ """
665
+ B = int(windows.shape[0] / (H * W / window_size / window_size))
666
+ x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
667
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
668
+ return x
669
+
670
+
671
+ class WindowAttention(nn.Module):
672
+ """ Window based multi-head self attention (W-MSA) module with relative position bias.
673
+ It supports both of shifted and non-shifted window.
674
+
675
+ Args:
676
+ dim (int): Number of input channels.
677
+ window_size (tuple[int]): The height and width of the window.
678
+ num_heads (int): Number of attention heads.
679
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
680
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
681
+ attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
682
+ proj_drop (float, optional): Dropout ratio of output. Default: 0.0
683
+ """
684
+
685
+ def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
686
+
687
+ super().__init__()
688
+ self.dim = dim
689
+ self.window_size = window_size # Wh, Ww
690
+ self.num_heads = num_heads
691
+ head_dim = dim // num_heads
692
+ self.scale = qk_scale or head_dim ** -0.5
693
+
694
+ # define a parameter table of relative position bias
695
+ self.relative_position_bias_table = nn.Parameter(
696
+ torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH
697
+
698
+ # get pair-wise relative position index for each token inside the window
699
+ coords_h = torch.arange(self.window_size[0])
700
+ coords_w = torch.arange(self.window_size[1])
701
+ coords = torch.stack(torch.meshgrid([coords_h, coords_w], indexing='ij')) # 2, Wh, Ww
702
+ coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
703
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
704
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
705
+ relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
706
+ relative_coords[:, :, 1] += self.window_size[1] - 1
707
+ relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
708
+ relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
709
+ self.register_buffer("relative_position_index", relative_position_index)
710
+
711
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
712
+ self.attn_drop_prob = attn_drop
713
+ self.attn_drop = nn.Dropout(attn_drop)
714
+ self.proj = nn.Linear(dim, dim)
715
+ self.proj_drop = nn.Dropout(proj_drop)
716
+
717
+ trunc_normal_(self.relative_position_bias_table, std=.02)
718
+ self.softmax = nn.Softmax(dim=-1)
719
+
720
+ def forward(self, x, mask=None):
721
+ """ Forward function.
722
+
723
+ Args:
724
+ x: input features with shape of (num_windows*B, N, C)
725
+ mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
726
+ """
727
+ B_, N, C = x.shape
728
+ qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
729
+ q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
730
+
731
+ q = q * self.scale
732
+
733
+ if config.SDPA_enabled:
734
+ x = torch.nn.functional.scaled_dot_product_attention(
735
+ q, k, v,
736
+ attn_mask=None, dropout_p=self.attn_drop_prob, is_causal=False
737
+ ).transpose(1, 2).reshape(B_, N, C)
738
+ else:
739
+ attn = (q @ k.transpose(-2, -1))
740
+
741
+ relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
742
+ self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH
743
+ relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
744
+ attn = attn + relative_position_bias.unsqueeze(0)
745
+
746
+ if mask is not None:
747
+ nW = mask.shape[0]
748
+ attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
749
+ attn = attn.view(-1, self.num_heads, N, N)
750
+ attn = self.softmax(attn)
751
+ else:
752
+ attn = self.softmax(attn)
753
+
754
+ attn = self.attn_drop(attn)
755
+
756
+ x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
757
+ x = self.proj(x)
758
+ x = self.proj_drop(x)
759
+ return x
760
+
761
+
762
+ class SwinTransformerBlock(nn.Module):
763
+ """ Swin Transformer Block.
764
+
765
+ Args:
766
+ dim (int): Number of input channels.
767
+ num_heads (int): Number of attention heads.
768
+ window_size (int): Window size.
769
+ shift_size (int): Shift size for SW-MSA.
770
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
771
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
772
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
773
+ drop (float, optional): Dropout rate. Default: 0.0
774
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
775
+ drop_path (float, optional): Stochastic depth rate. Default: 0.0
776
+ act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
777
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
778
+ """
779
+
780
+ def __init__(self, dim, num_heads, window_size=7, shift_size=0,
781
+ mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
782
+ act_layer=nn.GELU, norm_layer=nn.LayerNorm):
783
+ super().__init__()
784
+ self.dim = dim
785
+ self.num_heads = num_heads
786
+ self.window_size = window_size
787
+ self.shift_size = shift_size
788
+ self.mlp_ratio = mlp_ratio
789
+ assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
790
+
791
+ self.norm1 = norm_layer(dim)
792
+ self.attn = WindowAttention(
793
+ dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
794
+ qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
795
+
796
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
797
+ self.norm2 = norm_layer(dim)
798
+ mlp_hidden_dim = int(dim * mlp_ratio)
799
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
800
+
801
+ self.H = None
802
+ self.W = None
803
+
804
+ def forward(self, x, mask_matrix):
805
+ """ Forward function.
806
+
807
+ Args:
808
+ x: Input feature, tensor size (B, H*W, C).
809
+ H, W: Spatial resolution of the input feature.
810
+ mask_matrix: Attention mask for cyclic shift.
811
+ """
812
+ B, L, C = x.shape
813
+ H, W = self.H, self.W
814
+ assert L == H * W, "input feature has wrong size"
815
+
816
+ shortcut = x
817
+ x = self.norm1(x)
818
+ x = x.view(B, H, W, C)
819
+
820
+ # pad feature maps to multiples of window size
821
+ pad_l = pad_t = 0
822
+ pad_r = (self.window_size - W % self.window_size) % self.window_size
823
+ pad_b = (self.window_size - H % self.window_size) % self.window_size
824
+ x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
825
+ _, Hp, Wp, _ = x.shape
826
+
827
+ # cyclic shift
828
+ if self.shift_size > 0:
829
+ shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
830
+ attn_mask = mask_matrix
831
+ else:
832
+ shifted_x = x
833
+ attn_mask = None
834
+
835
+ # partition windows
836
+ x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C
837
+ x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C
838
+
839
+ # W-MSA/SW-MSA
840
+ attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
841
+
842
+ # merge windows
843
+ attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
844
+ shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
845
+
846
+ # reverse cyclic shift
847
+ if self.shift_size > 0:
848
+ x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
849
+ else:
850
+ x = shifted_x
851
+
852
+ if pad_r > 0 or pad_b > 0:
853
+ x = x[:, :H, :W, :].contiguous()
854
+
855
+ x = x.view(B, H * W, C)
856
+
857
+ # FFN
858
+ x = shortcut + self.drop_path(x)
859
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
860
+
861
+ return x
862
+
863
+
864
+ class PatchMerging(nn.Module):
865
+ """ Patch Merging Layer
866
+
867
+ Args:
868
+ dim (int): Number of input channels.
869
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
870
+ """
871
+ def __init__(self, dim, norm_layer=nn.LayerNorm):
872
+ super().__init__()
873
+ self.dim = dim
874
+ self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
875
+ self.norm = norm_layer(4 * dim)
876
+
877
+ def forward(self, x, H, W):
878
+ """ Forward function.
879
+
880
+ Args:
881
+ x: Input feature, tensor size (B, H*W, C).
882
+ H, W: Spatial resolution of the input feature.
883
+ """
884
+ B, L, C = x.shape
885
+ assert L == H * W, "input feature has wrong size"
886
+
887
+ x = x.view(B, H, W, C)
888
+
889
+ # padding
890
+ pad_input = (H % 2 == 1) or (W % 2 == 1)
891
+ if pad_input:
892
+ x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
893
+
894
+ x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
895
+ x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
896
+ x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
897
+ x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
898
+ x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
899
+ x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
900
+
901
+ x = self.norm(x)
902
+ x = self.reduction(x)
903
+
904
+ return x
905
+
906
+
907
+ class BasicLayer(nn.Module):
908
+ """ A basic Swin Transformer layer for one stage.
909
+
910
+ Args:
911
+ dim (int): Number of feature channels
912
+ depth (int): Depths of this stage.
913
+ num_heads (int): Number of attention head.
914
+ window_size (int): Local window size. Default: 7.
915
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
916
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
917
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
918
+ drop (float, optional): Dropout rate. Default: 0.0
919
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
920
+ drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
921
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
922
+ downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
923
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
924
+ """
925
+
926
+ def __init__(self,
927
+ dim,
928
+ depth,
929
+ num_heads,
930
+ window_size=7,
931
+ mlp_ratio=4.,
932
+ qkv_bias=True,
933
+ qk_scale=None,
934
+ drop=0.,
935
+ attn_drop=0.,
936
+ drop_path=0.,
937
+ norm_layer=nn.LayerNorm,
938
+ downsample=None,
939
+ use_checkpoint=False):
940
+ super().__init__()
941
+ self.window_size = window_size
942
+ self.shift_size = window_size // 2
943
+ self.depth = depth
944
+ self.use_checkpoint = use_checkpoint
945
+
946
+ # build blocks
947
+ self.blocks = nn.ModuleList([
948
+ SwinTransformerBlock(
949
+ dim=dim,
950
+ num_heads=num_heads,
951
+ window_size=window_size,
952
+ shift_size=0 if (i % 2 == 0) else window_size // 2,
953
+ mlp_ratio=mlp_ratio,
954
+ qkv_bias=qkv_bias,
955
+ qk_scale=qk_scale,
956
+ drop=drop,
957
+ attn_drop=attn_drop,
958
+ drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
959
+ norm_layer=norm_layer)
960
+ for i in range(depth)])
961
+
962
+ # patch merging layer
963
+ if downsample is not None:
964
+ self.downsample = downsample(dim=dim, norm_layer=norm_layer)
965
+ else:
966
+ self.downsample = None
967
+
968
+ def forward(self, x, H, W):
969
+ """ Forward function.
970
+
971
+ Args:
972
+ x: Input feature, tensor size (B, H*W, C).
973
+ H, W: Spatial resolution of the input feature.
974
+ """
975
+
976
+ # calculate attention mask for SW-MSA
977
+ Hp = int(np.ceil(H / self.window_size)) * self.window_size
978
+ Wp = int(np.ceil(W / self.window_size)) * self.window_size
979
+ img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
980
+ h_slices = (slice(0, -self.window_size),
981
+ slice(-self.window_size, -self.shift_size),
982
+ slice(-self.shift_size, None))
983
+ w_slices = (slice(0, -self.window_size),
984
+ slice(-self.window_size, -self.shift_size),
985
+ slice(-self.shift_size, None))
986
+ cnt = 0
987
+ for h in h_slices:
988
+ for w in w_slices:
989
+ img_mask[:, h, w, :] = cnt
990
+ cnt += 1
991
+
992
+ mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1
993
+ mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
994
+ attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
995
+ attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
996
+
997
+ for blk in self.blocks:
998
+ blk.H, blk.W = H, W
999
+ if self.use_checkpoint:
1000
+ x = checkpoint.checkpoint(blk, x, attn_mask)
1001
+ else:
1002
+ x = blk(x, attn_mask)
1003
+ if self.downsample is not None:
1004
+ x_down = self.downsample(x, H, W)
1005
+ Wh, Ww = (H + 1) // 2, (W + 1) // 2
1006
+ return x, H, W, x_down, Wh, Ww
1007
+ else:
1008
+ return x, H, W, x, H, W
1009
+
1010
+
1011
+ class PatchEmbed(nn.Module):
1012
+ """ Image to Patch Embedding
1013
+
1014
+ Args:
1015
+ patch_size (int): Patch token size. Default: 4.
1016
+ in_channels (int): Number of input image channels. Default: 3.
1017
+ embed_dim (int): Number of linear projection output channels. Default: 96.
1018
+ norm_layer (nn.Module, optional): Normalization layer. Default: None
1019
+ """
1020
+
1021
+ def __init__(self, patch_size=4, in_channels=3, embed_dim=96, norm_layer=None):
1022
+ super().__init__()
1023
+ patch_size = to_2tuple(patch_size)
1024
+ self.patch_size = patch_size
1025
+
1026
+ self.in_channels = in_channels
1027
+ self.embed_dim = embed_dim
1028
+
1029
+ self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
1030
+ if norm_layer is not None:
1031
+ self.norm = norm_layer(embed_dim)
1032
+ else:
1033
+ self.norm = None
1034
+
1035
+ def forward(self, x):
1036
+ """Forward function."""
1037
+ # padding
1038
+ _, _, H, W = x.size()
1039
+ if W % self.patch_size[1] != 0:
1040
+ x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
1041
+ if H % self.patch_size[0] != 0:
1042
+ x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
1043
+
1044
+ x = self.proj(x) # B C Wh Ww
1045
+ if self.norm is not None:
1046
+ Wh, Ww = x.size(2), x.size(3)
1047
+ x = x.flatten(2).transpose(1, 2)
1048
+ x = self.norm(x)
1049
+ x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
1050
+
1051
+ return x
1052
+
1053
+
1054
+ class SwinTransformer(nn.Module):
1055
+ """ Swin Transformer backbone.
1056
+ A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` -
1057
+ https://arxiv.org/pdf/2103.14030
1058
+
1059
+ Args:
1060
+ pretrain_img_size (int): Input image size for training the pretrained model,
1061
+ used in absolute postion embedding. Default 224.
1062
+ patch_size (int | tuple(int)): Patch size. Default: 4.
1063
+ in_channels (int): Number of input image channels. Default: 3.
1064
+ embed_dim (int): Number of linear projection output channels. Default: 96.
1065
+ depths (tuple[int]): Depths of each Swin Transformer stage.
1066
+ num_heads (tuple[int]): Number of attention head of each stage.
1067
+ window_size (int): Window size. Default: 7.
1068
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
1069
+ qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
1070
+ qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
1071
+ drop_rate (float): Dropout rate.
1072
+ attn_drop_rate (float): Attention dropout rate. Default: 0.
1073
+ drop_path_rate (float): Stochastic depth rate. Default: 0.2.
1074
+ norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
1075
+ ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
1076
+ patch_norm (bool): If True, add normalization after patch embedding. Default: True.
1077
+ out_indices (Sequence[int]): Output from which stages.
1078
+ frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
1079
+ -1 means not freezing any parameters.
1080
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
1081
+ """
1082
+
1083
+ def __init__(self,
1084
+ pretrain_img_size=224,
1085
+ patch_size=4,
1086
+ in_channels=3,
1087
+ embed_dim=96,
1088
+ depths=[2, 2, 6, 2],
1089
+ num_heads=[3, 6, 12, 24],
1090
+ window_size=7,
1091
+ mlp_ratio=4.,
1092
+ qkv_bias=True,
1093
+ qk_scale=None,
1094
+ drop_rate=0.,
1095
+ attn_drop_rate=0.,
1096
+ drop_path_rate=0.2,
1097
+ norm_layer=nn.LayerNorm,
1098
+ ape=False,
1099
+ patch_norm=True,
1100
+ out_indices=(0, 1, 2, 3),
1101
+ frozen_stages=-1,
1102
+ use_checkpoint=False):
1103
+ super().__init__()
1104
+
1105
+ self.pretrain_img_size = pretrain_img_size
1106
+ self.num_layers = len(depths)
1107
+ self.embed_dim = embed_dim
1108
+ self.ape = ape
1109
+ self.patch_norm = patch_norm
1110
+ self.out_indices = out_indices
1111
+ self.frozen_stages = frozen_stages
1112
+
1113
+ # split image into non-overlapping patches
1114
+ self.patch_embed = PatchEmbed(
1115
+ patch_size=patch_size, in_channels=in_channels, embed_dim=embed_dim,
1116
+ norm_layer=norm_layer if self.patch_norm else None)
1117
+
1118
+ # absolute position embedding
1119
+ if self.ape:
1120
+ pretrain_img_size = to_2tuple(pretrain_img_size)
1121
+ patch_size = to_2tuple(patch_size)
1122
+ patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]]
1123
+
1124
+ self.absolute_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]))
1125
+ trunc_normal_(self.absolute_pos_embed, std=.02)
1126
+
1127
+ self.pos_drop = nn.Dropout(p=drop_rate)
1128
+
1129
+ # stochastic depth
1130
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
1131
+
1132
+ # build layers
1133
+ self.layers = nn.ModuleList()
1134
+ for i_layer in range(self.num_layers):
1135
+ layer = BasicLayer(
1136
+ dim=int(embed_dim * 2 ** i_layer),
1137
+ depth=depths[i_layer],
1138
+ num_heads=num_heads[i_layer],
1139
+ window_size=window_size,
1140
+ mlp_ratio=mlp_ratio,
1141
+ qkv_bias=qkv_bias,
1142
+ qk_scale=qk_scale,
1143
+ drop=drop_rate,
1144
+ attn_drop=attn_drop_rate,
1145
+ drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
1146
+ norm_layer=norm_layer,
1147
+ downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
1148
+ use_checkpoint=use_checkpoint)
1149
+ self.layers.append(layer)
1150
+
1151
+ num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
1152
+ self.num_features = num_features
1153
+
1154
+ # add a norm layer for each output
1155
+ for i_layer in out_indices:
1156
+ layer = norm_layer(num_features[i_layer])
1157
+ layer_name = f'norm{i_layer}'
1158
+ self.add_module(layer_name, layer)
1159
+
1160
+ self._freeze_stages()
1161
+
1162
+ def _freeze_stages(self):
1163
+ if self.frozen_stages >= 0:
1164
+ self.patch_embed.eval()
1165
+ for param in self.patch_embed.parameters():
1166
+ param.requires_grad = False
1167
+
1168
+ if self.frozen_stages >= 1 and self.ape:
1169
+ self.absolute_pos_embed.requires_grad = False
1170
+
1171
+ if self.frozen_stages >= 2:
1172
+ self.pos_drop.eval()
1173
+ for i in range(0, self.frozen_stages - 1):
1174
+ m = self.layers[i]
1175
+ m.eval()
1176
+ for param in m.parameters():
1177
+ param.requires_grad = False
1178
+
1179
+
1180
+ def forward(self, x):
1181
+ """Forward function."""
1182
+ x = self.patch_embed(x)
1183
+
1184
+ Wh, Ww = x.size(2), x.size(3)
1185
+ if self.ape:
1186
+ # interpolate the position embedding to the corresponding size
1187
+ absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
1188
+ x = (x + absolute_pos_embed) # B Wh*Ww C
1189
+
1190
+ outs = []#x.contiguous()]
1191
+ x = x.flatten(2).transpose(1, 2)
1192
+ x = self.pos_drop(x)
1193
+ for i in range(self.num_layers):
1194
+ layer = self.layers[i]
1195
+ x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
1196
+
1197
+ if i in self.out_indices:
1198
+ norm_layer = getattr(self, f'norm{i}')
1199
+ x_out = norm_layer(x_out)
1200
+
1201
+ out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
1202
+ outs.append(out)
1203
+
1204
+ return tuple(outs)
1205
+
1206
+ def train(self, mode=True):
1207
+ """Convert the model into training mode while keep layers freezed."""
1208
+ super(SwinTransformer, self).train(mode)
1209
+ self._freeze_stages()
1210
+
1211
+ def swin_v1_t():
1212
+ model = SwinTransformer(embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7)
1213
+ return model
1214
+
1215
+ def swin_v1_s():
1216
+ model = SwinTransformer(embed_dim=96, depths=[2, 2, 18, 2], num_heads=[3, 6, 12, 24], window_size=7)
1217
+ return model
1218
+
1219
+ def swin_v1_b():
1220
+ model = SwinTransformer(embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=12)
1221
+ return model
1222
+
1223
+ def swin_v1_l():
1224
+ model = SwinTransformer(embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=12)
1225
+ return model
1226
+
1227
+
1228
+
1229
+ ### models/modules/deform_conv.py
1230
+
1231
+ import torch
1232
+ import torch.nn as nn
1233
+ from torchvision.ops import deform_conv2d
1234
+
1235
+
1236
+ class DeformableConv2d(nn.Module):
1237
+ def __init__(self,
1238
+ in_channels,
1239
+ out_channels,
1240
+ kernel_size=3,
1241
+ stride=1,
1242
+ padding=1,
1243
+ bias=False):
1244
+
1245
+ super(DeformableConv2d, self).__init__()
1246
+
1247
+ assert type(kernel_size) == tuple or type(kernel_size) == int
1248
+
1249
+ kernel_size = kernel_size if type(kernel_size) == tuple else (kernel_size, kernel_size)
1250
+ self.stride = stride if type(stride) == tuple else (stride, stride)
1251
+ self.padding = padding
1252
+
1253
+ self.offset_conv = nn.Conv2d(in_channels,
1254
+ 2 * kernel_size[0] * kernel_size[1],
1255
+ kernel_size=kernel_size,
1256
+ stride=stride,
1257
+ padding=self.padding,
1258
+ bias=True)
1259
+
1260
+ nn.init.constant_(self.offset_conv.weight, 0.)
1261
+ nn.init.constant_(self.offset_conv.bias, 0.)
1262
+
1263
+ self.modulator_conv = nn.Conv2d(in_channels,
1264
+ 1 * kernel_size[0] * kernel_size[1],
1265
+ kernel_size=kernel_size,
1266
+ stride=stride,
1267
+ padding=self.padding,
1268
+ bias=True)
1269
+
1270
+ nn.init.constant_(self.modulator_conv.weight, 0.)
1271
+ nn.init.constant_(self.modulator_conv.bias, 0.)
1272
+
1273
+ self.regular_conv = nn.Conv2d(in_channels,
1274
+ out_channels=out_channels,
1275
+ kernel_size=kernel_size,
1276
+ stride=stride,
1277
+ padding=self.padding,
1278
+ bias=bias)
1279
+
1280
+ def forward(self, x):
1281
+ #h, w = x.shape[2:]
1282
+ #max_offset = max(h, w)/4.
1283
+
1284
+ offset = self.offset_conv(x)#.clamp(-max_offset, max_offset)
1285
+ modulator = 2. * torch.sigmoid(self.modulator_conv(x))
1286
+
1287
+ x = deform_conv2d(
1288
+ input=x,
1289
+ offset=offset,
1290
+ weight=self.regular_conv.weight,
1291
+ bias=self.regular_conv.bias,
1292
+ padding=self.padding,
1293
+ mask=modulator,
1294
+ stride=self.stride,
1295
+ )
1296
+ return x
1297
+
1298
+
1299
+
1300
+
1301
+ ### utils.py
1302
+
1303
+ import torch.nn as nn
1304
+
1305
+
1306
+ def build_act_layer(act_layer):
1307
+ if act_layer == 'ReLU':
1308
+ return nn.ReLU(inplace=True)
1309
+ elif act_layer == 'SiLU':
1310
+ return nn.SiLU(inplace=True)
1311
+ elif act_layer == 'GELU':
1312
+ return nn.GELU()
1313
+
1314
+ raise NotImplementedError(f'build_act_layer does not support {act_layer}')
1315
+
1316
+
1317
+ def build_norm_layer(dim,
1318
+ norm_layer,
1319
+ in_format='channels_last',
1320
+ out_format='channels_last',
1321
+ eps=1e-6):
1322
+ layers = []
1323
+ if norm_layer == 'BN':
1324
+ if in_format == 'channels_last':
1325
+ layers.append(to_channels_first())
1326
+ layers.append(nn.BatchNorm2d(dim))
1327
+ if out_format == 'channels_last':
1328
+ layers.append(to_channels_last())
1329
+ elif norm_layer == 'LN':
1330
+ if in_format == 'channels_first':
1331
+ layers.append(to_channels_last())
1332
+ layers.append(nn.LayerNorm(dim, eps=eps))
1333
+ if out_format == 'channels_first':
1334
+ layers.append(to_channels_first())
1335
+ else:
1336
+ raise NotImplementedError(
1337
+ f'build_norm_layer does not support {norm_layer}')
1338
+ return nn.Sequential(*layers)
1339
+
1340
+
1341
+ class to_channels_first(nn.Module):
1342
+
1343
+ def __init__(self):
1344
+ super().__init__()
1345
+
1346
+ def forward(self, x):
1347
+ return x.permute(0, 3, 1, 2)
1348
+
1349
+
1350
+ class to_channels_last(nn.Module):
1351
+
1352
+ def __init__(self):
1353
+ super().__init__()
1354
+
1355
+ def forward(self, x):
1356
+ return x.permute(0, 2, 3, 1)
1357
+
1358
+
1359
+
1360
+ ### dataset.py
1361
+
1362
+ _class_labels_TR_sorted = (
1363
+ 'Airplane, Ant, Antenna, Archery, Axe, BabyCarriage, Bag, BalanceBeam, Balcony, Balloon, Basket, BasketballHoop, Beatle, Bed, Bee, Bench, Bicycle, '
1364
+ 'BicycleFrame, BicycleStand, Boat, Bonsai, BoomLift, Bridge, BunkBed, Butterfly, Button, Cable, CableLift, Cage, Camcorder, Cannon, Canoe, Car, '
1365
+ 'CarParkDropArm, Carriage, Cart, Caterpillar, CeilingLamp, Centipede, Chair, Clip, Clock, Clothes, CoatHanger, Comb, ConcretePumpTruck, Crack, Crane, '
1366
+ 'Cup, DentalChair, Desk, DeskChair, Diagram, DishRack, DoorHandle, Dragonfish, Dragonfly, Drum, Earphone, Easel, ElectricIron, Excavator, Eyeglasses, '
1367
+ 'Fan, Fence, Fencing, FerrisWheel, FireExtinguisher, Fishing, Flag, FloorLamp, Forklift, GasStation, Gate, Gear, Goal, Golf, GymEquipment, Hammock, '
1368
+ 'Handcart, Handcraft, Handrail, HangGlider, Harp, Harvester, Headset, Helicopter, Helmet, Hook, HorizontalBar, Hydrovalve, IroningTable, Jewelry, Key, '
1369
+ 'KidsPlayground, Kitchenware, Kite, Knife, Ladder, LaundryRack, Lightning, Lobster, Locust, Machine, MachineGun, MagazineRack, Mantis, Medal, MemorialArchway, '
1370
+ 'Microphone, Missile, MobileHolder, Monitor, Mosquito, Motorcycle, MovingTrolley, Mower, MusicPlayer, MusicStand, ObservationTower, Octopus, OilWell, '
1371
+ 'OlympicLogo, OperatingTable, OutdoorFitnessEquipment, Parachute, Pavilion, Piano, Pipe, PlowHarrow, PoleVault, Punchbag, Rack, Racket, Rifle, Ring, Robot, '
1372
+ 'RockClimbing, Rope, Sailboat, Satellite, Scaffold, Scale, Scissor, Scooter, Sculpture, Seadragon, Seahorse, Seal, SewingMachine, Ship, Shoe, ShoppingCart, '
1373
+ 'ShoppingTrolley, Shower, Shrimp, Signboard, Skateboarding, Skeleton, Skiing, Spade, SpeedBoat, Spider, Spoon, Stair, Stand, Stationary, SteeringWheel, '
1374
+ 'Stethoscope, Stool, Stove, StreetLamp, SweetStand, Swing, Sword, TV, Table, TableChair, TableLamp, TableTennis, Tank, Tapeline, Teapot, Telescope, Tent, '
1375
+ 'TobaccoPipe, Toy, Tractor, TrafficLight, TrafficSign, Trampoline, TransmissionTower, Tree, Tricycle, TrimmerCover, Tripod, Trombone, Truck, Trumpet, Tuba, '
1376
+ 'UAV, Umbrella, UnevenBars, UtilityPole, VacuumCleaner, Violin, Wakesurfing, Watch, WaterTower, WateringPot, Well, WellLid, Wheel, Wheelchair, WindTurbine, Windmill, WineGlass, WireWhisk, Yacht'
1377
+ )
1378
+ class_labels_TR_sorted = _class_labels_TR_sorted.split(', ')
1379
+
1380
+
1381
+ ### models/backbones/build_backbones.py
1382
+
1383
+ import torch
1384
+ import torch.nn as nn
1385
+ from collections import OrderedDict
1386
+ from torchvision.models import vgg16, vgg16_bn, VGG16_Weights, VGG16_BN_Weights, resnet50, ResNet50_Weights
1387
+ # from models.pvt_v2 import pvt_v2_b0, pvt_v2_b1, pvt_v2_b2, pvt_v2_b5
1388
+ # from models.swin_v1 import swin_v1_t, swin_v1_s, swin_v1_b, swin_v1_l
1389
+ # from config import Config
1390
+
1391
+
1392
+ config = Config()
1393
+
1394
+ def build_backbone(bb_name, pretrained=True, params_settings=''):
1395
+ if bb_name == 'vgg16':
1396
+ bb_net = list(vgg16(pretrained=VGG16_Weights.DEFAULT if pretrained else None).children())[0]
1397
+ bb = nn.Sequential(OrderedDict({'conv1': bb_net[:4], 'conv2': bb_net[4:9], 'conv3': bb_net[9:16], 'conv4': bb_net[16:23]}))
1398
+ elif bb_name == 'vgg16bn':
1399
+ bb_net = list(vgg16_bn(pretrained=VGG16_BN_Weights.DEFAULT if pretrained else None).children())[0]
1400
+ bb = nn.Sequential(OrderedDict({'conv1': bb_net[:6], 'conv2': bb_net[6:13], 'conv3': bb_net[13:23], 'conv4': bb_net[23:33]}))
1401
+ elif bb_name == 'resnet50':
1402
+ bb_net = list(resnet50(pretrained=ResNet50_Weights.DEFAULT if pretrained else None).children())
1403
+ bb = nn.Sequential(OrderedDict({'conv1': nn.Sequential(*bb_net[0:3]), 'conv2': bb_net[4], 'conv3': bb_net[5], 'conv4': bb_net[6]}))
1404
+ else:
1405
+ bb = eval('{}({})'.format(bb_name, params_settings))
1406
+ if pretrained:
1407
+ bb = load_weights(bb, bb_name)
1408
+ return bb
1409
+
1410
+ def load_weights(model, model_name):
1411
+ save_model = torch.load(config.weights[model_name], map_location='cpu')
1412
+ model_dict = model.state_dict()
1413
+ state_dict = {k: v if v.size() == model_dict[k].size() else model_dict[k] for k, v in save_model.items() if k in model_dict.keys()}
1414
+ # to ignore the weights with mismatched size when I modify the backbone itself.
1415
+ if not state_dict:
1416
+ save_model_keys = list(save_model.keys())
1417
+ sub_item = save_model_keys[0] if len(save_model_keys) == 1 else None
1418
+ state_dict = {k: v if v.size() == model_dict[k].size() else model_dict[k] for k, v in save_model[sub_item].items() if k in model_dict.keys()}
1419
+ if not state_dict or not sub_item:
1420
+ print('Weights are not successully loaded. Check the state dict of weights file.')
1421
+ return None
1422
+ else:
1423
+ print('Found correct weights in the "{}" item of loaded state_dict.'.format(sub_item))
1424
+ model_dict.update(state_dict)
1425
+ model.load_state_dict(model_dict)
1426
+ return model
1427
+
1428
+
1429
+
1430
+ ### models/modules/decoder_blocks.py
1431
+
1432
+ import torch
1433
+ import torch.nn as nn
1434
+ # from models.aspp import ASPP, ASPPDeformable
1435
+ # from config import Config
1436
+
1437
+
1438
+ # config = Config()
1439
+
1440
+
1441
+ class BasicDecBlk(nn.Module):
1442
+ def __init__(self, in_channels=64, out_channels=64, inter_channels=64):
1443
+ super(BasicDecBlk, self).__init__()
1444
+ inter_channels = in_channels // 4 if config.dec_channels_inter == 'adap' else 64
1445
+ self.conv_in = nn.Conv2d(in_channels, inter_channels, 3, 1, padding=1)
1446
+ self.relu_in = nn.ReLU(inplace=True)
1447
+ if config.dec_att == 'ASPP':
1448
+ self.dec_att = ASPP(in_channels=inter_channels)
1449
+ elif config.dec_att == 'ASPPDeformable':
1450
+ self.dec_att = ASPPDeformable(in_channels=inter_channels)
1451
+ self.conv_out = nn.Conv2d(inter_channels, out_channels, 3, 1, padding=1)
1452
+ self.bn_in = nn.BatchNorm2d(inter_channels) if config.batch_size > 1 else nn.Identity()
1453
+ self.bn_out = nn.BatchNorm2d(out_channels) if config.batch_size > 1 else nn.Identity()
1454
+
1455
+ def forward(self, x):
1456
+ x = self.conv_in(x)
1457
+ x = self.bn_in(x)
1458
+ x = self.relu_in(x)
1459
+ if hasattr(self, 'dec_att'):
1460
+ x = self.dec_att(x)
1461
+ x = self.conv_out(x)
1462
+ x = self.bn_out(x)
1463
+ return x
1464
+
1465
+
1466
+ class ResBlk(nn.Module):
1467
+ def __init__(self, in_channels=64, out_channels=None, inter_channels=64):
1468
+ super(ResBlk, self).__init__()
1469
+ if out_channels is None:
1470
+ out_channels = in_channels
1471
+ inter_channels = in_channels // 4 if config.dec_channels_inter == 'adap' else 64
1472
+
1473
+ self.conv_in = nn.Conv2d(in_channels, inter_channels, 3, 1, padding=1)
1474
+ self.bn_in = nn.BatchNorm2d(inter_channels) if config.batch_size > 1 else nn.Identity()
1475
+ self.relu_in = nn.ReLU(inplace=True)
1476
+
1477
+ if config.dec_att == 'ASPP':
1478
+ self.dec_att = ASPP(in_channels=inter_channels)
1479
+ elif config.dec_att == 'ASPPDeformable':
1480
+ self.dec_att = ASPPDeformable(in_channels=inter_channels)
1481
+
1482
+ self.conv_out = nn.Conv2d(inter_channels, out_channels, 3, 1, padding=1)
1483
+ self.bn_out = nn.BatchNorm2d(out_channels) if config.batch_size > 1 else nn.Identity()
1484
+
1485
+ self.conv_resi = nn.Conv2d(in_channels, out_channels, 1, 1, 0)
1486
+
1487
+ def forward(self, x):
1488
+ _x = self.conv_resi(x)
1489
+ x = self.conv_in(x)
1490
+ x = self.bn_in(x)
1491
+ x = self.relu_in(x)
1492
+ if hasattr(self, 'dec_att'):
1493
+ x = self.dec_att(x)
1494
+ x = self.conv_out(x)
1495
+ x = self.bn_out(x)
1496
+ return x + _x
1497
+
1498
+
1499
+
1500
+ ### models/modules/lateral_blocks.py
1501
+
1502
+ import numpy as np
1503
+ import torch
1504
+ import torch.nn as nn
1505
+ import torch.nn.functional as F
1506
+ from functools import partial
1507
+
1508
+ # from config import Config
1509
+
1510
+
1511
+ # config = Config()
1512
+
1513
+
1514
+ class BasicLatBlk(nn.Module):
1515
+ def __init__(self, in_channels=64, out_channels=64, inter_channels=64):
1516
+ super(BasicLatBlk, self).__init__()
1517
+ inter_channels = in_channels // 4 if config.dec_channels_inter == 'adap' else 64
1518
+ self.conv = nn.Conv2d(in_channels, out_channels, 1, 1, 0)
1519
+
1520
+ def forward(self, x):
1521
+ x = self.conv(x)
1522
+ return x
1523
+
1524
+
1525
+
1526
+ ### models/modules/aspp.py
1527
+
1528
+ import torch
1529
+ import torch.nn as nn
1530
+ import torch.nn.functional as F
1531
+ # from models.deform_conv import DeformableConv2d
1532
+ # from config import Config
1533
+
1534
+
1535
+ # config = Config()
1536
+
1537
+
1538
+ class _ASPPModule(nn.Module):
1539
+ def __init__(self, in_channels, planes, kernel_size, padding, dilation):
1540
+ super(_ASPPModule, self).__init__()
1541
+ self.atrous_conv = nn.Conv2d(in_channels, planes, kernel_size=kernel_size,
1542
+ stride=1, padding=padding, dilation=dilation, bias=False)
1543
+ self.bn = nn.BatchNorm2d(planes) if config.batch_size > 1 else nn.Identity()
1544
+ self.relu = nn.ReLU(inplace=True)
1545
+
1546
+ def forward(self, x):
1547
+ x = self.atrous_conv(x)
1548
+ x = self.bn(x)
1549
+
1550
+ return self.relu(x)
1551
+
1552
+
1553
+ class ASPP(nn.Module):
1554
+ def __init__(self, in_channels=64, out_channels=None, output_stride=16):
1555
+ super(ASPP, self).__init__()
1556
+ self.down_scale = 1
1557
+ if out_channels is None:
1558
+ out_channels = in_channels
1559
+ self.in_channelster = 256 // self.down_scale
1560
+ if output_stride == 16:
1561
+ dilations = [1, 6, 12, 18]
1562
+ elif output_stride == 8:
1563
+ dilations = [1, 12, 24, 36]
1564
+ else:
1565
+ raise NotImplementedError
1566
+
1567
+ self.aspp1 = _ASPPModule(in_channels, self.in_channelster, 1, padding=0, dilation=dilations[0])
1568
+ self.aspp2 = _ASPPModule(in_channels, self.in_channelster, 3, padding=dilations[1], dilation=dilations[1])
1569
+ self.aspp3 = _ASPPModule(in_channels, self.in_channelster, 3, padding=dilations[2], dilation=dilations[2])
1570
+ self.aspp4 = _ASPPModule(in_channels, self.in_channelster, 3, padding=dilations[3], dilation=dilations[3])
1571
+
1572
+ self.global_avg_pool = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
1573
+ nn.Conv2d(in_channels, self.in_channelster, 1, stride=1, bias=False),
1574
+ nn.BatchNorm2d(self.in_channelster) if config.batch_size > 1 else nn.Identity(),
1575
+ nn.ReLU(inplace=True))
1576
+ self.conv1 = nn.Conv2d(self.in_channelster * 5, out_channels, 1, bias=False)
1577
+ self.bn1 = nn.BatchNorm2d(out_channels) if config.batch_size > 1 else nn.Identity()
1578
+ self.relu = nn.ReLU(inplace=True)
1579
+ self.dropout = nn.Dropout(0.5)
1580
+
1581
+ def forward(self, x):
1582
+ x1 = self.aspp1(x)
1583
+ x2 = self.aspp2(x)
1584
+ x3 = self.aspp3(x)
1585
+ x4 = self.aspp4(x)
1586
+ x5 = self.global_avg_pool(x)
1587
+ x5 = F.interpolate(x5, size=x1.size()[2:], mode='bilinear', align_corners=True)
1588
+ x = torch.cat((x1, x2, x3, x4, x5), dim=1)
1589
+
1590
+ x = self.conv1(x)
1591
+ x = self.bn1(x)
1592
+ x = self.relu(x)
1593
+
1594
+ return self.dropout(x)
1595
+
1596
+
1597
+ ##################### Deformable
1598
+ class _ASPPModuleDeformable(nn.Module):
1599
+ def __init__(self, in_channels, planes, kernel_size, padding):
1600
+ super(_ASPPModuleDeformable, self).__init__()
1601
+ self.atrous_conv = DeformableConv2d(in_channels, planes, kernel_size=kernel_size,
1602
+ stride=1, padding=padding, bias=False)
1603
+ self.bn = nn.BatchNorm2d(planes) if config.batch_size > 1 else nn.Identity()
1604
+ self.relu = nn.ReLU(inplace=True)
1605
+
1606
+ def forward(self, x):
1607
+ x = self.atrous_conv(x)
1608
+ x = self.bn(x)
1609
+
1610
+ return self.relu(x)
1611
+
1612
+
1613
+ class ASPPDeformable(nn.Module):
1614
+ def __init__(self, in_channels, out_channels=None, parallel_block_sizes=[1, 3, 7]):
1615
+ super(ASPPDeformable, self).__init__()
1616
+ self.down_scale = 1
1617
+ if out_channels is None:
1618
+ out_channels = in_channels
1619
+ self.in_channelster = 256 // self.down_scale
1620
+
1621
+ self.aspp1 = _ASPPModuleDeformable(in_channels, self.in_channelster, 1, padding=0)
1622
+ self.aspp_deforms = nn.ModuleList([
1623
+ _ASPPModuleDeformable(in_channels, self.in_channelster, conv_size, padding=int(conv_size//2)) for conv_size in parallel_block_sizes
1624
+ ])
1625
+
1626
+ self.global_avg_pool = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
1627
+ nn.Conv2d(in_channels, self.in_channelster, 1, stride=1, bias=False),
1628
+ nn.BatchNorm2d(self.in_channelster) if config.batch_size > 1 else nn.Identity(),
1629
+ nn.ReLU(inplace=True))
1630
+ self.conv1 = nn.Conv2d(self.in_channelster * (2 + len(self.aspp_deforms)), out_channels, 1, bias=False)
1631
+ self.bn1 = nn.BatchNorm2d(out_channels) if config.batch_size > 1 else nn.Identity()
1632
+ self.relu = nn.ReLU(inplace=True)
1633
+ self.dropout = nn.Dropout(0.5)
1634
+
1635
+ def forward(self, x):
1636
+ x1 = self.aspp1(x)
1637
+ x_aspp_deforms = [aspp_deform(x) for aspp_deform in self.aspp_deforms]
1638
+ x5 = self.global_avg_pool(x)
1639
+ x5 = F.interpolate(x5, size=x1.size()[2:], mode='bilinear', align_corners=True)
1640
+ x = torch.cat((x1, *x_aspp_deforms, x5), dim=1)
1641
+
1642
+ x = self.conv1(x)
1643
+ x = self.bn1(x)
1644
+ x = self.relu(x)
1645
+
1646
+ return self.dropout(x)
1647
+
1648
+
1649
+
1650
+ ### models/refinement/refiner.py
1651
+
1652
+ import torch
1653
+ import torch.nn as nn
1654
+ from collections import OrderedDict
1655
+ import torch
1656
+ import torch.nn as nn
1657
+ import torch.nn.functional as F
1658
+ from torchvision.models import vgg16, vgg16_bn
1659
+ from torchvision.models import resnet50
1660
+
1661
+ # from config import Config
1662
+ # from dataset import class_labels_TR_sorted
1663
+ # from models.build_backbone import build_backbone
1664
+ # from models.decoder_blocks import BasicDecBlk
1665
+ # from models.lateral_blocks import BasicLatBlk
1666
+ # from models.ing import *
1667
+ # from models.stem_layer import StemLayer
1668
+
1669
+
1670
+ class RefinerPVTInChannels4(nn.Module):
1671
+ def __init__(self, in_channels=3+1):
1672
+ super(RefinerPVTInChannels4, self).__init__()
1673
+ self.config = Config()
1674
+ self.epoch = 1
1675
+ self.bb = build_backbone(self.config.bb, params_settings='in_channels=4')
1676
+
1677
+ lateral_channels_in_collection = {
1678
+ 'vgg16': [512, 256, 128, 64], 'vgg16bn': [512, 256, 128, 64], 'resnet50': [1024, 512, 256, 64],
1679
+ 'pvt_v2_b2': [512, 320, 128, 64], 'pvt_v2_b5': [512, 320, 128, 64],
1680
+ 'swin_v1_b': [1024, 512, 256, 128], 'swin_v1_l': [1536, 768, 384, 192],
1681
+ }
1682
+ channels = lateral_channels_in_collection[self.config.bb]
1683
+ self.squeeze_module = BasicDecBlk(channels[0], channels[0])
1684
+
1685
+ self.decoder = Decoder(channels)
1686
+
1687
+ if 0:
1688
+ for key, value in self.named_parameters():
1689
+ if 'bb.' in key:
1690
+ value.requires_grad = False
1691
+
1692
+ def forward(self, x):
1693
+ if isinstance(x, list):
1694
+ x = torch.cat(x, dim=1)
1695
+ ########## Encoder ##########
1696
+ if self.config.bb in ['vgg16', 'vgg16bn', 'resnet50']:
1697
+ x1 = self.bb.conv1(x)
1698
+ x2 = self.bb.conv2(x1)
1699
+ x3 = self.bb.conv3(x2)
1700
+ x4 = self.bb.conv4(x3)
1701
+ else:
1702
+ x1, x2, x3, x4 = self.bb(x)
1703
+
1704
+ x4 = self.squeeze_module(x4)
1705
+
1706
+ ########## Decoder ##########
1707
+
1708
+ features = [x, x1, x2, x3, x4]
1709
+ scaled_preds = self.decoder(features)
1710
+
1711
+ return scaled_preds
1712
+
1713
+
1714
+ class Refiner(nn.Module):
1715
+ def __init__(self, in_channels=3+1):
1716
+ super(Refiner, self).__init__()
1717
+ self.config = Config()
1718
+ self.epoch = 1
1719
+ self.stem_layer = StemLayer(in_channels=in_channels, inter_channels=48, out_channels=3, norm_layer='BN' if self.config.batch_size > 1 else 'LN')
1720
+ self.bb = build_backbone(self.config.bb)
1721
+
1722
+ lateral_channels_in_collection = {
1723
+ 'vgg16': [512, 256, 128, 64], 'vgg16bn': [512, 256, 128, 64], 'resnet50': [1024, 512, 256, 64],
1724
+ 'pvt_v2_b2': [512, 320, 128, 64], 'pvt_v2_b5': [512, 320, 128, 64],
1725
+ 'swin_v1_b': [1024, 512, 256, 128], 'swin_v1_l': [1536, 768, 384, 192],
1726
+ }
1727
+ channels = lateral_channels_in_collection[self.config.bb]
1728
+ self.squeeze_module = BasicDecBlk(channels[0], channels[0])
1729
+
1730
+ self.decoder = Decoder(channels)
1731
+
1732
+ if 0:
1733
+ for key, value in self.named_parameters():
1734
+ if 'bb.' in key:
1735
+ value.requires_grad = False
1736
+
1737
+ def forward(self, x):
1738
+ if isinstance(x, list):
1739
+ x = torch.cat(x, dim=1)
1740
+ x = self.stem_layer(x)
1741
+ ########## Encoder ##########
1742
+ if self.config.bb in ['vgg16', 'vgg16bn', 'resnet50']:
1743
+ x1 = self.bb.conv1(x)
1744
+ x2 = self.bb.conv2(x1)
1745
+ x3 = self.bb.conv3(x2)
1746
+ x4 = self.bb.conv4(x3)
1747
+ else:
1748
+ x1, x2, x3, x4 = self.bb(x)
1749
+
1750
+ x4 = self.squeeze_module(x4)
1751
+
1752
+ ########## Decoder ##########
1753
+
1754
+ features = [x, x1, x2, x3, x4]
1755
+ scaled_preds = self.decoder(features)
1756
+
1757
+ return scaled_preds
1758
+
1759
+
1760
+ class Decoder(nn.Module):
1761
+ def __init__(self, channels):
1762
+ super(Decoder, self).__init__()
1763
+ self.config = Config()
1764
+ DecoderBlock = eval('BasicDecBlk')
1765
+ LateralBlock = eval('BasicLatBlk')
1766
+
1767
+ self.decoder_block4 = DecoderBlock(channels[0], channels[1])
1768
+ self.decoder_block3 = DecoderBlock(channels[1], channels[2])
1769
+ self.decoder_block2 = DecoderBlock(channels[2], channels[3])
1770
+ self.decoder_block1 = DecoderBlock(channels[3], channels[3]//2)
1771
+
1772
+ self.lateral_block4 = LateralBlock(channels[1], channels[1])
1773
+ self.lateral_block3 = LateralBlock(channels[2], channels[2])
1774
+ self.lateral_block2 = LateralBlock(channels[3], channels[3])
1775
+
1776
+ if self.config.ms_supervision:
1777
+ self.conv_ms_spvn_4 = nn.Conv2d(channels[1], 1, 1, 1, 0)
1778
+ self.conv_ms_spvn_3 = nn.Conv2d(channels[2], 1, 1, 1, 0)
1779
+ self.conv_ms_spvn_2 = nn.Conv2d(channels[3], 1, 1, 1, 0)
1780
+ self.conv_out1 = nn.Sequential(nn.Conv2d(channels[3]//2, 1, 1, 1, 0))
1781
+
1782
+ def forward(self, features):
1783
+ x, x1, x2, x3, x4 = features
1784
+ outs = []
1785
+ p4 = self.decoder_block4(x4)
1786
+ _p4 = F.interpolate(p4, size=x3.shape[2:], mode='bilinear', align_corners=True)
1787
+ _p3 = _p4 + self.lateral_block4(x3)
1788
+
1789
+ p3 = self.decoder_block3(_p3)
1790
+ _p3 = F.interpolate(p3, size=x2.shape[2:], mode='bilinear', align_corners=True)
1791
+ _p2 = _p3 + self.lateral_block3(x2)
1792
+
1793
+ p2 = self.decoder_block2(_p2)
1794
+ _p2 = F.interpolate(p2, size=x1.shape[2:], mode='bilinear', align_corners=True)
1795
+ _p1 = _p2 + self.lateral_block2(x1)
1796
+
1797
+ _p1 = self.decoder_block1(_p1)
1798
+ _p1 = F.interpolate(_p1, size=x.shape[2:], mode='bilinear', align_corners=True)
1799
+ p1_out = self.conv_out1(_p1)
1800
+
1801
+ if self.config.ms_supervision:
1802
+ outs.append(self.conv_ms_spvn_4(p4))
1803
+ outs.append(self.conv_ms_spvn_3(p3))
1804
+ outs.append(self.conv_ms_spvn_2(p2))
1805
+ outs.append(p1_out)
1806
+ return outs
1807
+
1808
+
1809
+ class RefUNet(nn.Module):
1810
+ # Refinement
1811
+ def __init__(self, in_channels=3+1):
1812
+ super(RefUNet, self).__init__()
1813
+ self.encoder_1 = nn.Sequential(
1814
+ nn.Conv2d(in_channels, 64, 3, 1, 1),
1815
+ nn.Conv2d(64, 64, 3, 1, 1),
1816
+ nn.BatchNorm2d(64),
1817
+ nn.ReLU(inplace=True)
1818
+ )
1819
+
1820
+ self.encoder_2 = nn.Sequential(
1821
+ nn.MaxPool2d(2, 2, ceil_mode=True),
1822
+ nn.Conv2d(64, 64, 3, 1, 1),
1823
+ nn.BatchNorm2d(64),
1824
+ nn.ReLU(inplace=True)
1825
+ )
1826
+
1827
+ self.encoder_3 = nn.Sequential(
1828
+ nn.MaxPool2d(2, 2, ceil_mode=True),
1829
+ nn.Conv2d(64, 64, 3, 1, 1),
1830
+ nn.BatchNorm2d(64),
1831
+ nn.ReLU(inplace=True)
1832
+ )
1833
+
1834
+ self.encoder_4 = nn.Sequential(
1835
+ nn.MaxPool2d(2, 2, ceil_mode=True),
1836
+ nn.Conv2d(64, 64, 3, 1, 1),
1837
+ nn.BatchNorm2d(64),
1838
+ nn.ReLU(inplace=True)
1839
+ )
1840
+
1841
+ self.pool4 = nn.MaxPool2d(2, 2, ceil_mode=True)
1842
+ #####
1843
+ self.decoder_5 = nn.Sequential(
1844
+ nn.Conv2d(64, 64, 3, 1, 1),
1845
+ nn.BatchNorm2d(64),
1846
+ nn.ReLU(inplace=True)
1847
+ )
1848
+ #####
1849
+ self.decoder_4 = nn.Sequential(
1850
+ nn.Conv2d(128, 64, 3, 1, 1),
1851
+ nn.BatchNorm2d(64),
1852
+ nn.ReLU(inplace=True)
1853
+ )
1854
+
1855
+ self.decoder_3 = nn.Sequential(
1856
+ nn.Conv2d(128, 64, 3, 1, 1),
1857
+ nn.BatchNorm2d(64),
1858
+ nn.ReLU(inplace=True)
1859
+ )
1860
+
1861
+ self.decoder_2 = nn.Sequential(
1862
+ nn.Conv2d(128, 64, 3, 1, 1),
1863
+ nn.BatchNorm2d(64),
1864
+ nn.ReLU(inplace=True)
1865
+ )
1866
+
1867
+ self.decoder_1 = nn.Sequential(
1868
+ nn.Conv2d(128, 64, 3, 1, 1),
1869
+ nn.BatchNorm2d(64),
1870
+ nn.ReLU(inplace=True)
1871
+ )
1872
+
1873
+ self.conv_d0 = nn.Conv2d(64, 1, 3, 1, 1)
1874
+
1875
+ self.upscore2 = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
1876
+
1877
+ def forward(self, x):
1878
+ outs = []
1879
+ if isinstance(x, list):
1880
+ x = torch.cat(x, dim=1)
1881
+ hx = x
1882
+
1883
+ hx1 = self.encoder_1(hx)
1884
+ hx2 = self.encoder_2(hx1)
1885
+ hx3 = self.encoder_3(hx2)
1886
+ hx4 = self.encoder_4(hx3)
1887
+
1888
+ hx = self.decoder_5(self.pool4(hx4))
1889
+ hx = torch.cat((self.upscore2(hx), hx4), 1)
1890
+
1891
+ d4 = self.decoder_4(hx)
1892
+ hx = torch.cat((self.upscore2(d4), hx3), 1)
1893
+
1894
+ d3 = self.decoder_3(hx)
1895
+ hx = torch.cat((self.upscore2(d3), hx2), 1)
1896
+
1897
+ d2 = self.decoder_2(hx)
1898
+ hx = torch.cat((self.upscore2(d2), hx1), 1)
1899
+
1900
+ d1 = self.decoder_1(hx)
1901
+
1902
+ x = self.conv_d0(d1)
1903
+ outs.append(x)
1904
+ return outs
1905
+
1906
+
1907
+
1908
+ ### models/stem_layer.py
1909
+
1910
+ import torch.nn as nn
1911
+ # from utils import build_act_layer, build_norm_layer
1912
+
1913
+
1914
+ class StemLayer(nn.Module):
1915
+ r""" Stem layer of InternImage
1916
+ Args:
1917
+ in_channels (int): number of input channels
1918
+ out_channels (int): number of output channels
1919
+ act_layer (str): activation layer
1920
+ norm_layer (str): normalization layer
1921
+ """
1922
+
1923
+ def __init__(self,
1924
+ in_channels=3+1,
1925
+ inter_channels=48,
1926
+ out_channels=96,
1927
+ act_layer='GELU',
1928
+ norm_layer='BN'):
1929
+ super().__init__()
1930
+ self.conv1 = nn.Conv2d(in_channels,
1931
+ inter_channels,
1932
+ kernel_size=3,
1933
+ stride=1,
1934
+ padding=1)
1935
+ self.norm1 = build_norm_layer(
1936
+ inter_channels, norm_layer, 'channels_first', 'channels_first'
1937
+ )
1938
+ self.act = build_act_layer(act_layer)
1939
+ self.conv2 = nn.Conv2d(inter_channels,
1940
+ out_channels,
1941
+ kernel_size=3,
1942
+ stride=1,
1943
+ padding=1)
1944
+ self.norm2 = build_norm_layer(
1945
+ out_channels, norm_layer, 'channels_first', 'channels_first'
1946
+ )
1947
+
1948
+ def forward(self, x):
1949
+ x = self.conv1(x)
1950
+ x = self.norm1(x)
1951
+ x = self.act(x)
1952
+ x = self.conv2(x)
1953
+ x = self.norm2(x)
1954
+ return x
1955
+
1956
+
1957
+ ### models/birefnet.py
1958
+
1959
+ import torch
1960
+ import torch.nn as nn
1961
+ import torch.nn.functional as F
1962
+ from kornia.filters import laplacian
1963
+ from transformers import PreTrainedModel
1964
+
1965
+ # from config import Config
1966
+ # from dataset import class_labels_TR_sorted
1967
+ # from models.build_backbone import build_backbone
1968
+ # from models.decoder_blocks import BasicDecBlk, ResBlk, HierarAttDecBlk
1969
+ # from models.lateral_blocks import BasicLatBlk
1970
+ # from models.aspp import ASPP, ASPPDeformable
1971
+ # from models.ing import *
1972
+ # from models.refiner import Refiner, RefinerPVTInChannels4, RefUNet
1973
+ # from models.stem_layer import StemLayer
1974
+ from .BiRefNet_config import BiRefNetConfig
1975
+
1976
+
1977
+ class BiRefNet(
1978
+ PreTrainedModel
1979
+ ):
1980
+ config_class = BiRefNetConfig
1981
+ def __init__(self, bb_pretrained=True, config=BiRefNetConfig()):
1982
+ super(BiRefNet, self).__init__(config)
1983
+ bb_pretrained = config.bb_pretrained
1984
+ self.config = Config()
1985
+ self.epoch = 1
1986
+ self.bb = build_backbone(self.config.bb, pretrained=bb_pretrained)
1987
+
1988
+ channels = self.config.lateral_channels_in_collection
1989
+
1990
+ if self.config.auxiliary_classification:
1991
+ self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
1992
+ self.cls_head = nn.Sequential(
1993
+ nn.Linear(channels[0], len(class_labels_TR_sorted))
1994
+ )
1995
+
1996
+ if self.config.squeeze_block:
1997
+ self.squeeze_module = nn.Sequential(*[
1998
+ eval(self.config.squeeze_block.split('_x')[0])(channels[0]+sum(self.config.cxt), channels[0])
1999
+ for _ in range(eval(self.config.squeeze_block.split('_x')[1]))
2000
+ ])
2001
+
2002
+ self.decoder = Decoder(channels)
2003
+
2004
+ if self.config.ender:
2005
+ self.dec_end = nn.Sequential(
2006
+ nn.Conv2d(1, 16, 3, 1, 1),
2007
+ nn.Conv2d(16, 1, 3, 1, 1),
2008
+ nn.ReLU(inplace=True),
2009
+ )
2010
+
2011
+ # refine patch-level segmentation
2012
+ if self.config.refine:
2013
+ if self.config.refine == 'itself':
2014
+ self.stem_layer = StemLayer(in_channels=3+1, inter_channels=48, out_channels=3, norm_layer='BN' if self.config.batch_size > 1 else 'LN')
2015
+ else:
2016
+ self.refiner = eval('{}({})'.format(self.config.refine, 'in_channels=3+1'))
2017
+
2018
+ if self.config.freeze_bb:
2019
+ # Freeze the backbone...
2020
+ print(self.named_parameters())
2021
+ for key, value in self.named_parameters():
2022
+ if 'bb.' in key and 'refiner.' not in key:
2023
+ value.requires_grad = False
2024
+
2025
+ def forward_enc(self, x):
2026
+ if self.config.bb in ['vgg16', 'vgg16bn', 'resnet50']:
2027
+ x1 = self.bb.conv1(x); x2 = self.bb.conv2(x1); x3 = self.bb.conv3(x2); x4 = self.bb.conv4(x3)
2028
+ else:
2029
+ x1, x2, x3, x4 = self.bb(x)
2030
+ if self.config.mul_scl_ipt == 'cat':
2031
+ B, C, H, W = x.shape
2032
+ x1_, x2_, x3_, x4_ = self.bb(F.interpolate(x, size=(H//2, W//2), mode='bilinear', align_corners=True))
2033
+ x1 = torch.cat([x1, F.interpolate(x1_, size=x1.shape[2:], mode='bilinear', align_corners=True)], dim=1)
2034
+ x2 = torch.cat([x2, F.interpolate(x2_, size=x2.shape[2:], mode='bilinear', align_corners=True)], dim=1)
2035
+ x3 = torch.cat([x3, F.interpolate(x3_, size=x3.shape[2:], mode='bilinear', align_corners=True)], dim=1)
2036
+ x4 = torch.cat([x4, F.interpolate(x4_, size=x4.shape[2:], mode='bilinear', align_corners=True)], dim=1)
2037
+ elif self.config.mul_scl_ipt == 'add':
2038
+ B, C, H, W = x.shape
2039
+ x1_, x2_, x3_, x4_ = self.bb(F.interpolate(x, size=(H//2, W//2), mode='bilinear', align_corners=True))
2040
+ x1 = x1 + F.interpolate(x1_, size=x1.shape[2:], mode='bilinear', align_corners=True)
2041
+ x2 = x2 + F.interpolate(x2_, size=x2.shape[2:], mode='bilinear', align_corners=True)
2042
+ x3 = x3 + F.interpolate(x3_, size=x3.shape[2:], mode='bilinear', align_corners=True)
2043
+ x4 = x4 + F.interpolate(x4_, size=x4.shape[2:], mode='bilinear', align_corners=True)
2044
+ class_preds = self.cls_head(self.avgpool(x4).view(x4.shape[0], -1)) if self.training and self.config.auxiliary_classification else None
2045
+ if self.config.cxt:
2046
+ x4 = torch.cat(
2047
+ (
2048
+ *[
2049
+ F.interpolate(x1, size=x4.shape[2:], mode='bilinear', align_corners=True),
2050
+ F.interpolate(x2, size=x4.shape[2:], mode='bilinear', align_corners=True),
2051
+ F.interpolate(x3, size=x4.shape[2:], mode='bilinear', align_corners=True),
2052
+ ][-len(self.config.cxt):],
2053
+ x4
2054
+ ),
2055
+ dim=1
2056
+ )
2057
+ return (x1, x2, x3, x4), class_preds
2058
+
2059
+ def forward_ori(self, x):
2060
+ ########## Encoder ##########
2061
+ (x1, x2, x3, x4), class_preds = self.forward_enc(x)
2062
+ if self.config.squeeze_block:
2063
+ x4 = self.squeeze_module(x4)
2064
+ ########## Decoder ##########
2065
+ features = [x, x1, x2, x3, x4]
2066
+ if self.training and self.config.out_ref:
2067
+ features.append(laplacian(torch.mean(x, dim=1).unsqueeze(1), kernel_size=5))
2068
+ scaled_preds = self.decoder(features)
2069
+ return scaled_preds, class_preds
2070
+
2071
+ def forward(self, x):
2072
+ scaled_preds, class_preds = self.forward_ori(x)
2073
+ class_preds_lst = [class_preds]
2074
+ return [scaled_preds, class_preds_lst] if self.training else scaled_preds
2075
+
2076
+
2077
+ class Decoder(nn.Module):
2078
+ def __init__(self, channels):
2079
+ super(Decoder, self).__init__()
2080
+ self.config = Config()
2081
+ DecoderBlock = eval(self.config.dec_blk)
2082
+ LateralBlock = eval(self.config.lat_blk)
2083
+
2084
+ if self.config.dec_ipt:
2085
+ self.split = self.config.dec_ipt_split
2086
+ N_dec_ipt = 64
2087
+ DBlock = SimpleConvs
2088
+ ic = 64
2089
+ ipt_cha_opt = 1
2090
+ self.ipt_blk5 = DBlock(2**10*3 if self.split else 3, [N_dec_ipt, channels[0]//8][ipt_cha_opt], inter_channels=ic)
2091
+ self.ipt_blk4 = DBlock(2**8*3 if self.split else 3, [N_dec_ipt, channels[0]//8][ipt_cha_opt], inter_channels=ic)
2092
+ self.ipt_blk3 = DBlock(2**6*3 if self.split else 3, [N_dec_ipt, channels[1]//8][ipt_cha_opt], inter_channels=ic)
2093
+ self.ipt_blk2 = DBlock(2**4*3 if self.split else 3, [N_dec_ipt, channels[2]//8][ipt_cha_opt], inter_channels=ic)
2094
+ self.ipt_blk1 = DBlock(2**0*3 if self.split else 3, [N_dec_ipt, channels[3]//8][ipt_cha_opt], inter_channels=ic)
2095
+ else:
2096
+ self.split = None
2097
+
2098
+ self.decoder_block4 = DecoderBlock(channels[0]+([N_dec_ipt, channels[0]//8][ipt_cha_opt] if self.config.dec_ipt else 0), channels[1])
2099
+ self.decoder_block3 = DecoderBlock(channels[1]+([N_dec_ipt, channels[0]//8][ipt_cha_opt] if self.config.dec_ipt else 0), channels[2])
2100
+ self.decoder_block2 = DecoderBlock(channels[2]+([N_dec_ipt, channels[1]//8][ipt_cha_opt] if self.config.dec_ipt else 0), channels[3])
2101
+ self.decoder_block1 = DecoderBlock(channels[3]+([N_dec_ipt, channels[2]//8][ipt_cha_opt] if self.config.dec_ipt else 0), channels[3]//2)
2102
+ self.conv_out1 = nn.Sequential(nn.Conv2d(channels[3]//2+([N_dec_ipt, channels[3]//8][ipt_cha_opt] if self.config.dec_ipt else 0), 1, 1, 1, 0))
2103
+
2104
+ self.lateral_block4 = LateralBlock(channels[1], channels[1])
2105
+ self.lateral_block3 = LateralBlock(channels[2], channels[2])
2106
+ self.lateral_block2 = LateralBlock(channels[3], channels[3])
2107
+
2108
+ if self.config.ms_supervision:
2109
+ self.conv_ms_spvn_4 = nn.Conv2d(channels[1], 1, 1, 1, 0)
2110
+ self.conv_ms_spvn_3 = nn.Conv2d(channels[2], 1, 1, 1, 0)
2111
+ self.conv_ms_spvn_2 = nn.Conv2d(channels[3], 1, 1, 1, 0)
2112
+
2113
+ if self.config.out_ref:
2114
+ _N = 16
2115
+ self.gdt_convs_4 = nn.Sequential(nn.Conv2d(channels[1], _N, 3, 1, 1), nn.BatchNorm2d(_N) if self.config.batch_size > 1 else nn.Identity(), nn.ReLU(inplace=True))
2116
+ self.gdt_convs_3 = nn.Sequential(nn.Conv2d(channels[2], _N, 3, 1, 1), nn.BatchNorm2d(_N) if self.config.batch_size > 1 else nn.Identity(), nn.ReLU(inplace=True))
2117
+ self.gdt_convs_2 = nn.Sequential(nn.Conv2d(channels[3], _N, 3, 1, 1), nn.BatchNorm2d(_N) if self.config.batch_size > 1 else nn.Identity(), nn.ReLU(inplace=True))
2118
+
2119
+ self.gdt_convs_pred_4 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
2120
+ self.gdt_convs_pred_3 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
2121
+ self.gdt_convs_pred_2 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
2122
+
2123
+ self.gdt_convs_attn_4 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
2124
+ self.gdt_convs_attn_3 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
2125
+ self.gdt_convs_attn_2 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
2126
+
2127
+ def get_patches_batch(self, x, p):
2128
+ _size_h, _size_w = p.shape[2:]
2129
+ patches_batch = []
2130
+ for idx in range(x.shape[0]):
2131
+ columns_x = torch.split(x[idx], split_size_or_sections=_size_w, dim=-1)
2132
+ patches_x = []
2133
+ for column_x in columns_x:
2134
+ patches_x += [p.unsqueeze(0) for p in torch.split(column_x, split_size_or_sections=_size_h, dim=-2)]
2135
+ patch_sample = torch.cat(patches_x, dim=1)
2136
+ patches_batch.append(patch_sample)
2137
+ return torch.cat(patches_batch, dim=0)
2138
+
2139
+ def forward(self, features):
2140
+ if self.training and self.config.out_ref:
2141
+ outs_gdt_pred = []
2142
+ outs_gdt_label = []
2143
+ x, x1, x2, x3, x4, gdt_gt = features
2144
+ else:
2145
+ x, x1, x2, x3, x4 = features
2146
+ outs = []
2147
+
2148
+ if self.config.dec_ipt:
2149
+ patches_batch = self.get_patches_batch(x, x4) if self.split else x
2150
+ x4 = torch.cat((x4, self.ipt_blk5(F.interpolate(patches_batch, size=x4.shape[2:], mode='bilinear', align_corners=True))), 1)
2151
+ p4 = self.decoder_block4(x4)
2152
+ m4 = self.conv_ms_spvn_4(p4) if self.config.ms_supervision else None
2153
+ if self.config.out_ref:
2154
+ p4_gdt = self.gdt_convs_4(p4)
2155
+ if self.training:
2156
+ # >> GT:
2157
+ m4_dia = m4
2158
+ gdt_label_main_4 = gdt_gt * F.interpolate(m4_dia, size=gdt_gt.shape[2:], mode='bilinear', align_corners=True)
2159
+ outs_gdt_label.append(gdt_label_main_4)
2160
+ # >> Pred:
2161
+ gdt_pred_4 = self.gdt_convs_pred_4(p4_gdt)
2162
+ outs_gdt_pred.append(gdt_pred_4)
2163
+ gdt_attn_4 = self.gdt_convs_attn_4(p4_gdt).sigmoid()
2164
+ # >> Finally:
2165
+ p4 = p4 * gdt_attn_4
2166
+ _p4 = F.interpolate(p4, size=x3.shape[2:], mode='bilinear', align_corners=True)
2167
+ _p3 = _p4 + self.lateral_block4(x3)
2168
+
2169
+ if self.config.dec_ipt:
2170
+ patches_batch = self.get_patches_batch(x, _p3) if self.split else x
2171
+ _p3 = torch.cat((_p3, self.ipt_blk4(F.interpolate(patches_batch, size=x3.shape[2:], mode='bilinear', align_corners=True))), 1)
2172
+ p3 = self.decoder_block3(_p3)
2173
+ m3 = self.conv_ms_spvn_3(p3) if self.config.ms_supervision else None
2174
+ if self.config.out_ref:
2175
+ p3_gdt = self.gdt_convs_3(p3)
2176
+ if self.training:
2177
+ # >> GT:
2178
+ # m3 --dilation--> m3_dia
2179
+ # G_3^gt * m3_dia --> G_3^m, which is the label of gradient
2180
+ m3_dia = m3
2181
+ gdt_label_main_3 = gdt_gt * F.interpolate(m3_dia, size=gdt_gt.shape[2:], mode='bilinear', align_corners=True)
2182
+ outs_gdt_label.append(gdt_label_main_3)
2183
+ # >> Pred:
2184
+ # p3 --conv--BN--> F_3^G, where F_3^G predicts the \hat{G_3} with xx
2185
+ # F_3^G --sigmoid--> A_3^G
2186
+ gdt_pred_3 = self.gdt_convs_pred_3(p3_gdt)
2187
+ outs_gdt_pred.append(gdt_pred_3)
2188
+ gdt_attn_3 = self.gdt_convs_attn_3(p3_gdt).sigmoid()
2189
+ # >> Finally:
2190
+ # p3 = p3 * A_3^G
2191
+ p3 = p3 * gdt_attn_3
2192
+ _p3 = F.interpolate(p3, size=x2.shape[2:], mode='bilinear', align_corners=True)
2193
+ _p2 = _p3 + self.lateral_block3(x2)
2194
+
2195
+ if self.config.dec_ipt:
2196
+ patches_batch = self.get_patches_batch(x, _p2) if self.split else x
2197
+ _p2 = torch.cat((_p2, self.ipt_blk3(F.interpolate(patches_batch, size=x2.shape[2:], mode='bilinear', align_corners=True))), 1)
2198
+ p2 = self.decoder_block2(_p2)
2199
+ m2 = self.conv_ms_spvn_2(p2) if self.config.ms_supervision else None
2200
+ if self.config.out_ref:
2201
+ p2_gdt = self.gdt_convs_2(p2)
2202
+ if self.training:
2203
+ # >> GT:
2204
+ m2_dia = m2
2205
+ gdt_label_main_2 = gdt_gt * F.interpolate(m2_dia, size=gdt_gt.shape[2:], mode='bilinear', align_corners=True)
2206
+ outs_gdt_label.append(gdt_label_main_2)
2207
+ # >> Pred:
2208
+ gdt_pred_2 = self.gdt_convs_pred_2(p2_gdt)
2209
+ outs_gdt_pred.append(gdt_pred_2)
2210
+ gdt_attn_2 = self.gdt_convs_attn_2(p2_gdt).sigmoid()
2211
+ # >> Finally:
2212
+ p2 = p2 * gdt_attn_2
2213
+ _p2 = F.interpolate(p2, size=x1.shape[2:], mode='bilinear', align_corners=True)
2214
+ _p1 = _p2 + self.lateral_block2(x1)
2215
+
2216
+ if self.config.dec_ipt:
2217
+ patches_batch = self.get_patches_batch(x, _p1) if self.split else x
2218
+ _p1 = torch.cat((_p1, self.ipt_blk2(F.interpolate(patches_batch, size=x1.shape[2:], mode='bilinear', align_corners=True))), 1)
2219
+ _p1 = self.decoder_block1(_p1)
2220
+ _p1 = F.interpolate(_p1, size=x.shape[2:], mode='bilinear', align_corners=True)
2221
+
2222
+ if self.config.dec_ipt:
2223
+ patches_batch = self.get_patches_batch(x, _p1) if self.split else x
2224
+ _p1 = torch.cat((_p1, self.ipt_blk1(F.interpolate(patches_batch, size=x.shape[2:], mode='bilinear', align_corners=True))), 1)
2225
+ p1_out = self.conv_out1(_p1)
2226
+
2227
+ if self.config.ms_supervision:
2228
+ outs.append(m4)
2229
+ outs.append(m3)
2230
+ outs.append(m2)
2231
+ outs.append(p1_out)
2232
+ return outs if not (self.config.out_ref and self.training) else ([outs_gdt_pred, outs_gdt_label], outs)
2233
+
2234
+
2235
+ class SimpleConvs(nn.Module):
2236
+ def __init__(
2237
+ self, in_channels: int, out_channels: int, inter_channels=64
2238
+ ) -> None:
2239
+ super().__init__()
2240
+ self.conv1 = nn.Conv2d(in_channels, inter_channels, 3, 1, 1)
2241
+ self.conv_out = nn.Conv2d(inter_channels, out_channels, 3, 1, 1)
2242
+
2243
+ def forward(self, x):
2244
+ return self.conv_out(self.conv1(x))
BiRefNet/RMBG-2.0/config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ZhengPeng7/BiRefNet",
3
+ "architectures": [
4
+ "BiRefNet"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "BiRefNet_config.BiRefNetConfig",
8
+ "AutoModelForImageSegmentation": "birefnet.BiRefNet"
9
+ },
10
+ "custom_pipelines": {
11
+ "image-segmentation": {
12
+ "pt": [
13
+ "AutoModelForImageSegmentation"
14
+ ],
15
+ "tf": [],
16
+ "type": "image"
17
+ }
18
+ },
19
+ "bb_pretrained": false,
20
+ "model_type": "birefnet"
21
+ }
BiRefNet/RMBG-2.0/diagram1.png ADDED
BiRefNet/RMBG-2.0/preprocessor_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "feature_extractor_type": "ViTFeatureExtractor",
6
+ "image_mean": [
7
+ 0.485,
8
+ 0.456,
9
+ 0.406
10
+ ],
11
+ "image_processor_type": "ViTFeatureExtractor",
12
+ "image_std": [
13
+ 0.229,
14
+ 0.224,
15
+ 0.225
16
+ ],
17
+ "resample": 2,
18
+ "rescale_factor": 0.00392156862745098,
19
+ "size": {
20
+ "height": 1024,
21
+ "width": 1024
22
+ }
23
+ }
Joy_caption/README.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ ---
6
+ # Image Captioning App
7
+
8
+ This is a mod of [Wi-zz/joy-caption-pre-alpha](https://huggingface.co/Wi-zz/joy-caption-pre-alpha) and [fancyfeast/joy-caption-alpha-two](https://huggingface.co/spaces/fancyfeast/joy-caption-alpha-two). Thanks to [dominic1021](https://huggingface.co/dominic1021), [IceHibiki](https://huggingface.co/IceHibiki), [BullseyeMxP](https://huggingface.co/BullseyeMxP), [Wakeme](https://huggingface.co/Wakeme).
9
+
10
+ # Notice: I will contribute to Wi-zz after shaping the code.
11
+
12
+ ## Overview
13
+
14
+ This application generates descriptive captions for images using advanced ML models. It processes single images or entire directories, leveraging CLIP and LLM models for accurate and contextual captions. It has NSFW captioning support with natural language. This is just an extension of the original author's efforts to improve performance. Their repo is located here: https://huggingface.co/spaces/fancyfeast/joy-caption-alpha-two.
15
+
16
+ ## Features
17
+
18
+ - Single image and batch processing
19
+ - Multiple directory support
20
+ - Custom output directory
21
+ - Adjustable batch size
22
+ - Progress tracking
23
+
24
+ ## Usage
25
+
26
+ | Command | Description |
27
+ |---------|-------------|
28
+ | `python app.py image.jpg` | Process a single image |
29
+ | `python app.py /path/to/directory` | Process all images in a directory |
30
+ | `python app.py /path/to/dir1 /path/to/dir2` | Process multiple directories |
31
+ | `python app.py /path/to/dir --output /path/to/output` | Specify output directory |
32
+ | `python app.py /path/to/dir --bs 8` | Set batch size (default: 4) |
33
+
34
+ ## Technical Details
35
+
36
+ - **Models**: CLIP (vision), LLM (language), custom ImageAdapter
37
+ - **Optimization**: CUDA-enabled GPU support
38
+ - **Error Handling**: Skips problematic images in batch processing
39
+
40
+ ## Requirements
41
+
42
+ - Python 3.x
43
+ - PyTorch
44
+ - Transformers library
45
+ - PEFT library
46
+ - CUDA-capable GPU (recommended)
47
+
48
+ ## Installation
49
+
50
+ Windows
51
+
52
+ ```bash
53
+ git clone https://huggingface.co/John6666/joy-caption-alpha-two-cli-mod
54
+ cd joy-caption-alpha-two-cli-mod
55
+ python -m venv venv
56
+ .\venv\Scripts\activate
57
+ # Change as per https://pytorch.org/get-started/locally/
58
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
59
+ pip install -r requirements.txt
60
+ ```
61
+
62
+ Linux
63
+
64
+ ```bash
65
+ git clone https://huggingface.co/John6666/joy-caption-alpha-two-cli-mod
66
+ cd joy-caption-alpha-two-cli-mod
67
+ python3 -m venv venv
68
+ source venv/bin/activate
69
+ pip3 install torch torchvision torchaudio
70
+ pip3 install -r requirements.txt
71
+ ```
72
+
73
+ ## Contributing
74
+
75
+ Contributions are welcome! Please feel free to submit a Pull Request.
76
+
77
+ ## License
78
+
79
+ This project is licensed under the [MIT License](LICENSE).
Joy_caption/app.py ADDED
@@ -0,0 +1,536 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.amp.autocast_mode
3
+ import os
4
+ import sys
5
+ import logging
6
+ import warnings
7
+ import argparse
8
+ from PIL import Image
9
+ from pathlib import Path
10
+ from tqdm import tqdm
11
+ from torch import nn
12
+ from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM
13
+ from typing import List, Union
14
+ import torchvision.transforms.functional as TVF
15
+ from peft import PeftModel
16
+ import gc
17
+ import sys
18
+ IS_COLAB = 'google.colab' in sys.modules
19
+
20
+ # Constants
21
+ IMAGE_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.bmp', '.webp')
22
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
23
+ BASE_DIR = Path(__file__).resolve().parent # Define the base directory
24
+ CHECKPOINT_PATH = BASE_DIR / Path("cgrkzexw-599808")
25
+ CLIP_PATH = "google/siglip-so400m-patch14-384"
26
+ DEFAULT_MODEL_PATH = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
27
+ #DEFAULT_MODEL_PATH = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" # Default in Alpha One Two.
28
+ #DEFAULT_MODEL_PATH = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2" # Works better but full weight.
29
+ LORA_PATH = CHECKPOINT_PATH / "text_model"
30
+ CAPTION_TYPE_MAP = {
31
+ "Descriptive": [
32
+ "Write a descriptive caption for this image in a formal tone.",
33
+ "Write a descriptive caption for this image in a formal tone within {word_count} words.",
34
+ "Write a {length} descriptive caption for this image in a formal tone.",
35
+ ],
36
+ "Descriptive (Informal)": [
37
+ "Write a descriptive caption for this image in a casual tone.",
38
+ "Write a descriptive caption for this image in a casual tone within {word_count} words.",
39
+ "Write a {length} descriptive caption for this image in a casual tone.",
40
+ ],
41
+ "Training Prompt": [
42
+ "Write a stable diffusion prompt for this image.",
43
+ "Write a stable diffusion prompt for this image within {word_count} words.",
44
+ "Write a {length} stable diffusion prompt for this image.",
45
+ ],
46
+ "MidJourney": [
47
+ "Write a MidJourney prompt for this image.",
48
+ "Write a MidJourney prompt for this image within {word_count} words.",
49
+ "Write a {length} MidJourney prompt for this image.",
50
+ ],
51
+ "Booru tag list": [
52
+ "Write a list of Booru tags for this image.",
53
+ "Write a list of Booru tags for this image within {word_count} words.",
54
+ "Write a {length} list of Booru tags for this image.",
55
+ ],
56
+ "Booru-like tag list": [
57
+ "Write a list of Booru-like tags for this image.",
58
+ "Write a list of Booru-like tags for this image within {word_count} words.",
59
+ "Write a {length} list of Booru-like tags for this image.",
60
+ ],
61
+ "Art Critic": [
62
+ "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc.",
63
+ "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it within {word_count} words.",
64
+ "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it {length}.",
65
+ ],
66
+ "Product Listing": [
67
+ "Write a caption for this image as though it were a product listing.",
68
+ "Write a caption for this image as though it were a product listing. Keep it under {word_count} words.",
69
+ "Write a {length} caption for this image as though it were a product listing.",
70
+ ],
71
+ "Social Media Post": [
72
+ "Write a caption for this image as if it were being used for a social media post.",
73
+ "Write a caption for this image as if it were being used for a social media post. Limit the caption to {word_count} words.",
74
+ "Write a {length} caption for this image as if it were being used for a social media post.",
75
+ ],
76
+ }
77
+
78
+ class ImageAdapter(nn.Module):
79
+ def __init__(self, input_features: int, output_features: int, ln1: bool, pos_emb: bool, num_image_tokens: int, deep_extract: bool):
80
+ super().__init__()
81
+ self.deep_extract = deep_extract
82
+
83
+ if self.deep_extract:
84
+ input_features = input_features * 5
85
+
86
+ self.linear1 = nn.Linear(input_features, output_features)
87
+ self.activation = nn.GELU()
88
+ self.linear2 = nn.Linear(output_features, output_features)
89
+ self.ln1 = nn.Identity() if not ln1 else nn.LayerNorm(input_features)
90
+ self.pos_emb = None if not pos_emb else nn.Parameter(torch.zeros(num_image_tokens, input_features))
91
+
92
+ # Other tokens (<|image_start|>, <|image_end|>, <|eot_id|>)
93
+ self.other_tokens = nn.Embedding(3, output_features)
94
+ self.other_tokens.weight.data.normal_(mean=0.0, std=0.02) # Matches HF's implementation of llama3
95
+
96
+ def forward(self, vision_outputs: torch.Tensor):
97
+ if self.deep_extract:
98
+ x = torch.concat((
99
+ vision_outputs[-2],
100
+ vision_outputs[3],
101
+ vision_outputs[7],
102
+ vision_outputs[13],
103
+ vision_outputs[20],
104
+ ), dim=-1)
105
+ assert len(x.shape) == 3, f"Expected 3, got {len(x.shape)}" # batch, tokens, features
106
+ assert x.shape[-1] == vision_outputs[-2].shape[-1] * 5, f"Expected {vision_outputs[-2].shape[-1] * 5}, got {x.shape[-1]}"
107
+ else:
108
+ x = vision_outputs[-2]
109
+
110
+ x = self.ln1(x)
111
+
112
+ if self.pos_emb is not None:
113
+ assert x.shape[-2:] == self.pos_emb.shape, f"Expected {self.pos_emb.shape}, got {x.shape[-2:]}"
114
+ x = x + self.pos_emb
115
+
116
+ x = self.linear1(x)
117
+ x = self.activation(x)
118
+ x = self.linear2(x)
119
+
120
+ # <|image_start|>, IMAGE, <|image_end|>
121
+ other_tokens = self.other_tokens(torch.tensor([0, 1], device=self.other_tokens.weight.device).expand(x.shape[0], -1))
122
+ assert other_tokens.shape == (x.shape[0], 2, x.shape[2]), f"Expected {(x.shape[0], 2, x.shape[2])}, got {other_tokens.shape}"
123
+ x = torch.cat((other_tokens[:, 0:1], x, other_tokens[:, 1:2]), dim=1)
124
+
125
+ return x
126
+
127
+ def get_eot_embedding(self):
128
+ return self.other_tokens(torch.tensor([2], device=self.other_tokens.weight.device)).squeeze(0)
129
+
130
+
131
+ # Global Variables
132
+ IS_NF4 = True
133
+ IS_LORA = True
134
+ MODEL_PATH = DEFAULT_MODEL_PATH
135
+ device = "cuda" if torch.cuda.is_available() else "cpu"
136
+ print(f"Running on {device}")
137
+
138
+ warnings.filterwarnings("ignore", category=UserWarning)
139
+ logging.getLogger("transformers").setLevel(logging.ERROR)
140
+
141
+ class ImageAdapter(nn.Module):
142
+ def __init__(self, input_features: int, output_features: int, ln1: bool, pos_emb: bool, num_image_tokens: int, deep_extract: bool):
143
+ super().__init__()
144
+ self.deep_extract = deep_extract
145
+
146
+ if self.deep_extract:
147
+ input_features = input_features * 5
148
+
149
+ self.linear1 = nn.Linear(input_features, output_features)
150
+ self.activation = nn.GELU()
151
+ self.linear2 = nn.Linear(output_features, output_features)
152
+ self.ln1 = nn.Identity() if not ln1 else nn.LayerNorm(input_features)
153
+ self.pos_emb = None if not pos_emb else nn.Parameter(torch.zeros(num_image_tokens, input_features))
154
+
155
+ # Mode token
156
+ #self.mode_token = nn.Embedding(n_modes, output_features)
157
+ #self.mode_token.weight.data.normal_(mean=0.0, std=0.02) # Matches HF's implementation of llama3
158
+
159
+ # Other tokens (<|image_start|>, <|image_end|>, <|eot_id|>)
160
+ self.other_tokens = nn.Embedding(3, output_features)
161
+ self.other_tokens.weight.data.normal_(mean=0.0, std=0.02) # Matches HF's implementation of llama3
162
+
163
+ def forward(self, vision_outputs: torch.Tensor):
164
+ if self.deep_extract:
165
+ x = torch.concat((
166
+ vision_outputs[-2],
167
+ vision_outputs[3],
168
+ vision_outputs[7],
169
+ vision_outputs[13],
170
+ vision_outputs[20],
171
+ ), dim=-1)
172
+ assert len(x.shape) == 3, f"Expected 3, got {len(x.shape)}" # batch, tokens, features
173
+ assert x.shape[-1] == vision_outputs[-2].shape[-1] * 5, f"Expected {vision_outputs[-2].shape[-1] * 5}, got {x.shape[-1]}"
174
+ else:
175
+ x = vision_outputs[-2]
176
+
177
+ x = self.ln1(x)
178
+
179
+ if self.pos_emb is not None:
180
+ assert x.shape[-2:] == self.pos_emb.shape, f"Expected {self.pos_emb.shape}, got {x.shape[-2:]}"
181
+ x = x + self.pos_emb
182
+
183
+ x = self.linear1(x)
184
+ x = self.activation(x)
185
+ x = self.linear2(x)
186
+
187
+ # Mode token
188
+ #mode_token = self.mode_token(mode)
189
+ #assert mode_token.shape == (x.shape[0], mode_token.shape[1], x.shape[2]), f"Expected {(x.shape[0], 1, x.shape[2])}, got {mode_token.shape}"
190
+ #x = torch.cat((x, mode_token), dim=1)
191
+
192
+ # <|image_start|>, IMAGE, <|image_end|>
193
+ other_tokens = self.other_tokens(torch.tensor([0, 1], device=self.other_tokens.weight.device).expand(x.shape[0], -1))
194
+ assert other_tokens.shape == (x.shape[0], 2, x.shape[2]), f"Expected {(x.shape[0], 2, x.shape[2])}, got {other_tokens.shape}"
195
+ x = torch.cat((other_tokens[:, 0:1], x, other_tokens[:, 1:2]), dim=1)
196
+
197
+ return x
198
+
199
+ def get_eot_embedding(self):
200
+ return self.other_tokens(torch.tensor([2], device=self.other_tokens.weight.device)).squeeze(0)
201
+
202
+ def load_models():
203
+ global MODEL_PATH, IS_NF4, IS_LORA
204
+ try:
205
+ if IS_NF4:
206
+ from transformers import BitsAndBytesConfig
207
+ nf4_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
208
+ bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
209
+ print("Loading in NF4")
210
+ print("Loading CLIP 📎")
211
+ clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
212
+ clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model
213
+ assert (CHECKPOINT_PATH / "clip_model.pt").exists()
214
+ if (CHECKPOINT_PATH / "clip_model.pt").exists():
215
+ print("Loading VLM's custom vision model 📎")
216
+ checkpoint = torch.load(CHECKPOINT_PATH / "clip_model.pt", map_location='cpu', weights_only=False)
217
+ checkpoint = {k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()}
218
+ clip_model.load_state_dict(checkpoint)
219
+ del checkpoint
220
+ clip_model.eval().requires_grad_(False).to(device)
221
+
222
+ print("Loading tokenizer 🪙")
223
+ tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_PATH / "text_model", use_fast=True)
224
+ assert isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)), f"Tokenizer is of type {type(tokenizer)}"
225
+
226
+ print(f"Loading LLM: {MODEL_PATH} 🤖")
227
+ text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, quantization_config=nf4_config).eval()
228
+
229
+ if False and IS_LORA and LORA_PATH.exists(): # omitted
230
+ print("Loading VLM's custom text model 🤖")
231
+ text_model = PeftModel.from_pretrained(model=text_model, model_id=LORA_PATH, quantization_config=nf4_config)
232
+ text_model = text_model.merge_and_unload(safe_merge=True) # to avoid PEFT bug https://github.com/huggingface/transformers/issues/28515
233
+ else: print("VLM's custom text model isn't loaded 🤖")
234
+
235
+ print("Loading image adapter 🖼️")
236
+ image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False).eval().to("cpu")
237
+ image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu", weights_only=False))
238
+ image_adapter.eval().to(device)
239
+ else:
240
+ print("Loading in bfloat16")
241
+ print("Loading CLIP 📎")
242
+ clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
243
+ clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model
244
+ if (CHECKPOINT_PATH / "clip_model.pt").exists():
245
+ print("Loading VLM's custom vision model 📎")
246
+ checkpoint = torch.load(CHECKPOINT_PATH / "clip_model.pt", map_location='cpu', weights_only=False)
247
+ checkpoint = {k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()}
248
+ clip_model.load_state_dict(checkpoint)
249
+ del checkpoint
250
+ clip_model.eval().requires_grad_(False).to(device)
251
+
252
+ print("Loading tokenizer 🪙")
253
+ tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_PATH / "text_model", use_fast=True)
254
+ assert isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)), f"Tokenizer is of type {type(tokenizer)}"
255
+
256
+ print(f"Loading LLM: {MODEL_PATH} 🤖")
257
+ text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16).eval() # device_map="auto" may cause LoRA issue
258
+
259
+ if IS_LORA and LORA_PATH.exists():
260
+ print("Loading VLM's custom text model 🤖")
261
+ text_model = PeftModel.from_pretrained(model=text_model, model_id=LORA_PATH, device_map=device)
262
+ text_model = text_model.merge_and_unload(safe_merge=True) # to avoid PEFT bug https://github.com/huggingface/transformers/issues/28515
263
+ else: print("VLM's custom text model isn't loaded 🤖")
264
+
265
+ print("Loading image adapter 🖼️")
266
+ image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False).eval().to("cpu")
267
+ image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu", weights_only=False))
268
+ except Exception as e:
269
+ print(f"Error loading models: {e}")
270
+ sys.exit(1)
271
+ finally:
272
+ torch.cuda.empty_cache()
273
+ gc.collect()
274
+ return clip_processor, clip_model, tokenizer, text_model, image_adapter
275
+
276
+ @torch.inference_mode()
277
+ def stream_chat(input_images: List[Image.Image], caption_type: str, caption_length: Union[str, int], extra_options: list[str], name_input: str, custom_prompt: str,
278
+ max_new_tokens: int, top_p: float, temperature: float, batch_size: int, pbar: tqdm, models: tuple) -> List[str]:
279
+ global MODEL_PATH
280
+ clip_processor, clip_model, tokenizer, text_model, image_adapter = models
281
+ torch.cuda.empty_cache()
282
+ all_captions = []
283
+
284
+ # 'any' means no length specified
285
+ length = None if caption_length == "any" else caption_length
286
+
287
+ if isinstance(length, str):
288
+ try:
289
+ length = int(length)
290
+ except ValueError:
291
+ pass
292
+
293
+ # Build prompt
294
+ if length is None:
295
+ map_idx = 0
296
+ elif isinstance(length, int):
297
+ map_idx = 1
298
+ elif isinstance(length, str):
299
+ map_idx = 2
300
+ else:
301
+ raise ValueError(f"Invalid caption length: {length}")
302
+
303
+ prompt_str = CAPTION_TYPE_MAP[caption_type][map_idx]
304
+
305
+ # Add extra options
306
+ if len(extra_options) > 0:
307
+ prompt_str += " " + " ".join(extra_options)
308
+
309
+ # Add name, length, word_count
310
+ prompt_str = prompt_str.format(name=name_input, length=caption_length, word_count=caption_length)
311
+
312
+ if custom_prompt.strip() != "":
313
+ prompt_str = custom_prompt.strip()
314
+
315
+ # For debugging
316
+ print(f"Prompt: {prompt_str}")
317
+
318
+ for i in range(0, len(input_images), batch_size):
319
+ batch = input_images[i:i+batch_size]
320
+
321
+ for input_image in input_images:
322
+ try:
323
+ # Preprocess image
324
+ # NOTE: I found the default processor for so400M to have worse results than just using PIL directly
325
+ #image = clip_processor(images=input_image, return_tensors='pt').pixel_values
326
+ image = input_image.resize((384, 384), Image.LANCZOS)
327
+ pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
328
+ pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
329
+ pixel_values = pixel_values.to(device)
330
+ except ValueError as e:
331
+ print(f"Error processing image: {e}")
332
+ print("Skipping this image and continuing...")
333
+ continue
334
+
335
+ # Embed image
336
+ # This results in Batch x Image Tokens x Features
337
+ with torch.amp.autocast_mode.autocast(device, enabled=True):
338
+ vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
339
+ image_features = vision_outputs.hidden_states
340
+ embedded_images = image_adapter(image_features).to(device)
341
+
342
+ # Build the conversation
343
+ convo = [
344
+ {
345
+ "role": "system",
346
+ "content": "You are a helpful image captioner.",
347
+ },
348
+ {
349
+ "role": "user",
350
+ "content": prompt_str,
351
+ },
352
+ ]
353
+
354
+ # Format the conversation
355
+ convo_string = tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = True)
356
+ assert isinstance(convo_string, str)
357
+
358
+ # Tokenize the conversation
359
+ # prompt_str is tokenized separately so we can do the calculations below
360
+ convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False)
361
+ prompt_tokens = tokenizer.encode(prompt_str, return_tensors="pt", add_special_tokens=False, truncation=False)
362
+ assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor)
363
+ convo_tokens = convo_tokens.squeeze(0) # Squeeze just to make the following easier
364
+ prompt_tokens = prompt_tokens.squeeze(0)
365
+
366
+ # Calculate where to inject the image
367
+ eot_id_indices = (convo_tokens == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[0].tolist()
368
+ assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}"
369
+
370
+ preamble_len = eot_id_indices[1] - prompt_tokens.shape[0] # Number of tokens before the prompt
371
+
372
+ # Embed the tokens
373
+ convo_embeds = text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to(device))
374
+
375
+ # Construct the input
376
+ input_embeds = torch.cat([
377
+ convo_embeds[:, :preamble_len], # Part before the prompt
378
+ embedded_images.to(dtype=convo_embeds.dtype), # Image
379
+ convo_embeds[:, preamble_len:], # The prompt and anything after it
380
+ ], dim=1).to(device)
381
+
382
+ input_ids = torch.cat([
383
+ convo_tokens[:preamble_len].unsqueeze(0),
384
+ torch.zeros((1, embedded_images.shape[1]), dtype=torch.long), # Dummy tokens for the image (TODO: Should probably use a special token here so as not to confuse any generation algorithms that might be inspecting the input)
385
+ convo_tokens[preamble_len:].unsqueeze(0),
386
+ ], dim=1).to(device)
387
+ attention_mask = torch.ones_like(input_ids)
388
+
389
+ # Debugging
390
+ #print(f"Input to model: {repr(tokenizer.decode(input_ids[0]))}")
391
+
392
+ generate_ids = text_model.generate(input_ids=input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask, do_sample=True,
393
+ suppress_tokens=None, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature)
394
+
395
+ # Trim off the prompt
396
+ generate_ids = generate_ids[:, input_ids.shape[1]:]
397
+ if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
398
+ generate_ids = generate_ids[:, :-1]
399
+
400
+ caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
401
+ all_captions.append(caption.strip())
402
+
403
+ if pbar:
404
+ pbar.update(len(batch))
405
+
406
+ return all_captions
407
+
408
+ def process_directory(input_dir: Path, output_dir: Path, caption_type: str, caption_length: Union[str, int], extra_options: list[str], name_input: str, custom_prompt: str,
409
+ max_new_tokens: int, top_p: float, temperature: float, batch_size: int, models: tuple):
410
+ output_dir.mkdir(parents=True, exist_ok=True)
411
+ image_files = [f for f in input_dir.iterdir() if f.suffix.lower() in IMAGE_EXTENSIONS]
412
+ images_to_process = [f for f in image_files if not (output_dir / f"{f.stem}.txt").exists()]
413
+
414
+ if not images_to_process:
415
+ print("No new images to process.")
416
+ return
417
+
418
+ with tqdm(total=len(images_to_process), desc="Processing images", unit="image") as pbar:
419
+ for i in range(0, len(images_to_process), batch_size):
420
+ batch_files = images_to_process[i:i+batch_size]
421
+ batch_images = [Image.open(f).convert('RGB') for f in batch_files]
422
+
423
+ captions = stream_chat(batch_images, caption_type, caption_length, extra_options, name_input, custom_prompt,
424
+ max_new_tokens, top_p, temperature, batch_size, pbar, models)
425
+
426
+ for file, caption in zip(batch_files, captions):
427
+ with open(output_dir / f"{file.stem}.txt", 'w', encoding='utf-8') as f:
428
+ f.write(caption)
429
+
430
+ for img in batch_images:
431
+ img.close()
432
+
433
+ def parse_arguments():
434
+ parser = argparse.ArgumentParser(description="Process images and generate captions.")
435
+ parser.add_argument("input", nargs='+', help="Input image file or directory (or multiple directories)")
436
+ parser.add_argument("--output", help="Output directory (optional)")
437
+ parser.add_argument("--bs", type=int, default=4, help="Batch size (default: 4)")
438
+ parser.add_argument("--type", type=str, default="Descriptive",
439
+ choices=["Descriptive", "Descriptive (Informal)", "Training Prompt", "MidJourney", "Booru tag list", "Booru-like tag list", "Art Critic", "Product Listing", "Social Media Post"],
440
+ help='Caption Type (default: "Descriptive")')
441
+ parser.add_argument("--len", default="long",
442
+ choices=["any", "very short", "short", "medium-length", "long", "very long"] + [str(i) for i in range(20, 261, 10)],
443
+ help='Caption Length (default: "long")')
444
+ parser.add_argument("--extra", default=[], type=list[str], help='Extra Options',
445
+ choices=[
446
+ "If there is a person/character in the image you must refer to them as {name}.",
447
+ "Do NOT include information about people/characters that cannot be changed (like ethnicity, gender, etc), but do still include changeable attributes (like hair style).",
448
+ "Include information about lighting.",
449
+ "Include information about camera angle.",
450
+ "Include information about whether there is a watermark or not.",
451
+ "Include information about whether there are JPEG artifacts or not.",
452
+ "If it is a photo you MUST include information about what camera was likely used and details such as aperture, shutter speed, ISO, etc.",
453
+ "Do NOT include anything sexual; keep it PG.",
454
+ "Do NOT mention the image's resolution.",
455
+ "You MUST include information about the subjective aesthetic quality of the image from low to very high.",
456
+ "Include information on the image's composition style, such as leading lines, rule of thirds, or symmetry.",
457
+ "Do NOT mention any text that is in the image.",
458
+ "Specify the depth of field and whether the background is in focus or blurred.",
459
+ "If applicable, mention the likely use of artificial or natural lighting sources.",
460
+ "Do NOT use any ambiguous language.",
461
+ "Include whether the image is sfw, suggestive, or nsfw.",
462
+ "ONLY describe the most important elements of the image."
463
+ ])
464
+ parser.add_argument("--name", type=str, default="", help='Person/Character Name (if applicable)')
465
+ parser.add_argument("--prompt", type=str, default="", help='Custom Prompt (optional, will override all other settings)')
466
+ parser.add_argument("--model", type=str, default=DEFAULT_MODEL_PATH,
467
+ help='Huggingface LLM repo (default: "unsloth/Meta-Llama-3.1-8B-bnb-4bit")')
468
+ parser.add_argument("--bf16", action="store_true", default=False, help="Use bfloat16 (default: NF4)")
469
+ parser.add_argument("--nolora", action="store_true", default=False, help="Disable VLM's custom text model (default: Enable)")
470
+ parser.add_argument("--tokens", type=int, default=300, help="Max tokens (default: 300)")
471
+ parser.add_argument("--topp", type=float, default=0.9, help="Top-P (default: 0.9)")
472
+ parser.add_argument("--temp", type=float, default=0.6, help="Temperature (default: 0.6)")
473
+ return parser.parse_args()
474
+
475
+ def is_valid_repo(repo_id):
476
+ from huggingface_hub import HfApi
477
+ import re
478
+ try:
479
+ if not re.fullmatch(r'^[^/,\s\"\']+/[^/,\s\"\']+$', repo_id): return False
480
+ api = HfApi()
481
+ if api.repo_exists(repo_id=repo_id): return True
482
+ else: return False
483
+ except Exception as e:
484
+ print(f"Failed to connect {repo_id}. {e}")
485
+ return False
486
+
487
+ def main():
488
+ global MODEL_PATH, IS_NF4, IS_LORA
489
+ args = parse_arguments()
490
+ input_paths = [Path(input_path) for input_path in args.input]
491
+ batch_size = args.bs
492
+ caption_type = args.type
493
+ caption_length = args.len
494
+ extra_options = args.extra
495
+ name_input = args.name
496
+ custom_prompt = args.prompt
497
+ max_new_tokens = args.tokens
498
+ top_p = args.topp
499
+ temperature = args.temp
500
+ IS_NF4 = False if args.bf16 else True
501
+ IS_LORA = False if args.nolora else True
502
+ if is_valid_repo(args.model): MODEL_PATH = args.model
503
+ else: sys.exit(1)
504
+ models = load_models()
505
+
506
+ for input_path in input_paths:
507
+ if input_path.is_file() and input_path.suffix.lower() in IMAGE_EXTENSIONS:
508
+ output_path = input_path.with_suffix('.txt')
509
+ print(f"Processing single image 🎞️: {input_path.name}")
510
+ with tqdm(total=1, desc="Processing image", unit="image") as pbar:
511
+ captions = stream_chat([Image.open(input_path).convert('RGB')], caption_type, caption_length, extra_options, name_input, custom_prompt,
512
+ max_new_tokens, top_p, temperature, 1, pbar, models)
513
+ with open(output_path, 'w', encoding='utf-8') as f:
514
+ f.write(captions[0])
515
+ print(f"Output saved to {output_path}")
516
+ elif input_path.is_dir():
517
+ output_path = Path(args.output) if args.output else input_path
518
+ print(f"Processing directory 📁: {input_path}")
519
+ print(f"Output directory 📦: {output_path}")
520
+ print(f"Batch size 🗄️: {batch_size}")
521
+ process_directory(input_path, output_path, caption_type, caption_length, extra_options, name_input, custom_prompt,
522
+ max_new_tokens, top_p, temperature, batch_size, models)
523
+ else:
524
+ print(f"Invalid input: {input_path}")
525
+ print("Skipping...")
526
+
527
+ if not input_paths:
528
+ print("Usage:")
529
+ print("For single image: python app.py [image_file] [--bs batch_size]")
530
+ print("For directory (same input/output): python app.py [directory] [--bs batch_size]")
531
+ print("For directory (separate input/output): python app.py [directory] --output [output_directory] [--bs batch_size]")
532
+ print("For multiple directories: python app.py [directory1] [directory2] ... [--output output_directory] [--bs batch_size]")
533
+ sys.exit(1)
534
+
535
+ if __name__ == "__main__":
536
+ main()
Joy_caption/cgrkzexw-599808/config.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_project: joy-caption-1
2
+ device_batch_size: 2
3
+ batch_size: 256
4
+ learning_rate: 0.0002
5
+ warmup_samples: 18000
6
+ max_samples: 600000
7
+ save_every: 50000
8
+ test_every: 50000
9
+ use_amp: true
10
+ grad_scaler: true
11
+ lr_scheduler_type: cosine
12
+ min_lr_ratio: 0.0
13
+ allow_tf32: true
14
+ seed: 69
15
+ num_workers: 8
16
+ optimizer_type: adamw
17
+ adam_beta1: 0.9
18
+ adam_beta2: 0.999
19
+ adam_eps: 1.0e-08
20
+ adam_weight_decay: 0.0
21
+ clip_grad_norm: 1.0
22
+ dataset: fancyfeast/joy-captioning-20240924a
23
+ clip_model: google/siglip-so400m-patch14-384
24
+ text_model: ../lora-train/lora_model_vwbzycxh
25
+ resume: null
26
+ gradient_checkpointing: false
27
+ test_size: 2048
28
+ grad_scaler_init: 65536.0
29
+ max_caption_length: 257
30
+ num_image_tokens: 32
31
+ adapter_type: mlp
32
+ text_model_dtype: bfloat16
33
+ pre_test: false
34
+ train_image_model: true
35
+ image_model_lr: null
36
+ train_lora: true
37
+ lora_r: 64
38
+ lora_alpha: 16
39
+ lora_dropout: 0.1
Joy_caption/cgrkzexw-599808/text_model/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/Meta-Llama-3.1-8B-Instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
Joy_caption/cgrkzexw-599808/text_model/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/Meta-Llama-3.1-8B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "q_proj",
24
+ "v_proj",
25
+ "gate_proj",
26
+ "down_proj",
27
+ "o_proj",
28
+ "k_proj",
29
+ "up_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
Joy_caption/cgrkzexw-599808/text_model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Joy_caption/joycaption_alpha_two_cli_mod.ipynb ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "id": "ZgkQ4kDil23W"
8
+ },
9
+ "outputs": [],
10
+ "source": [
11
+ "!git clone https://huggingface.co/John6666/joy-caption-alpha-two-cli-mod/\n",
12
+ "!pip install -r /content/joy-caption-alpha-two-cli-mod/requirements.txt\n",
13
+ "!pip install bitsandbytes triton\n",
14
+ "!pip install accelerate==0.30.1\n",
15
+ "!python /content/joy-caption-alpha-two-cli-mod/app.py"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": null,
21
+ "metadata": {
22
+ "id": "gPwD8BVsnU7p"
23
+ },
24
+ "outputs": [],
25
+ "source": [
26
+ "!python /content/joy-caption-alpha-two-cli-mod/app.py"
27
+ ]
28
+ }
29
+ ],
30
+ "metadata": {
31
+ "accelerator": "GPU",
32
+ "colab": {
33
+ "gpuType": "T4",
34
+ "provenance": []
35
+ },
36
+ "kernelspec": {
37
+ "display_name": "Python 3",
38
+ "name": "python3"
39
+ },
40
+ "language_info": {
41
+ "name": "python"
42
+ }
43
+ },
44
+ "nbformat": 4,
45
+ "nbformat_minor": 0
46
+ }
Joy_caption/requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ huggingface_hub>=0.23.4
2
+ accelerate
3
+ torch
4
+ transformers==4.44.0
5
+ sentencepiece
6
+ bitsandbytes
7
+ Pillow
8
+ protobuf
9
+ peft==0.12.0
10
+ torchvision
LLM/Florence-2-base-PromptGen-v2.0/configuration_florence2.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import warnings
15
+ """ Florence-2 configuration"""
16
+
17
+ from typing import Optional
18
+
19
+ from transformers import AutoConfig
20
+ from transformers.configuration_utils import PretrainedConfig
21
+ from transformers.utils import logging
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ class Florence2VisionConfig(PretrainedConfig):
26
+ r"""
27
+ This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
28
+ according to the specified arguments, defining the model architecture. Instantiating a configuration with the
29
+ defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
30
+
31
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
32
+ documentation from [`PretrainedConfig`] for more information.
33
+
34
+ Args:
35
+ drop_path_rate (`float`, *optional*, defaults to 0.1):
36
+ The dropout rate of the drop path layer.
37
+ patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
38
+ The patch size of the image.
39
+ patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
40
+ The patch stride of the image.
41
+ patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
42
+ The patch padding of the image.
43
+ patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
44
+ Whether to apply layer normalization before the patch embedding layer.
45
+ enable_checkpoint (`bool`, *optional*, defaults to False):
46
+ Whether to enable checkpointing.
47
+ dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
48
+ The dimension of the embedding layer.
49
+ num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
50
+ The number of attention heads.
51
+ num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
52
+ The number of groups.
53
+ depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
54
+ The depth of the model.
55
+ window_size (`int`, *optional*, defaults to 12):
56
+ The window size of the model.
57
+ projection_dim (`int`, *optional*, defaults to 1024):
58
+ The dimension of the projection layer.
59
+ visual_temporal_embedding (`dict`, *optional*):
60
+ The configuration of the visual temporal embedding.
61
+ image_pos_embed (`dict`, *optional*):
62
+ The configuration of the image position embedding.
63
+ image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
64
+ The source of the image feature.
65
+ Example:
66
+
67
+ ```python
68
+ >>> from transformers import Florence2VisionConfig, Florence2VisionModel
69
+
70
+ >>> # Initializing a Florence2 Vision style configuration
71
+ >>> configuration = Florence2VisionConfig()
72
+
73
+ >>> # Initializing a model (with random weights)
74
+ >>> model = Florence2VisionModel(configuration)
75
+
76
+ >>> # Accessing the model configuration
77
+ >>> configuration = model.config
78
+ ```"""
79
+
80
+ model_type = "florence2_vision"
81
+ keys_to_ignore_at_inference = ["past_key_values"]
82
+
83
+ def __init__(
84
+ self,
85
+ drop_path_rate=0.1,
86
+ patch_size=[7, 3, 3, 3],
87
+ patch_stride=[4, 2, 2, 2],
88
+ patch_padding=[3, 1, 1, 1],
89
+ patch_prenorm=[False, True, True, True],
90
+ enable_checkpoint=False,
91
+ dim_embed=[256, 512, 1024, 2048],
92
+ num_heads=[8, 16, 32, 64],
93
+ num_groups=[8, 16, 32, 64],
94
+ depths=[1, 1, 9, 1],
95
+ window_size=12,
96
+ projection_dim=1024,
97
+ visual_temporal_embedding=None,
98
+ image_pos_embed=None,
99
+ image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
100
+ **kwargs,
101
+ ):
102
+ self.drop_path_rate = drop_path_rate
103
+ self.patch_size = patch_size
104
+ self.patch_stride = patch_stride
105
+ self.patch_padding = patch_padding
106
+ self.patch_prenorm = patch_prenorm
107
+ self.enable_checkpoint = enable_checkpoint
108
+ self.dim_embed = dim_embed
109
+ self.num_heads = num_heads
110
+ self.num_groups = num_groups
111
+ self.depths = depths
112
+ self.window_size = window_size
113
+ self.projection_dim = projection_dim
114
+ self.visual_temporal_embedding = visual_temporal_embedding
115
+ self.image_pos_embed = image_pos_embed
116
+ self.image_feature_source = image_feature_source
117
+
118
+ super().__init__(**kwargs)
119
+
120
+
121
+
122
+ class Florence2LanguageConfig(PretrainedConfig):
123
+ r"""
124
+ This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
125
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
126
+ defaults will yield a similar configuration to that of the BART
127
+ [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
128
+
129
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
130
+ documentation from [`PretrainedConfig`] for more information.
131
+
132
+
133
+ Args:
134
+ vocab_size (`int`, *optional*, defaults to 51289):
135
+ Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
136
+ `inputs_ids` passed when calling [`Florence2LanguageModel`].
137
+ d_model (`int`, *optional*, defaults to 1024):
138
+ Dimensionality of the layers and the pooler layer.
139
+ encoder_layers (`int`, *optional*, defaults to 12):
140
+ Number of encoder layers.
141
+ decoder_layers (`int`, *optional*, defaults to 12):
142
+ Number of decoder layers.
143
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
144
+ Number of attention heads for each attention layer in the Transformer encoder.
145
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
146
+ Number of attention heads for each attention layer in the Transformer decoder.
147
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
148
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
149
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
150
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
151
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
152
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
153
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
154
+ dropout (`float`, *optional*, defaults to 0.1):
155
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
156
+ attention_dropout (`float`, *optional*, defaults to 0.0):
157
+ The dropout ratio for the attention probabilities.
158
+ activation_dropout (`float`, *optional*, defaults to 0.0):
159
+ The dropout ratio for activations inside the fully connected layer.
160
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
161
+ The dropout ratio for classifier.
162
+ max_position_embeddings (`int`, *optional*, defaults to 1024):
163
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
164
+ just in case (e.g., 512 or 1024 or 2048).
165
+ init_std (`float`, *optional*, defaults to 0.02):
166
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
167
+ encoder_layerdrop (`float`, *optional*, defaults to 0.0):
168
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
169
+ for more details.
170
+ decoder_layerdrop (`float`, *optional*, defaults to 0.0):
171
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
172
+ for more details.
173
+ scale_embedding (`bool`, *optional*, defaults to `False`):
174
+ Scale embeddings by diving by sqrt(d_model).
175
+ use_cache (`bool`, *optional*, defaults to `True`):
176
+ Whether or not the model should return the last key/values attentions (not used by all models).
177
+ num_labels (`int`, *optional*, defaults to 3):
178
+ The number of labels to use in [`Florence2LanguageForSequenceClassification`].
179
+ forced_eos_token_id (`int`, *optional*, defaults to 2):
180
+ The id of the token to force as the last generated token when `max_length` is reached. Usually set to
181
+ `eos_token_id`.
182
+
183
+ Example:
184
+
185
+ ```python
186
+ >>> from transformers import Florence2LanguageConfig, Florence2LanguageModel
187
+
188
+ >>> # Initializing a Florence2 Language style configuration
189
+ >>> configuration = Florence2LanguageConfig()
190
+
191
+ >>> # Initializing a model (with random weights)
192
+ >>> model = Florence2LangaugeModel(configuration)
193
+
194
+ >>> # Accessing the model configuration
195
+ >>> configuration = model.config
196
+ ```"""
197
+
198
+ model_type = "florence2_language"
199
+ keys_to_ignore_at_inference = ["past_key_values"]
200
+ attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
201
+
202
+ def __init__(
203
+ self,
204
+ vocab_size=51289,
205
+ max_position_embeddings=1024,
206
+ encoder_layers=12,
207
+ encoder_ffn_dim=4096,
208
+ encoder_attention_heads=16,
209
+ decoder_layers=12,
210
+ decoder_ffn_dim=4096,
211
+ decoder_attention_heads=16,
212
+ encoder_layerdrop=0.0,
213
+ decoder_layerdrop=0.0,
214
+ activation_function="gelu",
215
+ d_model=1024,
216
+ dropout=0.1,
217
+ attention_dropout=0.0,
218
+ activation_dropout=0.0,
219
+ init_std=0.02,
220
+ classifier_dropout=0.0,
221
+ scale_embedding=False,
222
+ use_cache=True,
223
+ num_labels=3,
224
+ pad_token_id=1,
225
+ bos_token_id=0,
226
+ eos_token_id=2,
227
+ is_encoder_decoder=True,
228
+ decoder_start_token_id=2,
229
+ forced_eos_token_id=2,
230
+ **kwargs,
231
+ ):
232
+ self.vocab_size = vocab_size
233
+ self.max_position_embeddings = max_position_embeddings
234
+ self.d_model = d_model
235
+ self.encoder_ffn_dim = encoder_ffn_dim
236
+ self.encoder_layers = encoder_layers
237
+ self.encoder_attention_heads = encoder_attention_heads
238
+ self.decoder_ffn_dim = decoder_ffn_dim
239
+ self.decoder_layers = decoder_layers
240
+ self.decoder_attention_heads = decoder_attention_heads
241
+ self.dropout = dropout
242
+ self.attention_dropout = attention_dropout
243
+ self.activation_dropout = activation_dropout
244
+ self.activation_function = activation_function
245
+ self.init_std = init_std
246
+ self.encoder_layerdrop = encoder_layerdrop
247
+ self.decoder_layerdrop = decoder_layerdrop
248
+ self.classifier_dropout = classifier_dropout
249
+ self.use_cache = use_cache
250
+ self.num_hidden_layers = encoder_layers
251
+ self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
252
+
253
+ super().__init__(
254
+ num_labels=num_labels,
255
+ pad_token_id=pad_token_id,
256
+ bos_token_id=bos_token_id,
257
+ eos_token_id=eos_token_id,
258
+ is_encoder_decoder=is_encoder_decoder,
259
+ decoder_start_token_id=decoder_start_token_id,
260
+ forced_eos_token_id=forced_eos_token_id,
261
+ **kwargs,
262
+ )
263
+
264
+ # ensure backward compatibility for BART CNN models
265
+ if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
266
+ self.forced_bos_token_id = self.bos_token_id
267
+ warnings.warn(
268
+ f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
269
+ "The config can simply be saved and uploaded again to be fixed."
270
+ )
271
+
272
+ class Florence2Config(PretrainedConfig):
273
+ r"""
274
+ This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
275
+ Florence-2 model according to the specified arguments, defining the model architecture.
276
+
277
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
278
+ documentation from [`PretrainedConfig`] for more information.
279
+
280
+ Args:
281
+ vision_config (`Florence2VisionConfig`, *optional*):
282
+ Custom vision config or dict
283
+ text_config (`Union[AutoConfig, dict]`, *optional*):
284
+ The config object of the text backbone.
285
+ ignore_index (`int`, *optional*, defaults to -100):
286
+ The ignore index for the loss function.
287
+ vocab_size (`int`, *optional*, defaults to 51289):
288
+ Vocabulary size of the Florence2model. Defines the number of different tokens that can be represented by the
289
+ `inputs_ids` passed when calling [`~Florence2ForConditionalGeneration`]
290
+ projection_dim (`int`, *optional*, defaults to 1024):
291
+ Dimension of the multimodal projection space.
292
+
293
+ Example:
294
+
295
+ ```python
296
+ >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
297
+
298
+ >>> # Initializing a clip-like vision config
299
+ >>> vision_config = CLIPVisionConfig()
300
+
301
+ >>> # Initializing a Bart config
302
+ >>> text_config = BartConfig()
303
+
304
+ >>> # Initializing a Florence-2 configuration
305
+ >>> configuration = Florence2Config(vision_config, text_config)
306
+
307
+ >>> # Initializing a model from the florence-2 configuration
308
+ >>> model = Florence2ForConditionalGeneration(configuration)
309
+
310
+ >>> # Accessing the model configuration
311
+ >>> configuration = model.config
312
+ ```"""
313
+
314
+ model_type = "florence2"
315
+ is_composition = False
316
+
317
+ def __init__(
318
+ self,
319
+ vision_config=None,
320
+ text_config=None,
321
+ ignore_index=-100,
322
+ vocab_size=51289,
323
+ projection_dim=1024,
324
+ **kwargs,
325
+ ):
326
+ self.ignore_index = ignore_index
327
+ self.vocab_size = vocab_size
328
+ self.projection_dim = projection_dim
329
+ if vision_config is not None:
330
+ vision_config = PretrainedConfig(**vision_config)
331
+ self.vision_config = vision_config
332
+ self.vocab_size = self.vocab_size
333
+
334
+ self.text_config = text_config
335
+ if text_config is not None:
336
+ self.text_config = Florence2LanguageConfig(**text_config)
337
+
338
+
339
+ super().__init__(**kwargs)
340
+
LLM/Florence-2-base-PromptGen-v2.0/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "early_stopping": true,
6
+ "eos_token_id": 2,
7
+ "forced_bos_token_id": 0,
8
+ "forced_eos_token_id": 2,
9
+ "no_repeat_ngram_size": 3,
10
+ "num_beams": 3,
11
+ "pad_token_id": 1,
12
+ "transformers_version": "4.44.2"
13
+ }
LLM/Florence-2-base-PromptGen-v2.0/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
LLM/Florence-2-base-PromptGen-v2.0/processing_florence2.py ADDED
@@ -0,0 +1,1088 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Processor class for Florence-2.
17
+ """
18
+
19
+ import re
20
+ import logging
21
+ from typing import List, Optional, Union
22
+ import numpy as np
23
+
24
+ import torch
25
+
26
+ from transformers.feature_extraction_utils import BatchFeature
27
+ from transformers.image_utils import ImageInput, is_valid_image
28
+ from transformers.processing_utils import ProcessorMixin
29
+ from transformers.tokenization_utils_base import (
30
+ PaddingStrategy,
31
+ PreTokenizedInput,
32
+ TextInput,
33
+ TruncationStrategy,
34
+ )
35
+ from transformers.utils import TensorType
36
+
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+ # Copied from transformers.models.idefics2.processing_idefics2.is_url
41
+ def is_url(val) -> bool:
42
+ return isinstance(val, str) and val.startswith("http")
43
+
44
+ # Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
45
+ def is_image_or_image_url(elem):
46
+ return is_url(elem) or is_valid_image(elem)
47
+
48
+
49
+ def _is_str_or_image(elem):
50
+ return isinstance(elem, (str)) or is_image_or_image_url(elem)
51
+
52
+
53
+ class Florence2Processor(ProcessorMixin):
54
+ r"""
55
+ Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor.
56
+
57
+ [`Florence2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and [`BartTokenizerFast`]. See the
58
+ [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information.
59
+
60
+ Args:
61
+ image_processor ([`CLIPImageProcessor`], *optional*):
62
+ The image processor is a required input.
63
+ tokenizer ([`BartTokenizerFast`], *optional*):
64
+ The tokenizer is a required input.
65
+ """
66
+
67
+ attributes = ["image_processor", "tokenizer"]
68
+ image_processor_class = "CLIPImageProcessor"
69
+ tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
70
+
71
+ def __init__(
72
+ self,
73
+ image_processor=None,
74
+ tokenizer=None,
75
+ ):
76
+ if image_processor is None:
77
+ raise ValueError("You need to specify an `image_processor`.")
78
+ if tokenizer is None:
79
+ raise ValueError("You need to specify a `tokenizer`.")
80
+ if not hasattr(image_processor, "image_seq_length"):
81
+ raise ValueError("Image processor is missing an `image_seq_length` attribute.")
82
+
83
+ self.image_seq_length = image_processor.image_seq_length
84
+
85
+ tokens_to_add = {
86
+ 'additional_special_tokens': \
87
+ tokenizer.additional_special_tokens + \
88
+ ['<od>', '</od>', '<ocr>', '</ocr>'] + \
89
+ [f'<loc_{x}>' for x in range(1000)] + \
90
+ ['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>']
91
+ }
92
+ tokenizer.add_special_tokens(tokens_to_add)
93
+
94
+ self.tasks_answer_post_processing_type = {
95
+ '<OCR>': 'pure_text',
96
+ '<OCR_WITH_REGION>': 'ocr',
97
+ '<CAPTION>': 'pure_text',
98
+ '<DETAILED_CAPTION>': 'pure_text',
99
+ '<MORE_DETAILED_CAPTION>': 'pure_text',
100
+ '<OD>': 'description_with_bboxes',
101
+ '<DENSE_REGION_CAPTION>': 'description_with_bboxes',
102
+ '<CAPTION_TO_PHRASE_GROUNDING>': "phrase_grounding",
103
+ '<REFERRING_EXPRESSION_SEGMENTATION>': 'polygons',
104
+ '<REGION_TO_SEGMENTATION>': 'polygons',
105
+ '<OPEN_VOCABULARY_DETECTION>': 'description_with_bboxes_or_polygons',
106
+ '<REGION_TO_CATEGORY>': 'pure_text',
107
+ '<REGION_TO_DESCRIPTION>': 'pure_text',
108
+ '<REGION_TO_OCR>': 'pure_text',
109
+ '<REGION_PROPOSAL>': 'bboxes'
110
+ }
111
+
112
+ self.task_prompts_without_inputs = {
113
+ '<OCR>': 'What is the text in the image?',
114
+ '<OCR_WITH_REGION>': 'What is the text in the image, with regions?',
115
+ '<CAPTION>': 'What does the image describe?',
116
+ '<DETAILED_CAPTION>': 'Describe in detail what is shown in the image.',
117
+ '<MORE_DETAILED_CAPTION>': 'Describe with a paragraph what is shown in the image.',
118
+ '<OD>': 'Locate the objects with category name in the image.',
119
+ '<DENSE_REGION_CAPTION>': 'Locate the objects in the image, with their descriptions.',
120
+ '<REGION_PROPOSAL>': 'Locate the region proposals in the image.'
121
+ }
122
+
123
+ self.task_prompts_with_input = {
124
+ '<CAPTION_TO_PHRASE_GROUNDING>': "Locate the phrases in the caption: {input}",
125
+ '<REFERRING_EXPRESSION_SEGMENTATION>': 'Locate {input} in the image with mask',
126
+ '<REGION_TO_SEGMENTATION>': 'What is the polygon mask of region {input}',
127
+ '<OPEN_VOCABULARY_DETECTION>': 'Locate {input} in the image.',
128
+ '<REGION_TO_CATEGORY>': 'What is the region {input}?',
129
+ '<REGION_TO_DESCRIPTION>': 'What does the region {input} describe?',
130
+ '<REGION_TO_OCR>': 'What text is in the region {input}?',
131
+ }
132
+
133
+ self.post_processor = Florence2PostProcesser(tokenizer=tokenizer)
134
+
135
+
136
+ super().__init__(image_processor, tokenizer)
137
+
138
+ def _construct_prompts(self, text):
139
+ # replace the task tokens with the task prompts if task token is in the text
140
+ prompts = []
141
+ for _text in text:
142
+ # 1. fixed task prompts without additional inputs
143
+ for task_token, task_prompt in self.task_prompts_without_inputs.items():
144
+ if task_token in _text:
145
+ assert _text == task_token, f"Task token {task_token} should be the only token in the text."
146
+ _text = task_prompt
147
+ break
148
+ # 2. task prompts with additional inputs
149
+ for task_token, task_prompt in self.task_prompts_with_input.items():
150
+ if task_token in _text:
151
+ _text = task_prompt.format(input=_text.replace(task_token, ''))
152
+ break
153
+ prompts.append(_text)
154
+ return prompts
155
+
156
+ def __call__(
157
+ self,
158
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
159
+ images: ImageInput = None,
160
+ tokenize_newline_separately: bool = True,
161
+ padding: Union[bool, str, PaddingStrategy] = False,
162
+ truncation: Union[bool, str, TruncationStrategy] = None,
163
+ max_length=None,
164
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
165
+ do_resize: bool = None,
166
+ do_normalize: bool = None,
167
+ image_mean: Optional[Union[float, List[float]]] = None,
168
+ image_std: Optional[Union[float, List[float]]] = None,
169
+ data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821
170
+ input_data_format: Optional[
171
+ Union[str, "ChannelDimension"] # noqa: F821
172
+ ] = None,
173
+ resample: "PILImageResampling" = None, # noqa: F821
174
+ do_convert_rgb: bool = None,
175
+ do_thumbnail: bool = None,
176
+ do_align_long_axis: bool = None,
177
+ do_rescale: bool = None,
178
+ ) -> BatchFeature:
179
+ """
180
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
181
+ and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
182
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
183
+ CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
184
+ of the above two methods for more information.
185
+
186
+ Args:
187
+ text (`str`, `List[str]`, `List[List[str]]`):
188
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
189
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
190
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
191
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
192
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
193
+ tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
194
+ number of channels, H and W are image height and width.
195
+ tokenize_newline_separately (`bool`, defaults to `True`):
196
+ Adds a separately tokenized '\n' at the end of the prompt.
197
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
198
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
199
+ index) among:
200
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
201
+ sequence if provided).
202
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
203
+ acceptable input length for the model if that argument is not provided.
204
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
205
+ lengths).
206
+ max_length (`int`, *optional*):
207
+ Maximum length of the returned list and optionally padding length (see above).
208
+ truncation (`bool`, *optional*):
209
+ Activates truncation to cut input sequences longer than `max_length` to `max_length`.
210
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
211
+ If set, will return tensors of a particular framework. Acceptable values are:
212
+
213
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
214
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
215
+ - `'np'`: Return NumPy `np.ndarray` objects.
216
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
217
+
218
+ Returns:
219
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
220
+
221
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
222
+ is provided, the `input_ids` will also contain the suffix input ids.
223
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
224
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
225
+ `None`).
226
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
227
+ - **labels** -- Labels compatible with training if `suffix` is not None
228
+ """
229
+
230
+ return_token_type_ids = False
231
+
232
+ if images is None:
233
+ raise ValueError("`images` are expected as arguments to a `Florence2Processor` instance.")
234
+ if text is None:
235
+ logger.warning_once(
236
+ "You are using Florence-2 without a text prompt."
237
+ )
238
+ text = ""
239
+
240
+ if isinstance(text, List) and isinstance(images, List):
241
+ if len(images) < len(text):
242
+ raise ValueError(
243
+ f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image."
244
+ )
245
+ if _is_str_or_image(text):
246
+ text = [text]
247
+ elif isinstance(text, list) and _is_str_or_image(text[0]):
248
+ pass
249
+
250
+ pixel_values = self.image_processor(
251
+ images,
252
+ do_resize=do_resize,
253
+ do_normalize=do_normalize,
254
+ return_tensors=return_tensors,
255
+ image_mean=image_mean,
256
+ image_std=image_std,
257
+ input_data_format=input_data_format,
258
+ data_format=data_format,
259
+ resample=resample,
260
+ do_convert_rgb=do_convert_rgb,
261
+ )["pixel_values"]
262
+
263
+ if max_length is not None:
264
+ max_length -= self.image_seq_length # max_length has to account for the image tokens
265
+
266
+ text = self._construct_prompts(text)
267
+
268
+ inputs = self.tokenizer(
269
+ text,
270
+ return_tensors=return_tensors,
271
+ padding=padding,
272
+ max_length=max_length,
273
+ truncation=truncation,
274
+ return_token_type_ids=return_token_type_ids,
275
+ )
276
+
277
+ return_data = {**inputs, "pixel_values": pixel_values}
278
+
279
+ if return_token_type_ids:
280
+ labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
281
+ return_data.update({"labels": labels})
282
+ return BatchFeature(data=return_data)
283
+
284
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Florence2
285
+ def batch_decode(self, *args, **kwargs):
286
+ """
287
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
288
+ refer to the docstring of this method for more information.
289
+ """
290
+ return self.tokenizer.batch_decode(*args, **kwargs)
291
+
292
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Florence2
293
+ def decode(self, *args, **kwargs):
294
+ """
295
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
296
+ the docstring of this method for more information.
297
+ """
298
+ return self.tokenizer.decode(*args, **kwargs)
299
+
300
+ @property
301
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Florence2
302
+ def model_input_names(self):
303
+ tokenizer_input_names = self.tokenizer.model_input_names
304
+ image_processor_input_names = self.image_processor.model_input_names
305
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
306
+
307
+ def post_process_generation(self, text, task, image_size):
308
+ """
309
+ Post-process the output of the model to each of the task outputs.
310
+
311
+ Args:
312
+ text (`str`): The text to post-process.
313
+ task (`str`): The task to post-process the text for.
314
+ image_size (`Tuple[int, int]`): The size of the image. height x width.
315
+ """
316
+
317
+ task_answer_post_processing_type = self.tasks_answer_post_processing_type.get(task, 'pure_text')
318
+ task_answer = self.post_processor(
319
+ text=text,
320
+ image_size=image_size,
321
+ parse_tasks=task_answer_post_processing_type,
322
+ )[task_answer_post_processing_type]
323
+
324
+ if task_answer_post_processing_type == 'pure_text':
325
+ final_answer = task_answer
326
+ # remove the special tokens
327
+ final_answer = final_answer.replace('<s>', '').replace('</s>', '')
328
+ elif task_answer_post_processing_type in ['od', 'description_with_bboxes', 'bboxes']:
329
+ od_instances = task_answer
330
+ bboxes_od = [_od_instance['bbox'] for _od_instance in od_instances]
331
+ labels_od = [str(_od_instance['cat_name']) for _od_instance in od_instances]
332
+ final_answer = {'bboxes': bboxes_od, 'labels': labels_od}
333
+ elif task_answer_post_processing_type in ['ocr']:
334
+ bboxes = [_od_instance['quad_box'] for _od_instance in task_answer]
335
+ labels = [str(_od_instance['text']) for _od_instance in task_answer]
336
+ final_answer = {'quad_boxes': bboxes, 'labels': labels}
337
+ elif task_answer_post_processing_type in ['phrase_grounding']:
338
+ bboxes = []
339
+ labels = []
340
+ for _grounded_phrase in task_answer:
341
+ for _bbox in _grounded_phrase['bbox']:
342
+ bboxes.append(_bbox)
343
+ labels.append(_grounded_phrase['cat_name'])
344
+ final_answer = {'bboxes': bboxes, 'labels': labels}
345
+ elif task_answer_post_processing_type in ['description_with_polygons', 'polygons']:
346
+ labels = []
347
+ polygons = []
348
+ for result in task_answer:
349
+ label = result['cat_name']
350
+ _polygons = result['polygons']
351
+ labels.append(label)
352
+ polygons.append(_polygons)
353
+ final_answer = {'polygons': polygons, 'labels': labels}
354
+ elif task_answer_post_processing_type in ['description_with_bboxes_or_polygons']:
355
+ bboxes = []
356
+ bboxes_labels = []
357
+ polygons = []
358
+ polygons_labels = []
359
+ for result in task_answer:
360
+ label = result['cat_name']
361
+ if 'polygons' in result:
362
+ _polygons = result['polygons']
363
+ polygons.append(_polygons)
364
+ polygons_labels.append(label)
365
+ else:
366
+ _bbox = result['bbox']
367
+ bboxes.append(_bbox)
368
+ bboxes_labels.append(label)
369
+ final_answer = {'bboxes': bboxes, 'bboxes_labels': bboxes_labels, 'polygons': polygons, 'polygons_labels': polygons_labels}
370
+ else:
371
+ raise ValueError('Unknown task answer post processing type: {}'.format(task_answer_post_processing_type))
372
+
373
+ final_answer = {
374
+ task: final_answer}
375
+ return final_answer
376
+
377
+ class BoxQuantizer(object):
378
+ def __init__(self, mode, bins):
379
+ self.mode = mode
380
+ self.bins = bins
381
+
382
+ def quantize(self, boxes: torch.Tensor, size):
383
+ bins_w, bins_h = self.bins # Quantization bins.
384
+ size_w, size_h = size # Original image size.
385
+ size_per_bin_w = size_w / bins_w
386
+ size_per_bin_h = size_h / bins_h
387
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
388
+
389
+ if self.mode == 'floor':
390
+ quantized_xmin = (
391
+ xmin / size_per_bin_w).floor().clamp(0, bins_w - 1)
392
+ quantized_ymin = (
393
+ ymin / size_per_bin_h).floor().clamp(0, bins_h - 1)
394
+ quantized_xmax = (
395
+ xmax / size_per_bin_w).floor().clamp(0, bins_w - 1)
396
+ quantized_ymax = (
397
+ ymax / size_per_bin_h).floor().clamp(0, bins_h - 1)
398
+
399
+ elif self.mode == 'round':
400
+ raise NotImplementedError()
401
+
402
+ else:
403
+ raise ValueError('Incorrect quantization type.')
404
+
405
+ quantized_boxes = torch.cat(
406
+ (quantized_xmin, quantized_ymin, quantized_xmax, quantized_ymax), dim=-1
407
+ ).int()
408
+
409
+ return quantized_boxes
410
+
411
+ def dequantize(self, boxes: torch.Tensor, size):
412
+ bins_w, bins_h = self.bins # Quantization bins.
413
+ size_w, size_h = size # Original image size.
414
+ size_per_bin_w = size_w / bins_w
415
+ size_per_bin_h = size_h / bins_h
416
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
417
+
418
+ if self.mode == 'floor':
419
+ # Add 0.5 to use the center position of the bin as the coordinate.
420
+ dequantized_xmin = (xmin + 0.5) * size_per_bin_w
421
+ dequantized_ymin = (ymin + 0.5) * size_per_bin_h
422
+ dequantized_xmax = (xmax + 0.5) * size_per_bin_w
423
+ dequantized_ymax = (ymax + 0.5) * size_per_bin_h
424
+
425
+ elif self.mode == 'round':
426
+ raise NotImplementedError()
427
+
428
+ else:
429
+ raise ValueError('Incorrect quantization type.')
430
+
431
+ dequantized_boxes = torch.cat(
432
+ (dequantized_xmin, dequantized_ymin,
433
+ dequantized_xmax, dequantized_ymax), dim=-1
434
+ )
435
+
436
+ return dequantized_boxes
437
+
438
+
439
+ class CoordinatesQuantizer(object):
440
+ """
441
+ Quantize coornidates (Nx2)
442
+ """
443
+
444
+ def __init__(self, mode, bins):
445
+ self.mode = mode
446
+ self.bins = bins
447
+
448
+ def quantize(self, coordinates: torch.Tensor, size):
449
+ bins_w, bins_h = self.bins # Quantization bins.
450
+ size_w, size_h = size # Original image size.
451
+ size_per_bin_w = size_w / bins_w
452
+ size_per_bin_h = size_h / bins_h
453
+ assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
454
+ x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
455
+
456
+ if self.mode == 'floor':
457
+ quantized_x = (x / size_per_bin_w).floor().clamp(0, bins_w - 1)
458
+ quantized_y = (y / size_per_bin_h).floor().clamp(0, bins_h - 1)
459
+
460
+ elif self.mode == 'round':
461
+ raise NotImplementedError()
462
+
463
+ else:
464
+ raise ValueError('Incorrect quantization type.')
465
+
466
+ quantized_coordinates = torch.cat(
467
+ (quantized_x, quantized_y), dim=-1
468
+ ).int()
469
+
470
+ return quantized_coordinates
471
+
472
+ def dequantize(self, coordinates: torch.Tensor, size):
473
+ bins_w, bins_h = self.bins # Quantization bins.
474
+ size_w, size_h = size # Original image size.
475
+ size_per_bin_w = size_w / bins_w
476
+ size_per_bin_h = size_h / bins_h
477
+ assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
478
+ x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
479
+
480
+ if self.mode == 'floor':
481
+ # Add 0.5 to use the center position of the bin as the coordinate.
482
+ dequantized_x = (x + 0.5) * size_per_bin_w
483
+ dequantized_y = (y + 0.5) * size_per_bin_h
484
+
485
+ elif self.mode == 'round':
486
+ raise NotImplementedError()
487
+
488
+ else:
489
+ raise ValueError('Incorrect quantization type.')
490
+
491
+ dequantized_coordinates = torch.cat(
492
+ (dequantized_x, dequantized_y), dim=-1
493
+ )
494
+
495
+ return dequantized_coordinates
496
+
497
+
498
+ class Florence2PostProcesser(object):
499
+ """
500
+ Florence-2 post process for converting text prediction to various tasks results.
501
+
502
+ Args:
503
+ config: A dict of configs.
504
+ tokenizer: A tokenizer for decoding text to spans.
505
+ sample config:
506
+ UNIFIED_POST_PROCESS:
507
+ # commom configs
508
+ NUM_BBOX_HEIGHT_BINS: 1000
509
+ NUM_BBOX_WIDTH_BINS: 1000
510
+ COORDINATES_HEIGHT_BINS: 1000
511
+ COORDINATES_WIDTH_BINS: 1000
512
+ # task specific configs, override the common configs
513
+ PRASE_TASKS:
514
+ - TASK_NAME: 'video_dense_caption'
515
+ PATTERN: 'r<time_(\d+)><time_(\d+)>([a-zA-Z0-9 ]+)'
516
+ SCORE_MODE: 'avg_cat_name_scores'
517
+ NUM_BINS: 100
518
+ - TASK_NAME: 'od'
519
+ PATTERN: 'r<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>([a-zA-Z0-9 ]+)'
520
+ SCORE_MODE: 'avg_cat_name_scores'
521
+
522
+ Returns:
523
+ parsed_dict (dict): A dict of parsed results.
524
+ """
525
+ def __init__(
526
+ self,
527
+ tokenizer=None
528
+ ):
529
+ parse_tasks = []
530
+ parse_task_configs = {}
531
+ config = self._create_default_config()
532
+ for task in config['PARSE_TASKS']:
533
+ parse_tasks.append(task['TASK_NAME'])
534
+ parse_task_configs[task['TASK_NAME']] = task
535
+
536
+ self.config = config
537
+ self.parse_tasks = parse_tasks
538
+ self.parse_tasks_configs = parse_task_configs
539
+
540
+ self.tokenizer = tokenizer
541
+ if self.tokenizer is not None:
542
+ self.all_special_tokens = set(self.tokenizer.all_special_tokens)
543
+
544
+ self.init_quantizers()
545
+ self.black_list_of_phrase_grounding = self._create_black_list_of_phrase_grounding()
546
+
547
+ def _create_black_list_of_phrase_grounding(self):
548
+ black_list = {}
549
+
550
+ if 'phrase_grounding' in self.parse_tasks and self.parse_tasks_configs['phrase_grounding']['FILTER_BY_BLACK_LIST']:
551
+ black_list = set(
552
+ ['it', 'I', 'me', 'mine',
553
+ 'you', 'your', 'yours',
554
+ 'he', 'him', 'his',
555
+ 'she', 'her', 'hers',
556
+ 'they', 'them', 'their', 'theirs',
557
+ 'one', 'oneself',
558
+ 'we', 'us', 'our', 'ours',
559
+ 'you', 'your', 'yours',
560
+ 'they', 'them', 'their', 'theirs',
561
+ 'mine', 'yours', 'his', 'hers', 'its',
562
+ 'ours', 'yours', 'theirs',
563
+ 'myself', 'yourself', 'himself', 'herself', 'itself',
564
+ 'ourselves', 'yourselves', 'themselves',
565
+ 'this', 'that',
566
+ 'these', 'those',
567
+ 'who', 'whom', 'whose', 'which', 'what',
568
+ 'who', 'whom', 'whose', 'which', 'that',
569
+ 'all', 'another', 'any', 'anybody', 'anyone', 'anything',
570
+ 'each', 'everybody', 'everyone', 'everything',
571
+ 'few', 'many', 'nobody', 'none', 'one', 'several',
572
+ 'some', 'somebody', 'someone', 'something',
573
+ 'each other', 'one another',
574
+ 'myself', 'yourself', 'himself', 'herself', 'itself',
575
+ 'ourselves', 'yourselves', 'themselves',
576
+ 'the image', 'image', 'images', 'the', 'a', 'an', 'a group',
577
+ 'other objects', 'lots', 'a set',
578
+ ]
579
+ )
580
+
581
+ return black_list
582
+
583
+ def _create_default_config(self):
584
+ config = {
585
+ 'NUM_BBOX_HEIGHT_BINS': 1000,
586
+ 'NUM_BBOX_WIDTH_BINS': 1000,
587
+ 'BOX_QUANTIZATION_MODE': 'floor',
588
+ 'COORDINATES_HEIGHT_BINS': 1000,
589
+ 'COORDINATES_WIDTH_BINS': 1000,
590
+ 'COORDINATES_QUANTIZATION_MODE': 'floor',
591
+ 'PARSE_TASKS': [
592
+ {
593
+ 'TASK_NAME': 'od',
594
+ 'PATTERN': r'([a-zA-Z0-9 ]+)<loc_(\\d+)><loc_(\\d+)><loc_(\\d+)><loc_(\\d+)>'
595
+ },
596
+ {
597
+ 'TASK_NAME': 'ocr',
598
+ 'PATTERN': r'(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>',
599
+ 'AREA_THRESHOLD': 0.00
600
+ },
601
+ {
602
+ 'TASK_NAME': 'phrase_grounding',
603
+ 'FILTER_BY_BLACK_LIST': True
604
+ },
605
+ {
606
+ 'TASK_NAME': 'pure_text',
607
+ },
608
+ {
609
+ 'TASK_NAME': 'description_with_bboxes',
610
+ },
611
+ {
612
+ 'TASK_NAME': 'description_with_polygons',
613
+ },
614
+ {
615
+ 'TASK_NAME': 'polygons',
616
+ },
617
+ {
618
+ 'TASK_NAME': 'bboxes',
619
+ },
620
+ {
621
+ 'TASK_NAME': 'description_with_bboxes_or_polygons',
622
+ }
623
+ ]
624
+ }
625
+
626
+ return config
627
+
628
+ def init_quantizers(self):
629
+ # we have box_quantizer (od, grounding) and coordinates_quantizer (ocr, referring_segmentation)
630
+ num_bbox_height_bins = self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
631
+ num_bbox_width_bins = self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
632
+ box_quantization_mode = self.config.get('BOX_QUANTIZATION_MODE', 'floor')
633
+ self.box_quantizer = BoxQuantizer(
634
+ box_quantization_mode,
635
+ (num_bbox_width_bins, num_bbox_height_bins),
636
+ )
637
+
638
+ num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
639
+ num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
640
+ box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
641
+ self.coordinates_quantizer = CoordinatesQuantizer(
642
+ box_quantization_mode,
643
+ (num_bbox_width_bins, num_bbox_height_bins),
644
+ )
645
+
646
+ def decode_with_spans(self, tokenizer, token_ids):
647
+ filtered_tokens = tokenizer.convert_ids_to_tokens(
648
+ token_ids, skip_special_tokens=False)
649
+ assert len(filtered_tokens) == len(token_ids)
650
+
651
+ # To avoid mixing byte-level and unicode for byte-level BPT
652
+ # we need to build string separately for added tokens and byte-level tokens
653
+ # cf. https://github.com/huggingface/transformers/issues/1133
654
+ sub_texts = []
655
+ for token in filtered_tokens:
656
+ if token in self.all_special_tokens:
657
+ sub_texts.append(token)
658
+ else:
659
+ if isinstance(tokenizer, (BartTokenizer, BartTokenizerFast)):
660
+ sub_text = tokenizer.convert_tokens_to_string([token])
661
+ elif isinstance(tokenizer, (T5Tokenizer, T5TokenizerFast)):
662
+ # Ref: https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol
663
+ # Note: Do not strip sub_text as it may have functional whitespace
664
+ sub_text = token.replace('▁', ' ')
665
+ else:
666
+ raise ValueError(f'type {type(tokenizer)} not supported')
667
+ sub_texts.append(sub_text)
668
+
669
+ text = ''
670
+ spans = []
671
+ for sub_text in sub_texts:
672
+ span = (len(text), len(text) + len(sub_text)) # [start index, end index).
673
+ text += sub_text
674
+ spans.append(span)
675
+
676
+ # Text format:
677
+ # 1. T5Tokenizer/T5TokenizerFast:
678
+ # "<loc_1><loc_2><loc_3><loc_4> transplanting dog<loc_1><loc_2><loc_3><loc_4> cat</s>"
679
+ # Equivalent to t5_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
680
+ # 2. BartTokenizer (need to double check):
681
+ # "<s><loc_1><loc_2><loc_3><loc_4>transplanting dog<loc_1><loc_2><loc_3><loc_4>cat</s>"
682
+ # Equivalent to bart_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
683
+ return text, spans
684
+
685
+ def parse_od_from_text_and_spans(
686
+ self,
687
+ text,
688
+ pattern,
689
+ image_size,
690
+ phrase_centric=False
691
+ ):
692
+ parsed = list(re.finditer(pattern, text))
693
+
694
+ instances = []
695
+ for i in range(len(parsed)):
696
+ # Prepare instance.
697
+ instance = {}
698
+
699
+ if phrase_centric:
700
+ bbox_bins = [int(parsed[i].group(j)) for j in range(2, 6)]
701
+ else:
702
+ bbox_bins = [int(parsed[i].group(j)) for j in range(1, 5)]
703
+ instance['bbox'] = self.box_quantizer.dequantize(
704
+ boxes=torch.tensor(bbox_bins),
705
+ size=image_size
706
+ ).tolist()
707
+
708
+ if phrase_centric:
709
+ instance['cat_name'] = parsed[i].group(1).lower().strip()
710
+ else:
711
+ instance['cat_name'] = parsed[i].group(5).lower().strip()
712
+ instances.append(instance)
713
+
714
+ return instances
715
+
716
+ def parse_ocr_from_text_and_spans(self,
717
+ text,
718
+ pattern,
719
+ image_size,
720
+ area_threshold=-1.0,
721
+ ):
722
+ bboxes = []
723
+ labels = []
724
+ text = text.replace('<s>', '')
725
+ # ocr with regions
726
+ parsed = re.findall(pattern, text)
727
+ instances = []
728
+ image_width, image_height = image_size
729
+
730
+ for ocr_line in parsed:
731
+ ocr_content = ocr_line[0]
732
+ quad_box = ocr_line[1:]
733
+ quad_box = [int(i) for i in quad_box]
734
+ quad_box = self.coordinates_quantizer.dequantize(
735
+ torch.tensor(np.array(quad_box).reshape(-1, 2)),
736
+ size=image_size
737
+ ).reshape(-1).tolist()
738
+
739
+ if area_threshold > 0:
740
+ x_coords = [i for i in quad_box[0::2]]
741
+ y_coords = [i for i in quad_box[1::2]]
742
+
743
+ # apply the Shoelace formula
744
+ area = 0.5 * abs(sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1)))
745
+
746
+ if area < (image_width * image_height) * area_threshold:
747
+ continue
748
+
749
+ bboxes.append(quad_box)
750
+ labels.append(ocr_content)
751
+ instances.append({
752
+ 'quad_box': quad_box,
753
+ 'text': ocr_content,
754
+ })
755
+ return instances
756
+
757
+ def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
758
+ # ignore <s> </s> and <pad>
759
+ cur_span = 0
760
+ if text.startswith('<s>'):
761
+ cur_span += 3
762
+
763
+ text = text.replace('<s>', '')
764
+ text = text.replace('</s>', '')
765
+ text = text.replace('<pad>', '')
766
+
767
+ pattern = r"([^<]+(?:<loc_\d+>){4,})"
768
+ phrases = re.findall(pattern, text)
769
+
770
+ # pattern should be text pattern and od pattern
771
+ pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
772
+ box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
773
+
774
+ instances = []
775
+ for pharse_text in phrases:
776
+ phrase_text_strip = pharse_text.replace('<ground>', '', 1)
777
+ phrase_text_strip = pharse_text.replace('<obj>', '', 1)
778
+
779
+ if phrase_text_strip == '':
780
+ cur_span += len(pharse_text)
781
+ continue
782
+
783
+ # Prepare instance.
784
+ instance = {}
785
+
786
+ # parse phrase, get string
787
+ phrase = re.search(pattern, phrase_text_strip)
788
+ if phrase is None:
789
+ cur_span += len(pharse_text)
790
+ continue
791
+
792
+ # parse bboxes by box_pattern
793
+ bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
794
+ if len(bboxes_parsed) == 0:
795
+ cur_span += len(pharse_text)
796
+ continue
797
+
798
+ phrase = phrase.group()
799
+ # remove leading and trailing spaces
800
+ phrase = phrase.strip()
801
+
802
+ if phrase in self.black_list_of_phrase_grounding:
803
+ cur_span += len(pharse_text)
804
+ continue
805
+
806
+ # a list of list
807
+ bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
808
+ instance['bbox'] = self.box_quantizer.dequantize(
809
+ boxes=torch.tensor(bbox_bins),
810
+ size=image_size
811
+ ).tolist()
812
+
813
+ # exclude non-ascii characters
814
+ phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
815
+ instance['cat_name'] = phrase
816
+
817
+ instances.append(instance)
818
+
819
+ return instances
820
+
821
+ def parse_description_with_bboxes_from_text_and_spans(self, text, pattern, image_size, allow_empty_phrase=False):
822
+ # temporary parse solution, split by '.'
823
+ # ignore <s> </s> and <pad>
824
+
825
+ text = text.replace('<s>', '')
826
+ text = text.replace('</s>', '')
827
+ text = text.replace('<pad>', '')
828
+
829
+ if allow_empty_phrase:
830
+ pattern = rf"(?:(?:<loc_\d+>){{4,}})"
831
+ else:
832
+ pattern = r"([^<]+(?:<loc_\d+>){4,})"
833
+ phrases = re.findall(pattern, text)
834
+
835
+ # pattern should be text pattern and od pattern
836
+ pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
837
+ box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
838
+
839
+ instances = []
840
+ for pharse_text in phrases:
841
+ phrase_text_strip = pharse_text.replace('<ground>', '', 1)
842
+ phrase_text_strip = pharse_text.replace('<obj>', '', 1)
843
+
844
+ if phrase_text_strip == '' and not allow_empty_phrase:
845
+ continue
846
+
847
+ # parse phrase, get string
848
+ phrase = re.search(pattern, phrase_text_strip)
849
+ if phrase is None:
850
+ continue
851
+
852
+ phrase = phrase.group()
853
+ # remove leading and trailing spaces
854
+ phrase = phrase.strip()
855
+
856
+ # parse bboxes by box_pattern
857
+ bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
858
+ if len(bboxes_parsed) == 0:
859
+ continue
860
+
861
+ # a list of list
862
+ bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
863
+
864
+ bboxes = self.box_quantizer.dequantize(
865
+ boxes=torch.tensor(bbox_bins),
866
+ size=image_size
867
+ ).tolist()
868
+
869
+ phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
870
+ for _bboxes in bboxes:
871
+ # Prepare instance.
872
+ instance = {}
873
+ instance['bbox'] = _bboxes
874
+ # exclude non-ascii characters
875
+ instance['cat_name'] = phrase
876
+ instances.append(instance)
877
+
878
+ return instances
879
+
880
+ def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
881
+ allow_empty_phrase=False,
882
+ polygon_sep_token='<sep>',
883
+ polygon_start_token='<poly>',
884
+ polygon_end_token='</poly>',
885
+ with_box_at_start=False,
886
+ ):
887
+
888
+ # ref_seg format: '<expression><x1><y1><x2><y2><><><sep><><><><>'
889
+ # ignore <s> </s> and <pad>
890
+
891
+ text = text.replace('<s>', '')
892
+ text = text.replace('</s>', '')
893
+ text = text.replace('<pad>', '')
894
+
895
+ if allow_empty_phrase:
896
+ pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
897
+ else:
898
+ # [^<]+: This part matches one or more characters that are not the < symbol.
899
+ # The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
900
+ #
901
+ pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
902
+ phrases = re.findall(pattern, text)
903
+
904
+ phrase_string_pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_|<poly>)'
905
+ box_pattern = rf'((?:<loc_\d+>)+)(?:{re.escape(polygon_sep_token)}|$)'
906
+
907
+ # one polygons instance is separated by polygon_start_token and polygon_end_token
908
+ polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'
909
+
910
+ instances = []
911
+ for phrase_text in phrases:
912
+
913
+ # exclude loc_\d+>
914
+ # need to get span if want to include category score
915
+ phrase_text_strip = re.sub(r'^loc_\d+>', '', phrase_text, count=1)
916
+
917
+ # phrase = phrase.replace('<poly>', '')
918
+ # phrase = phrase.replace('poly>', '')
919
+
920
+ if phrase_text_strip == '' and not allow_empty_phrase:
921
+ continue
922
+
923
+
924
+ # parse phrase, get string
925
+ phrase = re.search(phrase_string_pattern, phrase_text_strip)
926
+ if phrase is None:
927
+ continue
928
+ phrase = phrase.group()
929
+ # remove leading and trailing spaces
930
+ phrase = phrase.strip()
931
+
932
+ # parse bboxes by box_pattern
933
+
934
+ # split by polygon_start_token and polygon_end_token first using polygons_instance_pattern
935
+ if polygon_start_token in phrase_text and polygon_end_token in phrase_text:
936
+ polygons_instances_parsed = list(re.finditer(polygons_instance_pattern, phrase_text))
937
+ else:
938
+ polygons_instances_parsed = [phrase_text]
939
+
940
+ for _polygons_instances_parsed in polygons_instances_parsed:
941
+ # Prepare instance.
942
+ instance = {}
943
+
944
+ # polygons_parsed= list(re.finditer(box_pattern, phrase_text))
945
+ if isinstance(_polygons_instances_parsed, str):
946
+ polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
947
+ else:
948
+ polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
949
+ if len(polygons_parsed) == 0:
950
+ continue
951
+
952
+ # a list of list (polygon)
953
+ bbox = []
954
+ polygons = []
955
+ for _polygon_parsed in polygons_parsed:
956
+ # group 1: whole <loc_\d+>...</loc_\d+>
957
+ _polygon = _polygon_parsed.group(1)
958
+ # parse into list of int
959
+ _polygon = [int(_loc_parsed.group(1)) for _loc_parsed in re.finditer(r'<loc_(\d+)>', _polygon)]
960
+ if with_box_at_start and len(bbox) == 0:
961
+ if len(_polygon) > 4:
962
+ # no valid bbox prediction
963
+ bbox = _polygon[:4]
964
+ _polygon = _polygon[4:]
965
+ else:
966
+ bbox = [0, 0, 0, 0]
967
+ # abandon last element if is not paired
968
+ if len(_polygon) % 2 == 1:
969
+ _polygon = _polygon[:-1]
970
+
971
+ # reshape into (n, 2)
972
+ _polygon = self.coordinates_quantizer.dequantize(
973
+ torch.tensor(np.array(_polygon).reshape(-1, 2)),
974
+ size=image_size
975
+ ).reshape(-1).tolist()
976
+ # reshape back
977
+ polygons.append(_polygon)
978
+
979
+ instance['cat_name'] = phrase
980
+ instance['polygons'] = polygons
981
+ if len(bbox) != 0:
982
+ instance['bbox'] = self.box_quantizer.dequantize(
983
+ boxes=torch.tensor([bbox]),
984
+ size=image_size
985
+ ).tolist()[0]
986
+
987
+ instances.append(instance)
988
+
989
+ return instances
990
+
991
+ def __call__(
992
+ self,
993
+ text=None,
994
+ image_size=None,
995
+ parse_tasks=None,
996
+ ):
997
+ """
998
+ Args:
999
+ text: model outputs
1000
+ image_size: (width, height)
1001
+ parse_tasks: a list of tasks to parse, if None, parse all tasks.
1002
+
1003
+ """
1004
+ if parse_tasks is not None:
1005
+ if isinstance(parse_tasks, str):
1006
+ parse_tasks = [parse_tasks]
1007
+ for _parse_task in parse_tasks:
1008
+ assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'
1009
+
1010
+ # sequence or text should be provided
1011
+ assert text is not None, 'text should be provided'
1012
+
1013
+ parsed_dict = {
1014
+ 'text': text
1015
+ }
1016
+
1017
+ for task in self.parse_tasks:
1018
+ if parse_tasks is not None and task not in parse_tasks:
1019
+ continue
1020
+
1021
+ pattern = self.parse_tasks_configs[task].get('PATTERN', None)
1022
+
1023
+ if task == 'ocr':
1024
+ instances = self.parse_ocr_from_text_and_spans(
1025
+ text,
1026
+ pattern=pattern,
1027
+ image_size=image_size,
1028
+ area_threshold=self.parse_tasks_configs[task].get('AREA_THRESHOLD', 0.0),
1029
+ )
1030
+ parsed_dict['ocr'] = instances
1031
+ elif task == 'phrase_grounding':
1032
+ instances = self.parse_phrase_grounding_from_text_and_spans(
1033
+ text,
1034
+ pattern=pattern,
1035
+ image_size=image_size,
1036
+ )
1037
+ parsed_dict['phrase_grounding'] = instances
1038
+ elif task == 'pure_text':
1039
+ parsed_dict['pure_text'] = text
1040
+ elif task == 'description_with_bboxes':
1041
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1042
+ text,
1043
+ pattern=pattern,
1044
+ image_size=image_size,
1045
+ )
1046
+ parsed_dict['description_with_bboxes'] = instances
1047
+ elif task == 'description_with_polygons':
1048
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1049
+ text,
1050
+ pattern=pattern,
1051
+ image_size=image_size,
1052
+ )
1053
+ parsed_dict['description_with_polygons'] = instances
1054
+ elif task == 'polygons':
1055
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1056
+ text,
1057
+ pattern=pattern,
1058
+ image_size=image_size,
1059
+ allow_empty_phrase=True,
1060
+ )
1061
+ parsed_dict['polygons'] = instances
1062
+ elif task == 'bboxes':
1063
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1064
+ text,
1065
+ pattern=pattern,
1066
+ image_size=image_size,
1067
+ allow_empty_phrase=True,
1068
+ )
1069
+ parsed_dict['bboxes'] = instances
1070
+ elif task == 'description_with_bboxes_or_polygons':
1071
+ if '<poly>' in text:
1072
+ # only support either polygons or bboxes, not both at the same time
1073
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1074
+ text,
1075
+ pattern=pattern,
1076
+ image_size=image_size,
1077
+ )
1078
+ else:
1079
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1080
+ text,
1081
+ pattern=pattern,
1082
+ image_size=image_size,
1083
+ )
1084
+ parsed_dict['description_with_bboxes_or_polygons'] = instances
1085
+ else:
1086
+ raise ValueError("task {} is not supported".format(task))
1087
+
1088
+ return parsed_dict
LLM/Florence-2-large-PromptGen-v2.0/README.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+ # Florence-2-large-PromptGen v2.0
5
+ This upgrade is based on PromptGen 1.5 with some new features to the model:
6
+
7
+ ## Features:
8
+ * Improved caption quality for \<GENERATE_TAGS\>, \<DETAILED_CAPTION\> and \<MORE_DETAILED_CAPTION\>.
9
+ <img style="width:100%; hight:100%" src="https://msdn.miaoshouai.com/miaoshou/bo/2024-11-05_03-15-15.png" />
10
+ <img style="width:100%; hight:100%" src="https://msdn.miaoshouai.com/miaoshou/bo/2024-11-05_03-40-29.png" />
11
+ * A new \<ANALYZE\> instruction, which helps the model to better understands the image composition of the input image.
12
+ <img style="width:100%; hight:100%" src="https://msdn.miaoshouai.com/miaoshou/bo/2024-11-05_03-42-58.png" />
13
+ <img style="width:100%; hight:100%" src="https://msdn.miaoshouai.com/miaoshou/bo/2024-11-05_07-42-36.png" />
14
+ * Memory efficient compare to other models! This is a really light weight caption model that allows you to use a little more than 1G of VRAM and produce lightening fast and high quality image captions.
15
+ <img style="width:100%; hight:100%" src="https://msdn.miaoshouai.com/miaoshou/bo/2024-09-05_12-56-39.png" />
16
+ * Designed to handle image captions for Flux model for both T5XXL CLIP and CLIP_L, the Miaoshou Tagger new node called "Flux CLIP Text Encode" which eliminates the need to run two separate tagger tools for caption creation. You can easily populate both CLIPs in a single generation, significantly boosting speed when working with Flux models.
17
+ <img style="width:100%; hight:100%" src="https://msdn.miaoshouai.com/miaoshou/bo/2024-09-05_14-11-02.png" />
18
+
19
+ ## Instruction prompt:
20
+ \<GENERATE_TAGS\> generate prompt as danbooru style tags<br>
21
+ \<CAPTION\> a one line caption for the image<br>
22
+ \<DETAILED_CAPTION\> a structured caption format which detects the position of the subjects in the image<br>
23
+ \<MORE_DETAILED_CAPTION\> a very detailed description for the image<br>
24
+ \<ANALYZE\> image composition analysis mode<br>
25
+ \<MIXED_CAPTION\> a mixed caption style of more detailed caption and tags, this is extremely useful for FLUX model when using T5XXL and CLIP_L together. A new node in MiaoshouTagger ComfyUI is added to support this instruction.<br>
26
+ \<MIXED_CAPTION_PLUS\> Combine the power of mixed caption with analyze.<br>
27
+
28
+ ## Version History:
29
+ For version 2.0, you will notice the following
30
+ 1. \<ANALYZE\> along with a beta node in ComfyUI for partial image analysis
31
+ 2. A new instruction for \<MIXED_CAPTION_PLUS\>
32
+ 3. A much improve accuracy for \<GENERATE_TAGS\>, \<DETAILED_CAPTION\> and \<MORE_DETAILED_CAPTION\>
33
+
34
+
35
+ ## How to use:
36
+
37
+ To use this model, you can load it directly from the Hugging Face Model Hub:
38
+
39
+ ```python
40
+
41
+ model = AutoModelForCausalLM.from_pretrained("MiaoshouAI/Florence-2-large-PromptGen-v2.0", trust_remote_code=True)
42
+ processor = AutoProcessor.from_pretrained("MiaoshouAI/Florence-2-large-PromptGen-v2.0", trust_remote_code=True)
43
+
44
+ prompt = "<MORE_DETAILED_CAPTION>"
45
+
46
+ url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
47
+ image = Image.open(requests.get(url, stream=True).raw)
48
+
49
+ inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
50
+
51
+ generated_ids = model.generate(
52
+     input_ids=inputs["input_ids"],
53
+     pixel_values=inputs["pixel_values"],
54
+     max_new_tokens=1024,
55
+     do_sample=False,
56
+     num_beams=3
57
+ )
58
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
59
+
60
+ parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height))
61
+
62
+ print(parsed_answer)
63
+ ```
64
+
65
+ ## Use under MiaoshouAI Tagger ComfyUI
66
+ If you just want to use this model, you can use it under ComfyUI-Miaoshouai-Tagger
67
+
68
+ https://github.com/miaoshouai/ComfyUI-Miaoshouai-Tagger
69
+
70
+ A detailed use and install instruction is already there.
71
+ (If you have already installed MiaoshouAI Tagger, you need to update the node in ComfyUI Manager first or use git pull to get the latest update.)
LLM/Florence-2-large-PromptGen-v2.0/added_tokens.json ADDED
@@ -0,0 +1,1026 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</cap>": 51270,
3
+ "</dcap>": 51274,
4
+ "</grounding>": 51276,
5
+ "</ncap>": 51272,
6
+ "</ocr>": 50268,
7
+ "</od>": 50266,
8
+ "</poly>": 51287,
9
+ "</proposal>": 51285,
10
+ "</region_cap>": 51281,
11
+ "</region_to_desciption>": 51283,
12
+ "</seg>": 51278,
13
+ "<and>": 51288,
14
+ "<cap>": 51269,
15
+ "<dcap>": 51273,
16
+ "<grounding>": 51275,
17
+ "<loc_0>": 50269,
18
+ "<loc_100>": 50369,
19
+ "<loc_101>": 50370,
20
+ "<loc_102>": 50371,
21
+ "<loc_103>": 50372,
22
+ "<loc_104>": 50373,
23
+ "<loc_105>": 50374,
24
+ "<loc_106>": 50375,
25
+ "<loc_107>": 50376,
26
+ "<loc_108>": 50377,
27
+ "<loc_109>": 50378,
28
+ "<loc_10>": 50279,
29
+ "<loc_110>": 50379,
30
+ "<loc_111>": 50380,
31
+ "<loc_112>": 50381,
32
+ "<loc_113>": 50382,
33
+ "<loc_114>": 50383,
34
+ "<loc_115>": 50384,
35
+ "<loc_116>": 50385,
36
+ "<loc_117>": 50386,
37
+ "<loc_118>": 50387,
38
+ "<loc_119>": 50388,
39
+ "<loc_11>": 50280,
40
+ "<loc_120>": 50389,
41
+ "<loc_121>": 50390,
42
+ "<loc_122>": 50391,
43
+ "<loc_123>": 50392,
44
+ "<loc_124>": 50393,
45
+ "<loc_125>": 50394,
46
+ "<loc_126>": 50395,
47
+ "<loc_127>": 50396,
48
+ "<loc_128>": 50397,
49
+ "<loc_129>": 50398,
50
+ "<loc_12>": 50281,
51
+ "<loc_130>": 50399,
52
+ "<loc_131>": 50400,
53
+ "<loc_132>": 50401,
54
+ "<loc_133>": 50402,
55
+ "<loc_134>": 50403,
56
+ "<loc_135>": 50404,
57
+ "<loc_136>": 50405,
58
+ "<loc_137>": 50406,
59
+ "<loc_138>": 50407,
60
+ "<loc_139>": 50408,
61
+ "<loc_13>": 50282,
62
+ "<loc_140>": 50409,
63
+ "<loc_141>": 50410,
64
+ "<loc_142>": 50411,
65
+ "<loc_143>": 50412,
66
+ "<loc_144>": 50413,
67
+ "<loc_145>": 50414,
68
+ "<loc_146>": 50415,
69
+ "<loc_147>": 50416,
70
+ "<loc_148>": 50417,
71
+ "<loc_149>": 50418,
72
+ "<loc_14>": 50283,
73
+ "<loc_150>": 50419,
74
+ "<loc_151>": 50420,
75
+ "<loc_152>": 50421,
76
+ "<loc_153>": 50422,
77
+ "<loc_154>": 50423,
78
+ "<loc_155>": 50424,
79
+ "<loc_156>": 50425,
80
+ "<loc_157>": 50426,
81
+ "<loc_158>": 50427,
82
+ "<loc_159>": 50428,
83
+ "<loc_15>": 50284,
84
+ "<loc_160>": 50429,
85
+ "<loc_161>": 50430,
86
+ "<loc_162>": 50431,
87
+ "<loc_163>": 50432,
88
+ "<loc_164>": 50433,
89
+ "<loc_165>": 50434,
90
+ "<loc_166>": 50435,
91
+ "<loc_167>": 50436,
92
+ "<loc_168>": 50437,
93
+ "<loc_169>": 50438,
94
+ "<loc_16>": 50285,
95
+ "<loc_170>": 50439,
96
+ "<loc_171>": 50440,
97
+ "<loc_172>": 50441,
98
+ "<loc_173>": 50442,
99
+ "<loc_174>": 50443,
100
+ "<loc_175>": 50444,
101
+ "<loc_176>": 50445,
102
+ "<loc_177>": 50446,
103
+ "<loc_178>": 50447,
104
+ "<loc_179>": 50448,
105
+ "<loc_17>": 50286,
106
+ "<loc_180>": 50449,
107
+ "<loc_181>": 50450,
108
+ "<loc_182>": 50451,
109
+ "<loc_183>": 50452,
110
+ "<loc_184>": 50453,
111
+ "<loc_185>": 50454,
112
+ "<loc_186>": 50455,
113
+ "<loc_187>": 50456,
114
+ "<loc_188>": 50457,
115
+ "<loc_189>": 50458,
116
+ "<loc_18>": 50287,
117
+ "<loc_190>": 50459,
118
+ "<loc_191>": 50460,
119
+ "<loc_192>": 50461,
120
+ "<loc_193>": 50462,
121
+ "<loc_194>": 50463,
122
+ "<loc_195>": 50464,
123
+ "<loc_196>": 50465,
124
+ "<loc_197>": 50466,
125
+ "<loc_198>": 50467,
126
+ "<loc_199>": 50468,
127
+ "<loc_19>": 50288,
128
+ "<loc_1>": 50270,
129
+ "<loc_200>": 50469,
130
+ "<loc_201>": 50470,
131
+ "<loc_202>": 50471,
132
+ "<loc_203>": 50472,
133
+ "<loc_204>": 50473,
134
+ "<loc_205>": 50474,
135
+ "<loc_206>": 50475,
136
+ "<loc_207>": 50476,
137
+ "<loc_208>": 50477,
138
+ "<loc_209>": 50478,
139
+ "<loc_20>": 50289,
140
+ "<loc_210>": 50479,
141
+ "<loc_211>": 50480,
142
+ "<loc_212>": 50481,
143
+ "<loc_213>": 50482,
144
+ "<loc_214>": 50483,
145
+ "<loc_215>": 50484,
146
+ "<loc_216>": 50485,
147
+ "<loc_217>": 50486,
148
+ "<loc_218>": 50487,
149
+ "<loc_219>": 50488,
150
+ "<loc_21>": 50290,
151
+ "<loc_220>": 50489,
152
+ "<loc_221>": 50490,
153
+ "<loc_222>": 50491,
154
+ "<loc_223>": 50492,
155
+ "<loc_224>": 50493,
156
+ "<loc_225>": 50494,
157
+ "<loc_226>": 50495,
158
+ "<loc_227>": 50496,
159
+ "<loc_228>": 50497,
160
+ "<loc_229>": 50498,
161
+ "<loc_22>": 50291,
162
+ "<loc_230>": 50499,
163
+ "<loc_231>": 50500,
164
+ "<loc_232>": 50501,
165
+ "<loc_233>": 50502,
166
+ "<loc_234>": 50503,
167
+ "<loc_235>": 50504,
168
+ "<loc_236>": 50505,
169
+ "<loc_237>": 50506,
170
+ "<loc_238>": 50507,
171
+ "<loc_239>": 50508,
172
+ "<loc_23>": 50292,
173
+ "<loc_240>": 50509,
174
+ "<loc_241>": 50510,
175
+ "<loc_242>": 50511,
176
+ "<loc_243>": 50512,
177
+ "<loc_244>": 50513,
178
+ "<loc_245>": 50514,
179
+ "<loc_246>": 50515,
180
+ "<loc_247>": 50516,
181
+ "<loc_248>": 50517,
182
+ "<loc_249>": 50518,
183
+ "<loc_24>": 50293,
184
+ "<loc_250>": 50519,
185
+ "<loc_251>": 50520,
186
+ "<loc_252>": 50521,
187
+ "<loc_253>": 50522,
188
+ "<loc_254>": 50523,
189
+ "<loc_255>": 50524,
190
+ "<loc_256>": 50525,
191
+ "<loc_257>": 50526,
192
+ "<loc_258>": 50527,
193
+ "<loc_259>": 50528,
194
+ "<loc_25>": 50294,
195
+ "<loc_260>": 50529,
196
+ "<loc_261>": 50530,
197
+ "<loc_262>": 50531,
198
+ "<loc_263>": 50532,
199
+ "<loc_264>": 50533,
200
+ "<loc_265>": 50534,
201
+ "<loc_266>": 50535,
202
+ "<loc_267>": 50536,
203
+ "<loc_268>": 50537,
204
+ "<loc_269>": 50538,
205
+ "<loc_26>": 50295,
206
+ "<loc_270>": 50539,
207
+ "<loc_271>": 50540,
208
+ "<loc_272>": 50541,
209
+ "<loc_273>": 50542,
210
+ "<loc_274>": 50543,
211
+ "<loc_275>": 50544,
212
+ "<loc_276>": 50545,
213
+ "<loc_277>": 50546,
214
+ "<loc_278>": 50547,
215
+ "<loc_279>": 50548,
216
+ "<loc_27>": 50296,
217
+ "<loc_280>": 50549,
218
+ "<loc_281>": 50550,
219
+ "<loc_282>": 50551,
220
+ "<loc_283>": 50552,
221
+ "<loc_284>": 50553,
222
+ "<loc_285>": 50554,
223
+ "<loc_286>": 50555,
224
+ "<loc_287>": 50556,
225
+ "<loc_288>": 50557,
226
+ "<loc_289>": 50558,
227
+ "<loc_28>": 50297,
228
+ "<loc_290>": 50559,
229
+ "<loc_291>": 50560,
230
+ "<loc_292>": 50561,
231
+ "<loc_293>": 50562,
232
+ "<loc_294>": 50563,
233
+ "<loc_295>": 50564,
234
+ "<loc_296>": 50565,
235
+ "<loc_297>": 50566,
236
+ "<loc_298>": 50567,
237
+ "<loc_299>": 50568,
238
+ "<loc_29>": 50298,
239
+ "<loc_2>": 50271,
240
+ "<loc_300>": 50569,
241
+ "<loc_301>": 50570,
242
+ "<loc_302>": 50571,
243
+ "<loc_303>": 50572,
244
+ "<loc_304>": 50573,
245
+ "<loc_305>": 50574,
246
+ "<loc_306>": 50575,
247
+ "<loc_307>": 50576,
248
+ "<loc_308>": 50577,
249
+ "<loc_309>": 50578,
250
+ "<loc_30>": 50299,
251
+ "<loc_310>": 50579,
252
+ "<loc_311>": 50580,
253
+ "<loc_312>": 50581,
254
+ "<loc_313>": 50582,
255
+ "<loc_314>": 50583,
256
+ "<loc_315>": 50584,
257
+ "<loc_316>": 50585,
258
+ "<loc_317>": 50586,
259
+ "<loc_318>": 50587,
260
+ "<loc_319>": 50588,
261
+ "<loc_31>": 50300,
262
+ "<loc_320>": 50589,
263
+ "<loc_321>": 50590,
264
+ "<loc_322>": 50591,
265
+ "<loc_323>": 50592,
266
+ "<loc_324>": 50593,
267
+ "<loc_325>": 50594,
268
+ "<loc_326>": 50595,
269
+ "<loc_327>": 50596,
270
+ "<loc_328>": 50597,
271
+ "<loc_329>": 50598,
272
+ "<loc_32>": 50301,
273
+ "<loc_330>": 50599,
274
+ "<loc_331>": 50600,
275
+ "<loc_332>": 50601,
276
+ "<loc_333>": 50602,
277
+ "<loc_334>": 50603,
278
+ "<loc_335>": 50604,
279
+ "<loc_336>": 50605,
280
+ "<loc_337>": 50606,
281
+ "<loc_338>": 50607,
282
+ "<loc_339>": 50608,
283
+ "<loc_33>": 50302,
284
+ "<loc_340>": 50609,
285
+ "<loc_341>": 50610,
286
+ "<loc_342>": 50611,
287
+ "<loc_343>": 50612,
288
+ "<loc_344>": 50613,
289
+ "<loc_345>": 50614,
290
+ "<loc_346>": 50615,
291
+ "<loc_347>": 50616,
292
+ "<loc_348>": 50617,
293
+ "<loc_349>": 50618,
294
+ "<loc_34>": 50303,
295
+ "<loc_350>": 50619,
296
+ "<loc_351>": 50620,
297
+ "<loc_352>": 50621,
298
+ "<loc_353>": 50622,
299
+ "<loc_354>": 50623,
300
+ "<loc_355>": 50624,
301
+ "<loc_356>": 50625,
302
+ "<loc_357>": 50626,
303
+ "<loc_358>": 50627,
304
+ "<loc_359>": 50628,
305
+ "<loc_35>": 50304,
306
+ "<loc_360>": 50629,
307
+ "<loc_361>": 50630,
308
+ "<loc_362>": 50631,
309
+ "<loc_363>": 50632,
310
+ "<loc_364>": 50633,
311
+ "<loc_365>": 50634,
312
+ "<loc_366>": 50635,
313
+ "<loc_367>": 50636,
314
+ "<loc_368>": 50637,
315
+ "<loc_369>": 50638,
316
+ "<loc_36>": 50305,
317
+ "<loc_370>": 50639,
318
+ "<loc_371>": 50640,
319
+ "<loc_372>": 50641,
320
+ "<loc_373>": 50642,
321
+ "<loc_374>": 50643,
322
+ "<loc_375>": 50644,
323
+ "<loc_376>": 50645,
324
+ "<loc_377>": 50646,
325
+ "<loc_378>": 50647,
326
+ "<loc_379>": 50648,
327
+ "<loc_37>": 50306,
328
+ "<loc_380>": 50649,
329
+ "<loc_381>": 50650,
330
+ "<loc_382>": 50651,
331
+ "<loc_383>": 50652,
332
+ "<loc_384>": 50653,
333
+ "<loc_385>": 50654,
334
+ "<loc_386>": 50655,
335
+ "<loc_387>": 50656,
336
+ "<loc_388>": 50657,
337
+ "<loc_389>": 50658,
338
+ "<loc_38>": 50307,
339
+ "<loc_390>": 50659,
340
+ "<loc_391>": 50660,
341
+ "<loc_392>": 50661,
342
+ "<loc_393>": 50662,
343
+ "<loc_394>": 50663,
344
+ "<loc_395>": 50664,
345
+ "<loc_396>": 50665,
346
+ "<loc_397>": 50666,
347
+ "<loc_398>": 50667,
348
+ "<loc_399>": 50668,
349
+ "<loc_39>": 50308,
350
+ "<loc_3>": 50272,
351
+ "<loc_400>": 50669,
352
+ "<loc_401>": 50670,
353
+ "<loc_402>": 50671,
354
+ "<loc_403>": 50672,
355
+ "<loc_404>": 50673,
356
+ "<loc_405>": 50674,
357
+ "<loc_406>": 50675,
358
+ "<loc_407>": 50676,
359
+ "<loc_408>": 50677,
360
+ "<loc_409>": 50678,
361
+ "<loc_40>": 50309,
362
+ "<loc_410>": 50679,
363
+ "<loc_411>": 50680,
364
+ "<loc_412>": 50681,
365
+ "<loc_413>": 50682,
366
+ "<loc_414>": 50683,
367
+ "<loc_415>": 50684,
368
+ "<loc_416>": 50685,
369
+ "<loc_417>": 50686,
370
+ "<loc_418>": 50687,
371
+ "<loc_419>": 50688,
372
+ "<loc_41>": 50310,
373
+ "<loc_420>": 50689,
374
+ "<loc_421>": 50690,
375
+ "<loc_422>": 50691,
376
+ "<loc_423>": 50692,
377
+ "<loc_424>": 50693,
378
+ "<loc_425>": 50694,
379
+ "<loc_426>": 50695,
380
+ "<loc_427>": 50696,
381
+ "<loc_428>": 50697,
382
+ "<loc_429>": 50698,
383
+ "<loc_42>": 50311,
384
+ "<loc_430>": 50699,
385
+ "<loc_431>": 50700,
386
+ "<loc_432>": 50701,
387
+ "<loc_433>": 50702,
388
+ "<loc_434>": 50703,
389
+ "<loc_435>": 50704,
390
+ "<loc_436>": 50705,
391
+ "<loc_437>": 50706,
392
+ "<loc_438>": 50707,
393
+ "<loc_439>": 50708,
394
+ "<loc_43>": 50312,
395
+ "<loc_440>": 50709,
396
+ "<loc_441>": 50710,
397
+ "<loc_442>": 50711,
398
+ "<loc_443>": 50712,
399
+ "<loc_444>": 50713,
400
+ "<loc_445>": 50714,
401
+ "<loc_446>": 50715,
402
+ "<loc_447>": 50716,
403
+ "<loc_448>": 50717,
404
+ "<loc_449>": 50718,
405
+ "<loc_44>": 50313,
406
+ "<loc_450>": 50719,
407
+ "<loc_451>": 50720,
408
+ "<loc_452>": 50721,
409
+ "<loc_453>": 50722,
410
+ "<loc_454>": 50723,
411
+ "<loc_455>": 50724,
412
+ "<loc_456>": 50725,
413
+ "<loc_457>": 50726,
414
+ "<loc_458>": 50727,
415
+ "<loc_459>": 50728,
416
+ "<loc_45>": 50314,
417
+ "<loc_460>": 50729,
418
+ "<loc_461>": 50730,
419
+ "<loc_462>": 50731,
420
+ "<loc_463>": 50732,
421
+ "<loc_464>": 50733,
422
+ "<loc_465>": 50734,
423
+ "<loc_466>": 50735,
424
+ "<loc_467>": 50736,
425
+ "<loc_468>": 50737,
426
+ "<loc_469>": 50738,
427
+ "<loc_46>": 50315,
428
+ "<loc_470>": 50739,
429
+ "<loc_471>": 50740,
430
+ "<loc_472>": 50741,
431
+ "<loc_473>": 50742,
432
+ "<loc_474>": 50743,
433
+ "<loc_475>": 50744,
434
+ "<loc_476>": 50745,
435
+ "<loc_477>": 50746,
436
+ "<loc_478>": 50747,
437
+ "<loc_479>": 50748,
438
+ "<loc_47>": 50316,
439
+ "<loc_480>": 50749,
440
+ "<loc_481>": 50750,
441
+ "<loc_482>": 50751,
442
+ "<loc_483>": 50752,
443
+ "<loc_484>": 50753,
444
+ "<loc_485>": 50754,
445
+ "<loc_486>": 50755,
446
+ "<loc_487>": 50756,
447
+ "<loc_488>": 50757,
448
+ "<loc_489>": 50758,
449
+ "<loc_48>": 50317,
450
+ "<loc_490>": 50759,
451
+ "<loc_491>": 50760,
452
+ "<loc_492>": 50761,
453
+ "<loc_493>": 50762,
454
+ "<loc_494>": 50763,
455
+ "<loc_495>": 50764,
456
+ "<loc_496>": 50765,
457
+ "<loc_497>": 50766,
458
+ "<loc_498>": 50767,
459
+ "<loc_499>": 50768,
460
+ "<loc_49>": 50318,
461
+ "<loc_4>": 50273,
462
+ "<loc_500>": 50769,
463
+ "<loc_501>": 50770,
464
+ "<loc_502>": 50771,
465
+ "<loc_503>": 50772,
466
+ "<loc_504>": 50773,
467
+ "<loc_505>": 50774,
468
+ "<loc_506>": 50775,
469
+ "<loc_507>": 50776,
470
+ "<loc_508>": 50777,
471
+ "<loc_509>": 50778,
472
+ "<loc_50>": 50319,
473
+ "<loc_510>": 50779,
474
+ "<loc_511>": 50780,
475
+ "<loc_512>": 50781,
476
+ "<loc_513>": 50782,
477
+ "<loc_514>": 50783,
478
+ "<loc_515>": 50784,
479
+ "<loc_516>": 50785,
480
+ "<loc_517>": 50786,
481
+ "<loc_518>": 50787,
482
+ "<loc_519>": 50788,
483
+ "<loc_51>": 50320,
484
+ "<loc_520>": 50789,
485
+ "<loc_521>": 50790,
486
+ "<loc_522>": 50791,
487
+ "<loc_523>": 50792,
488
+ "<loc_524>": 50793,
489
+ "<loc_525>": 50794,
490
+ "<loc_526>": 50795,
491
+ "<loc_527>": 50796,
492
+ "<loc_528>": 50797,
493
+ "<loc_529>": 50798,
494
+ "<loc_52>": 50321,
495
+ "<loc_530>": 50799,
496
+ "<loc_531>": 50800,
497
+ "<loc_532>": 50801,
498
+ "<loc_533>": 50802,
499
+ "<loc_534>": 50803,
500
+ "<loc_535>": 50804,
501
+ "<loc_536>": 50805,
502
+ "<loc_537>": 50806,
503
+ "<loc_538>": 50807,
504
+ "<loc_539>": 50808,
505
+ "<loc_53>": 50322,
506
+ "<loc_540>": 50809,
507
+ "<loc_541>": 50810,
508
+ "<loc_542>": 50811,
509
+ "<loc_543>": 50812,
510
+ "<loc_544>": 50813,
511
+ "<loc_545>": 50814,
512
+ "<loc_546>": 50815,
513
+ "<loc_547>": 50816,
514
+ "<loc_548>": 50817,
515
+ "<loc_549>": 50818,
516
+ "<loc_54>": 50323,
517
+ "<loc_550>": 50819,
518
+ "<loc_551>": 50820,
519
+ "<loc_552>": 50821,
520
+ "<loc_553>": 50822,
521
+ "<loc_554>": 50823,
522
+ "<loc_555>": 50824,
523
+ "<loc_556>": 50825,
524
+ "<loc_557>": 50826,
525
+ "<loc_558>": 50827,
526
+ "<loc_559>": 50828,
527
+ "<loc_55>": 50324,
528
+ "<loc_560>": 50829,
529
+ "<loc_561>": 50830,
530
+ "<loc_562>": 50831,
531
+ "<loc_563>": 50832,
532
+ "<loc_564>": 50833,
533
+ "<loc_565>": 50834,
534
+ "<loc_566>": 50835,
535
+ "<loc_567>": 50836,
536
+ "<loc_568>": 50837,
537
+ "<loc_569>": 50838,
538
+ "<loc_56>": 50325,
539
+ "<loc_570>": 50839,
540
+ "<loc_571>": 50840,
541
+ "<loc_572>": 50841,
542
+ "<loc_573>": 50842,
543
+ "<loc_574>": 50843,
544
+ "<loc_575>": 50844,
545
+ "<loc_576>": 50845,
546
+ "<loc_577>": 50846,
547
+ "<loc_578>": 50847,
548
+ "<loc_579>": 50848,
549
+ "<loc_57>": 50326,
550
+ "<loc_580>": 50849,
551
+ "<loc_581>": 50850,
552
+ "<loc_582>": 50851,
553
+ "<loc_583>": 50852,
554
+ "<loc_584>": 50853,
555
+ "<loc_585>": 50854,
556
+ "<loc_586>": 50855,
557
+ "<loc_587>": 50856,
558
+ "<loc_588>": 50857,
559
+ "<loc_589>": 50858,
560
+ "<loc_58>": 50327,
561
+ "<loc_590>": 50859,
562
+ "<loc_591>": 50860,
563
+ "<loc_592>": 50861,
564
+ "<loc_593>": 50862,
565
+ "<loc_594>": 50863,
566
+ "<loc_595>": 50864,
567
+ "<loc_596>": 50865,
568
+ "<loc_597>": 50866,
569
+ "<loc_598>": 50867,
570
+ "<loc_599>": 50868,
571
+ "<loc_59>": 50328,
572
+ "<loc_5>": 50274,
573
+ "<loc_600>": 50869,
574
+ "<loc_601>": 50870,
575
+ "<loc_602>": 50871,
576
+ "<loc_603>": 50872,
577
+ "<loc_604>": 50873,
578
+ "<loc_605>": 50874,
579
+ "<loc_606>": 50875,
580
+ "<loc_607>": 50876,
581
+ "<loc_608>": 50877,
582
+ "<loc_609>": 50878,
583
+ "<loc_60>": 50329,
584
+ "<loc_610>": 50879,
585
+ "<loc_611>": 50880,
586
+ "<loc_612>": 50881,
587
+ "<loc_613>": 50882,
588
+ "<loc_614>": 50883,
589
+ "<loc_615>": 50884,
590
+ "<loc_616>": 50885,
591
+ "<loc_617>": 50886,
592
+ "<loc_618>": 50887,
593
+ "<loc_619>": 50888,
594
+ "<loc_61>": 50330,
595
+ "<loc_620>": 50889,
596
+ "<loc_621>": 50890,
597
+ "<loc_622>": 50891,
598
+ "<loc_623>": 50892,
599
+ "<loc_624>": 50893,
600
+ "<loc_625>": 50894,
601
+ "<loc_626>": 50895,
602
+ "<loc_627>": 50896,
603
+ "<loc_628>": 50897,
604
+ "<loc_629>": 50898,
605
+ "<loc_62>": 50331,
606
+ "<loc_630>": 50899,
607
+ "<loc_631>": 50900,
608
+ "<loc_632>": 50901,
609
+ "<loc_633>": 50902,
610
+ "<loc_634>": 50903,
611
+ "<loc_635>": 50904,
612
+ "<loc_636>": 50905,
613
+ "<loc_637>": 50906,
614
+ "<loc_638>": 50907,
615
+ "<loc_639>": 50908,
616
+ "<loc_63>": 50332,
617
+ "<loc_640>": 50909,
618
+ "<loc_641>": 50910,
619
+ "<loc_642>": 50911,
620
+ "<loc_643>": 50912,
621
+ "<loc_644>": 50913,
622
+ "<loc_645>": 50914,
623
+ "<loc_646>": 50915,
624
+ "<loc_647>": 50916,
625
+ "<loc_648>": 50917,
626
+ "<loc_649>": 50918,
627
+ "<loc_64>": 50333,
628
+ "<loc_650>": 50919,
629
+ "<loc_651>": 50920,
630
+ "<loc_652>": 50921,
631
+ "<loc_653>": 50922,
632
+ "<loc_654>": 50923,
633
+ "<loc_655>": 50924,
634
+ "<loc_656>": 50925,
635
+ "<loc_657>": 50926,
636
+ "<loc_658>": 50927,
637
+ "<loc_659>": 50928,
638
+ "<loc_65>": 50334,
639
+ "<loc_660>": 50929,
640
+ "<loc_661>": 50930,
641
+ "<loc_662>": 50931,
642
+ "<loc_663>": 50932,
643
+ "<loc_664>": 50933,
644
+ "<loc_665>": 50934,
645
+ "<loc_666>": 50935,
646
+ "<loc_667>": 50936,
647
+ "<loc_668>": 50937,
648
+ "<loc_669>": 50938,
649
+ "<loc_66>": 50335,
650
+ "<loc_670>": 50939,
651
+ "<loc_671>": 50940,
652
+ "<loc_672>": 50941,
653
+ "<loc_673>": 50942,
654
+ "<loc_674>": 50943,
655
+ "<loc_675>": 50944,
656
+ "<loc_676>": 50945,
657
+ "<loc_677>": 50946,
658
+ "<loc_678>": 50947,
659
+ "<loc_679>": 50948,
660
+ "<loc_67>": 50336,
661
+ "<loc_680>": 50949,
662
+ "<loc_681>": 50950,
663
+ "<loc_682>": 50951,
664
+ "<loc_683>": 50952,
665
+ "<loc_684>": 50953,
666
+ "<loc_685>": 50954,
667
+ "<loc_686>": 50955,
668
+ "<loc_687>": 50956,
669
+ "<loc_688>": 50957,
670
+ "<loc_689>": 50958,
671
+ "<loc_68>": 50337,
672
+ "<loc_690>": 50959,
673
+ "<loc_691>": 50960,
674
+ "<loc_692>": 50961,
675
+ "<loc_693>": 50962,
676
+ "<loc_694>": 50963,
677
+ "<loc_695>": 50964,
678
+ "<loc_696>": 50965,
679
+ "<loc_697>": 50966,
680
+ "<loc_698>": 50967,
681
+ "<loc_699>": 50968,
682
+ "<loc_69>": 50338,
683
+ "<loc_6>": 50275,
684
+ "<loc_700>": 50969,
685
+ "<loc_701>": 50970,
686
+ "<loc_702>": 50971,
687
+ "<loc_703>": 50972,
688
+ "<loc_704>": 50973,
689
+ "<loc_705>": 50974,
690
+ "<loc_706>": 50975,
691
+ "<loc_707>": 50976,
692
+ "<loc_708>": 50977,
693
+ "<loc_709>": 50978,
694
+ "<loc_70>": 50339,
695
+ "<loc_710>": 50979,
696
+ "<loc_711>": 50980,
697
+ "<loc_712>": 50981,
698
+ "<loc_713>": 50982,
699
+ "<loc_714>": 50983,
700
+ "<loc_715>": 50984,
701
+ "<loc_716>": 50985,
702
+ "<loc_717>": 50986,
703
+ "<loc_718>": 50987,
704
+ "<loc_719>": 50988,
705
+ "<loc_71>": 50340,
706
+ "<loc_720>": 50989,
707
+ "<loc_721>": 50990,
708
+ "<loc_722>": 50991,
709
+ "<loc_723>": 50992,
710
+ "<loc_724>": 50993,
711
+ "<loc_725>": 50994,
712
+ "<loc_726>": 50995,
713
+ "<loc_727>": 50996,
714
+ "<loc_728>": 50997,
715
+ "<loc_729>": 50998,
716
+ "<loc_72>": 50341,
717
+ "<loc_730>": 50999,
718
+ "<loc_731>": 51000,
719
+ "<loc_732>": 51001,
720
+ "<loc_733>": 51002,
721
+ "<loc_734>": 51003,
722
+ "<loc_735>": 51004,
723
+ "<loc_736>": 51005,
724
+ "<loc_737>": 51006,
725
+ "<loc_738>": 51007,
726
+ "<loc_739>": 51008,
727
+ "<loc_73>": 50342,
728
+ "<loc_740>": 51009,
729
+ "<loc_741>": 51010,
730
+ "<loc_742>": 51011,
731
+ "<loc_743>": 51012,
732
+ "<loc_744>": 51013,
733
+ "<loc_745>": 51014,
734
+ "<loc_746>": 51015,
735
+ "<loc_747>": 51016,
736
+ "<loc_748>": 51017,
737
+ "<loc_749>": 51018,
738
+ "<loc_74>": 50343,
739
+ "<loc_750>": 51019,
740
+ "<loc_751>": 51020,
741
+ "<loc_752>": 51021,
742
+ "<loc_753>": 51022,
743
+ "<loc_754>": 51023,
744
+ "<loc_755>": 51024,
745
+ "<loc_756>": 51025,
746
+ "<loc_757>": 51026,
747
+ "<loc_758>": 51027,
748
+ "<loc_759>": 51028,
749
+ "<loc_75>": 50344,
750
+ "<loc_760>": 51029,
751
+ "<loc_761>": 51030,
752
+ "<loc_762>": 51031,
753
+ "<loc_763>": 51032,
754
+ "<loc_764>": 51033,
755
+ "<loc_765>": 51034,
756
+ "<loc_766>": 51035,
757
+ "<loc_767>": 51036,
758
+ "<loc_768>": 51037,
759
+ "<loc_769>": 51038,
760
+ "<loc_76>": 50345,
761
+ "<loc_770>": 51039,
762
+ "<loc_771>": 51040,
763
+ "<loc_772>": 51041,
764
+ "<loc_773>": 51042,
765
+ "<loc_774>": 51043,
766
+ "<loc_775>": 51044,
767
+ "<loc_776>": 51045,
768
+ "<loc_777>": 51046,
769
+ "<loc_778>": 51047,
770
+ "<loc_779>": 51048,
771
+ "<loc_77>": 50346,
772
+ "<loc_780>": 51049,
773
+ "<loc_781>": 51050,
774
+ "<loc_782>": 51051,
775
+ "<loc_783>": 51052,
776
+ "<loc_784>": 51053,
777
+ "<loc_785>": 51054,
778
+ "<loc_786>": 51055,
779
+ "<loc_787>": 51056,
780
+ "<loc_788>": 51057,
781
+ "<loc_789>": 51058,
782
+ "<loc_78>": 50347,
783
+ "<loc_790>": 51059,
784
+ "<loc_791>": 51060,
785
+ "<loc_792>": 51061,
786
+ "<loc_793>": 51062,
787
+ "<loc_794>": 51063,
788
+ "<loc_795>": 51064,
789
+ "<loc_796>": 51065,
790
+ "<loc_797>": 51066,
791
+ "<loc_798>": 51067,
792
+ "<loc_799>": 51068,
793
+ "<loc_79>": 50348,
794
+ "<loc_7>": 50276,
795
+ "<loc_800>": 51069,
796
+ "<loc_801>": 51070,
797
+ "<loc_802>": 51071,
798
+ "<loc_803>": 51072,
799
+ "<loc_804>": 51073,
800
+ "<loc_805>": 51074,
801
+ "<loc_806>": 51075,
802
+ "<loc_807>": 51076,
803
+ "<loc_808>": 51077,
804
+ "<loc_809>": 51078,
805
+ "<loc_80>": 50349,
806
+ "<loc_810>": 51079,
807
+ "<loc_811>": 51080,
808
+ "<loc_812>": 51081,
809
+ "<loc_813>": 51082,
810
+ "<loc_814>": 51083,
811
+ "<loc_815>": 51084,
812
+ "<loc_816>": 51085,
813
+ "<loc_817>": 51086,
814
+ "<loc_818>": 51087,
815
+ "<loc_819>": 51088,
816
+ "<loc_81>": 50350,
817
+ "<loc_820>": 51089,
818
+ "<loc_821>": 51090,
819
+ "<loc_822>": 51091,
820
+ "<loc_823>": 51092,
821
+ "<loc_824>": 51093,
822
+ "<loc_825>": 51094,
823
+ "<loc_826>": 51095,
824
+ "<loc_827>": 51096,
825
+ "<loc_828>": 51097,
826
+ "<loc_829>": 51098,
827
+ "<loc_82>": 50351,
828
+ "<loc_830>": 51099,
829
+ "<loc_831>": 51100,
830
+ "<loc_832>": 51101,
831
+ "<loc_833>": 51102,
832
+ "<loc_834>": 51103,
833
+ "<loc_835>": 51104,
834
+ "<loc_836>": 51105,
835
+ "<loc_837>": 51106,
836
+ "<loc_838>": 51107,
837
+ "<loc_839>": 51108,
838
+ "<loc_83>": 50352,
839
+ "<loc_840>": 51109,
840
+ "<loc_841>": 51110,
841
+ "<loc_842>": 51111,
842
+ "<loc_843>": 51112,
843
+ "<loc_844>": 51113,
844
+ "<loc_845>": 51114,
845
+ "<loc_846>": 51115,
846
+ "<loc_847>": 51116,
847
+ "<loc_848>": 51117,
848
+ "<loc_849>": 51118,
849
+ "<loc_84>": 50353,
850
+ "<loc_850>": 51119,
851
+ "<loc_851>": 51120,
852
+ "<loc_852>": 51121,
853
+ "<loc_853>": 51122,
854
+ "<loc_854>": 51123,
855
+ "<loc_855>": 51124,
856
+ "<loc_856>": 51125,
857
+ "<loc_857>": 51126,
858
+ "<loc_858>": 51127,
859
+ "<loc_859>": 51128,
860
+ "<loc_85>": 50354,
861
+ "<loc_860>": 51129,
862
+ "<loc_861>": 51130,
863
+ "<loc_862>": 51131,
864
+ "<loc_863>": 51132,
865
+ "<loc_864>": 51133,
866
+ "<loc_865>": 51134,
867
+ "<loc_866>": 51135,
868
+ "<loc_867>": 51136,
869
+ "<loc_868>": 51137,
870
+ "<loc_869>": 51138,
871
+ "<loc_86>": 50355,
872
+ "<loc_870>": 51139,
873
+ "<loc_871>": 51140,
874
+ "<loc_872>": 51141,
875
+ "<loc_873>": 51142,
876
+ "<loc_874>": 51143,
877
+ "<loc_875>": 51144,
878
+ "<loc_876>": 51145,
879
+ "<loc_877>": 51146,
880
+ "<loc_878>": 51147,
881
+ "<loc_879>": 51148,
882
+ "<loc_87>": 50356,
883
+ "<loc_880>": 51149,
884
+ "<loc_881>": 51150,
885
+ "<loc_882>": 51151,
886
+ "<loc_883>": 51152,
887
+ "<loc_884>": 51153,
888
+ "<loc_885>": 51154,
889
+ "<loc_886>": 51155,
890
+ "<loc_887>": 51156,
891
+ "<loc_888>": 51157,
892
+ "<loc_889>": 51158,
893
+ "<loc_88>": 50357,
894
+ "<loc_890>": 51159,
895
+ "<loc_891>": 51160,
896
+ "<loc_892>": 51161,
897
+ "<loc_893>": 51162,
898
+ "<loc_894>": 51163,
899
+ "<loc_895>": 51164,
900
+ "<loc_896>": 51165,
901
+ "<loc_897>": 51166,
902
+ "<loc_898>": 51167,
903
+ "<loc_899>": 51168,
904
+ "<loc_89>": 50358,
905
+ "<loc_8>": 50277,
906
+ "<loc_900>": 51169,
907
+ "<loc_901>": 51170,
908
+ "<loc_902>": 51171,
909
+ "<loc_903>": 51172,
910
+ "<loc_904>": 51173,
911
+ "<loc_905>": 51174,
912
+ "<loc_906>": 51175,
913
+ "<loc_907>": 51176,
914
+ "<loc_908>": 51177,
915
+ "<loc_909>": 51178,
916
+ "<loc_90>": 50359,
917
+ "<loc_910>": 51179,
918
+ "<loc_911>": 51180,
919
+ "<loc_912>": 51181,
920
+ "<loc_913>": 51182,
921
+ "<loc_914>": 51183,
922
+ "<loc_915>": 51184,
923
+ "<loc_916>": 51185,
924
+ "<loc_917>": 51186,
925
+ "<loc_918>": 51187,
926
+ "<loc_919>": 51188,
927
+ "<loc_91>": 50360,
928
+ "<loc_920>": 51189,
929
+ "<loc_921>": 51190,
930
+ "<loc_922>": 51191,
931
+ "<loc_923>": 51192,
932
+ "<loc_924>": 51193,
933
+ "<loc_925>": 51194,
934
+ "<loc_926>": 51195,
935
+ "<loc_927>": 51196,
936
+ "<loc_928>": 51197,
937
+ "<loc_929>": 51198,
938
+ "<loc_92>": 50361,
939
+ "<loc_930>": 51199,
940
+ "<loc_931>": 51200,
941
+ "<loc_932>": 51201,
942
+ "<loc_933>": 51202,
943
+ "<loc_934>": 51203,
944
+ "<loc_935>": 51204,
945
+ "<loc_936>": 51205,
946
+ "<loc_937>": 51206,
947
+ "<loc_938>": 51207,
948
+ "<loc_939>": 51208,
949
+ "<loc_93>": 50362,
950
+ "<loc_940>": 51209,
951
+ "<loc_941>": 51210,
952
+ "<loc_942>": 51211,
953
+ "<loc_943>": 51212,
954
+ "<loc_944>": 51213,
955
+ "<loc_945>": 51214,
956
+ "<loc_946>": 51215,
957
+ "<loc_947>": 51216,
958
+ "<loc_948>": 51217,
959
+ "<loc_949>": 51218,
960
+ "<loc_94>": 50363,
961
+ "<loc_950>": 51219,
962
+ "<loc_951>": 51220,
963
+ "<loc_952>": 51221,
964
+ "<loc_953>": 51222,
965
+ "<loc_954>": 51223,
966
+ "<loc_955>": 51224,
967
+ "<loc_956>": 51225,
968
+ "<loc_957>": 51226,
969
+ "<loc_958>": 51227,
970
+ "<loc_959>": 51228,
971
+ "<loc_95>": 50364,
972
+ "<loc_960>": 51229,
973
+ "<loc_961>": 51230,
974
+ "<loc_962>": 51231,
975
+ "<loc_963>": 51232,
976
+ "<loc_964>": 51233,
977
+ "<loc_965>": 51234,
978
+ "<loc_966>": 51235,
979
+ "<loc_967>": 51236,
980
+ "<loc_968>": 51237,
981
+ "<loc_969>": 51238,
982
+ "<loc_96>": 50365,
983
+ "<loc_970>": 51239,
984
+ "<loc_971>": 51240,
985
+ "<loc_972>": 51241,
986
+ "<loc_973>": 51242,
987
+ "<loc_974>": 51243,
988
+ "<loc_975>": 51244,
989
+ "<loc_976>": 51245,
990
+ "<loc_977>": 51246,
991
+ "<loc_978>": 51247,
992
+ "<loc_979>": 51248,
993
+ "<loc_97>": 50366,
994
+ "<loc_980>": 51249,
995
+ "<loc_981>": 51250,
996
+ "<loc_982>": 51251,
997
+ "<loc_983>": 51252,
998
+ "<loc_984>": 51253,
999
+ "<loc_985>": 51254,
1000
+ "<loc_986>": 51255,
1001
+ "<loc_987>": 51256,
1002
+ "<loc_988>": 51257,
1003
+ "<loc_989>": 51258,
1004
+ "<loc_98>": 50367,
1005
+ "<loc_990>": 51259,
1006
+ "<loc_991>": 51260,
1007
+ "<loc_992>": 51261,
1008
+ "<loc_993>": 51262,
1009
+ "<loc_994>": 51263,
1010
+ "<loc_995>": 51264,
1011
+ "<loc_996>": 51265,
1012
+ "<loc_997>": 51266,
1013
+ "<loc_998>": 51267,
1014
+ "<loc_999>": 51268,
1015
+ "<loc_99>": 50368,
1016
+ "<loc_9>": 50278,
1017
+ "<ncap>": 51271,
1018
+ "<ocr>": 50267,
1019
+ "<od>": 50265,
1020
+ "<poly>": 51286,
1021
+ "<proposal>": 51284,
1022
+ "<region_cap>": 51280,
1023
+ "<region_to_desciption>": 51282,
1024
+ "<seg>": 51277,
1025
+ "<sep>": 51279
1026
+ }
LLM/Florence-2-large-PromptGen-v2.0/config.json ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/Florence-2-large",
3
+ "architectures": [
4
+ "Florence2ForConditionalGeneration"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_florence2.Florence2Config",
8
+ "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
9
+ },
10
+ "bos_token_id": 0,
11
+ "eos_token_id": 2,
12
+ "ignore_index": -100,
13
+ "is_encoder_decoder": true,
14
+ "model_type": "florence2",
15
+ "pad_token_id": 1,
16
+ "projection_dim": 1024,
17
+ "text_config": {
18
+ "_attn_implementation_autoset": true,
19
+ "_name_or_path": "",
20
+ "activation_dropout": 0.1,
21
+ "activation_function": "gelu",
22
+ "add_bias_logits": false,
23
+ "add_cross_attention": false,
24
+ "add_final_layer_norm": false,
25
+ "architectures": null,
26
+ "attention_dropout": 0.1,
27
+ "bad_words_ids": null,
28
+ "begin_suppress_tokens": null,
29
+ "bos_token_id": 0,
30
+ "chunk_size_feed_forward": 0,
31
+ "classif_dropout": 0.1,
32
+ "classifier_dropout": 0.0,
33
+ "cross_attention_hidden_size": null,
34
+ "d_model": 1024,
35
+ "decoder_attention_heads": 16,
36
+ "decoder_ffn_dim": 4096,
37
+ "decoder_layerdrop": 0.0,
38
+ "decoder_layers": 12,
39
+ "decoder_start_token_id": 2,
40
+ "diversity_penalty": 0.0,
41
+ "do_sample": false,
42
+ "dropout": 0.1,
43
+ "early_stopping": true,
44
+ "encoder_attention_heads": 16,
45
+ "encoder_ffn_dim": 4096,
46
+ "encoder_layerdrop": 0.0,
47
+ "encoder_layers": 12,
48
+ "encoder_no_repeat_ngram_size": 0,
49
+ "eos_token_id": 2,
50
+ "exponential_decay_length_penalty": null,
51
+ "finetuning_task": null,
52
+ "forced_bos_token_id": 0,
53
+ "forced_eos_token_id": 2,
54
+ "gradient_checkpointing": false,
55
+ "id2label": {
56
+ "0": "LABEL_0",
57
+ "1": "LABEL_1",
58
+ "2": "LABEL_2"
59
+ },
60
+ "init_std": 0.02,
61
+ "is_decoder": false,
62
+ "is_encoder_decoder": true,
63
+ "label2id": {
64
+ "LABEL_0": 0,
65
+ "LABEL_1": 1,
66
+ "LABEL_2": 2
67
+ },
68
+ "length_penalty": 1.0,
69
+ "max_length": 20,
70
+ "max_position_embeddings": 1024,
71
+ "min_length": 0,
72
+ "model_type": "florence2_language",
73
+ "no_repeat_ngram_size": 3,
74
+ "normalize_before": false,
75
+ "num_beam_groups": 1,
76
+ "num_beams": 3,
77
+ "num_hidden_layers": 12,
78
+ "num_return_sequences": 1,
79
+ "output_attentions": false,
80
+ "output_hidden_states": false,
81
+ "output_scores": false,
82
+ "pad_token_id": 1,
83
+ "prefix": null,
84
+ "problem_type": null,
85
+ "pruned_heads": {},
86
+ "remove_invalid_values": false,
87
+ "repetition_penalty": 1.0,
88
+ "return_dict": true,
89
+ "return_dict_in_generate": false,
90
+ "scale_embedding": false,
91
+ "sep_token_id": null,
92
+ "suppress_tokens": null,
93
+ "task_specific_params": null,
94
+ "temperature": 1.0,
95
+ "tf_legacy_loss": false,
96
+ "tie_encoder_decoder": false,
97
+ "tie_word_embeddings": true,
98
+ "tokenizer_class": null,
99
+ "top_k": 50,
100
+ "top_p": 1.0,
101
+ "torch_dtype": null,
102
+ "torchscript": false,
103
+ "typical_p": 1.0,
104
+ "use_bfloat16": false,
105
+ "use_cache": true,
106
+ "vocab_size": 51289
107
+ },
108
+ "torch_dtype": "float32",
109
+ "transformers_version": "4.46.1",
110
+ "vision_config": {
111
+ "model_type": "davit",
112
+ "drop_path_rate": 0.1,
113
+ "patch_size": [7, 3, 3, 3],
114
+ "patch_stride": [4, 2, 2, 2],
115
+ "patch_padding": [3, 1, 1, 1],
116
+ "patch_prenorm": [false, true, true, true],
117
+ "enable_checkpoint": false,
118
+ "dim_embed": [256, 512, 1024, 2048],
119
+ "num_heads": [8, 16, 32, 64],
120
+ "num_groups": [8, 16, 32, 64],
121
+ "depths": [1, 1, 9, 1],
122
+ "window_size": 12,
123
+ "projection_dim": 1024,
124
+ "visual_temporal_embedding": {
125
+ "type": "COSINE",
126
+ "max_temporal_embeddings": 100
127
+ },
128
+ "image_pos_embed": {
129
+ "type": "learned_abs_2d",
130
+ "max_pos_embeddings": 50
131
+ },
132
+ "image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"]
133
+ },
134
+ "vocab_size": 51289,
135
+ "torch_dtype": "float16",
136
+ "transformers_version": "4.41.0.dev0",
137
+ "is_encoder_decoder": true
138
+ }
LLM/Florence-2-large-PromptGen-v2.0/configuration_florence2.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import warnings
15
+ """ Florence-2 configuration"""
16
+
17
+ from typing import Optional
18
+
19
+ from transformers import AutoConfig
20
+ from transformers.configuration_utils import PretrainedConfig
21
+ from transformers.utils import logging
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ class Florence2VisionConfig(PretrainedConfig):
26
+ r"""
27
+ This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
28
+ according to the specified arguments, defining the model architecture. Instantiating a configuration with the
29
+ defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
30
+
31
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
32
+ documentation from [`PretrainedConfig`] for more information.
33
+
34
+ Args:
35
+ drop_path_rate (`float`, *optional*, defaults to 0.1):
36
+ The dropout rate of the drop path layer.
37
+ patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
38
+ The patch size of the image.
39
+ patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
40
+ The patch stride of the image.
41
+ patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
42
+ The patch padding of the image.
43
+ patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
44
+ Whether to apply layer normalization before the patch embedding layer.
45
+ enable_checkpoint (`bool`, *optional*, defaults to False):
46
+ Whether to enable checkpointing.
47
+ dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
48
+ The dimension of the embedding layer.
49
+ num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
50
+ The number of attention heads.
51
+ num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
52
+ The number of groups.
53
+ depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
54
+ The depth of the model.
55
+ window_size (`int`, *optional*, defaults to 12):
56
+ The window size of the model.
57
+ projection_dim (`int`, *optional*, defaults to 1024):
58
+ The dimension of the projection layer.
59
+ visual_temporal_embedding (`dict`, *optional*):
60
+ The configuration of the visual temporal embedding.
61
+ image_pos_embed (`dict`, *optional*):
62
+ The configuration of the image position embedding.
63
+ image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
64
+ The source of the image feature.
65
+ Example:
66
+
67
+ ```python
68
+ >>> from transformers import Florence2VisionConfig, Florence2VisionModel
69
+
70
+ >>> # Initializing a Florence2 Vision style configuration
71
+ >>> configuration = Florence2VisionConfig()
72
+
73
+ >>> # Initializing a model (with random weights)
74
+ >>> model = Florence2VisionModel(configuration)
75
+
76
+ >>> # Accessing the model configuration
77
+ >>> configuration = model.config
78
+ ```"""
79
+
80
+ model_type = "florence2_vision"
81
+ keys_to_ignore_at_inference = ["past_key_values"]
82
+
83
+ def __init__(
84
+ self,
85
+ drop_path_rate=0.1,
86
+ patch_size=[7, 3, 3, 3],
87
+ patch_stride=[4, 2, 2, 2],
88
+ patch_padding=[3, 1, 1, 1],
89
+ patch_prenorm=[False, True, True, True],
90
+ enable_checkpoint=False,
91
+ dim_embed=[256, 512, 1024, 2048],
92
+ num_heads=[8, 16, 32, 64],
93
+ num_groups=[8, 16, 32, 64],
94
+ depths=[1, 1, 9, 1],
95
+ window_size=12,
96
+ projection_dim=1024,
97
+ visual_temporal_embedding=None,
98
+ image_pos_embed=None,
99
+ image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
100
+ **kwargs,
101
+ ):
102
+ self.drop_path_rate = drop_path_rate
103
+ self.patch_size = patch_size
104
+ self.patch_stride = patch_stride
105
+ self.patch_padding = patch_padding
106
+ self.patch_prenorm = patch_prenorm
107
+ self.enable_checkpoint = enable_checkpoint
108
+ self.dim_embed = dim_embed
109
+ self.num_heads = num_heads
110
+ self.num_groups = num_groups
111
+ self.depths = depths
112
+ self.window_size = window_size
113
+ self.projection_dim = projection_dim
114
+ self.visual_temporal_embedding = visual_temporal_embedding
115
+ self.image_pos_embed = image_pos_embed
116
+ self.image_feature_source = image_feature_source
117
+
118
+ super().__init__(**kwargs)
119
+
120
+
121
+
122
+ class Florence2LanguageConfig(PretrainedConfig):
123
+ r"""
124
+ This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
125
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
126
+ defaults will yield a similar configuration to that of the BART
127
+ [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
128
+
129
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
130
+ documentation from [`PretrainedConfig`] for more information.
131
+
132
+
133
+ Args:
134
+ vocab_size (`int`, *optional*, defaults to 51289):
135
+ Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
136
+ `inputs_ids` passed when calling [`Florence2LanguageModel`].
137
+ d_model (`int`, *optional*, defaults to 1024):
138
+ Dimensionality of the layers and the pooler layer.
139
+ encoder_layers (`int`, *optional*, defaults to 12):
140
+ Number of encoder layers.
141
+ decoder_layers (`int`, *optional*, defaults to 12):
142
+ Number of decoder layers.
143
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
144
+ Number of attention heads for each attention layer in the Transformer encoder.
145
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
146
+ Number of attention heads for each attention layer in the Transformer decoder.
147
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
148
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
149
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
150
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
151
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
152
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
153
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
154
+ dropout (`float`, *optional*, defaults to 0.1):
155
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
156
+ attention_dropout (`float`, *optional*, defaults to 0.0):
157
+ The dropout ratio for the attention probabilities.
158
+ activation_dropout (`float`, *optional*, defaults to 0.0):
159
+ The dropout ratio for activations inside the fully connected layer.
160
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
161
+ The dropout ratio for classifier.
162
+ max_position_embeddings (`int`, *optional*, defaults to 1024):
163
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
164
+ just in case (e.g., 512 or 1024 or 2048).
165
+ init_std (`float`, *optional*, defaults to 0.02):
166
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
167
+ encoder_layerdrop (`float`, *optional*, defaults to 0.0):
168
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
169
+ for more details.
170
+ decoder_layerdrop (`float`, *optional*, defaults to 0.0):
171
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
172
+ for more details.
173
+ scale_embedding (`bool`, *optional*, defaults to `False`):
174
+ Scale embeddings by diving by sqrt(d_model).
175
+ use_cache (`bool`, *optional*, defaults to `True`):
176
+ Whether or not the model should return the last key/values attentions (not used by all models).
177
+ num_labels (`int`, *optional*, defaults to 3):
178
+ The number of labels to use in [`Florence2LanguageForSequenceClassification`].
179
+ forced_eos_token_id (`int`, *optional*, defaults to 2):
180
+ The id of the token to force as the last generated token when `max_length` is reached. Usually set to
181
+ `eos_token_id`.
182
+
183
+ Example:
184
+
185
+ ```python
186
+ >>> from transformers import Florence2LanguageConfig, Florence2LanguageModel
187
+
188
+ >>> # Initializing a Florence2 Language style configuration
189
+ >>> configuration = Florence2LanguageConfig()
190
+
191
+ >>> # Initializing a model (with random weights)
192
+ >>> model = Florence2LangaugeModel(configuration)
193
+
194
+ >>> # Accessing the model configuration
195
+ >>> configuration = model.config
196
+ ```"""
197
+
198
+ model_type = "florence2_language"
199
+ keys_to_ignore_at_inference = ["past_key_values"]
200
+ attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
201
+
202
+ def __init__(
203
+ self,
204
+ vocab_size=51289,
205
+ max_position_embeddings=1024,
206
+ encoder_layers=12,
207
+ encoder_ffn_dim=4096,
208
+ encoder_attention_heads=16,
209
+ decoder_layers=12,
210
+ decoder_ffn_dim=4096,
211
+ decoder_attention_heads=16,
212
+ encoder_layerdrop=0.0,
213
+ decoder_layerdrop=0.0,
214
+ activation_function="gelu",
215
+ d_model=1024,
216
+ dropout=0.1,
217
+ attention_dropout=0.0,
218
+ activation_dropout=0.0,
219
+ init_std=0.02,
220
+ classifier_dropout=0.0,
221
+ scale_embedding=False,
222
+ use_cache=True,
223
+ num_labels=3,
224
+ pad_token_id=1,
225
+ bos_token_id=0,
226
+ eos_token_id=2,
227
+ is_encoder_decoder=True,
228
+ decoder_start_token_id=2,
229
+ forced_eos_token_id=2,
230
+ **kwargs,
231
+ ):
232
+ self.vocab_size = vocab_size
233
+ self.max_position_embeddings = max_position_embeddings
234
+ self.d_model = d_model
235
+ self.encoder_ffn_dim = encoder_ffn_dim
236
+ self.encoder_layers = encoder_layers
237
+ self.encoder_attention_heads = encoder_attention_heads
238
+ self.decoder_ffn_dim = decoder_ffn_dim
239
+ self.decoder_layers = decoder_layers
240
+ self.decoder_attention_heads = decoder_attention_heads
241
+ self.dropout = dropout
242
+ self.attention_dropout = attention_dropout
243
+ self.activation_dropout = activation_dropout
244
+ self.activation_function = activation_function
245
+ self.init_std = init_std
246
+ self.encoder_layerdrop = encoder_layerdrop
247
+ self.decoder_layerdrop = decoder_layerdrop
248
+ self.classifier_dropout = classifier_dropout
249
+ self.use_cache = use_cache
250
+ self.num_hidden_layers = encoder_layers
251
+ self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
252
+
253
+ super().__init__(
254
+ num_labels=num_labels,
255
+ pad_token_id=pad_token_id,
256
+ bos_token_id=bos_token_id,
257
+ eos_token_id=eos_token_id,
258
+ is_encoder_decoder=is_encoder_decoder,
259
+ decoder_start_token_id=decoder_start_token_id,
260
+ forced_eos_token_id=forced_eos_token_id,
261
+ **kwargs,
262
+ )
263
+
264
+ # ensure backward compatibility for BART CNN models
265
+ if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
266
+ self.forced_bos_token_id = self.bos_token_id
267
+ warnings.warn(
268
+ f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
269
+ "The config can simply be saved and uploaded again to be fixed."
270
+ )
271
+
272
+ class Florence2Config(PretrainedConfig):
273
+ r"""
274
+ This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
275
+ Florence-2 model according to the specified arguments, defining the model architecture.
276
+
277
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
278
+ documentation from [`PretrainedConfig`] for more information.
279
+
280
+ Args:
281
+ vision_config (`Florence2VisionConfig`, *optional*):
282
+ Custom vision config or dict
283
+ text_config (`Union[AutoConfig, dict]`, *optional*):
284
+ The config object of the text backbone.
285
+ ignore_index (`int`, *optional*, defaults to -100):
286
+ The ignore index for the loss function.
287
+ vocab_size (`int`, *optional*, defaults to 51289):
288
+ Vocabulary size of the Florence2model. Defines the number of different tokens that can be represented by the
289
+ `inputs_ids` passed when calling [`~Florence2ForConditionalGeneration`]
290
+ projection_dim (`int`, *optional*, defaults to 1024):
291
+ Dimension of the multimodal projection space.
292
+
293
+ Example:
294
+
295
+ ```python
296
+ >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
297
+
298
+ >>> # Initializing a clip-like vision config
299
+ >>> vision_config = CLIPVisionConfig()
300
+
301
+ >>> # Initializing a Bart config
302
+ >>> text_config = BartConfig()
303
+
304
+ >>> # Initializing a Florence-2 configuration
305
+ >>> configuration = Florence2Config(vision_config, text_config)
306
+
307
+ >>> # Initializing a model from the florence-2 configuration
308
+ >>> model = Florence2ForConditionalGeneration(configuration)
309
+
310
+ >>> # Accessing the model configuration
311
+ >>> configuration = model.config
312
+ ```"""
313
+
314
+ model_type = "florence2"
315
+ is_composition = False
316
+
317
+ def __init__(
318
+ self,
319
+ vision_config=None,
320
+ text_config=None,
321
+ ignore_index=-100,
322
+ vocab_size=51289,
323
+ projection_dim=1024,
324
+ **kwargs,
325
+ ):
326
+ self.ignore_index = ignore_index
327
+ self.vocab_size = vocab_size
328
+ self.projection_dim = projection_dim
329
+ if vision_config is not None:
330
+ vision_config = PretrainedConfig(**vision_config)
331
+ self.vision_config = vision_config
332
+ self.vocab_size = self.vocab_size
333
+
334
+ self.text_config = text_config
335
+ if text_config is not None:
336
+ self.text_config = Florence2LanguageConfig(**text_config)
337
+
338
+
339
+ super().__init__(**kwargs)
340
+
LLM/Florence-2-large-PromptGen-v2.0/generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "num_beams": 3,
3
+ "transformers_version": "4.46.1"
4
+ }
LLM/Florence-2-large-PromptGen-v2.0/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
LLM/Florence-2-large-PromptGen-v2.0/modeling_florence2.py ADDED
The diff for this file is too large to render. See raw diff
 
LLM/Florence-2-large-PromptGen-v2.0/preprocessor_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_florence2.Florence2Processor"
4
+ },
5
+ "crop_size": {
6
+ "height": 768,
7
+ "width": 768
8
+ },
9
+ "do_center_crop": false,
10
+ "do_convert_rgb": null,
11
+ "do_normalize": true,
12
+ "do_rescale": true,
13
+ "do_resize": true,
14
+ "image_mean": [
15
+ 0.485,
16
+ 0.456,
17
+ 0.406
18
+ ],
19
+ "image_processor_type": "CLIPImageProcessor",
20
+ "image_seq_length": 577,
21
+ "image_std": [
22
+ 0.229,
23
+ 0.224,
24
+ 0.225
25
+ ],
26
+ "processor_class": "Florence2Processor",
27
+ "resample": 3,
28
+ "rescale_factor": 0.00392156862745098,
29
+ "size": {
30
+ "height": 768,
31
+ "width": 768
32
+ }
33
+ }
LLM/Florence-2-large-PromptGen-v2.0/processing_florence2.py ADDED
@@ -0,0 +1,1088 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Processor class for Florence-2.
17
+ """
18
+
19
+ import re
20
+ import logging
21
+ from typing import List, Optional, Union
22
+ import numpy as np
23
+
24
+ import torch
25
+
26
+ from transformers.feature_extraction_utils import BatchFeature
27
+ from transformers.image_utils import ImageInput, is_valid_image
28
+ from transformers.processing_utils import ProcessorMixin
29
+ from transformers.tokenization_utils_base import (
30
+ PaddingStrategy,
31
+ PreTokenizedInput,
32
+ TextInput,
33
+ TruncationStrategy,
34
+ )
35
+ from transformers.utils import TensorType
36
+
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+ # Copied from transformers.models.idefics2.processing_idefics2.is_url
41
+ def is_url(val) -> bool:
42
+ return isinstance(val, str) and val.startswith("http")
43
+
44
+ # Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
45
+ def is_image_or_image_url(elem):
46
+ return is_url(elem) or is_valid_image(elem)
47
+
48
+
49
+ def _is_str_or_image(elem):
50
+ return isinstance(elem, (str)) or is_image_or_image_url(elem)
51
+
52
+
53
+ class Florence2Processor(ProcessorMixin):
54
+ r"""
55
+ Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor.
56
+
57
+ [`Florence2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and [`BartTokenizerFast`]. See the
58
+ [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information.
59
+
60
+ Args:
61
+ image_processor ([`CLIPImageProcessor`], *optional*):
62
+ The image processor is a required input.
63
+ tokenizer ([`BartTokenizerFast`], *optional*):
64
+ The tokenizer is a required input.
65
+ """
66
+
67
+ attributes = ["image_processor", "tokenizer"]
68
+ image_processor_class = "CLIPImageProcessor"
69
+ tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
70
+
71
+ def __init__(
72
+ self,
73
+ image_processor=None,
74
+ tokenizer=None,
75
+ ):
76
+ if image_processor is None:
77
+ raise ValueError("You need to specify an `image_processor`.")
78
+ if tokenizer is None:
79
+ raise ValueError("You need to specify a `tokenizer`.")
80
+ if not hasattr(image_processor, "image_seq_length"):
81
+ raise ValueError("Image processor is missing an `image_seq_length` attribute.")
82
+
83
+ self.image_seq_length = image_processor.image_seq_length
84
+
85
+ tokens_to_add = {
86
+ 'additional_special_tokens': \
87
+ tokenizer.additional_special_tokens + \
88
+ ['<od>', '</od>', '<ocr>', '</ocr>'] + \
89
+ [f'<loc_{x}>' for x in range(1000)] + \
90
+ ['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>']
91
+ }
92
+ tokenizer.add_special_tokens(tokens_to_add)
93
+
94
+ self.tasks_answer_post_processing_type = {
95
+ '<OCR>': 'pure_text',
96
+ '<OCR_WITH_REGION>': 'ocr',
97
+ '<CAPTION>': 'pure_text',
98
+ '<DETAILED_CAPTION>': 'pure_text',
99
+ '<MORE_DETAILED_CAPTION>': 'pure_text',
100
+ '<OD>': 'description_with_bboxes',
101
+ '<DENSE_REGION_CAPTION>': 'description_with_bboxes',
102
+ '<CAPTION_TO_PHRASE_GROUNDING>': "phrase_grounding",
103
+ '<REFERRING_EXPRESSION_SEGMENTATION>': 'polygons',
104
+ '<REGION_TO_SEGMENTATION>': 'polygons',
105
+ '<OPEN_VOCABULARY_DETECTION>': 'description_with_bboxes_or_polygons',
106
+ '<REGION_TO_CATEGORY>': 'pure_text',
107
+ '<REGION_TO_DESCRIPTION>': 'pure_text',
108
+ '<REGION_TO_OCR>': 'pure_text',
109
+ '<REGION_PROPOSAL>': 'bboxes'
110
+ }
111
+
112
+ self.task_prompts_without_inputs = {
113
+ '<OCR>': 'What is the text in the image?',
114
+ '<OCR_WITH_REGION>': 'What is the text in the image, with regions?',
115
+ '<CAPTION>': 'What does the image describe?',
116
+ '<DETAILED_CAPTION>': 'Describe in detail what is shown in the image.',
117
+ '<MORE_DETAILED_CAPTION>': 'Describe with a paragraph what is shown in the image.',
118
+ '<OD>': 'Locate the objects with category name in the image.',
119
+ '<DENSE_REGION_CAPTION>': 'Locate the objects in the image, with their descriptions.',
120
+ '<REGION_PROPOSAL>': 'Locate the region proposals in the image.'
121
+ }
122
+
123
+ self.task_prompts_with_input = {
124
+ '<CAPTION_TO_PHRASE_GROUNDING>': "Locate the phrases in the caption: {input}",
125
+ '<REFERRING_EXPRESSION_SEGMENTATION>': 'Locate {input} in the image with mask',
126
+ '<REGION_TO_SEGMENTATION>': 'What is the polygon mask of region {input}',
127
+ '<OPEN_VOCABULARY_DETECTION>': 'Locate {input} in the image.',
128
+ '<REGION_TO_CATEGORY>': 'What is the region {input}?',
129
+ '<REGION_TO_DESCRIPTION>': 'What does the region {input} describe?',
130
+ '<REGION_TO_OCR>': 'What text is in the region {input}?',
131
+ }
132
+
133
+ self.post_processor = Florence2PostProcesser(tokenizer=tokenizer)
134
+
135
+
136
+ super().__init__(image_processor, tokenizer)
137
+
138
+ def _construct_prompts(self, text):
139
+ # replace the task tokens with the task prompts if task token is in the text
140
+ prompts = []
141
+ for _text in text:
142
+ # 1. fixed task prompts without additional inputs
143
+ for task_token, task_prompt in self.task_prompts_without_inputs.items():
144
+ if task_token in _text:
145
+ assert _text == task_token, f"Task token {task_token} should be the only token in the text."
146
+ _text = task_prompt
147
+ break
148
+ # 2. task prompts with additional inputs
149
+ for task_token, task_prompt in self.task_prompts_with_input.items():
150
+ if task_token in _text:
151
+ _text = task_prompt.format(input=_text.replace(task_token, ''))
152
+ break
153
+ prompts.append(_text)
154
+ return prompts
155
+
156
+ def __call__(
157
+ self,
158
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
159
+ images: ImageInput = None,
160
+ tokenize_newline_separately: bool = True,
161
+ padding: Union[bool, str, PaddingStrategy] = False,
162
+ truncation: Union[bool, str, TruncationStrategy] = None,
163
+ max_length=None,
164
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
165
+ do_resize: bool = None,
166
+ do_normalize: bool = None,
167
+ image_mean: Optional[Union[float, List[float]]] = None,
168
+ image_std: Optional[Union[float, List[float]]] = None,
169
+ data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821
170
+ input_data_format: Optional[
171
+ Union[str, "ChannelDimension"] # noqa: F821
172
+ ] = None,
173
+ resample: "PILImageResampling" = None, # noqa: F821
174
+ do_convert_rgb: bool = None,
175
+ do_thumbnail: bool = None,
176
+ do_align_long_axis: bool = None,
177
+ do_rescale: bool = None,
178
+ ) -> BatchFeature:
179
+ """
180
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
181
+ and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
182
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
183
+ CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
184
+ of the above two methods for more information.
185
+
186
+ Args:
187
+ text (`str`, `List[str]`, `List[List[str]]`):
188
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
189
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
190
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
191
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
192
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
193
+ tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
194
+ number of channels, H and W are image height and width.
195
+ tokenize_newline_separately (`bool`, defaults to `True`):
196
+ Adds a separately tokenized '\n' at the end of the prompt.
197
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
198
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
199
+ index) among:
200
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
201
+ sequence if provided).
202
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
203
+ acceptable input length for the model if that argument is not provided.
204
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
205
+ lengths).
206
+ max_length (`int`, *optional*):
207
+ Maximum length of the returned list and optionally padding length (see above).
208
+ truncation (`bool`, *optional*):
209
+ Activates truncation to cut input sequences longer than `max_length` to `max_length`.
210
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
211
+ If set, will return tensors of a particular framework. Acceptable values are:
212
+
213
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
214
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
215
+ - `'np'`: Return NumPy `np.ndarray` objects.
216
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
217
+
218
+ Returns:
219
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
220
+
221
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
222
+ is provided, the `input_ids` will also contain the suffix input ids.
223
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
224
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
225
+ `None`).
226
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
227
+ - **labels** -- Labels compatible with training if `suffix` is not None
228
+ """
229
+
230
+ return_token_type_ids = False
231
+
232
+ if images is None:
233
+ raise ValueError("`images` are expected as arguments to a `Florence2Processor` instance.")
234
+ if text is None:
235
+ logger.warning_once(
236
+ "You are using Florence-2 without a text prompt."
237
+ )
238
+ text = ""
239
+
240
+ if isinstance(text, List) and isinstance(images, List):
241
+ if len(images) < len(text):
242
+ raise ValueError(
243
+ f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image."
244
+ )
245
+ if _is_str_or_image(text):
246
+ text = [text]
247
+ elif isinstance(text, list) and _is_str_or_image(text[0]):
248
+ pass
249
+
250
+ pixel_values = self.image_processor(
251
+ images,
252
+ do_resize=do_resize,
253
+ do_normalize=do_normalize,
254
+ return_tensors=return_tensors,
255
+ image_mean=image_mean,
256
+ image_std=image_std,
257
+ input_data_format=input_data_format,
258
+ data_format=data_format,
259
+ resample=resample,
260
+ do_convert_rgb=do_convert_rgb,
261
+ )["pixel_values"]
262
+
263
+ if max_length is not None:
264
+ max_length -= self.image_seq_length # max_length has to account for the image tokens
265
+
266
+ text = self._construct_prompts(text)
267
+
268
+ inputs = self.tokenizer(
269
+ text,
270
+ return_tensors=return_tensors,
271
+ padding=padding,
272
+ max_length=max_length,
273
+ truncation=truncation,
274
+ return_token_type_ids=return_token_type_ids,
275
+ )
276
+
277
+ return_data = {**inputs, "pixel_values": pixel_values}
278
+
279
+ if return_token_type_ids:
280
+ labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
281
+ return_data.update({"labels": labels})
282
+ return BatchFeature(data=return_data)
283
+
284
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Florence2
285
+ def batch_decode(self, *args, **kwargs):
286
+ """
287
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
288
+ refer to the docstring of this method for more information.
289
+ """
290
+ return self.tokenizer.batch_decode(*args, **kwargs)
291
+
292
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Florence2
293
+ def decode(self, *args, **kwargs):
294
+ """
295
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
296
+ the docstring of this method for more information.
297
+ """
298
+ return self.tokenizer.decode(*args, **kwargs)
299
+
300
+ @property
301
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Florence2
302
+ def model_input_names(self):
303
+ tokenizer_input_names = self.tokenizer.model_input_names
304
+ image_processor_input_names = self.image_processor.model_input_names
305
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
306
+
307
+ def post_process_generation(self, text, task, image_size):
308
+ """
309
+ Post-process the output of the model to each of the task outputs.
310
+
311
+ Args:
312
+ text (`str`): The text to post-process.
313
+ task (`str`): The task to post-process the text for.
314
+ image_size (`Tuple[int, int]`): The size of the image. height x width.
315
+ """
316
+
317
+ task_answer_post_processing_type = self.tasks_answer_post_processing_type.get(task, 'pure_text')
318
+ task_answer = self.post_processor(
319
+ text=text,
320
+ image_size=image_size,
321
+ parse_tasks=task_answer_post_processing_type,
322
+ )[task_answer_post_processing_type]
323
+
324
+ if task_answer_post_processing_type == 'pure_text':
325
+ final_answer = task_answer
326
+ # remove the special tokens
327
+ final_answer = final_answer.replace('<s>', '').replace('</s>', '')
328
+ elif task_answer_post_processing_type in ['od', 'description_with_bboxes', 'bboxes']:
329
+ od_instances = task_answer
330
+ bboxes_od = [_od_instance['bbox'] for _od_instance in od_instances]
331
+ labels_od = [str(_od_instance['cat_name']) for _od_instance in od_instances]
332
+ final_answer = {'bboxes': bboxes_od, 'labels': labels_od}
333
+ elif task_answer_post_processing_type in ['ocr']:
334
+ bboxes = [_od_instance['quad_box'] for _od_instance in task_answer]
335
+ labels = [str(_od_instance['text']) for _od_instance in task_answer]
336
+ final_answer = {'quad_boxes': bboxes, 'labels': labels}
337
+ elif task_answer_post_processing_type in ['phrase_grounding']:
338
+ bboxes = []
339
+ labels = []
340
+ for _grounded_phrase in task_answer:
341
+ for _bbox in _grounded_phrase['bbox']:
342
+ bboxes.append(_bbox)
343
+ labels.append(_grounded_phrase['cat_name'])
344
+ final_answer = {'bboxes': bboxes, 'labels': labels}
345
+ elif task_answer_post_processing_type in ['description_with_polygons', 'polygons']:
346
+ labels = []
347
+ polygons = []
348
+ for result in task_answer:
349
+ label = result['cat_name']
350
+ _polygons = result['polygons']
351
+ labels.append(label)
352
+ polygons.append(_polygons)
353
+ final_answer = {'polygons': polygons, 'labels': labels}
354
+ elif task_answer_post_processing_type in ['description_with_bboxes_or_polygons']:
355
+ bboxes = []
356
+ bboxes_labels = []
357
+ polygons = []
358
+ polygons_labels = []
359
+ for result in task_answer:
360
+ label = result['cat_name']
361
+ if 'polygons' in result:
362
+ _polygons = result['polygons']
363
+ polygons.append(_polygons)
364
+ polygons_labels.append(label)
365
+ else:
366
+ _bbox = result['bbox']
367
+ bboxes.append(_bbox)
368
+ bboxes_labels.append(label)
369
+ final_answer = {'bboxes': bboxes, 'bboxes_labels': bboxes_labels, 'polygons': polygons, 'polygons_labels': polygons_labels}
370
+ else:
371
+ raise ValueError('Unknown task answer post processing type: {}'.format(task_answer_post_processing_type))
372
+
373
+ final_answer = {
374
+ task: final_answer}
375
+ return final_answer
376
+
377
+ class BoxQuantizer(object):
378
+ def __init__(self, mode, bins):
379
+ self.mode = mode
380
+ self.bins = bins
381
+
382
+ def quantize(self, boxes: torch.Tensor, size):
383
+ bins_w, bins_h = self.bins # Quantization bins.
384
+ size_w, size_h = size # Original image size.
385
+ size_per_bin_w = size_w / bins_w
386
+ size_per_bin_h = size_h / bins_h
387
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
388
+
389
+ if self.mode == 'floor':
390
+ quantized_xmin = (
391
+ xmin / size_per_bin_w).floor().clamp(0, bins_w - 1)
392
+ quantized_ymin = (
393
+ ymin / size_per_bin_h).floor().clamp(0, bins_h - 1)
394
+ quantized_xmax = (
395
+ xmax / size_per_bin_w).floor().clamp(0, bins_w - 1)
396
+ quantized_ymax = (
397
+ ymax / size_per_bin_h).floor().clamp(0, bins_h - 1)
398
+
399
+ elif self.mode == 'round':
400
+ raise NotImplementedError()
401
+
402
+ else:
403
+ raise ValueError('Incorrect quantization type.')
404
+
405
+ quantized_boxes = torch.cat(
406
+ (quantized_xmin, quantized_ymin, quantized_xmax, quantized_ymax), dim=-1
407
+ ).int()
408
+
409
+ return quantized_boxes
410
+
411
+ def dequantize(self, boxes: torch.Tensor, size):
412
+ bins_w, bins_h = self.bins # Quantization bins.
413
+ size_w, size_h = size # Original image size.
414
+ size_per_bin_w = size_w / bins_w
415
+ size_per_bin_h = size_h / bins_h
416
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
417
+
418
+ if self.mode == 'floor':
419
+ # Add 0.5 to use the center position of the bin as the coordinate.
420
+ dequantized_xmin = (xmin + 0.5) * size_per_bin_w
421
+ dequantized_ymin = (ymin + 0.5) * size_per_bin_h
422
+ dequantized_xmax = (xmax + 0.5) * size_per_bin_w
423
+ dequantized_ymax = (ymax + 0.5) * size_per_bin_h
424
+
425
+ elif self.mode == 'round':
426
+ raise NotImplementedError()
427
+
428
+ else:
429
+ raise ValueError('Incorrect quantization type.')
430
+
431
+ dequantized_boxes = torch.cat(
432
+ (dequantized_xmin, dequantized_ymin,
433
+ dequantized_xmax, dequantized_ymax), dim=-1
434
+ )
435
+
436
+ return dequantized_boxes
437
+
438
+
439
+ class CoordinatesQuantizer(object):
440
+ """
441
+ Quantize coornidates (Nx2)
442
+ """
443
+
444
+ def __init__(self, mode, bins):
445
+ self.mode = mode
446
+ self.bins = bins
447
+
448
+ def quantize(self, coordinates: torch.Tensor, size):
449
+ bins_w, bins_h = self.bins # Quantization bins.
450
+ size_w, size_h = size # Original image size.
451
+ size_per_bin_w = size_w / bins_w
452
+ size_per_bin_h = size_h / bins_h
453
+ assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
454
+ x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
455
+
456
+ if self.mode == 'floor':
457
+ quantized_x = (x / size_per_bin_w).floor().clamp(0, bins_w - 1)
458
+ quantized_y = (y / size_per_bin_h).floor().clamp(0, bins_h - 1)
459
+
460
+ elif self.mode == 'round':
461
+ raise NotImplementedError()
462
+
463
+ else:
464
+ raise ValueError('Incorrect quantization type.')
465
+
466
+ quantized_coordinates = torch.cat(
467
+ (quantized_x, quantized_y), dim=-1
468
+ ).int()
469
+
470
+ return quantized_coordinates
471
+
472
+ def dequantize(self, coordinates: torch.Tensor, size):
473
+ bins_w, bins_h = self.bins # Quantization bins.
474
+ size_w, size_h = size # Original image size.
475
+ size_per_bin_w = size_w / bins_w
476
+ size_per_bin_h = size_h / bins_h
477
+ assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
478
+ x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
479
+
480
+ if self.mode == 'floor':
481
+ # Add 0.5 to use the center position of the bin as the coordinate.
482
+ dequantized_x = (x + 0.5) * size_per_bin_w
483
+ dequantized_y = (y + 0.5) * size_per_bin_h
484
+
485
+ elif self.mode == 'round':
486
+ raise NotImplementedError()
487
+
488
+ else:
489
+ raise ValueError('Incorrect quantization type.')
490
+
491
+ dequantized_coordinates = torch.cat(
492
+ (dequantized_x, dequantized_y), dim=-1
493
+ )
494
+
495
+ return dequantized_coordinates
496
+
497
+
498
+ class Florence2PostProcesser(object):
499
+ r"""
500
+ Florence-2 post process for converting text prediction to various tasks results.
501
+
502
+ Args:
503
+ config: A dict of configs.
504
+ tokenizer: A tokenizer for decoding text to spans.
505
+ sample config:
506
+ UNIFIED_POST_PROCESS:
507
+ # commom configs
508
+ NUM_BBOX_HEIGHT_BINS: 1000
509
+ NUM_BBOX_WIDTH_BINS: 1000
510
+ COORDINATES_HEIGHT_BINS: 1000
511
+ COORDINATES_WIDTH_BINS: 1000
512
+ # task specific configs, override the common configs
513
+ PRASE_TASKS:
514
+ - TASK_NAME: 'video_dense_caption'
515
+ PATTERN: 'r<time_(\d+)><time_(\d+)>([a-zA-Z0-9 ]+)'
516
+ SCORE_MODE: 'avg_cat_name_scores'
517
+ NUM_BINS: 100
518
+ - TASK_NAME: 'od'
519
+ PATTERN: 'r<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>([a-zA-Z0-9 ]+)'
520
+ SCORE_MODE: 'avg_cat_name_scores'
521
+
522
+ Returns:
523
+ parsed_dict (dict): A dict of parsed results.
524
+ """
525
+ def __init__(
526
+ self,
527
+ tokenizer=None
528
+ ):
529
+ parse_tasks = []
530
+ parse_task_configs = {}
531
+ config = self._create_default_config()
532
+ for task in config['PARSE_TASKS']:
533
+ parse_tasks.append(task['TASK_NAME'])
534
+ parse_task_configs[task['TASK_NAME']] = task
535
+
536
+ self.config = config
537
+ self.parse_tasks = parse_tasks
538
+ self.parse_tasks_configs = parse_task_configs
539
+
540
+ self.tokenizer = tokenizer
541
+ if self.tokenizer is not None:
542
+ self.all_special_tokens = set(self.tokenizer.all_special_tokens)
543
+
544
+ self.init_quantizers()
545
+ self.black_list_of_phrase_grounding = self._create_black_list_of_phrase_grounding()
546
+
547
+ def _create_black_list_of_phrase_grounding(self):
548
+ black_list = {}
549
+
550
+ if 'phrase_grounding' in self.parse_tasks and self.parse_tasks_configs['phrase_grounding']['FILTER_BY_BLACK_LIST']:
551
+ black_list = set(
552
+ ['it', 'I', 'me', 'mine',
553
+ 'you', 'your', 'yours',
554
+ 'he', 'him', 'his',
555
+ 'she', 'her', 'hers',
556
+ 'they', 'them', 'their', 'theirs',
557
+ 'one', 'oneself',
558
+ 'we', 'us', 'our', 'ours',
559
+ 'you', 'your', 'yours',
560
+ 'they', 'them', 'their', 'theirs',
561
+ 'mine', 'yours', 'his', 'hers', 'its',
562
+ 'ours', 'yours', 'theirs',
563
+ 'myself', 'yourself', 'himself', 'herself', 'itself',
564
+ 'ourselves', 'yourselves', 'themselves',
565
+ 'this', 'that',
566
+ 'these', 'those',
567
+ 'who', 'whom', 'whose', 'which', 'what',
568
+ 'who', 'whom', 'whose', 'which', 'that',
569
+ 'all', 'another', 'any', 'anybody', 'anyone', 'anything',
570
+ 'each', 'everybody', 'everyone', 'everything',
571
+ 'few', 'many', 'nobody', 'none', 'one', 'several',
572
+ 'some', 'somebody', 'someone', 'something',
573
+ 'each other', 'one another',
574
+ 'myself', 'yourself', 'himself', 'herself', 'itself',
575
+ 'ourselves', 'yourselves', 'themselves',
576
+ 'the image', 'image', 'images', 'the', 'a', 'an', 'a group',
577
+ 'other objects', 'lots', 'a set',
578
+ ]
579
+ )
580
+
581
+ return black_list
582
+
583
+ def _create_default_config(self):
584
+ config = {
585
+ 'NUM_BBOX_HEIGHT_BINS': 1000,
586
+ 'NUM_BBOX_WIDTH_BINS': 1000,
587
+ 'BOX_QUANTIZATION_MODE': 'floor',
588
+ 'COORDINATES_HEIGHT_BINS': 1000,
589
+ 'COORDINATES_WIDTH_BINS': 1000,
590
+ 'COORDINATES_QUANTIZATION_MODE': 'floor',
591
+ 'PARSE_TASKS': [
592
+ {
593
+ 'TASK_NAME': 'od',
594
+ 'PATTERN': r'([a-zA-Z0-9 ]+)<loc_(\\d+)><loc_(\\d+)><loc_(\\d+)><loc_(\\d+)>'
595
+ },
596
+ {
597
+ 'TASK_NAME': 'ocr',
598
+ 'PATTERN': r'(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>',
599
+ 'AREA_THRESHOLD': 0.00
600
+ },
601
+ {
602
+ 'TASK_NAME': 'phrase_grounding',
603
+ 'FILTER_BY_BLACK_LIST': True
604
+ },
605
+ {
606
+ 'TASK_NAME': 'pure_text',
607
+ },
608
+ {
609
+ 'TASK_NAME': 'description_with_bboxes',
610
+ },
611
+ {
612
+ 'TASK_NAME': 'description_with_polygons',
613
+ },
614
+ {
615
+ 'TASK_NAME': 'polygons',
616
+ },
617
+ {
618
+ 'TASK_NAME': 'bboxes',
619
+ },
620
+ {
621
+ 'TASK_NAME': 'description_with_bboxes_or_polygons',
622
+ }
623
+ ]
624
+ }
625
+
626
+ return config
627
+
628
+ def init_quantizers(self):
629
+ # we have box_quantizer (od, grounding) and coordinates_quantizer (ocr, referring_segmentation)
630
+ num_bbox_height_bins = self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
631
+ num_bbox_width_bins = self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
632
+ box_quantization_mode = self.config.get('BOX_QUANTIZATION_MODE', 'floor')
633
+ self.box_quantizer = BoxQuantizer(
634
+ box_quantization_mode,
635
+ (num_bbox_width_bins, num_bbox_height_bins),
636
+ )
637
+
638
+ num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
639
+ num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
640
+ box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
641
+ self.coordinates_quantizer = CoordinatesQuantizer(
642
+ box_quantization_mode,
643
+ (num_bbox_width_bins, num_bbox_height_bins),
644
+ )
645
+
646
+ def decode_with_spans(self, tokenizer, token_ids):
647
+ filtered_tokens = tokenizer.convert_ids_to_tokens(
648
+ token_ids, skip_special_tokens=False)
649
+ assert len(filtered_tokens) == len(token_ids)
650
+
651
+ # To avoid mixing byte-level and unicode for byte-level BPT
652
+ # we need to build string separately for added tokens and byte-level tokens
653
+ # cf. https://github.com/huggingface/transformers/issues/1133
654
+ sub_texts = []
655
+ for token in filtered_tokens:
656
+ if token in self.all_special_tokens:
657
+ sub_texts.append(token)
658
+ else:
659
+ if isinstance(tokenizer, (BartTokenizer, BartTokenizerFast)):
660
+ sub_text = tokenizer.convert_tokens_to_string([token])
661
+ elif isinstance(tokenizer, (T5Tokenizer, T5TokenizerFast)):
662
+ # Ref: https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol
663
+ # Note: Do not strip sub_text as it may have functional whitespace
664
+ sub_text = token.replace('▁', ' ')
665
+ else:
666
+ raise ValueError(f'type {type(tokenizer)} not supported')
667
+ sub_texts.append(sub_text)
668
+
669
+ text = ''
670
+ spans = []
671
+ for sub_text in sub_texts:
672
+ span = (len(text), len(text) + len(sub_text)) # [start index, end index).
673
+ text += sub_text
674
+ spans.append(span)
675
+
676
+ # Text format:
677
+ # 1. T5Tokenizer/T5TokenizerFast:
678
+ # "<loc_1><loc_2><loc_3><loc_4> transplanting dog<loc_1><loc_2><loc_3><loc_4> cat</s>"
679
+ # Equivalent to t5_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
680
+ # 2. BartTokenizer (need to double check):
681
+ # "<s><loc_1><loc_2><loc_3><loc_4>transplanting dog<loc_1><loc_2><loc_3><loc_4>cat</s>"
682
+ # Equivalent to bart_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
683
+ return text, spans
684
+
685
+ def parse_od_from_text_and_spans(
686
+ self,
687
+ text,
688
+ pattern,
689
+ image_size,
690
+ phrase_centric=False
691
+ ):
692
+ parsed = list(re.finditer(pattern, text))
693
+
694
+ instances = []
695
+ for i in range(len(parsed)):
696
+ # Prepare instance.
697
+ instance = {}
698
+
699
+ if phrase_centric:
700
+ bbox_bins = [int(parsed[i].group(j)) for j in range(2, 6)]
701
+ else:
702
+ bbox_bins = [int(parsed[i].group(j)) for j in range(1, 5)]
703
+ instance['bbox'] = self.box_quantizer.dequantize(
704
+ boxes=torch.tensor(bbox_bins),
705
+ size=image_size
706
+ ).tolist()
707
+
708
+ if phrase_centric:
709
+ instance['cat_name'] = parsed[i].group(1).lower().strip()
710
+ else:
711
+ instance['cat_name'] = parsed[i].group(5).lower().strip()
712
+ instances.append(instance)
713
+
714
+ return instances
715
+
716
+ def parse_ocr_from_text_and_spans(self,
717
+ text,
718
+ pattern,
719
+ image_size,
720
+ area_threshold=-1.0,
721
+ ):
722
+ bboxes = []
723
+ labels = []
724
+ text = text.replace('<s>', '')
725
+ # ocr with regions
726
+ parsed = re.findall(pattern, text)
727
+ instances = []
728
+ image_width, image_height = image_size
729
+
730
+ for ocr_line in parsed:
731
+ ocr_content = ocr_line[0]
732
+ quad_box = ocr_line[1:]
733
+ quad_box = [int(i) for i in quad_box]
734
+ quad_box = self.coordinates_quantizer.dequantize(
735
+ torch.tensor(np.array(quad_box).reshape(-1, 2)),
736
+ size=image_size
737
+ ).reshape(-1).tolist()
738
+
739
+ if area_threshold > 0:
740
+ x_coords = [i for i in quad_box[0::2]]
741
+ y_coords = [i for i in quad_box[1::2]]
742
+
743
+ # apply the Shoelace formula
744
+ area = 0.5 * abs(sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1)))
745
+
746
+ if area < (image_width * image_height) * area_threshold:
747
+ continue
748
+
749
+ bboxes.append(quad_box)
750
+ labels.append(ocr_content)
751
+ instances.append({
752
+ 'quad_box': quad_box,
753
+ 'text': ocr_content,
754
+ })
755
+ return instances
756
+
757
+ def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
758
+ # ignore <s> </s> and <pad>
759
+ cur_span = 0
760
+ if text.startswith('<s>'):
761
+ cur_span += 3
762
+
763
+ text = text.replace('<s>', '')
764
+ text = text.replace('</s>', '')
765
+ text = text.replace('<pad>', '')
766
+
767
+ pattern = r"([^<]+(?:<loc_\d+>){4,})"
768
+ phrases = re.findall(pattern, text)
769
+
770
+ # pattern should be text pattern and od pattern
771
+ pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
772
+ box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
773
+
774
+ instances = []
775
+ for pharse_text in phrases:
776
+ phrase_text_strip = pharse_text.replace('<ground>', '', 1)
777
+ phrase_text_strip = pharse_text.replace('<obj>', '', 1)
778
+
779
+ if phrase_text_strip == '':
780
+ cur_span += len(pharse_text)
781
+ continue
782
+
783
+ # Prepare instance.
784
+ instance = {}
785
+
786
+ # parse phrase, get string
787
+ phrase = re.search(pattern, phrase_text_strip)
788
+ if phrase is None:
789
+ cur_span += len(pharse_text)
790
+ continue
791
+
792
+ # parse bboxes by box_pattern
793
+ bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
794
+ if len(bboxes_parsed) == 0:
795
+ cur_span += len(pharse_text)
796
+ continue
797
+
798
+ phrase = phrase.group()
799
+ # remove leading and trailing spaces
800
+ phrase = phrase.strip()
801
+
802
+ if phrase in self.black_list_of_phrase_grounding:
803
+ cur_span += len(pharse_text)
804
+ continue
805
+
806
+ # a list of list
807
+ bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
808
+ instance['bbox'] = self.box_quantizer.dequantize(
809
+ boxes=torch.tensor(bbox_bins),
810
+ size=image_size
811
+ ).tolist()
812
+
813
+ # exclude non-ascii characters
814
+ phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
815
+ instance['cat_name'] = phrase
816
+
817
+ instances.append(instance)
818
+
819
+ return instances
820
+
821
+ def parse_description_with_bboxes_from_text_and_spans(self, text, pattern, image_size, allow_empty_phrase=False):
822
+ # temporary parse solution, split by '.'
823
+ # ignore <s> </s> and <pad>
824
+
825
+ text = text.replace('<s>', '')
826
+ text = text.replace('</s>', '')
827
+ text = text.replace('<pad>', '')
828
+
829
+ if allow_empty_phrase:
830
+ pattern = rf"(?:(?:<loc_\d+>){{4,}})"
831
+ else:
832
+ pattern = r"([^<]+(?:<loc_\d+>){4,})"
833
+ phrases = re.findall(pattern, text)
834
+
835
+ # pattern should be text pattern and od pattern
836
+ pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
837
+ box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
838
+
839
+ instances = []
840
+ for pharse_text in phrases:
841
+ phrase_text_strip = pharse_text.replace('<ground>', '', 1)
842
+ phrase_text_strip = pharse_text.replace('<obj>', '', 1)
843
+
844
+ if phrase_text_strip == '' and not allow_empty_phrase:
845
+ continue
846
+
847
+ # parse phrase, get string
848
+ phrase = re.search(pattern, phrase_text_strip)
849
+ if phrase is None:
850
+ continue
851
+
852
+ phrase = phrase.group()
853
+ # remove leading and trailing spaces
854
+ phrase = phrase.strip()
855
+
856
+ # parse bboxes by box_pattern
857
+ bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
858
+ if len(bboxes_parsed) == 0:
859
+ continue
860
+
861
+ # a list of list
862
+ bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
863
+
864
+ bboxes = self.box_quantizer.dequantize(
865
+ boxes=torch.tensor(bbox_bins),
866
+ size=image_size
867
+ ).tolist()
868
+
869
+ phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
870
+ for _bboxes in bboxes:
871
+ # Prepare instance.
872
+ instance = {}
873
+ instance['bbox'] = _bboxes
874
+ # exclude non-ascii characters
875
+ instance['cat_name'] = phrase
876
+ instances.append(instance)
877
+
878
+ return instances
879
+
880
+ def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
881
+ allow_empty_phrase=False,
882
+ polygon_sep_token='<sep>',
883
+ polygon_start_token='<poly>',
884
+ polygon_end_token='</poly>',
885
+ with_box_at_start=False,
886
+ ):
887
+
888
+ # ref_seg format: '<expression><x1><y1><x2><y2><><><sep><><><><>'
889
+ # ignore <s> </s> and <pad>
890
+
891
+ text = text.replace('<s>', '')
892
+ text = text.replace('</s>', '')
893
+ text = text.replace('<pad>', '')
894
+
895
+ if allow_empty_phrase:
896
+ pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
897
+ else:
898
+ # [^<]+: This part matches one or more characters that are not the < symbol.
899
+ # The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
900
+ #
901
+ pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
902
+ phrases = re.findall(pattern, text)
903
+
904
+ phrase_string_pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_|<poly>)'
905
+ box_pattern = rf'((?:<loc_\d+>)+)(?:{re.escape(polygon_sep_token)}|$)'
906
+
907
+ # one polygons instance is separated by polygon_start_token and polygon_end_token
908
+ polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'
909
+
910
+ instances = []
911
+ for phrase_text in phrases:
912
+
913
+ # exclude loc_\d+>
914
+ # need to get span if want to include category score
915
+ phrase_text_strip = re.sub(r'^loc_\d+>', '', phrase_text, count=1)
916
+
917
+ # phrase = phrase.replace('<poly>', '')
918
+ # phrase = phrase.replace('poly>', '')
919
+
920
+ if phrase_text_strip == '' and not allow_empty_phrase:
921
+ continue
922
+
923
+
924
+ # parse phrase, get string
925
+ phrase = re.search(phrase_string_pattern, phrase_text_strip)
926
+ if phrase is None:
927
+ continue
928
+ phrase = phrase.group()
929
+ # remove leading and trailing spaces
930
+ phrase = phrase.strip()
931
+
932
+ # parse bboxes by box_pattern
933
+
934
+ # split by polygon_start_token and polygon_end_token first using polygons_instance_pattern
935
+ if polygon_start_token in phrase_text and polygon_end_token in phrase_text:
936
+ polygons_instances_parsed = list(re.finditer(polygons_instance_pattern, phrase_text))
937
+ else:
938
+ polygons_instances_parsed = [phrase_text]
939
+
940
+ for _polygons_instances_parsed in polygons_instances_parsed:
941
+ # Prepare instance.
942
+ instance = {}
943
+
944
+ # polygons_parsed= list(re.finditer(box_pattern, phrase_text))
945
+ if isinstance(_polygons_instances_parsed, str):
946
+ polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
947
+ else:
948
+ polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
949
+ if len(polygons_parsed) == 0:
950
+ continue
951
+
952
+ # a list of list (polygon)
953
+ bbox = []
954
+ polygons = []
955
+ for _polygon_parsed in polygons_parsed:
956
+ # group 1: whole <loc_\d+>...</loc_\d+>
957
+ _polygon = _polygon_parsed.group(1)
958
+ # parse into list of int
959
+ _polygon = [int(_loc_parsed.group(1)) for _loc_parsed in re.finditer(r'<loc_(\d+)>', _polygon)]
960
+ if with_box_at_start and len(bbox) == 0:
961
+ if len(_polygon) > 4:
962
+ # no valid bbox prediction
963
+ bbox = _polygon[:4]
964
+ _polygon = _polygon[4:]
965
+ else:
966
+ bbox = [0, 0, 0, 0]
967
+ # abandon last element if is not paired
968
+ if len(_polygon) % 2 == 1:
969
+ _polygon = _polygon[:-1]
970
+
971
+ # reshape into (n, 2)
972
+ _polygon = self.coordinates_quantizer.dequantize(
973
+ torch.tensor(np.array(_polygon).reshape(-1, 2)),
974
+ size=image_size
975
+ ).reshape(-1).tolist()
976
+ # reshape back
977
+ polygons.append(_polygon)
978
+
979
+ instance['cat_name'] = phrase
980
+ instance['polygons'] = polygons
981
+ if len(bbox) != 0:
982
+ instance['bbox'] = self.box_quantizer.dequantize(
983
+ boxes=torch.tensor([bbox]),
984
+ size=image_size
985
+ ).tolist()[0]
986
+
987
+ instances.append(instance)
988
+
989
+ return instances
990
+
991
+ def __call__(
992
+ self,
993
+ text=None,
994
+ image_size=None,
995
+ parse_tasks=None,
996
+ ):
997
+ """
998
+ Args:
999
+ text: model outputs
1000
+ image_size: (width, height)
1001
+ parse_tasks: a list of tasks to parse, if None, parse all tasks.
1002
+
1003
+ """
1004
+ if parse_tasks is not None:
1005
+ if isinstance(parse_tasks, str):
1006
+ parse_tasks = [parse_tasks]
1007
+ for _parse_task in parse_tasks:
1008
+ assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'
1009
+
1010
+ # sequence or text should be provided
1011
+ assert text is not None, 'text should be provided'
1012
+
1013
+ parsed_dict = {
1014
+ 'text': text
1015
+ }
1016
+
1017
+ for task in self.parse_tasks:
1018
+ if parse_tasks is not None and task not in parse_tasks:
1019
+ continue
1020
+
1021
+ pattern = self.parse_tasks_configs[task].get('PATTERN', None)
1022
+
1023
+ if task == 'ocr':
1024
+ instances = self.parse_ocr_from_text_and_spans(
1025
+ text,
1026
+ pattern=pattern,
1027
+ image_size=image_size,
1028
+ area_threshold=self.parse_tasks_configs[task].get('AREA_THRESHOLD', 0.0),
1029
+ )
1030
+ parsed_dict['ocr'] = instances
1031
+ elif task == 'phrase_grounding':
1032
+ instances = self.parse_phrase_grounding_from_text_and_spans(
1033
+ text,
1034
+ pattern=pattern,
1035
+ image_size=image_size,
1036
+ )
1037
+ parsed_dict['phrase_grounding'] = instances
1038
+ elif task == 'pure_text':
1039
+ parsed_dict['pure_text'] = text
1040
+ elif task == 'description_with_bboxes':
1041
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1042
+ text,
1043
+ pattern=pattern,
1044
+ image_size=image_size,
1045
+ )
1046
+ parsed_dict['description_with_bboxes'] = instances
1047
+ elif task == 'description_with_polygons':
1048
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1049
+ text,
1050
+ pattern=pattern,
1051
+ image_size=image_size,
1052
+ )
1053
+ parsed_dict['description_with_polygons'] = instances
1054
+ elif task == 'polygons':
1055
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1056
+ text,
1057
+ pattern=pattern,
1058
+ image_size=image_size,
1059
+ allow_empty_phrase=True,
1060
+ )
1061
+ parsed_dict['polygons'] = instances
1062
+ elif task == 'bboxes':
1063
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1064
+ text,
1065
+ pattern=pattern,
1066
+ image_size=image_size,
1067
+ allow_empty_phrase=True,
1068
+ )
1069
+ parsed_dict['bboxes'] = instances
1070
+ elif task == 'description_with_bboxes_or_polygons':
1071
+ if '<poly>' in text:
1072
+ # only support either polygons or bboxes, not both at the same time
1073
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1074
+ text,
1075
+ pattern=pattern,
1076
+ image_size=image_size,
1077
+ )
1078
+ else:
1079
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1080
+ text,
1081
+ pattern=pattern,
1082
+ image_size=image_size,
1083
+ )
1084
+ parsed_dict['description_with_bboxes_or_polygons'] = instances
1085
+ else:
1086
+ raise ValueError("task {} is not supported".format(task))
1087
+
1088
+ return parsed_dict
LLM/Florence-2-large-PromptGen-v2.0/special_tokens_map.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/Florence-2-large-PromptGen-v2.0/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/Florence-2-large-PromptGen-v2.0/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/Florence-2-large-PromptGen-v2.0/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/Llama-3.1-8B-Lexi-Uncensored-V2/README.md ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: llama3.1
3
+ model-index:
4
+ - name: Llama-3.1-8B-Lexi-Uncensored-V2
5
+ results:
6
+ - task:
7
+ type: text-generation
8
+ name: Text Generation
9
+ dataset:
10
+ name: IFEval (0-Shot)
11
+ type: HuggingFaceH4/ifeval
12
+ args:
13
+ num_few_shot: 0
14
+ metrics:
15
+ - type: inst_level_strict_acc and prompt_level_strict_acc
16
+ value: 77.92
17
+ name: strict accuracy
18
+ source:
19
+ url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2
20
+ name: Open LLM Leaderboard
21
+ - task:
22
+ type: text-generation
23
+ name: Text Generation
24
+ dataset:
25
+ name: BBH (3-Shot)
26
+ type: BBH
27
+ args:
28
+ num_few_shot: 3
29
+ metrics:
30
+ - type: acc_norm
31
+ value: 29.69
32
+ name: normalized accuracy
33
+ source:
34
+ url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2
35
+ name: Open LLM Leaderboard
36
+ - task:
37
+ type: text-generation
38
+ name: Text Generation
39
+ dataset:
40
+ name: MATH Lvl 5 (4-Shot)
41
+ type: hendrycks/competition_math
42
+ args:
43
+ num_few_shot: 4
44
+ metrics:
45
+ - type: exact_match
46
+ value: 16.92
47
+ name: exact match
48
+ source:
49
+ url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2
50
+ name: Open LLM Leaderboard
51
+ - task:
52
+ type: text-generation
53
+ name: Text Generation
54
+ dataset:
55
+ name: GPQA (0-shot)
56
+ type: Idavidrein/gpqa
57
+ args:
58
+ num_few_shot: 0
59
+ metrics:
60
+ - type: acc_norm
61
+ value: 4.36
62
+ name: acc_norm
63
+ source:
64
+ url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2
65
+ name: Open LLM Leaderboard
66
+ - task:
67
+ type: text-generation
68
+ name: Text Generation
69
+ dataset:
70
+ name: MuSR (0-shot)
71
+ type: TAUR-Lab/MuSR
72
+ args:
73
+ num_few_shot: 0
74
+ metrics:
75
+ - type: acc_norm
76
+ value: 7.77
77
+ name: acc_norm
78
+ source:
79
+ url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2
80
+ name: Open LLM Leaderboard
81
+ - task:
82
+ type: text-generation
83
+ name: Text Generation
84
+ dataset:
85
+ name: MMLU-PRO (5-shot)
86
+ type: TIGER-Lab/MMLU-Pro
87
+ config: main
88
+ split: test
89
+ args:
90
+ num_few_shot: 5
91
+ metrics:
92
+ - type: acc
93
+ value: 30.9
94
+ name: accuracy
95
+ source:
96
+ url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2
97
+ name: Open LLM Leaderboard
98
+ library_name: transformers
99
+ ---
100
+
101
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/644ad182f434a6a63b18eee6/7mnEJyioRzQaWz8xLM4KI.png)
102
+
103
+ VERSION 2 Update Notes:
104
+ ---
105
+ - More compliant
106
+ - Smarter
107
+ - For best response, use this system prompt (feel free to expand upon it as you wish):
108
+
109
+ Think step by step with a logical reasoning and intellectual sense before you provide any response.
110
+
111
+ - For more uncensored and compliant response, you can expand the system message differently, or simply enter a dot "." as system message.
112
+
113
+ - IMPORTANT: Upon further investigation, the Q4 seems to have refusal issues sometimes.
114
+ There seems to be some of the fine-tune loss happening due to the quantization. I will look into it for V3.
115
+ Until then, I suggest you run F16 or Q8 if possible.
116
+
117
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/644ad182f434a6a63b18eee6/zaHhRjsk3rvo_YewgXV2Z.png)
118
+
119
+ GENERAL INFO:
120
+ ---
121
+
122
+ This model is based on Llama-3.1-8b-Instruct, and is governed by [META LLAMA 3.1 COMMUNITY LICENSE AGREEMENT](https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/LICENSE)
123
+
124
+ Lexi is uncensored, which makes the model compliant. You are advised to implement your own alignment layer before exposing the model as a service. It will be highly compliant with any requests, even unethical ones.
125
+
126
+ You are responsible for any content you create using this model. Please use it responsibly.
127
+
128
+ Lexi is licensed according to Meta's Llama license. I grant permission for any use, including commercial, that falls within accordance with Meta's Llama-3.1 license.
129
+
130
+ IMPORTANT:
131
+ ---
132
+ Use the same template as the official Llama 3.1 8B instruct.
133
+ System tokens must be present during inference, even if you set an empty system message. If you are unsure, just add a short system message as you wish.
134
+
135
+ FEEDBACK:
136
+ ---
137
+ If you find any issues or have suggestions for improvements, feel free to leave a review and I will look into it for upcoming improvements and next version.
138
+
139
+
140
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/644ad182f434a6a63b18eee6/uqJv-R1LeJEfMxi1nmTH5.png)
141
+
142
+
143
+ # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
144
+ Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_Orenguteng__Llama-3.1-8B-Lexi-Uncensored-V2)
145
+
146
+ | Metric |Value|
147
+ |-------------------|----:|
148
+ |Avg. |27.93|
149
+ |IFEval (0-Shot) |77.92|
150
+ |BBH (3-Shot) |29.69|
151
+ |MATH Lvl 5 (4-Shot)|16.92|
152
+ |GPQA (0-shot) | 4.36|
153
+ |MuSR (0-shot) | 7.77|
154
+ |MMLU-PRO (5-shot) |30.90|
155
+
LLM/Llama-3.1-8B-Lexi-Uncensored-V2/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "unsloth/meta-llama-3.1-8b-instruct",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 128000,
9
+ "eos_token_id": [
10
+ 128001,
11
+ 128008,
12
+ 128009
13
+ ],
14
+ "hidden_act": "silu",
15
+ "hidden_size": 4096,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 14336,
18
+ "max_position_embeddings": 131072,
19
+ "mlp_bias": false,
20
+ "model_type": "llama",
21
+ "num_attention_heads": 32,
22
+ "num_hidden_layers": 32,
23
+ "num_key_value_heads": 8,
24
+ "pad_token_id": 128004,
25
+ "pretraining_tp": 1,
26
+ "rms_norm_eps": 1e-05,
27
+ "rope_scaling": {
28
+ "factor": 8.0,
29
+ "high_freq_factor": 4.0,
30
+ "low_freq_factor": 1.0,
31
+ "original_max_position_embeddings": 8192,
32
+ "rope_type": "llama3"
33
+ },
34
+ "rope_theta": 500000.0,
35
+ "tie_word_embeddings": false,
36
+ "torch_dtype": "bfloat16",
37
+ "transformers_version": "4.44.0.dev0",
38
+ "unsloth_version": "2024.8",
39
+ "use_cache": true,
40
+ "vocab_size": 128256
41
+ }
LLM/Llama-3.1-8B-Lexi-Uncensored-V2/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 128000,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 128001,
6
+ 128008,
7
+ 128009
8
+ ],
9
+ "max_length": 131072,
10
+ "pad_token_id": 128004,
11
+ "temperature": 0.6,
12
+ "top_p": 0.9,
13
+ "transformers_version": "4.44.0.dev0"
14
+ }
LLM/Llama-3.1-8B-Lexi-Uncensored-V2/model.safetensors.index.json ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 16060522496
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
224
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
234
+ "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
235
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
238
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
243
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
244
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
245
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
246
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
247
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
248
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
249
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
250
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
251
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
252
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
253
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
254
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
255
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
256
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
257
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
258
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
259
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
260
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
261
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
266
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
269
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
271
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
278
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
279
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
281
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
282
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
283
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
284
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
288
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
289
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
290
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
291
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
292
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
293
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
294
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
295
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
296
+ "model.norm.weight": "model-00004-of-00004.safetensors"
297
+ }
298
+ }
LLM/Llama-3.1-8B-Lexi-Uncensored-V2/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin_of_text|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|eot_id|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|finetune_right_pad_id|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
LLM/Llama-3.1-8B-Lexi-Uncensored-V2/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/Llama-3.1-8B-Lexi-Uncensored-V2/tokenizer_config.json ADDED
@@ -0,0 +1,2064 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "128000": {
4
+ "content": "<|begin_of_text|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "128001": {
12
+ "content": "<|end_of_text|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "128002": {
20
+ "content": "<|reserved_special_token_0|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "128003": {
28
+ "content": "<|reserved_special_token_1|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128004": {
36
+ "content": "<|finetune_right_pad_id|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "128005": {
44
+ "content": "<|reserved_special_token_2|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "128006": {
52
+ "content": "<|start_header_id|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "128007": {
60
+ "content": "<|end_header_id|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "128008": {
68
+ "content": "<|eom_id|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "128009": {
76
+ "content": "<|eot_id|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "128010": {
84
+ "content": "<|python_tag|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "128011": {
92
+ "content": "<|reserved_special_token_3|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "128012": {
100
+ "content": "<|reserved_special_token_4|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "128013": {
108
+ "content": "<|reserved_special_token_5|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "128014": {
116
+ "content": "<|reserved_special_token_6|>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "128015": {
124
+ "content": "<|reserved_special_token_7|>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "128016": {
132
+ "content": "<|reserved_special_token_8|>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "128017": {
140
+ "content": "<|reserved_special_token_9|>",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "128018": {
148
+ "content": "<|reserved_special_token_10|>",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "128019": {
156
+ "content": "<|reserved_special_token_11|>",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "128020": {
164
+ "content": "<|reserved_special_token_12|>",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "128021": {
172
+ "content": "<|reserved_special_token_13|>",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "128022": {
180
+ "content": "<|reserved_special_token_14|>",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ },
187
+ "128023": {
188
+ "content": "<|reserved_special_token_15|>",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": true
194
+ },
195
+ "128024": {
196
+ "content": "<|reserved_special_token_16|>",
197
+ "lstrip": false,
198
+ "normalized": false,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": true
202
+ },
203
+ "128025": {
204
+ "content": "<|reserved_special_token_17|>",
205
+ "lstrip": false,
206
+ "normalized": false,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": true
210
+ },
211
+ "128026": {
212
+ "content": "<|reserved_special_token_18|>",
213
+ "lstrip": false,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": true
218
+ },
219
+ "128027": {
220
+ "content": "<|reserved_special_token_19|>",
221
+ "lstrip": false,
222
+ "normalized": false,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": true
226
+ },
227
+ "128028": {
228
+ "content": "<|reserved_special_token_20|>",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": true
234
+ },
235
+ "128029": {
236
+ "content": "<|reserved_special_token_21|>",
237
+ "lstrip": false,
238
+ "normalized": false,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": true
242
+ },
243
+ "128030": {
244
+ "content": "<|reserved_special_token_22|>",
245
+ "lstrip": false,
246
+ "normalized": false,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": true
250
+ },
251
+ "128031": {
252
+ "content": "<|reserved_special_token_23|>",
253
+ "lstrip": false,
254
+ "normalized": false,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": true
258
+ },
259
+ "128032": {
260
+ "content": "<|reserved_special_token_24|>",
261
+ "lstrip": false,
262
+ "normalized": false,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": true
266
+ },
267
+ "128033": {
268
+ "content": "<|reserved_special_token_25|>",
269
+ "lstrip": false,
270
+ "normalized": false,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": true
274
+ },
275
+ "128034": {
276
+ "content": "<|reserved_special_token_26|>",
277
+ "lstrip": false,
278
+ "normalized": false,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": true
282
+ },
283
+ "128035": {
284
+ "content": "<|reserved_special_token_27|>",
285
+ "lstrip": false,
286
+ "normalized": false,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": true
290
+ },
291
+ "128036": {
292
+ "content": "<|reserved_special_token_28|>",
293
+ "lstrip": false,
294
+ "normalized": false,
295
+ "rstrip": false,
296
+ "single_word": false,
297
+ "special": true
298
+ },
299
+ "128037": {
300
+ "content": "<|reserved_special_token_29|>",
301
+ "lstrip": false,
302
+ "normalized": false,
303
+ "rstrip": false,
304
+ "single_word": false,
305
+ "special": true
306
+ },
307
+ "128038": {
308
+ "content": "<|reserved_special_token_30|>",
309
+ "lstrip": false,
310
+ "normalized": false,
311
+ "rstrip": false,
312
+ "single_word": false,
313
+ "special": true
314
+ },
315
+ "128039": {
316
+ "content": "<|reserved_special_token_31|>",
317
+ "lstrip": false,
318
+ "normalized": false,
319
+ "rstrip": false,
320
+ "single_word": false,
321
+ "special": true
322
+ },
323
+ "128040": {
324
+ "content": "<|reserved_special_token_32|>",
325
+ "lstrip": false,
326
+ "normalized": false,
327
+ "rstrip": false,
328
+ "single_word": false,
329
+ "special": true
330
+ },
331
+ "128041": {
332
+ "content": "<|reserved_special_token_33|>",
333
+ "lstrip": false,
334
+ "normalized": false,
335
+ "rstrip": false,
336
+ "single_word": false,
337
+ "special": true
338
+ },
339
+ "128042": {
340
+ "content": "<|reserved_special_token_34|>",
341
+ "lstrip": false,
342
+ "normalized": false,
343
+ "rstrip": false,
344
+ "single_word": false,
345
+ "special": true
346
+ },
347
+ "128043": {
348
+ "content": "<|reserved_special_token_35|>",
349
+ "lstrip": false,
350
+ "normalized": false,
351
+ "rstrip": false,
352
+ "single_word": false,
353
+ "special": true
354
+ },
355
+ "128044": {
356
+ "content": "<|reserved_special_token_36|>",
357
+ "lstrip": false,
358
+ "normalized": false,
359
+ "rstrip": false,
360
+ "single_word": false,
361
+ "special": true
362
+ },
363
+ "128045": {
364
+ "content": "<|reserved_special_token_37|>",
365
+ "lstrip": false,
366
+ "normalized": false,
367
+ "rstrip": false,
368
+ "single_word": false,
369
+ "special": true
370
+ },
371
+ "128046": {
372
+ "content": "<|reserved_special_token_38|>",
373
+ "lstrip": false,
374
+ "normalized": false,
375
+ "rstrip": false,
376
+ "single_word": false,
377
+ "special": true
378
+ },
379
+ "128047": {
380
+ "content": "<|reserved_special_token_39|>",
381
+ "lstrip": false,
382
+ "normalized": false,
383
+ "rstrip": false,
384
+ "single_word": false,
385
+ "special": true
386
+ },
387
+ "128048": {
388
+ "content": "<|reserved_special_token_40|>",
389
+ "lstrip": false,
390
+ "normalized": false,
391
+ "rstrip": false,
392
+ "single_word": false,
393
+ "special": true
394
+ },
395
+ "128049": {
396
+ "content": "<|reserved_special_token_41|>",
397
+ "lstrip": false,
398
+ "normalized": false,
399
+ "rstrip": false,
400
+ "single_word": false,
401
+ "special": true
402
+ },
403
+ "128050": {
404
+ "content": "<|reserved_special_token_42|>",
405
+ "lstrip": false,
406
+ "normalized": false,
407
+ "rstrip": false,
408
+ "single_word": false,
409
+ "special": true
410
+ },
411
+ "128051": {
412
+ "content": "<|reserved_special_token_43|>",
413
+ "lstrip": false,
414
+ "normalized": false,
415
+ "rstrip": false,
416
+ "single_word": false,
417
+ "special": true
418
+ },
419
+ "128052": {
420
+ "content": "<|reserved_special_token_44|>",
421
+ "lstrip": false,
422
+ "normalized": false,
423
+ "rstrip": false,
424
+ "single_word": false,
425
+ "special": true
426
+ },
427
+ "128053": {
428
+ "content": "<|reserved_special_token_45|>",
429
+ "lstrip": false,
430
+ "normalized": false,
431
+ "rstrip": false,
432
+ "single_word": false,
433
+ "special": true
434
+ },
435
+ "128054": {
436
+ "content": "<|reserved_special_token_46|>",
437
+ "lstrip": false,
438
+ "normalized": false,
439
+ "rstrip": false,
440
+ "single_word": false,
441
+ "special": true
442
+ },
443
+ "128055": {
444
+ "content": "<|reserved_special_token_47|>",
445
+ "lstrip": false,
446
+ "normalized": false,
447
+ "rstrip": false,
448
+ "single_word": false,
449
+ "special": true
450
+ },
451
+ "128056": {
452
+ "content": "<|reserved_special_token_48|>",
453
+ "lstrip": false,
454
+ "normalized": false,
455
+ "rstrip": false,
456
+ "single_word": false,
457
+ "special": true
458
+ },
459
+ "128057": {
460
+ "content": "<|reserved_special_token_49|>",
461
+ "lstrip": false,
462
+ "normalized": false,
463
+ "rstrip": false,
464
+ "single_word": false,
465
+ "special": true
466
+ },
467
+ "128058": {
468
+ "content": "<|reserved_special_token_50|>",
469
+ "lstrip": false,
470
+ "normalized": false,
471
+ "rstrip": false,
472
+ "single_word": false,
473
+ "special": true
474
+ },
475
+ "128059": {
476
+ "content": "<|reserved_special_token_51|>",
477
+ "lstrip": false,
478
+ "normalized": false,
479
+ "rstrip": false,
480
+ "single_word": false,
481
+ "special": true
482
+ },
483
+ "128060": {
484
+ "content": "<|reserved_special_token_52|>",
485
+ "lstrip": false,
486
+ "normalized": false,
487
+ "rstrip": false,
488
+ "single_word": false,
489
+ "special": true
490
+ },
491
+ "128061": {
492
+ "content": "<|reserved_special_token_53|>",
493
+ "lstrip": false,
494
+ "normalized": false,
495
+ "rstrip": false,
496
+ "single_word": false,
497
+ "special": true
498
+ },
499
+ "128062": {
500
+ "content": "<|reserved_special_token_54|>",
501
+ "lstrip": false,
502
+ "normalized": false,
503
+ "rstrip": false,
504
+ "single_word": false,
505
+ "special": true
506
+ },
507
+ "128063": {
508
+ "content": "<|reserved_special_token_55|>",
509
+ "lstrip": false,
510
+ "normalized": false,
511
+ "rstrip": false,
512
+ "single_word": false,
513
+ "special": true
514
+ },
515
+ "128064": {
516
+ "content": "<|reserved_special_token_56|>",
517
+ "lstrip": false,
518
+ "normalized": false,
519
+ "rstrip": false,
520
+ "single_word": false,
521
+ "special": true
522
+ },
523
+ "128065": {
524
+ "content": "<|reserved_special_token_57|>",
525
+ "lstrip": false,
526
+ "normalized": false,
527
+ "rstrip": false,
528
+ "single_word": false,
529
+ "special": true
530
+ },
531
+ "128066": {
532
+ "content": "<|reserved_special_token_58|>",
533
+ "lstrip": false,
534
+ "normalized": false,
535
+ "rstrip": false,
536
+ "single_word": false,
537
+ "special": true
538
+ },
539
+ "128067": {
540
+ "content": "<|reserved_special_token_59|>",
541
+ "lstrip": false,
542
+ "normalized": false,
543
+ "rstrip": false,
544
+ "single_word": false,
545
+ "special": true
546
+ },
547
+ "128068": {
548
+ "content": "<|reserved_special_token_60|>",
549
+ "lstrip": false,
550
+ "normalized": false,
551
+ "rstrip": false,
552
+ "single_word": false,
553
+ "special": true
554
+ },
555
+ "128069": {
556
+ "content": "<|reserved_special_token_61|>",
557
+ "lstrip": false,
558
+ "normalized": false,
559
+ "rstrip": false,
560
+ "single_word": false,
561
+ "special": true
562
+ },
563
+ "128070": {
564
+ "content": "<|reserved_special_token_62|>",
565
+ "lstrip": false,
566
+ "normalized": false,
567
+ "rstrip": false,
568
+ "single_word": false,
569
+ "special": true
570
+ },
571
+ "128071": {
572
+ "content": "<|reserved_special_token_63|>",
573
+ "lstrip": false,
574
+ "normalized": false,
575
+ "rstrip": false,
576
+ "single_word": false,
577
+ "special": true
578
+ },
579
+ "128072": {
580
+ "content": "<|reserved_special_token_64|>",
581
+ "lstrip": false,
582
+ "normalized": false,
583
+ "rstrip": false,
584
+ "single_word": false,
585
+ "special": true
586
+ },
587
+ "128073": {
588
+ "content": "<|reserved_special_token_65|>",
589
+ "lstrip": false,
590
+ "normalized": false,
591
+ "rstrip": false,
592
+ "single_word": false,
593
+ "special": true
594
+ },
595
+ "128074": {
596
+ "content": "<|reserved_special_token_66|>",
597
+ "lstrip": false,
598
+ "normalized": false,
599
+ "rstrip": false,
600
+ "single_word": false,
601
+ "special": true
602
+ },
603
+ "128075": {
604
+ "content": "<|reserved_special_token_67|>",
605
+ "lstrip": false,
606
+ "normalized": false,
607
+ "rstrip": false,
608
+ "single_word": false,
609
+ "special": true
610
+ },
611
+ "128076": {
612
+ "content": "<|reserved_special_token_68|>",
613
+ "lstrip": false,
614
+ "normalized": false,
615
+ "rstrip": false,
616
+ "single_word": false,
617
+ "special": true
618
+ },
619
+ "128077": {
620
+ "content": "<|reserved_special_token_69|>",
621
+ "lstrip": false,
622
+ "normalized": false,
623
+ "rstrip": false,
624
+ "single_word": false,
625
+ "special": true
626
+ },
627
+ "128078": {
628
+ "content": "<|reserved_special_token_70|>",
629
+ "lstrip": false,
630
+ "normalized": false,
631
+ "rstrip": false,
632
+ "single_word": false,
633
+ "special": true
634
+ },
635
+ "128079": {
636
+ "content": "<|reserved_special_token_71|>",
637
+ "lstrip": false,
638
+ "normalized": false,
639
+ "rstrip": false,
640
+ "single_word": false,
641
+ "special": true
642
+ },
643
+ "128080": {
644
+ "content": "<|reserved_special_token_72|>",
645
+ "lstrip": false,
646
+ "normalized": false,
647
+ "rstrip": false,
648
+ "single_word": false,
649
+ "special": true
650
+ },
651
+ "128081": {
652
+ "content": "<|reserved_special_token_73|>",
653
+ "lstrip": false,
654
+ "normalized": false,
655
+ "rstrip": false,
656
+ "single_word": false,
657
+ "special": true
658
+ },
659
+ "128082": {
660
+ "content": "<|reserved_special_token_74|>",
661
+ "lstrip": false,
662
+ "normalized": false,
663
+ "rstrip": false,
664
+ "single_word": false,
665
+ "special": true
666
+ },
667
+ "128083": {
668
+ "content": "<|reserved_special_token_75|>",
669
+ "lstrip": false,
670
+ "normalized": false,
671
+ "rstrip": false,
672
+ "single_word": false,
673
+ "special": true
674
+ },
675
+ "128084": {
676
+ "content": "<|reserved_special_token_76|>",
677
+ "lstrip": false,
678
+ "normalized": false,
679
+ "rstrip": false,
680
+ "single_word": false,
681
+ "special": true
682
+ },
683
+ "128085": {
684
+ "content": "<|reserved_special_token_77|>",
685
+ "lstrip": false,
686
+ "normalized": false,
687
+ "rstrip": false,
688
+ "single_word": false,
689
+ "special": true
690
+ },
691
+ "128086": {
692
+ "content": "<|reserved_special_token_78|>",
693
+ "lstrip": false,
694
+ "normalized": false,
695
+ "rstrip": false,
696
+ "single_word": false,
697
+ "special": true
698
+ },
699
+ "128087": {
700
+ "content": "<|reserved_special_token_79|>",
701
+ "lstrip": false,
702
+ "normalized": false,
703
+ "rstrip": false,
704
+ "single_word": false,
705
+ "special": true
706
+ },
707
+ "128088": {
708
+ "content": "<|reserved_special_token_80|>",
709
+ "lstrip": false,
710
+ "normalized": false,
711
+ "rstrip": false,
712
+ "single_word": false,
713
+ "special": true
714
+ },
715
+ "128089": {
716
+ "content": "<|reserved_special_token_81|>",
717
+ "lstrip": false,
718
+ "normalized": false,
719
+ "rstrip": false,
720
+ "single_word": false,
721
+ "special": true
722
+ },
723
+ "128090": {
724
+ "content": "<|reserved_special_token_82|>",
725
+ "lstrip": false,
726
+ "normalized": false,
727
+ "rstrip": false,
728
+ "single_word": false,
729
+ "special": true
730
+ },
731
+ "128091": {
732
+ "content": "<|reserved_special_token_83|>",
733
+ "lstrip": false,
734
+ "normalized": false,
735
+ "rstrip": false,
736
+ "single_word": false,
737
+ "special": true
738
+ },
739
+ "128092": {
740
+ "content": "<|reserved_special_token_84|>",
741
+ "lstrip": false,
742
+ "normalized": false,
743
+ "rstrip": false,
744
+ "single_word": false,
745
+ "special": true
746
+ },
747
+ "128093": {
748
+ "content": "<|reserved_special_token_85|>",
749
+ "lstrip": false,
750
+ "normalized": false,
751
+ "rstrip": false,
752
+ "single_word": false,
753
+ "special": true
754
+ },
755
+ "128094": {
756
+ "content": "<|reserved_special_token_86|>",
757
+ "lstrip": false,
758
+ "normalized": false,
759
+ "rstrip": false,
760
+ "single_word": false,
761
+ "special": true
762
+ },
763
+ "128095": {
764
+ "content": "<|reserved_special_token_87|>",
765
+ "lstrip": false,
766
+ "normalized": false,
767
+ "rstrip": false,
768
+ "single_word": false,
769
+ "special": true
770
+ },
771
+ "128096": {
772
+ "content": "<|reserved_special_token_88|>",
773
+ "lstrip": false,
774
+ "normalized": false,
775
+ "rstrip": false,
776
+ "single_word": false,
777
+ "special": true
778
+ },
779
+ "128097": {
780
+ "content": "<|reserved_special_token_89|>",
781
+ "lstrip": false,
782
+ "normalized": false,
783
+ "rstrip": false,
784
+ "single_word": false,
785
+ "special": true
786
+ },
787
+ "128098": {
788
+ "content": "<|reserved_special_token_90|>",
789
+ "lstrip": false,
790
+ "normalized": false,
791
+ "rstrip": false,
792
+ "single_word": false,
793
+ "special": true
794
+ },
795
+ "128099": {
796
+ "content": "<|reserved_special_token_91|>",
797
+ "lstrip": false,
798
+ "normalized": false,
799
+ "rstrip": false,
800
+ "single_word": false,
801
+ "special": true
802
+ },
803
+ "128100": {
804
+ "content": "<|reserved_special_token_92|>",
805
+ "lstrip": false,
806
+ "normalized": false,
807
+ "rstrip": false,
808
+ "single_word": false,
809
+ "special": true
810
+ },
811
+ "128101": {
812
+ "content": "<|reserved_special_token_93|>",
813
+ "lstrip": false,
814
+ "normalized": false,
815
+ "rstrip": false,
816
+ "single_word": false,
817
+ "special": true
818
+ },
819
+ "128102": {
820
+ "content": "<|reserved_special_token_94|>",
821
+ "lstrip": false,
822
+ "normalized": false,
823
+ "rstrip": false,
824
+ "single_word": false,
825
+ "special": true
826
+ },
827
+ "128103": {
828
+ "content": "<|reserved_special_token_95|>",
829
+ "lstrip": false,
830
+ "normalized": false,
831
+ "rstrip": false,
832
+ "single_word": false,
833
+ "special": true
834
+ },
835
+ "128104": {
836
+ "content": "<|reserved_special_token_96|>",
837
+ "lstrip": false,
838
+ "normalized": false,
839
+ "rstrip": false,
840
+ "single_word": false,
841
+ "special": true
842
+ },
843
+ "128105": {
844
+ "content": "<|reserved_special_token_97|>",
845
+ "lstrip": false,
846
+ "normalized": false,
847
+ "rstrip": false,
848
+ "single_word": false,
849
+ "special": true
850
+ },
851
+ "128106": {
852
+ "content": "<|reserved_special_token_98|>",
853
+ "lstrip": false,
854
+ "normalized": false,
855
+ "rstrip": false,
856
+ "single_word": false,
857
+ "special": true
858
+ },
859
+ "128107": {
860
+ "content": "<|reserved_special_token_99|>",
861
+ "lstrip": false,
862
+ "normalized": false,
863
+ "rstrip": false,
864
+ "single_word": false,
865
+ "special": true
866
+ },
867
+ "128108": {
868
+ "content": "<|reserved_special_token_100|>",
869
+ "lstrip": false,
870
+ "normalized": false,
871
+ "rstrip": false,
872
+ "single_word": false,
873
+ "special": true
874
+ },
875
+ "128109": {
876
+ "content": "<|reserved_special_token_101|>",
877
+ "lstrip": false,
878
+ "normalized": false,
879
+ "rstrip": false,
880
+ "single_word": false,
881
+ "special": true
882
+ },
883
+ "128110": {
884
+ "content": "<|reserved_special_token_102|>",
885
+ "lstrip": false,
886
+ "normalized": false,
887
+ "rstrip": false,
888
+ "single_word": false,
889
+ "special": true
890
+ },
891
+ "128111": {
892
+ "content": "<|reserved_special_token_103|>",
893
+ "lstrip": false,
894
+ "normalized": false,
895
+ "rstrip": false,
896
+ "single_word": false,
897
+ "special": true
898
+ },
899
+ "128112": {
900
+ "content": "<|reserved_special_token_104|>",
901
+ "lstrip": false,
902
+ "normalized": false,
903
+ "rstrip": false,
904
+ "single_word": false,
905
+ "special": true
906
+ },
907
+ "128113": {
908
+ "content": "<|reserved_special_token_105|>",
909
+ "lstrip": false,
910
+ "normalized": false,
911
+ "rstrip": false,
912
+ "single_word": false,
913
+ "special": true
914
+ },
915
+ "128114": {
916
+ "content": "<|reserved_special_token_106|>",
917
+ "lstrip": false,
918
+ "normalized": false,
919
+ "rstrip": false,
920
+ "single_word": false,
921
+ "special": true
922
+ },
923
+ "128115": {
924
+ "content": "<|reserved_special_token_107|>",
925
+ "lstrip": false,
926
+ "normalized": false,
927
+ "rstrip": false,
928
+ "single_word": false,
929
+ "special": true
930
+ },
931
+ "128116": {
932
+ "content": "<|reserved_special_token_108|>",
933
+ "lstrip": false,
934
+ "normalized": false,
935
+ "rstrip": false,
936
+ "single_word": false,
937
+ "special": true
938
+ },
939
+ "128117": {
940
+ "content": "<|reserved_special_token_109|>",
941
+ "lstrip": false,
942
+ "normalized": false,
943
+ "rstrip": false,
944
+ "single_word": false,
945
+ "special": true
946
+ },
947
+ "128118": {
948
+ "content": "<|reserved_special_token_110|>",
949
+ "lstrip": false,
950
+ "normalized": false,
951
+ "rstrip": false,
952
+ "single_word": false,
953
+ "special": true
954
+ },
955
+ "128119": {
956
+ "content": "<|reserved_special_token_111|>",
957
+ "lstrip": false,
958
+ "normalized": false,
959
+ "rstrip": false,
960
+ "single_word": false,
961
+ "special": true
962
+ },
963
+ "128120": {
964
+ "content": "<|reserved_special_token_112|>",
965
+ "lstrip": false,
966
+ "normalized": false,
967
+ "rstrip": false,
968
+ "single_word": false,
969
+ "special": true
970
+ },
971
+ "128121": {
972
+ "content": "<|reserved_special_token_113|>",
973
+ "lstrip": false,
974
+ "normalized": false,
975
+ "rstrip": false,
976
+ "single_word": false,
977
+ "special": true
978
+ },
979
+ "128122": {
980
+ "content": "<|reserved_special_token_114|>",
981
+ "lstrip": false,
982
+ "normalized": false,
983
+ "rstrip": false,
984
+ "single_word": false,
985
+ "special": true
986
+ },
987
+ "128123": {
988
+ "content": "<|reserved_special_token_115|>",
989
+ "lstrip": false,
990
+ "normalized": false,
991
+ "rstrip": false,
992
+ "single_word": false,
993
+ "special": true
994
+ },
995
+ "128124": {
996
+ "content": "<|reserved_special_token_116|>",
997
+ "lstrip": false,
998
+ "normalized": false,
999
+ "rstrip": false,
1000
+ "single_word": false,
1001
+ "special": true
1002
+ },
1003
+ "128125": {
1004
+ "content": "<|reserved_special_token_117|>",
1005
+ "lstrip": false,
1006
+ "normalized": false,
1007
+ "rstrip": false,
1008
+ "single_word": false,
1009
+ "special": true
1010
+ },
1011
+ "128126": {
1012
+ "content": "<|reserved_special_token_118|>",
1013
+ "lstrip": false,
1014
+ "normalized": false,
1015
+ "rstrip": false,
1016
+ "single_word": false,
1017
+ "special": true
1018
+ },
1019
+ "128127": {
1020
+ "content": "<|reserved_special_token_119|>",
1021
+ "lstrip": false,
1022
+ "normalized": false,
1023
+ "rstrip": false,
1024
+ "single_word": false,
1025
+ "special": true
1026
+ },
1027
+ "128128": {
1028
+ "content": "<|reserved_special_token_120|>",
1029
+ "lstrip": false,
1030
+ "normalized": false,
1031
+ "rstrip": false,
1032
+ "single_word": false,
1033
+ "special": true
1034
+ },
1035
+ "128129": {
1036
+ "content": "<|reserved_special_token_121|>",
1037
+ "lstrip": false,
1038
+ "normalized": false,
1039
+ "rstrip": false,
1040
+ "single_word": false,
1041
+ "special": true
1042
+ },
1043
+ "128130": {
1044
+ "content": "<|reserved_special_token_122|>",
1045
+ "lstrip": false,
1046
+ "normalized": false,
1047
+ "rstrip": false,
1048
+ "single_word": false,
1049
+ "special": true
1050
+ },
1051
+ "128131": {
1052
+ "content": "<|reserved_special_token_123|>",
1053
+ "lstrip": false,
1054
+ "normalized": false,
1055
+ "rstrip": false,
1056
+ "single_word": false,
1057
+ "special": true
1058
+ },
1059
+ "128132": {
1060
+ "content": "<|reserved_special_token_124|>",
1061
+ "lstrip": false,
1062
+ "normalized": false,
1063
+ "rstrip": false,
1064
+ "single_word": false,
1065
+ "special": true
1066
+ },
1067
+ "128133": {
1068
+ "content": "<|reserved_special_token_125|>",
1069
+ "lstrip": false,
1070
+ "normalized": false,
1071
+ "rstrip": false,
1072
+ "single_word": false,
1073
+ "special": true
1074
+ },
1075
+ "128134": {
1076
+ "content": "<|reserved_special_token_126|>",
1077
+ "lstrip": false,
1078
+ "normalized": false,
1079
+ "rstrip": false,
1080
+ "single_word": false,
1081
+ "special": true
1082
+ },
1083
+ "128135": {
1084
+ "content": "<|reserved_special_token_127|>",
1085
+ "lstrip": false,
1086
+ "normalized": false,
1087
+ "rstrip": false,
1088
+ "single_word": false,
1089
+ "special": true
1090
+ },
1091
+ "128136": {
1092
+ "content": "<|reserved_special_token_128|>",
1093
+ "lstrip": false,
1094
+ "normalized": false,
1095
+ "rstrip": false,
1096
+ "single_word": false,
1097
+ "special": true
1098
+ },
1099
+ "128137": {
1100
+ "content": "<|reserved_special_token_129|>",
1101
+ "lstrip": false,
1102
+ "normalized": false,
1103
+ "rstrip": false,
1104
+ "single_word": false,
1105
+ "special": true
1106
+ },
1107
+ "128138": {
1108
+ "content": "<|reserved_special_token_130|>",
1109
+ "lstrip": false,
1110
+ "normalized": false,
1111
+ "rstrip": false,
1112
+ "single_word": false,
1113
+ "special": true
1114
+ },
1115
+ "128139": {
1116
+ "content": "<|reserved_special_token_131|>",
1117
+ "lstrip": false,
1118
+ "normalized": false,
1119
+ "rstrip": false,
1120
+ "single_word": false,
1121
+ "special": true
1122
+ },
1123
+ "128140": {
1124
+ "content": "<|reserved_special_token_132|>",
1125
+ "lstrip": false,
1126
+ "normalized": false,
1127
+ "rstrip": false,
1128
+ "single_word": false,
1129
+ "special": true
1130
+ },
1131
+ "128141": {
1132
+ "content": "<|reserved_special_token_133|>",
1133
+ "lstrip": false,
1134
+ "normalized": false,
1135
+ "rstrip": false,
1136
+ "single_word": false,
1137
+ "special": true
1138
+ },
1139
+ "128142": {
1140
+ "content": "<|reserved_special_token_134|>",
1141
+ "lstrip": false,
1142
+ "normalized": false,
1143
+ "rstrip": false,
1144
+ "single_word": false,
1145
+ "special": true
1146
+ },
1147
+ "128143": {
1148
+ "content": "<|reserved_special_token_135|>",
1149
+ "lstrip": false,
1150
+ "normalized": false,
1151
+ "rstrip": false,
1152
+ "single_word": false,
1153
+ "special": true
1154
+ },
1155
+ "128144": {
1156
+ "content": "<|reserved_special_token_136|>",
1157
+ "lstrip": false,
1158
+ "normalized": false,
1159
+ "rstrip": false,
1160
+ "single_word": false,
1161
+ "special": true
1162
+ },
1163
+ "128145": {
1164
+ "content": "<|reserved_special_token_137|>",
1165
+ "lstrip": false,
1166
+ "normalized": false,
1167
+ "rstrip": false,
1168
+ "single_word": false,
1169
+ "special": true
1170
+ },
1171
+ "128146": {
1172
+ "content": "<|reserved_special_token_138|>",
1173
+ "lstrip": false,
1174
+ "normalized": false,
1175
+ "rstrip": false,
1176
+ "single_word": false,
1177
+ "special": true
1178
+ },
1179
+ "128147": {
1180
+ "content": "<|reserved_special_token_139|>",
1181
+ "lstrip": false,
1182
+ "normalized": false,
1183
+ "rstrip": false,
1184
+ "single_word": false,
1185
+ "special": true
1186
+ },
1187
+ "128148": {
1188
+ "content": "<|reserved_special_token_140|>",
1189
+ "lstrip": false,
1190
+ "normalized": false,
1191
+ "rstrip": false,
1192
+ "single_word": false,
1193
+ "special": true
1194
+ },
1195
+ "128149": {
1196
+ "content": "<|reserved_special_token_141|>",
1197
+ "lstrip": false,
1198
+ "normalized": false,
1199
+ "rstrip": false,
1200
+ "single_word": false,
1201
+ "special": true
1202
+ },
1203
+ "128150": {
1204
+ "content": "<|reserved_special_token_142|>",
1205
+ "lstrip": false,
1206
+ "normalized": false,
1207
+ "rstrip": false,
1208
+ "single_word": false,
1209
+ "special": true
1210
+ },
1211
+ "128151": {
1212
+ "content": "<|reserved_special_token_143|>",
1213
+ "lstrip": false,
1214
+ "normalized": false,
1215
+ "rstrip": false,
1216
+ "single_word": false,
1217
+ "special": true
1218
+ },
1219
+ "128152": {
1220
+ "content": "<|reserved_special_token_144|>",
1221
+ "lstrip": false,
1222
+ "normalized": false,
1223
+ "rstrip": false,
1224
+ "single_word": false,
1225
+ "special": true
1226
+ },
1227
+ "128153": {
1228
+ "content": "<|reserved_special_token_145|>",
1229
+ "lstrip": false,
1230
+ "normalized": false,
1231
+ "rstrip": false,
1232
+ "single_word": false,
1233
+ "special": true
1234
+ },
1235
+ "128154": {
1236
+ "content": "<|reserved_special_token_146|>",
1237
+ "lstrip": false,
1238
+ "normalized": false,
1239
+ "rstrip": false,
1240
+ "single_word": false,
1241
+ "special": true
1242
+ },
1243
+ "128155": {
1244
+ "content": "<|reserved_special_token_147|>",
1245
+ "lstrip": false,
1246
+ "normalized": false,
1247
+ "rstrip": false,
1248
+ "single_word": false,
1249
+ "special": true
1250
+ },
1251
+ "128156": {
1252
+ "content": "<|reserved_special_token_148|>",
1253
+ "lstrip": false,
1254
+ "normalized": false,
1255
+ "rstrip": false,
1256
+ "single_word": false,
1257
+ "special": true
1258
+ },
1259
+ "128157": {
1260
+ "content": "<|reserved_special_token_149|>",
1261
+ "lstrip": false,
1262
+ "normalized": false,
1263
+ "rstrip": false,
1264
+ "single_word": false,
1265
+ "special": true
1266
+ },
1267
+ "128158": {
1268
+ "content": "<|reserved_special_token_150|>",
1269
+ "lstrip": false,
1270
+ "normalized": false,
1271
+ "rstrip": false,
1272
+ "single_word": false,
1273
+ "special": true
1274
+ },
1275
+ "128159": {
1276
+ "content": "<|reserved_special_token_151|>",
1277
+ "lstrip": false,
1278
+ "normalized": false,
1279
+ "rstrip": false,
1280
+ "single_word": false,
1281
+ "special": true
1282
+ },
1283
+ "128160": {
1284
+ "content": "<|reserved_special_token_152|>",
1285
+ "lstrip": false,
1286
+ "normalized": false,
1287
+ "rstrip": false,
1288
+ "single_word": false,
1289
+ "special": true
1290
+ },
1291
+ "128161": {
1292
+ "content": "<|reserved_special_token_153|>",
1293
+ "lstrip": false,
1294
+ "normalized": false,
1295
+ "rstrip": false,
1296
+ "single_word": false,
1297
+ "special": true
1298
+ },
1299
+ "128162": {
1300
+ "content": "<|reserved_special_token_154|>",
1301
+ "lstrip": false,
1302
+ "normalized": false,
1303
+ "rstrip": false,
1304
+ "single_word": false,
1305
+ "special": true
1306
+ },
1307
+ "128163": {
1308
+ "content": "<|reserved_special_token_155|>",
1309
+ "lstrip": false,
1310
+ "normalized": false,
1311
+ "rstrip": false,
1312
+ "single_word": false,
1313
+ "special": true
1314
+ },
1315
+ "128164": {
1316
+ "content": "<|reserved_special_token_156|>",
1317
+ "lstrip": false,
1318
+ "normalized": false,
1319
+ "rstrip": false,
1320
+ "single_word": false,
1321
+ "special": true
1322
+ },
1323
+ "128165": {
1324
+ "content": "<|reserved_special_token_157|>",
1325
+ "lstrip": false,
1326
+ "normalized": false,
1327
+ "rstrip": false,
1328
+ "single_word": false,
1329
+ "special": true
1330
+ },
1331
+ "128166": {
1332
+ "content": "<|reserved_special_token_158|>",
1333
+ "lstrip": false,
1334
+ "normalized": false,
1335
+ "rstrip": false,
1336
+ "single_word": false,
1337
+ "special": true
1338
+ },
1339
+ "128167": {
1340
+ "content": "<|reserved_special_token_159|>",
1341
+ "lstrip": false,
1342
+ "normalized": false,
1343
+ "rstrip": false,
1344
+ "single_word": false,
1345
+ "special": true
1346
+ },
1347
+ "128168": {
1348
+ "content": "<|reserved_special_token_160|>",
1349
+ "lstrip": false,
1350
+ "normalized": false,
1351
+ "rstrip": false,
1352
+ "single_word": false,
1353
+ "special": true
1354
+ },
1355
+ "128169": {
1356
+ "content": "<|reserved_special_token_161|>",
1357
+ "lstrip": false,
1358
+ "normalized": false,
1359
+ "rstrip": false,
1360
+ "single_word": false,
1361
+ "special": true
1362
+ },
1363
+ "128170": {
1364
+ "content": "<|reserved_special_token_162|>",
1365
+ "lstrip": false,
1366
+ "normalized": false,
1367
+ "rstrip": false,
1368
+ "single_word": false,
1369
+ "special": true
1370
+ },
1371
+ "128171": {
1372
+ "content": "<|reserved_special_token_163|>",
1373
+ "lstrip": false,
1374
+ "normalized": false,
1375
+ "rstrip": false,
1376
+ "single_word": false,
1377
+ "special": true
1378
+ },
1379
+ "128172": {
1380
+ "content": "<|reserved_special_token_164|>",
1381
+ "lstrip": false,
1382
+ "normalized": false,
1383
+ "rstrip": false,
1384
+ "single_word": false,
1385
+ "special": true
1386
+ },
1387
+ "128173": {
1388
+ "content": "<|reserved_special_token_165|>",
1389
+ "lstrip": false,
1390
+ "normalized": false,
1391
+ "rstrip": false,
1392
+ "single_word": false,
1393
+ "special": true
1394
+ },
1395
+ "128174": {
1396
+ "content": "<|reserved_special_token_166|>",
1397
+ "lstrip": false,
1398
+ "normalized": false,
1399
+ "rstrip": false,
1400
+ "single_word": false,
1401
+ "special": true
1402
+ },
1403
+ "128175": {
1404
+ "content": "<|reserved_special_token_167|>",
1405
+ "lstrip": false,
1406
+ "normalized": false,
1407
+ "rstrip": false,
1408
+ "single_word": false,
1409
+ "special": true
1410
+ },
1411
+ "128176": {
1412
+ "content": "<|reserved_special_token_168|>",
1413
+ "lstrip": false,
1414
+ "normalized": false,
1415
+ "rstrip": false,
1416
+ "single_word": false,
1417
+ "special": true
1418
+ },
1419
+ "128177": {
1420
+ "content": "<|reserved_special_token_169|>",
1421
+ "lstrip": false,
1422
+ "normalized": false,
1423
+ "rstrip": false,
1424
+ "single_word": false,
1425
+ "special": true
1426
+ },
1427
+ "128178": {
1428
+ "content": "<|reserved_special_token_170|>",
1429
+ "lstrip": false,
1430
+ "normalized": false,
1431
+ "rstrip": false,
1432
+ "single_word": false,
1433
+ "special": true
1434
+ },
1435
+ "128179": {
1436
+ "content": "<|reserved_special_token_171|>",
1437
+ "lstrip": false,
1438
+ "normalized": false,
1439
+ "rstrip": false,
1440
+ "single_word": false,
1441
+ "special": true
1442
+ },
1443
+ "128180": {
1444
+ "content": "<|reserved_special_token_172|>",
1445
+ "lstrip": false,
1446
+ "normalized": false,
1447
+ "rstrip": false,
1448
+ "single_word": false,
1449
+ "special": true
1450
+ },
1451
+ "128181": {
1452
+ "content": "<|reserved_special_token_173|>",
1453
+ "lstrip": false,
1454
+ "normalized": false,
1455
+ "rstrip": false,
1456
+ "single_word": false,
1457
+ "special": true
1458
+ },
1459
+ "128182": {
1460
+ "content": "<|reserved_special_token_174|>",
1461
+ "lstrip": false,
1462
+ "normalized": false,
1463
+ "rstrip": false,
1464
+ "single_word": false,
1465
+ "special": true
1466
+ },
1467
+ "128183": {
1468
+ "content": "<|reserved_special_token_175|>",
1469
+ "lstrip": false,
1470
+ "normalized": false,
1471
+ "rstrip": false,
1472
+ "single_word": false,
1473
+ "special": true
1474
+ },
1475
+ "128184": {
1476
+ "content": "<|reserved_special_token_176|>",
1477
+ "lstrip": false,
1478
+ "normalized": false,
1479
+ "rstrip": false,
1480
+ "single_word": false,
1481
+ "special": true
1482
+ },
1483
+ "128185": {
1484
+ "content": "<|reserved_special_token_177|>",
1485
+ "lstrip": false,
1486
+ "normalized": false,
1487
+ "rstrip": false,
1488
+ "single_word": false,
1489
+ "special": true
1490
+ },
1491
+ "128186": {
1492
+ "content": "<|reserved_special_token_178|>",
1493
+ "lstrip": false,
1494
+ "normalized": false,
1495
+ "rstrip": false,
1496
+ "single_word": false,
1497
+ "special": true
1498
+ },
1499
+ "128187": {
1500
+ "content": "<|reserved_special_token_179|>",
1501
+ "lstrip": false,
1502
+ "normalized": false,
1503
+ "rstrip": false,
1504
+ "single_word": false,
1505
+ "special": true
1506
+ },
1507
+ "128188": {
1508
+ "content": "<|reserved_special_token_180|>",
1509
+ "lstrip": false,
1510
+ "normalized": false,
1511
+ "rstrip": false,
1512
+ "single_word": false,
1513
+ "special": true
1514
+ },
1515
+ "128189": {
1516
+ "content": "<|reserved_special_token_181|>",
1517
+ "lstrip": false,
1518
+ "normalized": false,
1519
+ "rstrip": false,
1520
+ "single_word": false,
1521
+ "special": true
1522
+ },
1523
+ "128190": {
1524
+ "content": "<|reserved_special_token_182|>",
1525
+ "lstrip": false,
1526
+ "normalized": false,
1527
+ "rstrip": false,
1528
+ "single_word": false,
1529
+ "special": true
1530
+ },
1531
+ "128191": {
1532
+ "content": "<|reserved_special_token_183|>",
1533
+ "lstrip": false,
1534
+ "normalized": false,
1535
+ "rstrip": false,
1536
+ "single_word": false,
1537
+ "special": true
1538
+ },
1539
+ "128192": {
1540
+ "content": "<|reserved_special_token_184|>",
1541
+ "lstrip": false,
1542
+ "normalized": false,
1543
+ "rstrip": false,
1544
+ "single_word": false,
1545
+ "special": true
1546
+ },
1547
+ "128193": {
1548
+ "content": "<|reserved_special_token_185|>",
1549
+ "lstrip": false,
1550
+ "normalized": false,
1551
+ "rstrip": false,
1552
+ "single_word": false,
1553
+ "special": true
1554
+ },
1555
+ "128194": {
1556
+ "content": "<|reserved_special_token_186|>",
1557
+ "lstrip": false,
1558
+ "normalized": false,
1559
+ "rstrip": false,
1560
+ "single_word": false,
1561
+ "special": true
1562
+ },
1563
+ "128195": {
1564
+ "content": "<|reserved_special_token_187|>",
1565
+ "lstrip": false,
1566
+ "normalized": false,
1567
+ "rstrip": false,
1568
+ "single_word": false,
1569
+ "special": true
1570
+ },
1571
+ "128196": {
1572
+ "content": "<|reserved_special_token_188|>",
1573
+ "lstrip": false,
1574
+ "normalized": false,
1575
+ "rstrip": false,
1576
+ "single_word": false,
1577
+ "special": true
1578
+ },
1579
+ "128197": {
1580
+ "content": "<|reserved_special_token_189|>",
1581
+ "lstrip": false,
1582
+ "normalized": false,
1583
+ "rstrip": false,
1584
+ "single_word": false,
1585
+ "special": true
1586
+ },
1587
+ "128198": {
1588
+ "content": "<|reserved_special_token_190|>",
1589
+ "lstrip": false,
1590
+ "normalized": false,
1591
+ "rstrip": false,
1592
+ "single_word": false,
1593
+ "special": true
1594
+ },
1595
+ "128199": {
1596
+ "content": "<|reserved_special_token_191|>",
1597
+ "lstrip": false,
1598
+ "normalized": false,
1599
+ "rstrip": false,
1600
+ "single_word": false,
1601
+ "special": true
1602
+ },
1603
+ "128200": {
1604
+ "content": "<|reserved_special_token_192|>",
1605
+ "lstrip": false,
1606
+ "normalized": false,
1607
+ "rstrip": false,
1608
+ "single_word": false,
1609
+ "special": true
1610
+ },
1611
+ "128201": {
1612
+ "content": "<|reserved_special_token_193|>",
1613
+ "lstrip": false,
1614
+ "normalized": false,
1615
+ "rstrip": false,
1616
+ "single_word": false,
1617
+ "special": true
1618
+ },
1619
+ "128202": {
1620
+ "content": "<|reserved_special_token_194|>",
1621
+ "lstrip": false,
1622
+ "normalized": false,
1623
+ "rstrip": false,
1624
+ "single_word": false,
1625
+ "special": true
1626
+ },
1627
+ "128203": {
1628
+ "content": "<|reserved_special_token_195|>",
1629
+ "lstrip": false,
1630
+ "normalized": false,
1631
+ "rstrip": false,
1632
+ "single_word": false,
1633
+ "special": true
1634
+ },
1635
+ "128204": {
1636
+ "content": "<|reserved_special_token_196|>",
1637
+ "lstrip": false,
1638
+ "normalized": false,
1639
+ "rstrip": false,
1640
+ "single_word": false,
1641
+ "special": true
1642
+ },
1643
+ "128205": {
1644
+ "content": "<|reserved_special_token_197|>",
1645
+ "lstrip": false,
1646
+ "normalized": false,
1647
+ "rstrip": false,
1648
+ "single_word": false,
1649
+ "special": true
1650
+ },
1651
+ "128206": {
1652
+ "content": "<|reserved_special_token_198|>",
1653
+ "lstrip": false,
1654
+ "normalized": false,
1655
+ "rstrip": false,
1656
+ "single_word": false,
1657
+ "special": true
1658
+ },
1659
+ "128207": {
1660
+ "content": "<|reserved_special_token_199|>",
1661
+ "lstrip": false,
1662
+ "normalized": false,
1663
+ "rstrip": false,
1664
+ "single_word": false,
1665
+ "special": true
1666
+ },
1667
+ "128208": {
1668
+ "content": "<|reserved_special_token_200|>",
1669
+ "lstrip": false,
1670
+ "normalized": false,
1671
+ "rstrip": false,
1672
+ "single_word": false,
1673
+ "special": true
1674
+ },
1675
+ "128209": {
1676
+ "content": "<|reserved_special_token_201|>",
1677
+ "lstrip": false,
1678
+ "normalized": false,
1679
+ "rstrip": false,
1680
+ "single_word": false,
1681
+ "special": true
1682
+ },
1683
+ "128210": {
1684
+ "content": "<|reserved_special_token_202|>",
1685
+ "lstrip": false,
1686
+ "normalized": false,
1687
+ "rstrip": false,
1688
+ "single_word": false,
1689
+ "special": true
1690
+ },
1691
+ "128211": {
1692
+ "content": "<|reserved_special_token_203|>",
1693
+ "lstrip": false,
1694
+ "normalized": false,
1695
+ "rstrip": false,
1696
+ "single_word": false,
1697
+ "special": true
1698
+ },
1699
+ "128212": {
1700
+ "content": "<|reserved_special_token_204|>",
1701
+ "lstrip": false,
1702
+ "normalized": false,
1703
+ "rstrip": false,
1704
+ "single_word": false,
1705
+ "special": true
1706
+ },
1707
+ "128213": {
1708
+ "content": "<|reserved_special_token_205|>",
1709
+ "lstrip": false,
1710
+ "normalized": false,
1711
+ "rstrip": false,
1712
+ "single_word": false,
1713
+ "special": true
1714
+ },
1715
+ "128214": {
1716
+ "content": "<|reserved_special_token_206|>",
1717
+ "lstrip": false,
1718
+ "normalized": false,
1719
+ "rstrip": false,
1720
+ "single_word": false,
1721
+ "special": true
1722
+ },
1723
+ "128215": {
1724
+ "content": "<|reserved_special_token_207|>",
1725
+ "lstrip": false,
1726
+ "normalized": false,
1727
+ "rstrip": false,
1728
+ "single_word": false,
1729
+ "special": true
1730
+ },
1731
+ "128216": {
1732
+ "content": "<|reserved_special_token_208|>",
1733
+ "lstrip": false,
1734
+ "normalized": false,
1735
+ "rstrip": false,
1736
+ "single_word": false,
1737
+ "special": true
1738
+ },
1739
+ "128217": {
1740
+ "content": "<|reserved_special_token_209|>",
1741
+ "lstrip": false,
1742
+ "normalized": false,
1743
+ "rstrip": false,
1744
+ "single_word": false,
1745
+ "special": true
1746
+ },
1747
+ "128218": {
1748
+ "content": "<|reserved_special_token_210|>",
1749
+ "lstrip": false,
1750
+ "normalized": false,
1751
+ "rstrip": false,
1752
+ "single_word": false,
1753
+ "special": true
1754
+ },
1755
+ "128219": {
1756
+ "content": "<|reserved_special_token_211|>",
1757
+ "lstrip": false,
1758
+ "normalized": false,
1759
+ "rstrip": false,
1760
+ "single_word": false,
1761
+ "special": true
1762
+ },
1763
+ "128220": {
1764
+ "content": "<|reserved_special_token_212|>",
1765
+ "lstrip": false,
1766
+ "normalized": false,
1767
+ "rstrip": false,
1768
+ "single_word": false,
1769
+ "special": true
1770
+ },
1771
+ "128221": {
1772
+ "content": "<|reserved_special_token_213|>",
1773
+ "lstrip": false,
1774
+ "normalized": false,
1775
+ "rstrip": false,
1776
+ "single_word": false,
1777
+ "special": true
1778
+ },
1779
+ "128222": {
1780
+ "content": "<|reserved_special_token_214|>",
1781
+ "lstrip": false,
1782
+ "normalized": false,
1783
+ "rstrip": false,
1784
+ "single_word": false,
1785
+ "special": true
1786
+ },
1787
+ "128223": {
1788
+ "content": "<|reserved_special_token_215|>",
1789
+ "lstrip": false,
1790
+ "normalized": false,
1791
+ "rstrip": false,
1792
+ "single_word": false,
1793
+ "special": true
1794
+ },
1795
+ "128224": {
1796
+ "content": "<|reserved_special_token_216|>",
1797
+ "lstrip": false,
1798
+ "normalized": false,
1799
+ "rstrip": false,
1800
+ "single_word": false,
1801
+ "special": true
1802
+ },
1803
+ "128225": {
1804
+ "content": "<|reserved_special_token_217|>",
1805
+ "lstrip": false,
1806
+ "normalized": false,
1807
+ "rstrip": false,
1808
+ "single_word": false,
1809
+ "special": true
1810
+ },
1811
+ "128226": {
1812
+ "content": "<|reserved_special_token_218|>",
1813
+ "lstrip": false,
1814
+ "normalized": false,
1815
+ "rstrip": false,
1816
+ "single_word": false,
1817
+ "special": true
1818
+ },
1819
+ "128227": {
1820
+ "content": "<|reserved_special_token_219|>",
1821
+ "lstrip": false,
1822
+ "normalized": false,
1823
+ "rstrip": false,
1824
+ "single_word": false,
1825
+ "special": true
1826
+ },
1827
+ "128228": {
1828
+ "content": "<|reserved_special_token_220|>",
1829
+ "lstrip": false,
1830
+ "normalized": false,
1831
+ "rstrip": false,
1832
+ "single_word": false,
1833
+ "special": true
1834
+ },
1835
+ "128229": {
1836
+ "content": "<|reserved_special_token_221|>",
1837
+ "lstrip": false,
1838
+ "normalized": false,
1839
+ "rstrip": false,
1840
+ "single_word": false,
1841
+ "special": true
1842
+ },
1843
+ "128230": {
1844
+ "content": "<|reserved_special_token_222|>",
1845
+ "lstrip": false,
1846
+ "normalized": false,
1847
+ "rstrip": false,
1848
+ "single_word": false,
1849
+ "special": true
1850
+ },
1851
+ "128231": {
1852
+ "content": "<|reserved_special_token_223|>",
1853
+ "lstrip": false,
1854
+ "normalized": false,
1855
+ "rstrip": false,
1856
+ "single_word": false,
1857
+ "special": true
1858
+ },
1859
+ "128232": {
1860
+ "content": "<|reserved_special_token_224|>",
1861
+ "lstrip": false,
1862
+ "normalized": false,
1863
+ "rstrip": false,
1864
+ "single_word": false,
1865
+ "special": true
1866
+ },
1867
+ "128233": {
1868
+ "content": "<|reserved_special_token_225|>",
1869
+ "lstrip": false,
1870
+ "normalized": false,
1871
+ "rstrip": false,
1872
+ "single_word": false,
1873
+ "special": true
1874
+ },
1875
+ "128234": {
1876
+ "content": "<|reserved_special_token_226|>",
1877
+ "lstrip": false,
1878
+ "normalized": false,
1879
+ "rstrip": false,
1880
+ "single_word": false,
1881
+ "special": true
1882
+ },
1883
+ "128235": {
1884
+ "content": "<|reserved_special_token_227|>",
1885
+ "lstrip": false,
1886
+ "normalized": false,
1887
+ "rstrip": false,
1888
+ "single_word": false,
1889
+ "special": true
1890
+ },
1891
+ "128236": {
1892
+ "content": "<|reserved_special_token_228|>",
1893
+ "lstrip": false,
1894
+ "normalized": false,
1895
+ "rstrip": false,
1896
+ "single_word": false,
1897
+ "special": true
1898
+ },
1899
+ "128237": {
1900
+ "content": "<|reserved_special_token_229|>",
1901
+ "lstrip": false,
1902
+ "normalized": false,
1903
+ "rstrip": false,
1904
+ "single_word": false,
1905
+ "special": true
1906
+ },
1907
+ "128238": {
1908
+ "content": "<|reserved_special_token_230|>",
1909
+ "lstrip": false,
1910
+ "normalized": false,
1911
+ "rstrip": false,
1912
+ "single_word": false,
1913
+ "special": true
1914
+ },
1915
+ "128239": {
1916
+ "content": "<|reserved_special_token_231|>",
1917
+ "lstrip": false,
1918
+ "normalized": false,
1919
+ "rstrip": false,
1920
+ "single_word": false,
1921
+ "special": true
1922
+ },
1923
+ "128240": {
1924
+ "content": "<|reserved_special_token_232|>",
1925
+ "lstrip": false,
1926
+ "normalized": false,
1927
+ "rstrip": false,
1928
+ "single_word": false,
1929
+ "special": true
1930
+ },
1931
+ "128241": {
1932
+ "content": "<|reserved_special_token_233|>",
1933
+ "lstrip": false,
1934
+ "normalized": false,
1935
+ "rstrip": false,
1936
+ "single_word": false,
1937
+ "special": true
1938
+ },
1939
+ "128242": {
1940
+ "content": "<|reserved_special_token_234|>",
1941
+ "lstrip": false,
1942
+ "normalized": false,
1943
+ "rstrip": false,
1944
+ "single_word": false,
1945
+ "special": true
1946
+ },
1947
+ "128243": {
1948
+ "content": "<|reserved_special_token_235|>",
1949
+ "lstrip": false,
1950
+ "normalized": false,
1951
+ "rstrip": false,
1952
+ "single_word": false,
1953
+ "special": true
1954
+ },
1955
+ "128244": {
1956
+ "content": "<|reserved_special_token_236|>",
1957
+ "lstrip": false,
1958
+ "normalized": false,
1959
+ "rstrip": false,
1960
+ "single_word": false,
1961
+ "special": true
1962
+ },
1963
+ "128245": {
1964
+ "content": "<|reserved_special_token_237|>",
1965
+ "lstrip": false,
1966
+ "normalized": false,
1967
+ "rstrip": false,
1968
+ "single_word": false,
1969
+ "special": true
1970
+ },
1971
+ "128246": {
1972
+ "content": "<|reserved_special_token_238|>",
1973
+ "lstrip": false,
1974
+ "normalized": false,
1975
+ "rstrip": false,
1976
+ "single_word": false,
1977
+ "special": true
1978
+ },
1979
+ "128247": {
1980
+ "content": "<|reserved_special_token_239|>",
1981
+ "lstrip": false,
1982
+ "normalized": false,
1983
+ "rstrip": false,
1984
+ "single_word": false,
1985
+ "special": true
1986
+ },
1987
+ "128248": {
1988
+ "content": "<|reserved_special_token_240|>",
1989
+ "lstrip": false,
1990
+ "normalized": false,
1991
+ "rstrip": false,
1992
+ "single_word": false,
1993
+ "special": true
1994
+ },
1995
+ "128249": {
1996
+ "content": "<|reserved_special_token_241|>",
1997
+ "lstrip": false,
1998
+ "normalized": false,
1999
+ "rstrip": false,
2000
+ "single_word": false,
2001
+ "special": true
2002
+ },
2003
+ "128250": {
2004
+ "content": "<|reserved_special_token_242|>",
2005
+ "lstrip": false,
2006
+ "normalized": false,
2007
+ "rstrip": false,
2008
+ "single_word": false,
2009
+ "special": true
2010
+ },
2011
+ "128251": {
2012
+ "content": "<|reserved_special_token_243|>",
2013
+ "lstrip": false,
2014
+ "normalized": false,
2015
+ "rstrip": false,
2016
+ "single_word": false,
2017
+ "special": true
2018
+ },
2019
+ "128252": {
2020
+ "content": "<|reserved_special_token_244|>",
2021
+ "lstrip": false,
2022
+ "normalized": false,
2023
+ "rstrip": false,
2024
+ "single_word": false,
2025
+ "special": true
2026
+ },
2027
+ "128253": {
2028
+ "content": "<|reserved_special_token_245|>",
2029
+ "lstrip": false,
2030
+ "normalized": false,
2031
+ "rstrip": false,
2032
+ "single_word": false,
2033
+ "special": true
2034
+ },
2035
+ "128254": {
2036
+ "content": "<|reserved_special_token_246|>",
2037
+ "lstrip": false,
2038
+ "normalized": false,
2039
+ "rstrip": false,
2040
+ "single_word": false,
2041
+ "special": true
2042
+ },
2043
+ "128255": {
2044
+ "content": "<|reserved_special_token_247|>",
2045
+ "lstrip": false,
2046
+ "normalized": false,
2047
+ "rstrip": false,
2048
+ "single_word": false,
2049
+ "special": true
2050
+ }
2051
+ },
2052
+ "bos_token": "<|begin_of_text|>",
2053
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
2054
+ "clean_up_tokenization_spaces": true,
2055
+ "eos_token": "<|eot_id|>",
2056
+ "model_input_names": [
2057
+ "input_ids",
2058
+ "attention_mask"
2059
+ ],
2060
+ "model_max_length": 131072,
2061
+ "pad_token": "<|finetune_right_pad_id|>",
2062
+ "padding_side": "right",
2063
+ "tokenizer_class": "PreTrainedTokenizerFast"
2064
+ }
clip/siglip-so400m-patch14-384/README.md ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - vision
5
+ widget:
6
+ - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png
7
+ candidate_labels: playing music, playing sports
8
+ example_title: Cat & Dog
9
+ ---
10
+
11
+ # SigLIP (shape-optimized model)
12
+
13
+ SigLIP model pre-trained on WebLi at resolution 384x384. It was introduced in the paper [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Zhai et al. and first released in [this repository](https://github.com/google-research/big_vision).
14
+
15
+ This model has the SoViT-400m architecture, which is the shape-optimized version as presented in [Getting ViT in Shape: Scaling Laws for Compute-Optimal Model Design](https://arxiv.org/abs/2305.13035) by Alabdulmohsin et al.
16
+
17
+ Disclaimer: The team releasing SigLIP did not write a model card for this model so this model card has been written by the Hugging Face team.
18
+
19
+ ## Model description
20
+
21
+ SigLIP is [CLIP](https://huggingface.co/docs/transformers/model_doc/clip), a multimodal model, with a better loss function. The sigmoid loss operates solely on image-text pairs and does not require a global view of the pairwise similarities for normalization. This allows further scaling up the batch size, while also performing better at smaller batch sizes.
22
+
23
+ A TLDR of SigLIP by one of the authors can be found [here](https://twitter.com/giffmana/status/1692641733459267713).
24
+
25
+ ## Intended uses & limitations
26
+
27
+ You can use the raw model for tasks like zero-shot image classification and image-text retrieval. See the [model hub](https://huggingface.co/models?search=google/siglip) to look for
28
+ other versions on a task that interests you.
29
+
30
+ ### How to use
31
+
32
+ Here is how to use this model to perform zero-shot image classification:
33
+
34
+ ```python
35
+ from PIL import Image
36
+ import requests
37
+ from transformers import AutoProcessor, AutoModel
38
+ import torch
39
+
40
+ model = AutoModel.from_pretrained("google/siglip-so400m-patch14-384")
41
+ processor = AutoProcessor.from_pretrained("google/siglip-so400m-patch14-384")
42
+
43
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
44
+ image = Image.open(requests.get(url, stream=True).raw)
45
+
46
+ texts = ["a photo of 2 cats", "a photo of 2 dogs"]
47
+ inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
48
+
49
+ with torch.no_grad():
50
+ outputs = model(**inputs)
51
+
52
+ logits_per_image = outputs.logits_per_image
53
+ probs = torch.sigmoid(logits_per_image) # these are the probabilities
54
+ print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
55
+ ```
56
+
57
+ Alternatively, one can leverage the pipeline API which abstracts away the complexity for the user:
58
+
59
+ ```python
60
+ from transformers import pipeline
61
+ from PIL import Image
62
+ import requests
63
+
64
+ # load pipe
65
+ image_classifier = pipeline(task="zero-shot-image-classification", model="google/siglip-so400m-patch14-384")
66
+
67
+ # load image
68
+ url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
69
+ image = Image.open(requests.get(url, stream=True).raw)
70
+
71
+ # inference
72
+ outputs = image_classifier(image, candidate_labels=["2 cats", "a plane", "a remote"])
73
+ outputs = [{"score": round(output["score"], 4), "label": output["label"] } for output in outputs]
74
+ print(outputs)
75
+ ```
76
+ For more code examples, we refer to the [documentation](https://huggingface.co/transformers/main/model_doc/siglip.html#).
77
+
78
+ ## Training procedure
79
+
80
+ ### Training data
81
+
82
+ SigLIP is pre-trained on the WebLI dataset [(Chen et al., 2023)](https://arxiv.org/abs/2209.06794).
83
+
84
+ ### Preprocessing
85
+
86
+ Images are resized/rescaled to the same resolution (384x384) and normalized across the RGB channels with mean (0.5, 0.5, 0.5) and standard deviation (0.5, 0.5, 0.5).
87
+
88
+ Texts are tokenized and padded to the same length (64 tokens).
89
+
90
+ ### Compute
91
+
92
+ The model was trained on 16 TPU-v4 chips for three days.
93
+
94
+ ## Evaluation results
95
+
96
+ Evaluation of SigLIP compared to CLIP is shown below (taken from the paper).
97
+
98
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/siglip_table.jpeg"
99
+ alt="drawing" width="600"/>
100
+
101
+ ### BibTeX entry and citation info
102
+
103
+ ```bibtex
104
+ @misc{zhai2023sigmoid,
105
+ title={Sigmoid Loss for Language Image Pre-Training},
106
+ author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
107
+ year={2023},
108
+ eprint={2303.15343},
109
+ archivePrefix={arXiv},
110
+ primaryClass={cs.CV}
111
+ }
112
+ ```
clip/siglip-so400m-patch14-384/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SiglipModel"
4
+ ],
5
+ "initializer_factor": 1.0,
6
+ "model_type": "siglip",
7
+ "text_config": {
8
+ "hidden_size": 1152,
9
+ "intermediate_size": 4304,
10
+ "model_type": "siglip_text_model",
11
+ "num_attention_heads": 16,
12
+ "num_hidden_layers": 27
13
+ },
14
+ "torch_dtype": "float32",
15
+ "transformers_version": "4.37.0.dev0",
16
+ "vision_config": {
17
+ "hidden_size": 1152,
18
+ "image_size": 384,
19
+ "intermediate_size": 4304,
20
+ "model_type": "siglip_vision_model",
21
+ "num_attention_heads": 16,
22
+ "num_hidden_layers": 27,
23
+ "patch_size": 14
24
+ }
25
+ }
clip/siglip-so400m-patch14-384/preprocessor_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_processor_type": "SiglipImageProcessor",
11
+ "image_std": [
12
+ 0.5,
13
+ 0.5,
14
+ 0.5
15
+ ],
16
+ "processor_class": "SiglipProcessor",
17
+ "resample": 3,
18
+ "rescale_factor": 0.00392156862745098,
19
+ "size": {
20
+ "height": 384,
21
+ "width": 384
22
+ }
23
+ }
clip/siglip-so400m-patch14-384/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token": {
3
+ "content": "</s>",
4
+ "lstrip": true,
5
+ "normalized": false,
6
+ "rstrip": true,
7
+ "single_word": false
8
+ },
9
+ "pad_token": {
10
+ "content": "</s>",
11
+ "lstrip": true,
12
+ "normalized": false,
13
+ "rstrip": true,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": true,
19
+ "normalized": false,
20
+ "rstrip": true,
21
+ "single_word": false
22
+ }
23
+ }
clip/siglip-so400m-patch14-384/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
clip/siglip-so400m-patch14-384/tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "1": {
4
+ "content": "</s>",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "2": {
12
+ "content": "<unk>",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": true
18
+ }
19
+ },
20
+ "additional_special_tokens": [],
21
+ "clean_up_tokenization_spaces": true,
22
+ "do_lower_case": true,
23
+ "eos_token": "</s>",
24
+ "model_input_names": [
25
+ "input_ids"
26
+ ],
27
+ "model_max_length": 64,
28
+ "pad_token": "</s>",
29
+ "processor_class": "SiglipProcessor",
30
+ "sp_model_kwargs": {},
31
+ "tokenizer_class": "SiglipTokenizer",
32
+ "unk_token": "<unk>"
33
+ }
clip_interrogator/models--timm--vit_large_patch14_clip_224.openai/refs/main ADDED
@@ -0,0 +1 @@
 
 
1
+ 18d0535469bb561bf468d76c1d73aa35156c922b
controlnet/FLUX.1/FLUX.1-dev-ControlNet-Union-Pro-2.0/README.md ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: flux-1-dev-non-commercial-license
4
+ license_link: https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/LICENSE.md
5
+
6
+ language:
7
+ - en
8
+ library_name: diffusers
9
+ pipeline_tag: text-to-image
10
+
11
+ tags:
12
+ - Text-to-Image
13
+ - ControlNet
14
+ - Diffusers
15
+ - Flux.1-dev
16
+ - image-generation
17
+ - Stable Diffusion
18
+ base_model: black-forest-labs/FLUX.1-dev
19
+ ---
20
+
21
+ # FLUX.1-dev-ControlNet-Union-Pro-2.0
22
+
23
+ This repository contains an unified ControlNet for FLUX.1-dev model released by [Shakker Labs](https://huggingface.co/Shakker-Labs). We provide an [online demo](https://huggingface.co/spaces/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro-2.0). A FP8 quantized version provided by community can be found in [ABDALLALSWAITI/FLUX.1-dev-ControlNet-Union-Pro-2.0-fp8](https://huggingface.co/ABDALLALSWAITI/FLUX.1-dev-ControlNet-Union-Pro-2.0-fp8).
24
+
25
+ # Keynotes
26
+ In comparison with [Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro](https://huggingface.co/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro),
27
+ - Remove mode embedding, has smaller model size.
28
+ - Improve on canny and pose, better control and aesthetics.
29
+ - Add support for soft edge. Remove support for tile.
30
+
31
+ # Model Cards
32
+ - This ControlNet consists of 6 double blocks and 0 single block. Mode embedding is removed.
33
+ - We train the model from scratch for 300k steps using a dataset of 20M high-quality general and human images. We train at 512x512 resolution in BFloat16, batch size = 128, learning rate = 2e-5, the guidance is uniformly sampled from [1, 7]. We set the text drop ratio to 0.20.
34
+ - This model supports multiple control modes, including canny, soft edge, depth, pose, gray. You can use it just as a normal ControlNet.
35
+ - This model can be jointly used with other ControlNets.
36
+
37
+ # Showcases
38
+
39
+ <table>
40
+ <tr>
41
+ <td><img src="./images/canny.png" alt="canny" style="height:100%"></td>
42
+ </tr>
43
+ <tr>
44
+ <td><img src="./images/softedge.png" alt="softedge" style="height:100%"></td>
45
+ </tr>
46
+ <tr>
47
+ <td><img src="./images/pose.png" alt="pose" style="height:100%"></td>
48
+ </tr>
49
+ <tr>
50
+ <td><img src="./images/depth.png" alt="depth" style="height:100%"></td>
51
+ </tr>
52
+ <tr>
53
+ <td><img src="./images/gray.png" alt="gray" style="height:100%"></td>
54
+ </tr>
55
+ </table>
56
+
57
+ # Inference
58
+ ```python
59
+ import torch
60
+ from diffusers.utils import load_image
61
+ from diffusers import FluxControlNetPipeline, FluxControlNetModel
62
+
63
+ base_model = 'black-forest-labs/FLUX.1-dev'
64
+ controlnet_model_union = 'Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro-2.0'
65
+
66
+ controlnet = FluxControlNetModel.from_pretrained(controlnet_model_union, torch_dtype=torch.bfloat16)
67
+ pipe = FluxControlNetPipeline.from_pretrained(base_model, controlnet=controlnet, torch_dtype=torch.bfloat16)
68
+ pipe.to("cuda")
69
+
70
+ # replace with other conds
71
+ control_image = load_image("./conds/canny.png")
72
+ width, height = control_image.size
73
+
74
+ prompt = "A young girl stands gracefully at the edge of a serene beach, her long, flowing hair gently tousled by the sea breeze. She wears a soft, pastel-colored dress that complements the tranquil blues and greens of the coastal scenery. The golden hues of the setting sun cast a warm glow on her face, highlighting her serene expression. The background features a vast, azure ocean with gentle waves lapping at the shore, surrounded by distant cliffs and a clear, cloudless sky. The composition emphasizes the girl's serene presence amidst the natural beauty, with a balanced blend of warm and cool tones."
75
+
76
+ image = pipe(
77
+ prompt,
78
+ control_image=control_image,
79
+ width=width,
80
+ height=height,
81
+ controlnet_conditioning_scale=0.7,
82
+ control_guidance_end=0.8,
83
+ num_inference_steps=30,
84
+ guidance_scale=3.5,
85
+ generator=torch.Generator(device="cuda").manual_seed(42),
86
+ ).images[0]
87
+ ```
88
+
89
+ # Multi-Inference
90
+ ```python
91
+ import torch
92
+ from diffusers.utils import load_image
93
+
94
+ # https://github.com/huggingface/diffusers/pull/11350
95
+ # You can directly import from diffusers by install the laster version from source
96
+ # from diffusers import FluxControlNetPipeline, FluxControlNetModel
97
+
98
+ # use local files for this moment
99
+ from pipeline_flux_controlnet import FluxControlNetPipeline
100
+ from controlnet_flux import FluxControlNetModel
101
+
102
+ base_model = 'black-forest-labs/FLUX.1-dev'
103
+ controlnet_model_union = 'Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro-2.0'
104
+
105
+ controlnet = FluxControlNetModel.from_pretrained(controlnet_model_union, torch_dtype=torch.bfloat16)
106
+ pipe = FluxControlNetPipeline.from_pretrained(base_model, controlnet=[controlnet], torch_dtype=torch.bfloat16) # use [] to enable multi-CNs
107
+ pipe.to("cuda")
108
+
109
+ # replace with other conds
110
+ control_image = load_image("./conds/canny.png")
111
+ width, height = control_image.size
112
+
113
+ prompt = "A young girl stands gracefully at the edge of a serene beach, her long, flowing hair gently tousled by the sea breeze. She wears a soft, pastel-colored dress that complements the tranquil blues and greens of the coastal scenery. The golden hues of the setting sun cast a warm glow on her face, highlighting her serene expression. The background features a vast, azure ocean with gentle waves lapping at the shore, surrounded by distant cliffs and a clear, cloudless sky. The composition emphasizes the girl's serene presence amidst the natural beauty, with a balanced blend of warm and cool tones."
114
+
115
+ image = pipe(
116
+ prompt,
117
+ control_image=[control_image, control_image], # try with different conds such as canny&depth, pose&depth
118
+ width=width,
119
+ height=height,
120
+ controlnet_conditioning_scale=[0.35, 0.35],
121
+ control_guidance_end=[0.8, 0.8],
122
+ num_inference_steps=30,
123
+ guidance_scale=3.5,
124
+ generator=torch.Generator(device="cuda").manual_seed(42),
125
+ ).images[0]
126
+ ```
127
+
128
+ # Recommended Parameters
129
+ You can adjust controlnet_conditioning_scale and control_guidance_end for stronger control and better detail preservation. For better stability, we highly suggest to use detailed prompt, for some cases, multi-conditions help.
130
+ - Canny: use cv2.Canny, controlnet_conditioning_scale=0.7, control_guidance_end=0.8.
131
+ - Soft Edge: use [AnylineDetector](https://github.com/huggingface/controlnet_aux), controlnet_conditioning_scale=0.7, control_guidance_end=0.8.
132
+ - Depth: use [depth-anything](https://github.com/DepthAnything/Depth-Anything-V2), controlnet_conditioning_scale=0.8, control_guidance_end=0.8.
133
+ - Pose: use [DWPose](https://github.com/IDEA-Research/DWPose/tree/onnx), controlnet_conditioning_scale=0.9, control_guidance_end=0.65.
134
+ - Gray: use cv2.cvtColor, controlnet_conditioning_scale=0.9, control_guidance_end=0.8.
135
+
136
+ # Resources
137
+ - [InstantX/FLUX.1-dev-IP-Adapter](https://huggingface.co/InstantX/FLUX.1-dev-IP-Adapter)
138
+ - [InstantX/FLUX.1-dev-Controlnet-Canny](https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Canny)
139
+ - [Shakker-Labs/FLUX.1-dev-ControlNet-Depth](https://huggingface.co/Shakker-Labs/FLUX.1-dev-ControlNet-Depth)
140
+ - [Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro](https://huggingface.co/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro)
141
+
142
+ # Acknowledgements
143
+ This model is developed by [Shakker Labs](https://huggingface.co/Shakker-Labs). The original idea is inspired by [xinsir/controlnet-union-sdxl-1.0](https://huggingface.co/xinsir/controlnet-union-sdxl-1.0). All copyright reserved.
144
+
145
+ # Citation
146
+ If you find this project useful in your research, please cite us via
147
+ ```
148
+ @misc{flux-cn-union-pro-2,
149
+ author = {Shakker-Labs},
150
+ title = {ControlNet-Union},
151
+ year = {2025},
152
+ howpublished={\url{https://huggingface.co/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro-2.0}},
153
+ }
154
+ ```
controlnet/FLUX.1/FLUX.1-dev-ControlNet-Union-Pro-2.0/conds/canny.png ADDED
controlnet/FLUX.1/FLUX.1-dev-ControlNet-Union-Pro-2.0/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "FluxControlNetModel",
3
+ "_diffusers_version": "0.31.0.dev0",
4
+ "attention_head_dim": 128,
5
+ "axes_dims_rope": [
6
+ 16,
7
+ 56,
8
+ 56
9
+ ],
10
+ "guidance_embeds": true,
11
+ "in_channels": 64,
12
+ "joint_attention_dim": 4096,
13
+ "num_attention_heads": 24,
14
+ "num_layers": 6,
15
+ "num_mode": null,
16
+ "num_single_layers": 0,
17
+ "patch_size": 1,
18
+ "pooled_projection_dim": 768
19
+ }
controlnet/FLUX.1/FLUX.1-dev-ControlNet-Union-Pro-2.0/controlnet_flux.py ADDED
@@ -0,0 +1,509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from dataclasses import dataclass
16
+ from typing import Any, Dict, List, Optional, Tuple, Union
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+
21
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
22
+ from diffusers.loaders import PeftAdapterMixin
23
+ from diffusers.models.attention_processor import AttentionProcessor
24
+ from diffusers.models.modeling_utils import ModelMixin
25
+ from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
26
+ from diffusers.models.controlnets.controlnet import ControlNetConditioningEmbedding, zero_module
27
+ from diffusers.models.embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
28
+ from diffusers.models.modeling_outputs import Transformer2DModelOutput
29
+ from diffusers.models.transformers.transformer_flux import FluxSingleTransformerBlock, FluxTransformerBlock
30
+
31
+
32
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
33
+
34
+
35
+ @dataclass
36
+ class FluxControlNetOutput(BaseOutput):
37
+ controlnet_block_samples: Tuple[torch.Tensor]
38
+ controlnet_single_block_samples: Tuple[torch.Tensor]
39
+
40
+
41
+ class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
42
+ _supports_gradient_checkpointing = True
43
+
44
+ @register_to_config
45
+ def __init__(
46
+ self,
47
+ patch_size: int = 1,
48
+ in_channels: int = 64,
49
+ num_layers: int = 19,
50
+ num_single_layers: int = 38,
51
+ attention_head_dim: int = 128,
52
+ num_attention_heads: int = 24,
53
+ joint_attention_dim: int = 4096,
54
+ pooled_projection_dim: int = 768,
55
+ guidance_embeds: bool = False,
56
+ axes_dims_rope: List[int] = [16, 56, 56],
57
+ num_mode: int = None,
58
+ conditioning_embedding_channels: int = None,
59
+ ):
60
+ super().__init__()
61
+ self.out_channels = in_channels
62
+ self.inner_dim = num_attention_heads * attention_head_dim
63
+
64
+ self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
65
+ text_time_guidance_cls = (
66
+ CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
67
+ )
68
+ self.time_text_embed = text_time_guidance_cls(
69
+ embedding_dim=self.inner_dim, pooled_projection_dim=pooled_projection_dim
70
+ )
71
+
72
+ self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim)
73
+ self.x_embedder = torch.nn.Linear(in_channels, self.inner_dim)
74
+
75
+ self.transformer_blocks = nn.ModuleList(
76
+ [
77
+ FluxTransformerBlock(
78
+ dim=self.inner_dim,
79
+ num_attention_heads=num_attention_heads,
80
+ attention_head_dim=attention_head_dim,
81
+ )
82
+ for i in range(num_layers)
83
+ ]
84
+ )
85
+
86
+ self.single_transformer_blocks = nn.ModuleList(
87
+ [
88
+ FluxSingleTransformerBlock(
89
+ dim=self.inner_dim,
90
+ num_attention_heads=num_attention_heads,
91
+ attention_head_dim=attention_head_dim,
92
+ )
93
+ for i in range(num_single_layers)
94
+ ]
95
+ )
96
+
97
+ # controlnet_blocks
98
+ self.controlnet_blocks = nn.ModuleList([])
99
+ for _ in range(len(self.transformer_blocks)):
100
+ self.controlnet_blocks.append(zero_module(nn.Linear(self.inner_dim, self.inner_dim)))
101
+
102
+ self.controlnet_single_blocks = nn.ModuleList([])
103
+ for _ in range(len(self.single_transformer_blocks)):
104
+ self.controlnet_single_blocks.append(zero_module(nn.Linear(self.inner_dim, self.inner_dim)))
105
+
106
+ self.union = num_mode is not None
107
+ if self.union:
108
+ self.controlnet_mode_embedder = nn.Embedding(num_mode, self.inner_dim)
109
+
110
+ if conditioning_embedding_channels is not None:
111
+ self.input_hint_block = ControlNetConditioningEmbedding(
112
+ conditioning_embedding_channels=conditioning_embedding_channels, block_out_channels=(16, 16, 16, 16)
113
+ )
114
+ self.controlnet_x_embedder = torch.nn.Linear(in_channels, self.inner_dim)
115
+ else:
116
+ self.input_hint_block = None
117
+ self.controlnet_x_embedder = zero_module(torch.nn.Linear(in_channels, self.inner_dim))
118
+
119
+ self.gradient_checkpointing = False
120
+
121
+ @property
122
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
123
+ def attn_processors(self):
124
+ r"""
125
+ Returns:
126
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
127
+ indexed by its weight name.
128
+ """
129
+ # set recursively
130
+ processors = {}
131
+
132
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
133
+ if hasattr(module, "get_processor"):
134
+ processors[f"{name}.processor"] = module.get_processor()
135
+
136
+ for sub_name, child in module.named_children():
137
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
138
+
139
+ return processors
140
+
141
+ for name, module in self.named_children():
142
+ fn_recursive_add_processors(name, module, processors)
143
+
144
+ return processors
145
+
146
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
147
+ def set_attn_processor(self, processor):
148
+ r"""
149
+ Sets the attention processor to use to compute attention.
150
+
151
+ Parameters:
152
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
153
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
154
+ for **all** `Attention` layers.
155
+
156
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
157
+ processor. This is strongly recommended when setting trainable attention processors.
158
+
159
+ """
160
+ count = len(self.attn_processors.keys())
161
+
162
+ if isinstance(processor, dict) and len(processor) != count:
163
+ raise ValueError(
164
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
165
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
166
+ )
167
+
168
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
169
+ if hasattr(module, "set_processor"):
170
+ if not isinstance(processor, dict):
171
+ module.set_processor(processor)
172
+ else:
173
+ module.set_processor(processor.pop(f"{name}.processor"))
174
+
175
+ for sub_name, child in module.named_children():
176
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
177
+
178
+ for name, module in self.named_children():
179
+ fn_recursive_attn_processor(name, module, processor)
180
+
181
+ @classmethod
182
+ def from_transformer(
183
+ cls,
184
+ transformer,
185
+ num_layers: int = 4,
186
+ num_single_layers: int = 10,
187
+ attention_head_dim: int = 128,
188
+ num_attention_heads: int = 24,
189
+ load_weights_from_transformer=True,
190
+ ):
191
+ config = dict(transformer.config)
192
+ config["num_layers"] = num_layers
193
+ config["num_single_layers"] = num_single_layers
194
+ config["attention_head_dim"] = attention_head_dim
195
+ config["num_attention_heads"] = num_attention_heads
196
+
197
+ controlnet = cls.from_config(config)
198
+
199
+ if load_weights_from_transformer:
200
+ controlnet.pos_embed.load_state_dict(transformer.pos_embed.state_dict())
201
+ controlnet.time_text_embed.load_state_dict(transformer.time_text_embed.state_dict())
202
+ controlnet.context_embedder.load_state_dict(transformer.context_embedder.state_dict())
203
+ controlnet.x_embedder.load_state_dict(transformer.x_embedder.state_dict())
204
+ controlnet.transformer_blocks.load_state_dict(transformer.transformer_blocks.state_dict(), strict=False)
205
+ controlnet.single_transformer_blocks.load_state_dict(
206
+ transformer.single_transformer_blocks.state_dict(), strict=False
207
+ )
208
+
209
+ controlnet.controlnet_x_embedder = zero_module(controlnet.controlnet_x_embedder)
210
+
211
+ return controlnet
212
+
213
+ def forward(
214
+ self,
215
+ hidden_states: torch.Tensor,
216
+ controlnet_cond: torch.Tensor,
217
+ controlnet_mode: torch.Tensor = None,
218
+ conditioning_scale: float = 1.0,
219
+ encoder_hidden_states: torch.Tensor = None,
220
+ pooled_projections: torch.Tensor = None,
221
+ timestep: torch.LongTensor = None,
222
+ img_ids: torch.Tensor = None,
223
+ txt_ids: torch.Tensor = None,
224
+ guidance: torch.Tensor = None,
225
+ joint_attention_kwargs: Optional[Dict[str, Any]] = None,
226
+ return_dict: bool = True,
227
+ ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
228
+ """
229
+ The [`FluxTransformer2DModel`] forward method.
230
+
231
+ Args:
232
+ hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
233
+ Input `hidden_states`.
234
+ controlnet_cond (`torch.Tensor`):
235
+ The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
236
+ controlnet_mode (`torch.Tensor`):
237
+ The mode tensor of shape `(batch_size, 1)`.
238
+ conditioning_scale (`float`, defaults to `1.0`):
239
+ The scale factor for ControlNet outputs.
240
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
241
+ Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
242
+ pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
243
+ from the embeddings of input conditions.
244
+ timestep ( `torch.LongTensor`):
245
+ Used to indicate denoising step.
246
+ block_controlnet_hidden_states: (`list` of `torch.Tensor`):
247
+ A list of tensors that if specified are added to the residuals of transformer blocks.
248
+ joint_attention_kwargs (`dict`, *optional*):
249
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
250
+ `self.processor` in
251
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
252
+ return_dict (`bool`, *optional*, defaults to `True`):
253
+ Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
254
+ tuple.
255
+
256
+ Returns:
257
+ If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
258
+ `tuple` where the first element is the sample tensor.
259
+ """
260
+ if joint_attention_kwargs is not None:
261
+ joint_attention_kwargs = joint_attention_kwargs.copy()
262
+ lora_scale = joint_attention_kwargs.pop("scale", 1.0)
263
+ else:
264
+ lora_scale = 1.0
265
+
266
+ if USE_PEFT_BACKEND:
267
+ # weight the lora layers by setting `lora_scale` for each PEFT layer
268
+ scale_lora_layers(self, lora_scale)
269
+ else:
270
+ if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
271
+ logger.warning(
272
+ "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
273
+ )
274
+ hidden_states = self.x_embedder(hidden_states)
275
+
276
+ if self.input_hint_block is not None:
277
+ controlnet_cond = self.input_hint_block(controlnet_cond)
278
+ batch_size, channels, height_pw, width_pw = controlnet_cond.shape
279
+ height = height_pw // self.config.patch_size
280
+ width = width_pw // self.config.patch_size
281
+ controlnet_cond = controlnet_cond.reshape(
282
+ batch_size, channels, height, self.config.patch_size, width, self.config.patch_size
283
+ )
284
+ controlnet_cond = controlnet_cond.permute(0, 2, 4, 1, 3, 5)
285
+ controlnet_cond = controlnet_cond.reshape(batch_size, height * width, -1)
286
+ # add
287
+ hidden_states = hidden_states + self.controlnet_x_embedder(controlnet_cond)
288
+
289
+ timestep = timestep.to(hidden_states.dtype) * 1000
290
+ if guidance is not None:
291
+ guidance = guidance.to(hidden_states.dtype) * 1000
292
+ else:
293
+ guidance = None
294
+ temb = (
295
+ self.time_text_embed(timestep, pooled_projections)
296
+ if guidance is None
297
+ else self.time_text_embed(timestep, guidance, pooled_projections)
298
+ )
299
+ encoder_hidden_states = self.context_embedder(encoder_hidden_states)
300
+
301
+ if txt_ids.ndim == 3:
302
+ logger.warning(
303
+ "Passing `txt_ids` 3d torch.Tensor is deprecated."
304
+ "Please remove the batch dimension and pass it as a 2d torch Tensor"
305
+ )
306
+ txt_ids = txt_ids[0]
307
+ if img_ids.ndim == 3:
308
+ logger.warning(
309
+ "Passing `img_ids` 3d torch.Tensor is deprecated."
310
+ "Please remove the batch dimension and pass it as a 2d torch Tensor"
311
+ )
312
+ img_ids = img_ids[0]
313
+
314
+ if self.union:
315
+ # union mode
316
+ if controlnet_mode is None:
317
+ raise ValueError("`controlnet_mode` cannot be `None` when applying ControlNet-Union")
318
+ # union mode emb
319
+ controlnet_mode_emb = self.controlnet_mode_embedder(controlnet_mode)
320
+ encoder_hidden_states = torch.cat([controlnet_mode_emb, encoder_hidden_states], dim=1)
321
+ txt_ids = torch.cat([txt_ids[:1], txt_ids], dim=0)
322
+
323
+ ids = torch.cat((txt_ids, img_ids), dim=0)
324
+ image_rotary_emb = self.pos_embed(ids)
325
+
326
+ block_samples = ()
327
+ for index_block, block in enumerate(self.transformer_blocks):
328
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
329
+ encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
330
+ block,
331
+ hidden_states,
332
+ encoder_hidden_states,
333
+ temb,
334
+ image_rotary_emb,
335
+ )
336
+
337
+ else:
338
+ encoder_hidden_states, hidden_states = block(
339
+ hidden_states=hidden_states,
340
+ encoder_hidden_states=encoder_hidden_states,
341
+ temb=temb,
342
+ image_rotary_emb=image_rotary_emb,
343
+ )
344
+ block_samples = block_samples + (hidden_states,)
345
+
346
+ hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
347
+
348
+ single_block_samples = ()
349
+ for index_block, block in enumerate(self.single_transformer_blocks):
350
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
351
+ hidden_states = self._gradient_checkpointing_func(
352
+ block,
353
+ hidden_states,
354
+ temb,
355
+ image_rotary_emb,
356
+ )
357
+
358
+ else:
359
+ hidden_states = block(
360
+ hidden_states=hidden_states,
361
+ temb=temb,
362
+ image_rotary_emb=image_rotary_emb,
363
+ )
364
+ single_block_samples = single_block_samples + (hidden_states[:, encoder_hidden_states.shape[1] :],)
365
+
366
+ # controlnet block
367
+ controlnet_block_samples = ()
368
+ for block_sample, controlnet_block in zip(block_samples, self.controlnet_blocks):
369
+ block_sample = controlnet_block(block_sample)
370
+ controlnet_block_samples = controlnet_block_samples + (block_sample,)
371
+
372
+ controlnet_single_block_samples = ()
373
+ for single_block_sample, controlnet_block in zip(single_block_samples, self.controlnet_single_blocks):
374
+ single_block_sample = controlnet_block(single_block_sample)
375
+ controlnet_single_block_samples = controlnet_single_block_samples + (single_block_sample,)
376
+
377
+ # scaling
378
+ controlnet_block_samples = [sample * conditioning_scale for sample in controlnet_block_samples]
379
+ controlnet_single_block_samples = [sample * conditioning_scale for sample in controlnet_single_block_samples]
380
+
381
+ controlnet_block_samples = None if len(controlnet_block_samples) == 0 else controlnet_block_samples
382
+ controlnet_single_block_samples = (
383
+ None if len(controlnet_single_block_samples) == 0 else controlnet_single_block_samples
384
+ )
385
+
386
+ if USE_PEFT_BACKEND:
387
+ # remove `lora_scale` from each PEFT layer
388
+ unscale_lora_layers(self, lora_scale)
389
+
390
+ if not return_dict:
391
+ return (controlnet_block_samples, controlnet_single_block_samples)
392
+
393
+ return FluxControlNetOutput(
394
+ controlnet_block_samples=controlnet_block_samples,
395
+ controlnet_single_block_samples=controlnet_single_block_samples,
396
+ )
397
+
398
+
399
+ class FluxMultiControlNetModel(ModelMixin):
400
+ r"""
401
+ `FluxMultiControlNetModel` wrapper class for Multi-FluxControlNetModel
402
+
403
+ This module is a wrapper for multiple instances of the `FluxControlNetModel`. The `forward()` API is designed to be
404
+ compatible with `FluxControlNetModel`.
405
+
406
+ Args:
407
+ controlnets (`List[FluxControlNetModel]`):
408
+ Provides additional conditioning to the unet during the denoising process. You must set multiple
409
+ `FluxControlNetModel` as a list.
410
+ """
411
+
412
+ def __init__(self, controlnets):
413
+ super().__init__()
414
+ self.nets = nn.ModuleList(controlnets)
415
+
416
+ def forward(
417
+ self,
418
+ hidden_states: torch.FloatTensor,
419
+ controlnet_cond: List[torch.tensor],
420
+ controlnet_mode: List[torch.tensor],
421
+ conditioning_scale: List[float],
422
+ encoder_hidden_states: torch.Tensor = None,
423
+ pooled_projections: torch.Tensor = None,
424
+ timestep: torch.LongTensor = None,
425
+ img_ids: torch.Tensor = None,
426
+ txt_ids: torch.Tensor = None,
427
+ guidance: torch.Tensor = None,
428
+ joint_attention_kwargs: Optional[Dict[str, Any]] = None,
429
+ return_dict: bool = True,
430
+ ) -> Union[FluxControlNetOutput, Tuple]:
431
+ # ControlNet-Union with multiple conditions
432
+ # only load one ControlNet for saving memories
433
+ if len(self.nets) == 1:
434
+ controlnet = self.nets[0]
435
+
436
+ for i, (image, mode, scale) in enumerate(zip(controlnet_cond, controlnet_mode, conditioning_scale)):
437
+ block_samples, single_block_samples = controlnet(
438
+ hidden_states=hidden_states,
439
+ controlnet_cond=image,
440
+ controlnet_mode=mode[:, None],
441
+ conditioning_scale=scale,
442
+ timestep=timestep,
443
+ guidance=guidance,
444
+ pooled_projections=pooled_projections,
445
+ encoder_hidden_states=encoder_hidden_states,
446
+ txt_ids=txt_ids,
447
+ img_ids=img_ids,
448
+ joint_attention_kwargs=joint_attention_kwargs,
449
+ return_dict=return_dict,
450
+ )
451
+
452
+ # merge samples
453
+ if i == 0:
454
+ control_block_samples = block_samples
455
+ control_single_block_samples = single_block_samples
456
+ else:
457
+ if block_samples is not None and control_block_samples is not None:
458
+ control_block_samples = [
459
+ control_block_sample + block_sample
460
+ for control_block_sample, block_sample in zip(control_block_samples, block_samples)
461
+ ]
462
+ if single_block_samples is not None and control_single_block_samples is not None:
463
+ control_single_block_samples = [
464
+ control_single_block_sample + block_sample
465
+ for control_single_block_sample, block_sample in zip(
466
+ control_single_block_samples, single_block_samples
467
+ )
468
+ ]
469
+
470
+ # Regular Multi-ControlNets
471
+ # load all ControlNets into memories
472
+ else:
473
+ for i, (image, mode, scale, controlnet) in enumerate(
474
+ zip(controlnet_cond, controlnet_mode, conditioning_scale, self.nets)
475
+ ):
476
+ block_samples, single_block_samples = controlnet(
477
+ hidden_states=hidden_states,
478
+ controlnet_cond=image,
479
+ controlnet_mode=mode[:, None],
480
+ conditioning_scale=scale,
481
+ timestep=timestep,
482
+ guidance=guidance,
483
+ pooled_projections=pooled_projections,
484
+ encoder_hidden_states=encoder_hidden_states,
485
+ txt_ids=txt_ids,
486
+ img_ids=img_ids,
487
+ joint_attention_kwargs=joint_attention_kwargs,
488
+ return_dict=return_dict,
489
+ )
490
+
491
+ # merge samples
492
+ if i == 0:
493
+ control_block_samples = block_samples
494
+ control_single_block_samples = single_block_samples
495
+ else:
496
+ if block_samples is not None and control_block_samples is not None:
497
+ control_block_samples = [
498
+ control_block_sample + block_sample
499
+ for control_block_sample, block_sample in zip(control_block_samples, block_samples)
500
+ ]
501
+ if single_block_samples is not None and control_single_block_samples is not None:
502
+ control_single_block_samples = [
503
+ control_single_block_sample + block_sample
504
+ for control_single_block_sample, block_sample in zip(
505
+ control_single_block_samples, single_block_samples
506
+ )
507
+ ]
508
+
509
+ return control_block_samples, control_single_block_samples
controlnet/FLUX.1/FLUX.1-dev-ControlNet-Union-Pro-2.0/pipeline_flux_controlnet.py ADDED
@@ -0,0 +1,1181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import torch
20
+ from transformers import (
21
+ CLIPImageProcessor,
22
+ CLIPTextModel,
23
+ CLIPTokenizer,
24
+ CLIPVisionModelWithProjection,
25
+ T5EncoderModel,
26
+ T5TokenizerFast,
27
+ )
28
+
29
+ from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
30
+ from diffusers.loaders import FluxIPAdapterMixin, FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
31
+ from diffusers.models.autoencoders import AutoencoderKL
32
+
33
+ # from diffusers.models.controlnets.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
34
+ from controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
35
+
36
+ from diffusers.models.transformers import FluxTransformer2DModel
37
+ from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
38
+ from diffusers.utils import (
39
+ USE_PEFT_BACKEND,
40
+ is_torch_xla_available,
41
+ logging,
42
+ replace_example_docstring,
43
+ scale_lora_layers,
44
+ unscale_lora_layers,
45
+ )
46
+ from diffusers.utils.torch_utils import randn_tensor
47
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
48
+ from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
49
+
50
+
51
+ if is_torch_xla_available():
52
+ import torch_xla.core.xla_model as xm
53
+
54
+ XLA_AVAILABLE = True
55
+ else:
56
+ XLA_AVAILABLE = False
57
+
58
+
59
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
60
+
61
+ EXAMPLE_DOC_STRING = """
62
+ Examples:
63
+ ```py
64
+ >>> import torch
65
+ >>> from diffusers.utils import load_image
66
+ >>> from diffusers import FluxControlNetPipeline
67
+ >>> from diffusers import FluxControlNetModel
68
+
69
+ >>> base_model = "black-forest-labs/FLUX.1-dev"
70
+ >>> controlnet_model = "InstantX/FLUX.1-dev-controlnet-canny"
71
+ >>> controlnet = FluxControlNetModel.from_pretrained(controlnet_model, torch_dtype=torch.bfloat16)
72
+ >>> pipe = FluxControlNetPipeline.from_pretrained(
73
+ ... base_model, controlnet=controlnet, torch_dtype=torch.bfloat16
74
+ ... )
75
+ >>> pipe.to("cuda")
76
+ >>> control_image = load_image("https://huggingface.co/InstantX/SD3-Controlnet-Canny/resolve/main/canny.jpg")
77
+ >>> prompt = "A girl in city, 25 years old, cool, futuristic"
78
+ >>> image = pipe(
79
+ ... prompt,
80
+ ... control_image=control_image,
81
+ ... control_guidance_start=0.2,
82
+ ... control_guidance_end=0.8,
83
+ ... controlnet_conditioning_scale=1.0,
84
+ ... num_inference_steps=28,
85
+ ... guidance_scale=3.5,
86
+ ... ).images[0]
87
+ >>> image.save("flux.png")
88
+ ```
89
+ """
90
+
91
+
92
+ # Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
93
+ def calculate_shift(
94
+ image_seq_len,
95
+ base_seq_len: int = 256,
96
+ max_seq_len: int = 4096,
97
+ base_shift: float = 0.5,
98
+ max_shift: float = 1.15,
99
+ ):
100
+ m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
101
+ b = base_shift - m * base_seq_len
102
+ mu = image_seq_len * m + b
103
+ return mu
104
+
105
+
106
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
107
+ def retrieve_latents(
108
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
109
+ ):
110
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
111
+ return encoder_output.latent_dist.sample(generator)
112
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
113
+ return encoder_output.latent_dist.mode()
114
+ elif hasattr(encoder_output, "latents"):
115
+ return encoder_output.latents
116
+ else:
117
+ raise AttributeError("Could not access latents of provided encoder_output")
118
+
119
+
120
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
121
+ def retrieve_timesteps(
122
+ scheduler,
123
+ num_inference_steps: Optional[int] = None,
124
+ device: Optional[Union[str, torch.device]] = None,
125
+ timesteps: Optional[List[int]] = None,
126
+ sigmas: Optional[List[float]] = None,
127
+ **kwargs,
128
+ ):
129
+ r"""
130
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
131
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
132
+
133
+ Args:
134
+ scheduler (`SchedulerMixin`):
135
+ The scheduler to get timesteps from.
136
+ num_inference_steps (`int`):
137
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
138
+ must be `None`.
139
+ device (`str` or `torch.device`, *optional*):
140
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
141
+ timesteps (`List[int]`, *optional*):
142
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
143
+ `num_inference_steps` and `sigmas` must be `None`.
144
+ sigmas (`List[float]`, *optional*):
145
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
146
+ `num_inference_steps` and `timesteps` must be `None`.
147
+
148
+ Returns:
149
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
150
+ second element is the number of inference steps.
151
+ """
152
+ if timesteps is not None and sigmas is not None:
153
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
154
+ if timesteps is not None:
155
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
156
+ if not accepts_timesteps:
157
+ raise ValueError(
158
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
159
+ f" timestep schedules. Please check whether you are using the correct scheduler."
160
+ )
161
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
162
+ timesteps = scheduler.timesteps
163
+ num_inference_steps = len(timesteps)
164
+ elif sigmas is not None:
165
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
166
+ if not accept_sigmas:
167
+ raise ValueError(
168
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
169
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
170
+ )
171
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
172
+ timesteps = scheduler.timesteps
173
+ num_inference_steps = len(timesteps)
174
+ else:
175
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
176
+ timesteps = scheduler.timesteps
177
+ return timesteps, num_inference_steps
178
+
179
+
180
+ class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin, FluxIPAdapterMixin):
181
+ r"""
182
+ The Flux pipeline for text-to-image generation.
183
+
184
+ Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
185
+
186
+ Args:
187
+ transformer ([`FluxTransformer2DModel`]):
188
+ Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
189
+ scheduler ([`FlowMatchEulerDiscreteScheduler`]):
190
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
191
+ vae ([`AutoencoderKL`]):
192
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
193
+ text_encoder ([`CLIPTextModel`]):
194
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
195
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
196
+ text_encoder_2 ([`T5EncoderModel`]):
197
+ [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
198
+ the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
199
+ tokenizer (`CLIPTokenizer`):
200
+ Tokenizer of class
201
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
202
+ tokenizer_2 (`T5TokenizerFast`):
203
+ Second Tokenizer of class
204
+ [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
205
+ """
206
+
207
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->transformer->vae"
208
+ _optional_components = ["image_encoder", "feature_extractor"]
209
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "control_image"]
210
+
211
+ def __init__(
212
+ self,
213
+ scheduler: FlowMatchEulerDiscreteScheduler,
214
+ vae: AutoencoderKL,
215
+ text_encoder: CLIPTextModel,
216
+ tokenizer: CLIPTokenizer,
217
+ text_encoder_2: T5EncoderModel,
218
+ tokenizer_2: T5TokenizerFast,
219
+ transformer: FluxTransformer2DModel,
220
+ controlnet: Union[
221
+ FluxControlNetModel, List[FluxControlNetModel], Tuple[FluxControlNetModel], FluxMultiControlNetModel
222
+ ],
223
+ image_encoder: CLIPVisionModelWithProjection = None,
224
+ feature_extractor: CLIPImageProcessor = None,
225
+ ):
226
+ super().__init__()
227
+ if isinstance(controlnet, (list, tuple)):
228
+ controlnet = FluxMultiControlNetModel(controlnet)
229
+
230
+ self.register_modules(
231
+ vae=vae,
232
+ text_encoder=text_encoder,
233
+ text_encoder_2=text_encoder_2,
234
+ tokenizer=tokenizer,
235
+ tokenizer_2=tokenizer_2,
236
+ transformer=transformer,
237
+ scheduler=scheduler,
238
+ controlnet=controlnet,
239
+ image_encoder=image_encoder,
240
+ feature_extractor=feature_extractor,
241
+ )
242
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
243
+ # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
244
+ # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
245
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
246
+ self.tokenizer_max_length = (
247
+ self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
248
+ )
249
+ self.default_sample_size = 128
250
+
251
+ def _get_t5_prompt_embeds(
252
+ self,
253
+ prompt: Union[str, List[str]] = None,
254
+ num_images_per_prompt: int = 1,
255
+ max_sequence_length: int = 512,
256
+ device: Optional[torch.device] = None,
257
+ dtype: Optional[torch.dtype] = None,
258
+ ):
259
+ device = device or self._execution_device
260
+ dtype = dtype or self.text_encoder.dtype
261
+
262
+ prompt = [prompt] if isinstance(prompt, str) else prompt
263
+ batch_size = len(prompt)
264
+
265
+ if isinstance(self, TextualInversionLoaderMixin):
266
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
267
+
268
+ text_inputs = self.tokenizer_2(
269
+ prompt,
270
+ padding="max_length",
271
+ max_length=max_sequence_length,
272
+ truncation=True,
273
+ return_length=False,
274
+ return_overflowing_tokens=False,
275
+ return_tensors="pt",
276
+ )
277
+ text_input_ids = text_inputs.input_ids
278
+ untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
279
+
280
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
281
+ removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
282
+ logger.warning(
283
+ "The following part of your input was truncated because `max_sequence_length` is set to "
284
+ f" {max_sequence_length} tokens: {removed_text}"
285
+ )
286
+
287
+ prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
288
+
289
+ dtype = self.text_encoder_2.dtype
290
+ prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
291
+
292
+ _, seq_len, _ = prompt_embeds.shape
293
+
294
+ # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
295
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
296
+ prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
297
+
298
+ return prompt_embeds
299
+
300
+ def _get_clip_prompt_embeds(
301
+ self,
302
+ prompt: Union[str, List[str]],
303
+ num_images_per_prompt: int = 1,
304
+ device: Optional[torch.device] = None,
305
+ ):
306
+ device = device or self._execution_device
307
+
308
+ prompt = [prompt] if isinstance(prompt, str) else prompt
309
+ batch_size = len(prompt)
310
+
311
+ if isinstance(self, TextualInversionLoaderMixin):
312
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
313
+
314
+ text_inputs = self.tokenizer(
315
+ prompt,
316
+ padding="max_length",
317
+ max_length=self.tokenizer_max_length,
318
+ truncation=True,
319
+ return_overflowing_tokens=False,
320
+ return_length=False,
321
+ return_tensors="pt",
322
+ )
323
+
324
+ text_input_ids = text_inputs.input_ids
325
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
326
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
327
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
328
+ logger.warning(
329
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
330
+ f" {self.tokenizer_max_length} tokens: {removed_text}"
331
+ )
332
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
333
+
334
+ # Use pooled output of CLIPTextModel
335
+ prompt_embeds = prompt_embeds.pooler_output
336
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
337
+
338
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
339
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
340
+ prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
341
+
342
+ return prompt_embeds
343
+
344
+ def encode_prompt(
345
+ self,
346
+ prompt: Union[str, List[str]],
347
+ prompt_2: Union[str, List[str]],
348
+ device: Optional[torch.device] = None,
349
+ num_images_per_prompt: int = 1,
350
+ prompt_embeds: Optional[torch.FloatTensor] = None,
351
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
352
+ max_sequence_length: int = 512,
353
+ lora_scale: Optional[float] = None,
354
+ ):
355
+ r"""
356
+
357
+ Args:
358
+ prompt (`str` or `List[str]`, *optional*):
359
+ prompt to be encoded
360
+ prompt_2 (`str` or `List[str]`, *optional*):
361
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
362
+ used in all text-encoders
363
+ device: (`torch.device`):
364
+ torch device
365
+ num_images_per_prompt (`int`):
366
+ number of images that should be generated per prompt
367
+ prompt_embeds (`torch.FloatTensor`, *optional*):
368
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
369
+ provided, text embeddings will be generated from `prompt` input argument.
370
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
371
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
372
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
373
+ clip_skip (`int`, *optional*):
374
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
375
+ the output of the pre-final layer will be used for computing the prompt embeddings.
376
+ lora_scale (`float`, *optional*):
377
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
378
+ """
379
+ device = device or self._execution_device
380
+
381
+ # set lora scale so that monkey patched LoRA
382
+ # function of text encoder can correctly access it
383
+ if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
384
+ self._lora_scale = lora_scale
385
+
386
+ # dynamically adjust the LoRA scale
387
+ if self.text_encoder is not None and USE_PEFT_BACKEND:
388
+ scale_lora_layers(self.text_encoder, lora_scale)
389
+ if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
390
+ scale_lora_layers(self.text_encoder_2, lora_scale)
391
+
392
+ prompt = [prompt] if isinstance(prompt, str) else prompt
393
+
394
+ if prompt_embeds is None:
395
+ prompt_2 = prompt_2 or prompt
396
+ prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
397
+
398
+ # We only use the pooled prompt output from the CLIPTextModel
399
+ pooled_prompt_embeds = self._get_clip_prompt_embeds(
400
+ prompt=prompt,
401
+ device=device,
402
+ num_images_per_prompt=num_images_per_prompt,
403
+ )
404
+ prompt_embeds = self._get_t5_prompt_embeds(
405
+ prompt=prompt_2,
406
+ num_images_per_prompt=num_images_per_prompt,
407
+ max_sequence_length=max_sequence_length,
408
+ device=device,
409
+ )
410
+
411
+ if self.text_encoder is not None:
412
+ if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
413
+ # Retrieve the original scale by scaling back the LoRA layers
414
+ unscale_lora_layers(self.text_encoder, lora_scale)
415
+
416
+ if self.text_encoder_2 is not None:
417
+ if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
418
+ # Retrieve the original scale by scaling back the LoRA layers
419
+ unscale_lora_layers(self.text_encoder_2, lora_scale)
420
+
421
+ dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
422
+ text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
423
+
424
+ return prompt_embeds, pooled_prompt_embeds, text_ids
425
+
426
+ # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_image
427
+ def encode_image(self, image, device, num_images_per_prompt):
428
+ dtype = next(self.image_encoder.parameters()).dtype
429
+
430
+ if not isinstance(image, torch.Tensor):
431
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
432
+
433
+ image = image.to(device=device, dtype=dtype)
434
+ image_embeds = self.image_encoder(image).image_embeds
435
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
436
+ return image_embeds
437
+
438
+ # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_ip_adapter_image_embeds
439
+ def prepare_ip_adapter_image_embeds(
440
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
441
+ ):
442
+ image_embeds = []
443
+ if ip_adapter_image_embeds is None:
444
+ if not isinstance(ip_adapter_image, list):
445
+ ip_adapter_image = [ip_adapter_image]
446
+
447
+ if len(ip_adapter_image) != self.transformer.encoder_hid_proj.num_ip_adapters:
448
+ raise ValueError(
449
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
450
+ )
451
+
452
+ for single_ip_adapter_image in ip_adapter_image:
453
+ single_image_embeds = self.encode_image(single_ip_adapter_image, device, 1)
454
+ image_embeds.append(single_image_embeds[None, :])
455
+ else:
456
+ if not isinstance(ip_adapter_image_embeds, list):
457
+ ip_adapter_image_embeds = [ip_adapter_image_embeds]
458
+
459
+ if len(ip_adapter_image_embeds) != self.transformer.encoder_hid_proj.num_ip_adapters:
460
+ raise ValueError(
461
+ f"`ip_adapter_image_embeds` must have same length as the number of IP Adapters. Got {len(ip_adapter_image_embeds)} image embeds and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
462
+ )
463
+
464
+ for single_image_embeds in ip_adapter_image_embeds:
465
+ image_embeds.append(single_image_embeds)
466
+
467
+ ip_adapter_image_embeds = []
468
+ for single_image_embeds in image_embeds:
469
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
470
+ single_image_embeds = single_image_embeds.to(device=device)
471
+ ip_adapter_image_embeds.append(single_image_embeds)
472
+
473
+ return ip_adapter_image_embeds
474
+
475
+ def check_inputs(
476
+ self,
477
+ prompt,
478
+ prompt_2,
479
+ height,
480
+ width,
481
+ negative_prompt=None,
482
+ negative_prompt_2=None,
483
+ prompt_embeds=None,
484
+ negative_prompt_embeds=None,
485
+ pooled_prompt_embeds=None,
486
+ negative_pooled_prompt_embeds=None,
487
+ callback_on_step_end_tensor_inputs=None,
488
+ max_sequence_length=None,
489
+ ):
490
+ if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
491
+ logger.warning(
492
+ f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
493
+ )
494
+
495
+ if callback_on_step_end_tensor_inputs is not None and not all(
496
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
497
+ ):
498
+ raise ValueError(
499
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
500
+ )
501
+
502
+ if prompt is not None and prompt_embeds is not None:
503
+ raise ValueError(
504
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
505
+ " only forward one of the two."
506
+ )
507
+ elif prompt_2 is not None and prompt_embeds is not None:
508
+ raise ValueError(
509
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
510
+ " only forward one of the two."
511
+ )
512
+ elif prompt is None and prompt_embeds is None:
513
+ raise ValueError(
514
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
515
+ )
516
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
517
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
518
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
519
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
520
+
521
+ if negative_prompt is not None and negative_prompt_embeds is not None:
522
+ raise ValueError(
523
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
524
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
525
+ )
526
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
527
+ raise ValueError(
528
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
529
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
530
+ )
531
+
532
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
533
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
534
+ raise ValueError(
535
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
536
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
537
+ f" {negative_prompt_embeds.shape}."
538
+ )
539
+
540
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
541
+ raise ValueError(
542
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
543
+ )
544
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
545
+ raise ValueError(
546
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
547
+ )
548
+
549
+ if max_sequence_length is not None and max_sequence_length > 512:
550
+ raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
551
+
552
+ @staticmethod
553
+ # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
554
+ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
555
+ latent_image_ids = torch.zeros(height, width, 3)
556
+ latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
557
+ latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
558
+
559
+ latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
560
+
561
+ latent_image_ids = latent_image_ids.reshape(
562
+ latent_image_id_height * latent_image_id_width, latent_image_id_channels
563
+ )
564
+
565
+ return latent_image_ids.to(device=device, dtype=dtype)
566
+
567
+ @staticmethod
568
+ # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
569
+ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
570
+ latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
571
+ latents = latents.permute(0, 2, 4, 1, 3, 5)
572
+ latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
573
+
574
+ return latents
575
+
576
+ @staticmethod
577
+ # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
578
+ def _unpack_latents(latents, height, width, vae_scale_factor):
579
+ batch_size, num_patches, channels = latents.shape
580
+
581
+ # VAE applies 8x compression on images but we must also account for packing which requires
582
+ # latent height and width to be divisible by 2.
583
+ height = 2 * (int(height) // (vae_scale_factor * 2))
584
+ width = 2 * (int(width) // (vae_scale_factor * 2))
585
+
586
+ latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
587
+ latents = latents.permute(0, 3, 1, 4, 2, 5)
588
+
589
+ latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
590
+
591
+ return latents
592
+
593
+ # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_latents
594
+ def prepare_latents(
595
+ self,
596
+ batch_size,
597
+ num_channels_latents,
598
+ height,
599
+ width,
600
+ dtype,
601
+ device,
602
+ generator,
603
+ latents=None,
604
+ ):
605
+ # VAE applies 8x compression on images but we must also account for packing which requires
606
+ # latent height and width to be divisible by 2.
607
+ height = 2 * (int(height) // (self.vae_scale_factor * 2))
608
+ width = 2 * (int(width) // (self.vae_scale_factor * 2))
609
+
610
+ shape = (batch_size, num_channels_latents, height, width)
611
+
612
+ if latents is not None:
613
+ latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
614
+ return latents.to(device=device, dtype=dtype), latent_image_ids
615
+
616
+ if isinstance(generator, list) and len(generator) != batch_size:
617
+ raise ValueError(
618
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
619
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
620
+ )
621
+
622
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
623
+ latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
624
+
625
+ latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
626
+
627
+ return latents, latent_image_ids
628
+
629
+ # Copied from diffusers.pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet.StableDiffusion3ControlNetPipeline.prepare_image
630
+ def prepare_image(
631
+ self,
632
+ image,
633
+ width,
634
+ height,
635
+ batch_size,
636
+ num_images_per_prompt,
637
+ device,
638
+ dtype,
639
+ do_classifier_free_guidance=False,
640
+ guess_mode=False,
641
+ ):
642
+ if isinstance(image, torch.Tensor):
643
+ pass
644
+ else:
645
+ image = self.image_processor.preprocess(image, height=height, width=width)
646
+
647
+ image_batch_size = image.shape[0]
648
+
649
+ if image_batch_size == 1:
650
+ repeat_by = batch_size
651
+ else:
652
+ # image batch size is the same as prompt batch size
653
+ repeat_by = num_images_per_prompt
654
+
655
+ image = image.repeat_interleave(repeat_by, dim=0)
656
+
657
+ image = image.to(device=device, dtype=dtype)
658
+
659
+ if do_classifier_free_guidance and not guess_mode:
660
+ image = torch.cat([image] * 2)
661
+
662
+ return image
663
+
664
+ @property
665
+ def guidance_scale(self):
666
+ return self._guidance_scale
667
+
668
+ @property
669
+ def joint_attention_kwargs(self):
670
+ return self._joint_attention_kwargs
671
+
672
+ @property
673
+ def num_timesteps(self):
674
+ return self._num_timesteps
675
+
676
+ @property
677
+ def interrupt(self):
678
+ return self._interrupt
679
+
680
+ @torch.no_grad()
681
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
682
+ def __call__(
683
+ self,
684
+ prompt: Union[str, List[str]] = None,
685
+ prompt_2: Optional[Union[str, List[str]]] = None,
686
+ negative_prompt: Union[str, List[str]] = None,
687
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
688
+ true_cfg_scale: float = 1.0,
689
+ height: Optional[int] = None,
690
+ width: Optional[int] = None,
691
+ num_inference_steps: int = 28,
692
+ sigmas: Optional[List[float]] = None,
693
+ guidance_scale: float = 7.0,
694
+ control_guidance_start: Union[float, List[float]] = 0.0,
695
+ control_guidance_end: Union[float, List[float]] = 1.0,
696
+ control_image: PipelineImageInput = None,
697
+ control_mode: Optional[Union[int, List[int]]] = None,
698
+ controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
699
+ num_images_per_prompt: Optional[int] = 1,
700
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
701
+ latents: Optional[torch.FloatTensor] = None,
702
+ prompt_embeds: Optional[torch.FloatTensor] = None,
703
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
704
+ ip_adapter_image: Optional[PipelineImageInput] = None,
705
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
706
+ negative_ip_adapter_image: Optional[PipelineImageInput] = None,
707
+ negative_ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
708
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
709
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
710
+ output_type: Optional[str] = "pil",
711
+ return_dict: bool = True,
712
+ joint_attention_kwargs: Optional[Dict[str, Any]] = None,
713
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
714
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
715
+ max_sequence_length: int = 512,
716
+ ):
717
+ r"""
718
+ Function invoked when calling the pipeline for generation.
719
+
720
+ Args:
721
+ prompt (`str` or `List[str]`, *optional*):
722
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
723
+ instead.
724
+ prompt_2 (`str` or `List[str]`, *optional*):
725
+ The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
726
+ will be used instead
727
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
728
+ The height in pixels of the generated image. This is set to 1024 by default for the best results.
729
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
730
+ The width in pixels of the generated image. This is set to 1024 by default for the best results.
731
+ num_inference_steps (`int`, *optional*, defaults to 50):
732
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
733
+ expense of slower inference.
734
+ sigmas (`List[float]`, *optional*):
735
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
736
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
737
+ will be used.
738
+ guidance_scale (`float`, *optional*, defaults to 7.0):
739
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
740
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
741
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
742
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
743
+ usually at the expense of lower image quality.
744
+ control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
745
+ The percentage of total steps at which the ControlNet starts applying.
746
+ control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
747
+ The percentage of total steps at which the ControlNet stops applying.
748
+ control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
749
+ `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
750
+ The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
751
+ specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
752
+ as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
753
+ width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
754
+ images must be passed as a list such that each element of the list can be correctly batched for input
755
+ to a single ControlNet.
756
+ controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
757
+ The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
758
+ to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
759
+ the corresponding scale as a list.
760
+ control_mode (`int` or `List[int]`,, *optional*, defaults to None):
761
+ The control mode when applying ControlNet-Union.
762
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
763
+ The number of images to generate per prompt.
764
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
765
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
766
+ to make generation deterministic.
767
+ latents (`torch.FloatTensor`, *optional*):
768
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
769
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
770
+ tensor will ge generated by sampling using the supplied random `generator`.
771
+ prompt_embeds (`torch.FloatTensor`, *optional*):
772
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
773
+ provided, text embeddings will be generated from `prompt` input argument.
774
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
775
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
776
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
777
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
778
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
779
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
780
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
781
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
782
+ negative_ip_adapter_image:
783
+ (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
784
+ negative_ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
785
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
786
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
787
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
788
+ output_type (`str`, *optional*, defaults to `"pil"`):
789
+ The output format of the generate image. Choose between
790
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
791
+ return_dict (`bool`, *optional*, defaults to `True`):
792
+ Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
793
+ joint_attention_kwargs (`dict`, *optional*):
794
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
795
+ `self.processor` in
796
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
797
+ callback_on_step_end (`Callable`, *optional*):
798
+ A function that calls at the end of each denoising steps during the inference. The function is called
799
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
800
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
801
+ `callback_on_step_end_tensor_inputs`.
802
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
803
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
804
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
805
+ `._callback_tensor_inputs` attribute of your pipeline class.
806
+ max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
807
+
808
+ Examples:
809
+
810
+ Returns:
811
+ [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
812
+ is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
813
+ images.
814
+ """
815
+
816
+ height = height or self.default_sample_size * self.vae_scale_factor
817
+ width = width or self.default_sample_size * self.vae_scale_factor
818
+
819
+ if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
820
+ control_guidance_start = len(control_guidance_end) * [control_guidance_start]
821
+ elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
822
+ control_guidance_end = len(control_guidance_start) * [control_guidance_end]
823
+ elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
824
+ mult = len(self.controlnet.nets) if isinstance(self.controlnet, FluxMultiControlNetModel) else 1
825
+ control_guidance_start, control_guidance_end = (
826
+ mult * [control_guidance_start],
827
+ mult * [control_guidance_end],
828
+ )
829
+
830
+ # 1. Check inputs. Raise error if not correct
831
+ self.check_inputs(
832
+ prompt,
833
+ prompt_2,
834
+ height,
835
+ width,
836
+ negative_prompt=negative_prompt,
837
+ negative_prompt_2=negative_prompt_2,
838
+ prompt_embeds=prompt_embeds,
839
+ negative_prompt_embeds=negative_prompt_embeds,
840
+ pooled_prompt_embeds=pooled_prompt_embeds,
841
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
842
+ callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
843
+ max_sequence_length=max_sequence_length,
844
+ )
845
+
846
+ self._guidance_scale = guidance_scale
847
+ self._joint_attention_kwargs = joint_attention_kwargs
848
+ self._interrupt = False
849
+
850
+ # 2. Define call parameters
851
+ if prompt is not None and isinstance(prompt, str):
852
+ batch_size = 1
853
+ elif prompt is not None and isinstance(prompt, list):
854
+ batch_size = len(prompt)
855
+ else:
856
+ batch_size = prompt_embeds.shape[0]
857
+
858
+ device = self._execution_device
859
+ dtype = self.transformer.dtype
860
+
861
+ # 3. Prepare text embeddings
862
+ lora_scale = (
863
+ self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
864
+ )
865
+ do_true_cfg = true_cfg_scale > 1 and negative_prompt is not None
866
+ (
867
+ prompt_embeds,
868
+ pooled_prompt_embeds,
869
+ text_ids,
870
+ ) = self.encode_prompt(
871
+ prompt=prompt,
872
+ prompt_2=prompt_2,
873
+ prompt_embeds=prompt_embeds,
874
+ pooled_prompt_embeds=pooled_prompt_embeds,
875
+ device=device,
876
+ num_images_per_prompt=num_images_per_prompt,
877
+ max_sequence_length=max_sequence_length,
878
+ lora_scale=lora_scale,
879
+ )
880
+ if do_true_cfg:
881
+ (
882
+ negative_prompt_embeds,
883
+ negative_pooled_prompt_embeds,
884
+ _,
885
+ ) = self.encode_prompt(
886
+ prompt=negative_prompt,
887
+ prompt_2=negative_prompt_2,
888
+ prompt_embeds=negative_prompt_embeds,
889
+ pooled_prompt_embeds=negative_pooled_prompt_embeds,
890
+ device=device,
891
+ num_images_per_prompt=num_images_per_prompt,
892
+ max_sequence_length=max_sequence_length,
893
+ lora_scale=lora_scale,
894
+ )
895
+
896
+ # 3. Prepare control image
897
+ num_channels_latents = self.transformer.config.in_channels // 4
898
+ if isinstance(self.controlnet, FluxControlNetModel):
899
+ control_image = self.prepare_image(
900
+ image=control_image,
901
+ width=width,
902
+ height=height,
903
+ batch_size=batch_size * num_images_per_prompt,
904
+ num_images_per_prompt=num_images_per_prompt,
905
+ device=device,
906
+ dtype=self.vae.dtype,
907
+ )
908
+ height, width = control_image.shape[-2:]
909
+
910
+ # xlab controlnet has a input_hint_block and instantx controlnet does not
911
+ controlnet_blocks_repeat = False if self.controlnet.input_hint_block is None else True
912
+ if self.controlnet.input_hint_block is None:
913
+ # vae encode
914
+ control_image = retrieve_latents(self.vae.encode(control_image), generator=generator)
915
+ control_image = (control_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
916
+
917
+ # pack
918
+ height_control_image, width_control_image = control_image.shape[2:]
919
+ control_image = self._pack_latents(
920
+ control_image,
921
+ batch_size * num_images_per_prompt,
922
+ num_channels_latents,
923
+ height_control_image,
924
+ width_control_image,
925
+ )
926
+
927
+ # Here we ensure that `control_mode` has the same length as the control_image.
928
+ if control_mode is not None:
929
+ if not isinstance(control_mode, int):
930
+ raise ValueError(" For `FluxControlNet`, `control_mode` should be an `int` or `None`")
931
+ control_mode = torch.tensor(control_mode).to(device, dtype=torch.long)
932
+ control_mode = control_mode.view(-1, 1).expand(control_image.shape[0], 1)
933
+
934
+ elif isinstance(self.controlnet, FluxMultiControlNetModel):
935
+ control_images = []
936
+ # xlab controlnet has a input_hint_block and instantx controlnet does not
937
+ controlnet_blocks_repeat = False if self.controlnet.nets[0].input_hint_block is None else True
938
+ for i, control_image_ in enumerate(control_image):
939
+ control_image_ = self.prepare_image(
940
+ image=control_image_,
941
+ width=width,
942
+ height=height,
943
+ batch_size=batch_size * num_images_per_prompt,
944
+ num_images_per_prompt=num_images_per_prompt,
945
+ device=device,
946
+ dtype=self.vae.dtype,
947
+ )
948
+ height, width = control_image_.shape[-2:]
949
+
950
+ if self.controlnet.nets[0].input_hint_block is None:
951
+ # vae encode
952
+ control_image_ = retrieve_latents(self.vae.encode(control_image_), generator=generator)
953
+ control_image_ = (control_image_ - self.vae.config.shift_factor) * self.vae.config.scaling_factor
954
+
955
+ # pack
956
+ height_control_image, width_control_image = control_image_.shape[2:]
957
+ control_image_ = self._pack_latents(
958
+ control_image_,
959
+ batch_size * num_images_per_prompt,
960
+ num_channels_latents,
961
+ height_control_image,
962
+ width_control_image,
963
+ )
964
+ control_images.append(control_image_)
965
+
966
+ control_image = control_images
967
+
968
+ # Here we ensure that `control_mode` has the same length as the control_image.
969
+ if isinstance(control_mode, list) and len(control_mode) != len(control_image):
970
+ raise ValueError(
971
+ "For Multi-ControlNet, `control_mode` must be a list of the same "
972
+ + " length as the number of controlnets (control images) specified"
973
+ )
974
+ if not isinstance(control_mode, list):
975
+ control_mode = [control_mode] * len(control_image)
976
+ # set control mode
977
+ control_modes = []
978
+ for cmode in control_mode:
979
+ if cmode is None:
980
+ cmode = -1
981
+ control_mode = torch.tensor(cmode).expand(control_images[0].shape[0]).to(device, dtype=torch.long)
982
+ control_modes.append(control_mode)
983
+ control_mode = control_modes
984
+
985
+ # 4. Prepare latent variables
986
+ num_channels_latents = self.transformer.config.in_channels // 4
987
+ latents, latent_image_ids = self.prepare_latents(
988
+ batch_size * num_images_per_prompt,
989
+ num_channels_latents,
990
+ height,
991
+ width,
992
+ prompt_embeds.dtype,
993
+ device,
994
+ generator,
995
+ latents,
996
+ )
997
+
998
+ # 5. Prepare timesteps
999
+ sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
1000
+ image_seq_len = latents.shape[1]
1001
+ mu = calculate_shift(
1002
+ image_seq_len,
1003
+ self.scheduler.config.get("base_image_seq_len", 256),
1004
+ self.scheduler.config.get("max_image_seq_len", 4096),
1005
+ self.scheduler.config.get("base_shift", 0.5),
1006
+ self.scheduler.config.get("max_shift", 1.15),
1007
+ )
1008
+ timesteps, num_inference_steps = retrieve_timesteps(
1009
+ self.scheduler,
1010
+ num_inference_steps,
1011
+ device,
1012
+ sigmas=sigmas,
1013
+ mu=mu,
1014
+ )
1015
+
1016
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
1017
+ self._num_timesteps = len(timesteps)
1018
+
1019
+ # 6. Create tensor stating which controlnets to keep
1020
+ controlnet_keep = []
1021
+ for i in range(len(timesteps)):
1022
+ keeps = [
1023
+ 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
1024
+ for s, e in zip(control_guidance_start, control_guidance_end)
1025
+ ]
1026
+ controlnet_keep.append(keeps[0] if isinstance(self.controlnet, FluxControlNetModel) else keeps)
1027
+
1028
+ if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) and (
1029
+ negative_ip_adapter_image is None and negative_ip_adapter_image_embeds is None
1030
+ ):
1031
+ negative_ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
1032
+ elif (ip_adapter_image is None and ip_adapter_image_embeds is None) and (
1033
+ negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None
1034
+ ):
1035
+ ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
1036
+
1037
+ if self.joint_attention_kwargs is None:
1038
+ self._joint_attention_kwargs = {}
1039
+
1040
+ image_embeds = None
1041
+ negative_image_embeds = None
1042
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
1043
+ image_embeds = self.prepare_ip_adapter_image_embeds(
1044
+ ip_adapter_image,
1045
+ ip_adapter_image_embeds,
1046
+ device,
1047
+ batch_size * num_images_per_prompt,
1048
+ )
1049
+ if negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None:
1050
+ negative_image_embeds = self.prepare_ip_adapter_image_embeds(
1051
+ negative_ip_adapter_image,
1052
+ negative_ip_adapter_image_embeds,
1053
+ device,
1054
+ batch_size * num_images_per_prompt,
1055
+ )
1056
+
1057
+ # 7. Denoising loop
1058
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1059
+ for i, t in enumerate(timesteps):
1060
+ if self.interrupt:
1061
+ continue
1062
+
1063
+ if image_embeds is not None:
1064
+ self._joint_attention_kwargs["ip_adapter_image_embeds"] = image_embeds
1065
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
1066
+ timestep = t.expand(latents.shape[0]).to(latents.dtype)
1067
+
1068
+ if isinstance(self.controlnet, FluxMultiControlNetModel):
1069
+ use_guidance = self.controlnet.nets[0].config.guidance_embeds
1070
+ else:
1071
+ use_guidance = self.controlnet.config.guidance_embeds
1072
+
1073
+ guidance = torch.tensor([guidance_scale], device=device) if use_guidance else None
1074
+ guidance = guidance.expand(latents.shape[0]) if guidance is not None else None
1075
+
1076
+ if isinstance(controlnet_keep[i], list):
1077
+ cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
1078
+ else:
1079
+ controlnet_cond_scale = controlnet_conditioning_scale
1080
+ if isinstance(controlnet_cond_scale, list):
1081
+ controlnet_cond_scale = controlnet_cond_scale[0]
1082
+ cond_scale = controlnet_cond_scale * controlnet_keep[i]
1083
+
1084
+ # controlnet
1085
+ controlnet_block_samples, controlnet_single_block_samples = self.controlnet(
1086
+ hidden_states=latents,
1087
+ controlnet_cond=control_image,
1088
+ controlnet_mode=control_mode,
1089
+ conditioning_scale=cond_scale,
1090
+ timestep=timestep / 1000,
1091
+ guidance=guidance,
1092
+ pooled_projections=pooled_prompt_embeds,
1093
+ encoder_hidden_states=prompt_embeds,
1094
+ txt_ids=text_ids,
1095
+ img_ids=latent_image_ids,
1096
+ joint_attention_kwargs=self.joint_attention_kwargs,
1097
+ return_dict=False,
1098
+ )
1099
+
1100
+ guidance = (
1101
+ torch.tensor([guidance_scale], device=device) if self.transformer.config.guidance_embeds else None
1102
+ )
1103
+ guidance = guidance.expand(latents.shape[0]) if guidance is not None else None
1104
+
1105
+ noise_pred = self.transformer(
1106
+ hidden_states=latents,
1107
+ timestep=timestep / 1000,
1108
+ guidance=guidance,
1109
+ pooled_projections=pooled_prompt_embeds,
1110
+ encoder_hidden_states=prompt_embeds,
1111
+ controlnet_block_samples=controlnet_block_samples,
1112
+ controlnet_single_block_samples=controlnet_single_block_samples,
1113
+ txt_ids=text_ids,
1114
+ img_ids=latent_image_ids,
1115
+ joint_attention_kwargs=self.joint_attention_kwargs,
1116
+ return_dict=False,
1117
+ controlnet_blocks_repeat=controlnet_blocks_repeat,
1118
+ )[0]
1119
+
1120
+ if do_true_cfg:
1121
+ if negative_image_embeds is not None:
1122
+ self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
1123
+ neg_noise_pred = self.transformer(
1124
+ hidden_states=latents,
1125
+ timestep=timestep / 1000,
1126
+ guidance=guidance,
1127
+ pooled_projections=negative_pooled_prompt_embeds,
1128
+ encoder_hidden_states=negative_prompt_embeds,
1129
+ controlnet_block_samples=controlnet_block_samples,
1130
+ controlnet_single_block_samples=controlnet_single_block_samples,
1131
+ txt_ids=text_ids,
1132
+ img_ids=latent_image_ids,
1133
+ joint_attention_kwargs=self.joint_attention_kwargs,
1134
+ return_dict=False,
1135
+ controlnet_blocks_repeat=controlnet_blocks_repeat,
1136
+ )[0]
1137
+ noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
1138
+
1139
+ # compute the previous noisy sample x_t -> x_t-1
1140
+ latents_dtype = latents.dtype
1141
+ latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
1142
+
1143
+ if latents.dtype != latents_dtype:
1144
+ if torch.backends.mps.is_available():
1145
+ # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
1146
+ latents = latents.to(latents_dtype)
1147
+
1148
+ if callback_on_step_end is not None:
1149
+ callback_kwargs = {}
1150
+ for k in callback_on_step_end_tensor_inputs:
1151
+ callback_kwargs[k] = locals()[k]
1152
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
1153
+
1154
+ latents = callback_outputs.pop("latents", latents)
1155
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1156
+ control_image = callback_outputs.pop("control_image", control_image)
1157
+
1158
+ # call the callback, if provided
1159
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1160
+ progress_bar.update()
1161
+
1162
+ if XLA_AVAILABLE:
1163
+ xm.mark_step()
1164
+
1165
+ if output_type == "latent":
1166
+ image = latents
1167
+
1168
+ else:
1169
+ latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
1170
+ latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
1171
+
1172
+ image = self.vae.decode(latents, return_dict=False)[0]
1173
+ image = self.image_processor.postprocess(image, output_type=output_type)
1174
+
1175
+ # Offload all models
1176
+ self.maybe_free_model_hooks()
1177
+
1178
+ if not return_dict:
1179
+ return (image,)
1180
+
1181
+ return FluxPipelineOutput(images=image)