vijayakumaran92 commited on
Commit
e7273b8
·
verified ·
1 Parent(s): 9299a88

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +15 -0
  2. Person_1.png +3 -0
  3. Person_1_Garment.png +3 -0
  4. Person_1_Mask.png +0 -0
  5. Reference_models/Huggin_Face_Script +18 -0
  6. Reference_models/Huggin_Face_Script.py +18 -0
  7. Reference_models/clip_l/config.json +171 -0
  8. Reference_models/clip_l/merges.txt +0 -0
  9. Reference_models/clip_l/preprocessor_config.json +19 -0
  10. Reference_models/clip_l/special_tokens_map.json +1 -0
  11. Reference_models/clip_l/tokenizer.json +0 -0
  12. Reference_models/clip_l/tokenizer_config.json +34 -0
  13. Reference_models/clip_l/vocab.json +0 -0
  14. Training_Data/Inperson_3-Mask-V1.jpg +0 -0
  15. Training_Data/Inperson_3-V1.jpg +0 -0
  16. Training_Data/Inperson_4-Garment-V2.jpg +3 -0
  17. Training_Data/Inperson_4-Mask-V2.jpg +0 -0
  18. Training_Data/Inperson_4-V2.jpg +3 -0
  19. Training_Data/Inperson_5-Mask-V2.jpg +0 -0
  20. Training_Data/Inperson_5-Mask-V2.png +3 -0
  21. Training_Data/Inperson_7-Garment-V2.jpg +3 -0
  22. Training_Data/Inperson_7-Mask-V2.jpg +0 -0
  23. Training_Data/Inperson_7-Mask-V2.png +0 -0
  24. Training_Data/Pinaperson_1-Mask-V1.jpg +0 -0
  25. Training_Data/Pinaperson_2-Mask-V1.jpg +0 -0
  26. Training_Data/Pinaperson_3-Mask-V1.jpg +0 -0
  27. Training_Data/Pinaperson_4-Mask-V1.jpg +0 -0
  28. Training_Data/TigcPerson_2-Garment.jpg +3 -0
  29. Training_Data/TigcPerson_3-Garment.jpg +3 -0
  30. Training_Data/TigcPerson_4-Garment.jpg +3 -0
  31. Training_Data/TigcPerson_4-Mask.jpg +3 -0
  32. Training_Data/Venusperson_1-Mask.jpg +0 -0
  33. Training_Data/Venusperson_1.jpg +3 -0
  34. Training_Data/Venusperson_10-Mask.jpg +0 -0
  35. Training_Data/Venusperson_11-Mask.jpg +0 -0
  36. Training_Data/Venusperson_12-Mask.jpg +0 -0
  37. Training_Data/Venusperson_2-Mask.jpg +0 -0
  38. Training_Data/Venusperson_2.jpg +3 -0
  39. Training_Data/Venusperson_3-Mask.jpg +0 -0
  40. Training_Data/Venusperson_4-Mask.jpg +0 -0
  41. Training_Data/Venusperson_5-Garment.jpg +3 -0
  42. Training_Data/Venusperson_5-Mask.jpg +0 -0
  43. Training_Data/Venusperson_5.jpg +3 -0
  44. Training_Data/Venusperson_6-Mask.jpg +0 -0
  45. Training_Data/Venusperson_7-Garment.jpg +3 -0
  46. Training_Data/Venusperson_7-Mask.jpg +0 -0
  47. Training_Data/Venusperson_8-Mask.jpg +0 -0
  48. Training_Data/Venusperson_9-Mask.jpg +0 -0
  49. Unmodel_training.sh +28 -0
  50. ace_plus_dataset_bkp.py +279 -0
.gitattributes CHANGED
@@ -33,3 +33,18 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Training_Data/TigcPerson_2-Garment.jpg filter=lfs diff=lfs merge=lfs -text
37
+ Training_Data/TigcPerson_4-Garment.jpg filter=lfs diff=lfs merge=lfs -text
38
+ Person_1.png filter=lfs diff=lfs merge=lfs -text
39
+ Training_Data/Inperson_4-V2.jpg filter=lfs diff=lfs merge=lfs -text
40
+ Training_Data/Inperson_4-Garment-V2.jpg filter=lfs diff=lfs merge=lfs -text
41
+ Person_1_Garment.png filter=lfs diff=lfs merge=lfs -text
42
+ Training_Data/Venusperson_5.jpg filter=lfs diff=lfs merge=lfs -text
43
+ Training_Data/TigcPerson_3-Garment.jpg filter=lfs diff=lfs merge=lfs -text
44
+ Training_Data/Inperson_5-Mask-V2.png filter=lfs diff=lfs merge=lfs -text
45
+ Training_Data/Venusperson_5-Garment.jpg filter=lfs diff=lfs merge=lfs -text
46
+ Training_Data/TigcPerson_4-Mask.jpg filter=lfs diff=lfs merge=lfs -text
47
+ Training_Data/Inperson_7-Garment-V2.jpg filter=lfs diff=lfs merge=lfs -text
48
+ Training_Data/Venusperson_7-Garment.jpg filter=lfs diff=lfs merge=lfs -text
49
+ Training_Data/Venusperson_1.jpg filter=lfs diff=lfs merge=lfs -text
50
+ Training_Data/Venusperson_2.jpg filter=lfs diff=lfs merge=lfs -text
Person_1.png ADDED

Git LFS Details

  • SHA256: 8dddceee0164e391fa185495ee833857b1210ffe129dce738a27d7b627d31006
  • Pointer size: 131 Bytes
  • Size of remote file: 256 kB
Person_1_Garment.png ADDED

Git LFS Details

  • SHA256: 8c598d183bba37b82718a1a17fa10804be2e65ef5425683fe5dbbffc65afecda
  • Pointer size: 132 Bytes
  • Size of remote file: 1.75 MB
Person_1_Mask.png ADDED
Reference_models/Huggin_Face_Script ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import snapshot_download, hf_hub_download
2
+
3
+ # 1a) Download the **entire metadata** tree for T5-XXL
4
+ snapshot_download(
5
+ repo_id="google/t5-xxl-lm-adapt",
6
+ repo_type="model",
7
+ local_dir="Reference_models/t5_xxl_meta",
8
+ allow_patterns=["config.json", "tokenizer_config.json", "spiece.model"]
9
+ )
10
+
11
+ # 1b) Download the **entire metadata** tree for CLIP-L
12
+ snapshot_download(
13
+ repo_id="openai/clip-vit-large-patch14",
14
+ repo_type="model",
15
+ local_dir="Reference_models/clip_l_meta",
16
+ allow_patterns=["config.json", "tokenizer_config.json", "vocab.json", "merges.txt"]
17
+ )
18
+
Reference_models/Huggin_Face_Script.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import snapshot_download, hf_hub_download
2
+
3
+ # 1a) Download the **entire metadata** tree for T5-XXL
4
+ snapshot_download(
5
+ repo_id="google/t5-xxl-lm-adapt",
6
+ repo_type="model",
7
+ local_dir="Reference_models/t5_xxl_meta",
8
+ allow_patterns=["config.json", "tokenizer_config.json", "spiece.model"]
9
+ )
10
+
11
+ # 1b) Download the **entire metadata** tree for CLIP-L
12
+ snapshot_download(
13
+ repo_id="openai/clip-vit-large-patch14",
14
+ repo_type="model",
15
+ local_dir="Reference_models/clip_l_meta",
16
+ allow_patterns=["config.json", "tokenizer_config.json", "vocab.json", "merges.txt"]
17
+ )
18
+
Reference_models/clip_l/config.json ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "clip-vit-large-patch14/",
3
+ "architectures": [
4
+ "CLIPModel"
5
+ ],
6
+ "initializer_factor": 1.0,
7
+ "logit_scale_init_value": 2.6592,
8
+ "model_type": "clip",
9
+ "projection_dim": 768,
10
+ "text_config": {
11
+ "_name_or_path": "",
12
+ "add_cross_attention": false,
13
+ "architectures": null,
14
+ "attention_dropout": 0.0,
15
+ "bad_words_ids": null,
16
+ "bos_token_id": 0,
17
+ "chunk_size_feed_forward": 0,
18
+ "cross_attention_hidden_size": null,
19
+ "decoder_start_token_id": null,
20
+ "diversity_penalty": 0.0,
21
+ "do_sample": false,
22
+ "dropout": 0.0,
23
+ "early_stopping": false,
24
+ "encoder_no_repeat_ngram_size": 0,
25
+ "eos_token_id": 2,
26
+ "finetuning_task": null,
27
+ "forced_bos_token_id": null,
28
+ "forced_eos_token_id": null,
29
+ "hidden_act": "quick_gelu",
30
+ "hidden_size": 768,
31
+ "id2label": {
32
+ "0": "LABEL_0",
33
+ "1": "LABEL_1"
34
+ },
35
+ "initializer_factor": 1.0,
36
+ "initializer_range": 0.02,
37
+ "intermediate_size": 3072,
38
+ "is_decoder": false,
39
+ "is_encoder_decoder": false,
40
+ "label2id": {
41
+ "LABEL_0": 0,
42
+ "LABEL_1": 1
43
+ },
44
+ "layer_norm_eps": 1e-05,
45
+ "length_penalty": 1.0,
46
+ "max_length": 20,
47
+ "max_position_embeddings": 77,
48
+ "min_length": 0,
49
+ "model_type": "clip_text_model",
50
+ "no_repeat_ngram_size": 0,
51
+ "num_attention_heads": 12,
52
+ "num_beam_groups": 1,
53
+ "num_beams": 1,
54
+ "num_hidden_layers": 12,
55
+ "num_return_sequences": 1,
56
+ "output_attentions": false,
57
+ "output_hidden_states": false,
58
+ "output_scores": false,
59
+ "pad_token_id": 1,
60
+ "prefix": null,
61
+ "problem_type": null,
62
+ "projection_dim" : 768,
63
+ "pruned_heads": {},
64
+ "remove_invalid_values": false,
65
+ "repetition_penalty": 1.0,
66
+ "return_dict": true,
67
+ "return_dict_in_generate": false,
68
+ "sep_token_id": null,
69
+ "task_specific_params": null,
70
+ "temperature": 1.0,
71
+ "tie_encoder_decoder": false,
72
+ "tie_word_embeddings": true,
73
+ "tokenizer_class": null,
74
+ "top_k": 50,
75
+ "top_p": 1.0,
76
+ "torch_dtype": null,
77
+ "torchscript": false,
78
+ "transformers_version": "4.16.0.dev0",
79
+ "use_bfloat16": false,
80
+ "vocab_size": 49408
81
+ },
82
+ "text_config_dict": {
83
+ "hidden_size": 768,
84
+ "intermediate_size": 3072,
85
+ "num_attention_heads": 12,
86
+ "num_hidden_layers": 12,
87
+ "projection_dim": 768
88
+ },
89
+ "torch_dtype": "float32",
90
+ "transformers_version": null,
91
+ "vision_config": {
92
+ "_name_or_path": "",
93
+ "add_cross_attention": false,
94
+ "architectures": null,
95
+ "attention_dropout": 0.0,
96
+ "bad_words_ids": null,
97
+ "bos_token_id": null,
98
+ "chunk_size_feed_forward": 0,
99
+ "cross_attention_hidden_size": null,
100
+ "decoder_start_token_id": null,
101
+ "diversity_penalty": 0.0,
102
+ "do_sample": false,
103
+ "dropout": 0.0,
104
+ "early_stopping": false,
105
+ "encoder_no_repeat_ngram_size": 0,
106
+ "eos_token_id": null,
107
+ "finetuning_task": null,
108
+ "forced_bos_token_id": null,
109
+ "forced_eos_token_id": null,
110
+ "hidden_act": "quick_gelu",
111
+ "hidden_size": 1024,
112
+ "id2label": {
113
+ "0": "LABEL_0",
114
+ "1": "LABEL_1"
115
+ },
116
+ "image_size": 224,
117
+ "initializer_factor": 1.0,
118
+ "initializer_range": 0.02,
119
+ "intermediate_size": 4096,
120
+ "is_decoder": false,
121
+ "is_encoder_decoder": false,
122
+ "label2id": {
123
+ "LABEL_0": 0,
124
+ "LABEL_1": 1
125
+ },
126
+ "layer_norm_eps": 1e-05,
127
+ "length_penalty": 1.0,
128
+ "max_length": 20,
129
+ "min_length": 0,
130
+ "model_type": "clip_vision_model",
131
+ "no_repeat_ngram_size": 0,
132
+ "num_attention_heads": 16,
133
+ "num_beam_groups": 1,
134
+ "num_beams": 1,
135
+ "num_hidden_layers": 24,
136
+ "num_return_sequences": 1,
137
+ "output_attentions": false,
138
+ "output_hidden_states": false,
139
+ "output_scores": false,
140
+ "pad_token_id": null,
141
+ "patch_size": 14,
142
+ "prefix": null,
143
+ "problem_type": null,
144
+ "projection_dim" : 768,
145
+ "pruned_heads": {},
146
+ "remove_invalid_values": false,
147
+ "repetition_penalty": 1.0,
148
+ "return_dict": true,
149
+ "return_dict_in_generate": false,
150
+ "sep_token_id": null,
151
+ "task_specific_params": null,
152
+ "temperature": 1.0,
153
+ "tie_encoder_decoder": false,
154
+ "tie_word_embeddings": true,
155
+ "tokenizer_class": null,
156
+ "top_k": 50,
157
+ "top_p": 1.0,
158
+ "torch_dtype": null,
159
+ "torchscript": false,
160
+ "transformers_version": "4.16.0.dev0",
161
+ "use_bfloat16": false
162
+ },
163
+ "vision_config_dict": {
164
+ "hidden_size": 1024,
165
+ "intermediate_size": 4096,
166
+ "num_attention_heads": 16,
167
+ "num_hidden_layers": 24,
168
+ "patch_size": 14,
169
+ "projection_dim": 768
170
+ }
171
+ }
Reference_models/clip_l/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
Reference_models/clip_l/preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 224,
3
+ "do_center_crop": true,
4
+ "do_normalize": true,
5
+ "do_resize": true,
6
+ "feature_extractor_type": "CLIPFeatureExtractor",
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "resample": 3,
18
+ "size": 224
19
+ }
Reference_models/clip_l/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": {"content": "<|startoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": "<|endoftext|>"}
Reference_models/clip_l/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Reference_models/clip_l/tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "unk_token": {
3
+ "content": "<|endoftext|>",
4
+ "single_word": false,
5
+ "lstrip": false,
6
+ "rstrip": false,
7
+ "normalized": true,
8
+ "__type": "AddedToken"
9
+ },
10
+ "bos_token": {
11
+ "content": "<|startoftext|>",
12
+ "single_word": false,
13
+ "lstrip": false,
14
+ "rstrip": false,
15
+ "normalized": true,
16
+ "__type": "AddedToken"
17
+ },
18
+ "eos_token": {
19
+ "content": "<|endoftext|>",
20
+ "single_word": false,
21
+ "lstrip": false,
22
+ "rstrip": false,
23
+ "normalized": true,
24
+ "__type": "AddedToken"
25
+ },
26
+ "pad_token": "<|endoftext|>",
27
+ "add_prefix_space": false,
28
+ "errors": "replace",
29
+ "do_lower_case": true,
30
+ "name_or_path": "openai/clip-vit-base-patch32",
31
+ "model_max_length": 77,
32
+ "special_tokens_map_file": "./special_tokens_map.json",
33
+ "tokenizer_class": "CLIPTokenizer"
34
+ }
Reference_models/clip_l/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
Training_Data/Inperson_3-Mask-V1.jpg ADDED
Training_Data/Inperson_3-V1.jpg ADDED
Training_Data/Inperson_4-Garment-V2.jpg ADDED

Git LFS Details

  • SHA256: 3247f3b9fcd2496250a1481cb0f4069e2e2f18b2bb1df8646a96ddd03715f429
  • Pointer size: 131 Bytes
  • Size of remote file: 179 kB
Training_Data/Inperson_4-Mask-V2.jpg ADDED
Training_Data/Inperson_4-V2.jpg ADDED

Git LFS Details

  • SHA256: 273fcc2ae2eff81b3226bbca9d3f8fd52146bcee432d1e56f489cdd48396e597
  • Pointer size: 131 Bytes
  • Size of remote file: 474 kB
Training_Data/Inperson_5-Mask-V2.jpg ADDED
Training_Data/Inperson_5-Mask-V2.png ADDED

Git LFS Details

  • SHA256: fbcb6e039ecd846af36f9b9d588d0e200a2553d80a0e79d81b3e135b0d414ae3
  • Pointer size: 131 Bytes
  • Size of remote file: 164 kB
Training_Data/Inperson_7-Garment-V2.jpg ADDED

Git LFS Details

  • SHA256: 68b55a283dd54b560ac92ae4c127dbd72bca4fd19fdc13d54e2598a34b7af7ec
  • Pointer size: 131 Bytes
  • Size of remote file: 186 kB
Training_Data/Inperson_7-Mask-V2.jpg ADDED
Training_Data/Inperson_7-Mask-V2.png ADDED
Training_Data/Pinaperson_1-Mask-V1.jpg ADDED
Training_Data/Pinaperson_2-Mask-V1.jpg ADDED
Training_Data/Pinaperson_3-Mask-V1.jpg ADDED
Training_Data/Pinaperson_4-Mask-V1.jpg ADDED
Training_Data/TigcPerson_2-Garment.jpg ADDED

Git LFS Details

  • SHA256: bdea3158c0e0fa4212077c37bb744ef8851e417ba06b966445a6634be416bc26
  • Pointer size: 131 Bytes
  • Size of remote file: 284 kB
Training_Data/TigcPerson_3-Garment.jpg ADDED

Git LFS Details

  • SHA256: bdea3158c0e0fa4212077c37bb744ef8851e417ba06b966445a6634be416bc26
  • Pointer size: 131 Bytes
  • Size of remote file: 284 kB
Training_Data/TigcPerson_4-Garment.jpg ADDED

Git LFS Details

  • SHA256: bdea3158c0e0fa4212077c37bb744ef8851e417ba06b966445a6634be416bc26
  • Pointer size: 131 Bytes
  • Size of remote file: 284 kB
Training_Data/TigcPerson_4-Mask.jpg ADDED

Git LFS Details

  • SHA256: 2de80d770c44d20d4525c04984367885e2e9c7be658c788f5aec5b5e3f85f4da
  • Pointer size: 131 Bytes
  • Size of remote file: 291 kB
Training_Data/Venusperson_1-Mask.jpg ADDED
Training_Data/Venusperson_1.jpg ADDED

Git LFS Details

  • SHA256: ebd6c816f2dabf9f153c37415577d3f0a93d5829b9152b3f2cb8d2a5f0847e56
  • Pointer size: 131 Bytes
  • Size of remote file: 497 kB
Training_Data/Venusperson_10-Mask.jpg ADDED
Training_Data/Venusperson_11-Mask.jpg ADDED
Training_Data/Venusperson_12-Mask.jpg ADDED
Training_Data/Venusperson_2-Mask.jpg ADDED
Training_Data/Venusperson_2.jpg ADDED

Git LFS Details

  • SHA256: ebd6c816f2dabf9f153c37415577d3f0a93d5829b9152b3f2cb8d2a5f0847e56
  • Pointer size: 131 Bytes
  • Size of remote file: 497 kB
Training_Data/Venusperson_3-Mask.jpg ADDED
Training_Data/Venusperson_4-Mask.jpg ADDED
Training_Data/Venusperson_5-Garment.jpg ADDED

Git LFS Details

  • SHA256: 73996fb25946eab5d27933f0f77c058f8f10698eb731bcaca0472ae5bb93ce7c
  • Pointer size: 132 Bytes
  • Size of remote file: 1.86 MB
Training_Data/Venusperson_5-Mask.jpg ADDED
Training_Data/Venusperson_5.jpg ADDED

Git LFS Details

  • SHA256: 15f39aeced4fe50df5caa207b29099d9c313fa194242499450aee4c67864aae1
  • Pointer size: 131 Bytes
  • Size of remote file: 827 kB
Training_Data/Venusperson_6-Mask.jpg ADDED
Training_Data/Venusperson_7-Garment.jpg ADDED

Git LFS Details

  • SHA256: 85be7194e7a1573345a05f0026c345041eafa69be411e42ab515bad9a0ebfba6
  • Pointer size: 132 Bytes
  • Size of remote file: 1.43 MB
Training_Data/Venusperson_7-Mask.jpg ADDED
Training_Data/Venusperson_8-Mask.jpg ADDED
Training_Data/Venusperson_9-Mask.jpg ADDED
Unmodel_training.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Script to clean GPU memory and run training
3
+
4
+ # Kill any existing Python processes
5
+ echo "Stopping any running Python processes..."
6
+ pkill -9 python
7
+
8
+ # Clear GPU cache
9
+ echo "Clearing GPU cache..."
10
+ nvidia-smi --gpu-reset
11
+
12
+ # Wait a moment for cleanup
13
+ sleep 5
14
+
15
+ # Check GPU memory status
16
+ echo "Current GPU memory status:"
17
+ nvidia-smi
18
+
19
+ # Set memory optimization environment variables
20
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
21
+ export PYTORCH_NO_CUDA_MEMORY_CACHING=1
22
+
23
+ # Run training with reduced image size (optional)
24
+ echo "Starting training..."
25
+ python run_train.py --cfg train_config/ace_plus_fft_lora.yaml
26
+
27
+ # Or if you have a specific memory-optimized config:
28
+ # python run_train.py --cfg train_config/ace_plus_fft_lora_low_mem.yaml
ace_plus_dataset_bkp.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Alibaba, Inc. and its affiliates.
3
+ import math
4
+ import re, io
5
+ import numpy as np
6
+ import random, torch
7
+ from PIL import Image
8
+ import torchvision.transforms as T
9
+ from collections import defaultdict
10
+ from scepter.modules.data.dataset.registry import DATASETS
11
+ from scepter.modules.data.dataset.base_dataset import BaseDataset
12
+ from scepter.modules.transform.io import pillow_convert
13
+ from scepter.modules.utils.directory import osp_path
14
+ from scepter.modules.utils.file_system import FS
15
+ from torchvision.transforms import InterpolationMode
16
+ def load_image(prefix, img_path, cvt_type=None):
17
+ if img_path is None or img_path == '':
18
+ return None
19
+ img_path = osp_path(prefix, img_path)
20
+ with FS.get_object(img_path) as image_bytes:
21
+ image = Image.open(io.BytesIO(image_bytes))
22
+ if cvt_type is not None:
23
+ image = pillow_convert(image, cvt_type)
24
+ return image
25
+ def transform_image(image, std = 0.5, mean = 0.5):
26
+ return (image.permute(2, 0, 1)/255. - mean)/std
27
+ def transform_mask(mask):
28
+ return mask.unsqueeze(0)/255.
29
+
30
+ def ensure_src_align_target_h_mode(src_image, size, image_id, interpolation=InterpolationMode.BILINEAR):
31
+ # padding mode
32
+ H, W = size
33
+ ret_image = []
34
+ for one_id in image_id:
35
+ edit_image = src_image[one_id]
36
+ tH, tW = H, W
37
+ ret_image.append(T.Resize((tH, tW), interpolation=interpolation, antialias=True)(edit_image))
38
+ return ret_image
39
+
40
+ def ensure_src_align_target_padding_mode(src_image, size, image_id, size_h = [], interpolation=InterpolationMode.BILINEAR):
41
+ # padding mode
42
+ H, W = size
43
+
44
+ ret_data = []
45
+ ret_h = []
46
+ for idx, one_id in enumerate(image_id):
47
+ if len(size_h) < 1:
48
+ rH = random.randint(int(H / 3), int(H))
49
+ else:
50
+ rH = size_h[idx]
51
+ ret_h.append(rH)
52
+ edit_image = src_image[one_id]
53
+ _, eH, eW = edit_image.shape
54
+ scale = rH/eH
55
+ tH, tW = rH, int(eW * scale)
56
+ edit_image = T.Resize((tH, tW), interpolation=interpolation, antialias=True)(edit_image)
57
+ # padding
58
+ delta_w = 0
59
+ delta_h = H - tH
60
+ padding = (delta_w // 2, delta_h // 2, delta_w - (delta_w // 2), delta_h - (delta_h // 2))
61
+ ret_data.append(T.Pad(padding, fill=0, padding_mode="constant")(edit_image).float())
62
+ return ret_data, ret_h
63
+
64
+ def ensure_limit_sequence(image, max_seq_len = 4096, d = 16, interpolation=InterpolationMode.BILINEAR):
65
+ # resize image for max_seq_len, while keep the aspect ratio
66
+ H, W = image.shape[-2:]
67
+ scale = min(1.0, math.sqrt(max_seq_len / ((H / d) * (W / d))))
68
+ rH = int(H * scale) // d * d # ensure divisible by self.d
69
+ rW = int(W * scale) // d * d
70
+ # print(f"{H} {W} -> {rH} {rW}")
71
+ image = T.Resize((rH, rW), interpolation=interpolation, antialias=True)(image)
72
+ return image
73
+
74
+ @DATASETS.register_class()
75
+ class ACEPlusDataset(BaseDataset):
76
+ para_dict = {
77
+ "DELIMITER": {
78
+ "value": "#;#",
79
+ "description": "The delimiter for records of data list."
80
+ },
81
+ "FIELDS": {
82
+ "value": ["data_type", "edit_image", "edit_mask", "ref_image", "target_image", "prompt"],
83
+ "description": "The fields for every record."
84
+ },
85
+ "PATH_PREFIX": {
86
+ "value": "",
87
+ "description": "The path prefix for every input image."
88
+ },
89
+ "EDIT_TYPE_LIST": {
90
+ "value": [],
91
+ "description": "The edit type list to be trained for data list."
92
+ },
93
+ "MAX_SEQ_LEN": {
94
+ "value": 4096,
95
+ "description": "The max sequence length for input image."
96
+ },
97
+ "D": {
98
+ "value": 16,
99
+ "description": "Patch size for resized image."
100
+ }
101
+ }
102
+ para_dict.update(BaseDataset.para_dict)
103
+ def __init__(self, cfg, logger=None):
104
+ super().__init__(cfg, logger=logger)
105
+ delimiter = cfg.get("DELIMITER", "#;#")
106
+ fields = cfg.get("FIELDS", [])
107
+ prefix = cfg.get("PATH_PREFIX", "")
108
+ edit_type_list = cfg.get("EDIT_TYPE_LIST", [])
109
+ self.modify_mode = cfg.get("MODIFY_MODE", True)
110
+ self.max_seq_len = cfg.get("MAX_SEQ_LEN", 4096)
111
+ self.repaiting_scale = cfg.get("REPAINTING_SCALE", 0.5)
112
+ self.d = cfg.get("D", 16)
113
+ prompt_file = cfg.DATA_LIST
114
+ self.items = self.read_data_list(delimiter,
115
+ fields,
116
+ prefix,
117
+ edit_type_list,
118
+ prompt_file)
119
+ random.shuffle(self.items)
120
+ use_num = int(cfg.get('USE_NUM', -1))
121
+ if use_num > 0:
122
+ self.items = self.items[:use_num]
123
+ def read_data_list(self, delimiter,
124
+ fields,
125
+ prefix,
126
+ edit_type_list,
127
+ prompt_file):
128
+ with FS.get_object(prompt_file) as local_data:
129
+ rows = local_data.decode('utf-8').strip().split('\n')
130
+ items = list()
131
+ dtype_level_num = {}
132
+ for i, row in enumerate(rows):
133
+ item = {"prefix": prefix}
134
+ for key, val in zip(fields, row.split(delimiter)):
135
+ item[key] = val
136
+ edit_type = item["data_type"]
137
+ if len(edit_type_list) > 0:
138
+ for re_pattern in edit_type_list:
139
+ if re.match(re_pattern, edit_type):
140
+ items.append(item)
141
+ if edit_type not in dtype_level_num:
142
+ dtype_level_num[edit_type] = 0
143
+ dtype_level_num[edit_type] += 1
144
+ break
145
+ else:
146
+ items.append(item)
147
+ if edit_type not in dtype_level_num:
148
+ dtype_level_num[edit_type] = 0
149
+ dtype_level_num[edit_type] += 1
150
+ for edit_type in dtype_level_num:
151
+ self.logger.info(f"{edit_type} has {dtype_level_num[edit_type]} samples.")
152
+ return items
153
+ def __len__(self):
154
+ return len(self.items)
155
+
156
+ def __getitem__(self, index):
157
+ item = self._get(index)
158
+ return self.pipeline(item)
159
+
160
+ def _get(self, index):
161
+ # normalize
162
+ sample_id = index%len(self)
163
+ index = self.items[index%len(self)]
164
+ prefix = index.get("prefix", "")
165
+ edit_image = index.get("edit_image", "")
166
+ edit_mask = index.get("edit_mask", "")
167
+ ref_image = index.get("ref_image", "")
168
+ target_image = index.get("target_image", "")
169
+ prompt = index.get("prompt", "")
170
+
171
+ edit_image = load_image(prefix, edit_image, cvt_type="RGB") if edit_image != "" else None
172
+ edit_mask = load_image(prefix, edit_mask, cvt_type="L") if edit_mask != "" else None
173
+ ref_image = load_image(prefix, ref_image, cvt_type="RGB") if ref_image != "" else None
174
+ target_image = load_image(prefix, target_image, cvt_type="RGB") if target_image != "" else None
175
+ assert target_image is not None
176
+
177
+ edit_id, ref_id, src_image_list, src_mask_list = [], [], [], []
178
+ # parse editing image
179
+ if edit_image is None:
180
+ edit_image = Image.new("RGB", target_image.size, (255, 255, 255))
181
+ edit_mask = Image.new("L", edit_image.size, 255)
182
+ elif edit_mask is None:
183
+ edit_mask = Image.new("L", edit_image.size, 255)
184
+ src_image_list.append(edit_image)
185
+ edit_id.append(0)
186
+ src_mask_list.append(edit_mask)
187
+ # parse reference image
188
+ if ref_image is not None:
189
+ src_image_list.append(ref_image)
190
+ ref_id.append(1)
191
+ src_mask_list.append(Image.new("L", ref_image.size, 0))
192
+
193
+ image = transform_image(torch.tensor(np.array(target_image).astype(np.float32)))
194
+ if edit_mask is not None:
195
+ image_mask = transform_mask(torch.tensor(np.array(edit_mask).astype(np.float32)))
196
+ else:
197
+ image_mask = Image.new("L", target_image.size, 255)
198
+ image_mask = transform_mask(torch.tensor(np.array(image_mask).astype(np.float32)))
199
+
200
+
201
+ src_image_list = [transform_image(torch.tensor(np.array(im).astype(np.float32))) for im in src_image_list]
202
+ src_mask_list = [transform_mask(torch.tensor(np.array(im).astype(np.float32))) for im in src_mask_list]
203
+
204
+ # decide the repainting scale for the editing task
205
+ if len(ref_id) > 0:
206
+ repainting_scale = 1.0
207
+ else:
208
+ repainting_scale = self.repaiting_scale
209
+ for e_i in edit_id:
210
+ src_image_list[e_i] = src_image_list[e_i] * (1 - repainting_scale * src_mask_list[e_i])
211
+ size = image.shape[1:]
212
+ ref_image_list, ret_h = ensure_src_align_target_padding_mode(src_image_list, size,
213
+ image_id=ref_id,
214
+ interpolation=InterpolationMode.NEAREST_EXACT)
215
+ ref_mask_list, ret_h = ensure_src_align_target_padding_mode(src_mask_list, size,
216
+ size_h=ret_h,
217
+ image_id=ref_id,
218
+ interpolation=InterpolationMode.NEAREST_EXACT)
219
+
220
+ edit_image_list = ensure_src_align_target_h_mode(src_image_list, size,
221
+ image_id=edit_id,
222
+ interpolation=InterpolationMode.NEAREST_EXACT)
223
+ edit_mask_list = ensure_src_align_target_h_mode(src_mask_list, size,
224
+ image_id=edit_id,
225
+ interpolation=InterpolationMode.NEAREST_EXACT)
226
+
227
+
228
+
229
+ src_image_list = [torch.cat(ref_image_list + edit_image_list, dim=-1)]
230
+ src_mask_list = [torch.cat(ref_mask_list + edit_mask_list, dim=-1)]
231
+ image = torch.cat(ref_image_list + [image], dim=-1)
232
+ image_mask = torch.cat(ref_mask_list + [image_mask], dim=-1)
233
+
234
+ # limit max sequence length
235
+ image = ensure_limit_sequence(image, max_seq_len = self.max_seq_len,
236
+ d = self.d, interpolation=InterpolationMode.NEAREST_EXACT)
237
+ image_mask = ensure_limit_sequence(image_mask, max_seq_len = self.max_seq_len,
238
+ d = self.d, interpolation=InterpolationMode.NEAREST_EXACT)
239
+ src_image_list = [ensure_limit_sequence(i, max_seq_len = self.max_seq_len,
240
+ d = self.d, interpolation=InterpolationMode.NEAREST_EXACT) for i in src_image_list]
241
+ src_mask_list = [ensure_limit_sequence(i, max_seq_len = self.max_seq_len,
242
+ d = self.d, interpolation=InterpolationMode.NEAREST_EXACT) for i in src_mask_list]
243
+
244
+ if self.modify_mode:
245
+ # To be modified regions according to mask
246
+ modify_image_list = [ii * im for ii, im in zip(src_image_list, src_mask_list)]
247
+ # To be edited regions according to mask
248
+ src_image_list = [ii * (1 - im) for ii, im in zip(src_image_list, src_mask_list)]
249
+ else:
250
+ src_image_list = src_image_list
251
+ modify_image_list = src_image_list
252
+
253
+ item = {
254
+ "src_image_list": src_image_list,
255
+ "src_mask_list": src_mask_list,
256
+ "modify_image_list": modify_image_list,
257
+ "image": image,
258
+ "image_mask": image_mask,
259
+ "edit_id": edit_id,
260
+ "ref_id": ref_id,
261
+ "prompt": prompt,
262
+ "edit_key": index["edit_key"] if "edit_key" in index else "",
263
+ "sample_id": sample_id
264
+ }
265
+ return item
266
+
267
+ @staticmethod
268
+ def collate_fn(batch):
269
+ collect = defaultdict(list)
270
+ for sample in batch:
271
+ for k, v in sample.items():
272
+ collect[k].append(v)
273
+ new_batch = dict()
274
+ for k, v in collect.items():
275
+ if all([i is None for i in v]):
276
+ new_batch[k] = None
277
+ else:
278
+ new_batch[k] = v
279
+ return new_batch