Shawn87377 xueyanz commited on
Commit
fa9854e
·
0 Parent(s):

Duplicate from xdecoder/Instruct-X-Decoder

Browse files

Co-authored-by: Xueyan Zou <xueyanz@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes.   See raw diff
.gitattributes ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ *.psd filter=lfs diff=lfs merge=lfs -text
36
+ images/animals.png filter=lfs diff=lfs merge=lfs -text
37
+ images/region_retrieval.png filter=lfs diff=lfs merge=lfs -text
38
+ images/girl_and_two_boys.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Instruct X-Decoder
3
+ emoji: 🖌️🎨
4
+ colorFrom: pink
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 3.14.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: afl-3.0
11
+ duplicated_from: xdecoder/Instruct-X-Decoder
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3
+ # Copyright (c) 2022 Microsoft
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # Written by Jianwei Yang (jianwyan@microsoft.com), Xueyan Zou (xueyan@cs.wisc.edu)
6
+ # --------------------------------------------------------
7
+
8
+ import os
9
+ os.system("python -m pip install git+https://github.com/MaureenZOU/detectron2-xyz.git")
10
+
11
+ import gradio as gr
12
+ import torch
13
+ import argparse
14
+
15
+ from xdecoder.BaseModel import BaseModel
16
+ from xdecoder import build_model
17
+ from utils.distributed import init_distributed
18
+ from utils.arguments import load_opt_from_config_files
19
+
20
+ from tasks import *
21
+
22
+ def parse_option():
23
+ parser = argparse.ArgumentParser('X-Decoder All-in-One Demo', add_help=False)
24
+ parser.add_argument('--conf_files', default="configs/xdecoder/svlp_focalt_lang.yaml", metavar="FILE", help='path to config file', )
25
+ args = parser.parse_args()
26
+
27
+ return args
28
+
29
+ '''
30
+ build args
31
+ '''
32
+ args = parse_option()
33
+ opt = load_opt_from_config_files(args.conf_files)
34
+ opt = init_distributed(opt)
35
+
36
+ # META DATA
37
+ pretrained_pth_last = os.path.join("xdecoder_focalt_last.pt")
38
+ pretrained_pth_novg = os.path.join("xdecoder_focalt_last_novg.pt")
39
+
40
+ if not os.path.exists(pretrained_pth_last):
41
+ os.system("wget {}".format("https://projects4jw.blob.core.windows.net/x-decoder/release/xdecoder_focalt_last.pt"))
42
+
43
+ if not os.path.exists(pretrained_pth_novg):
44
+ os.system("wget {}".format("https://projects4jw.blob.core.windows.net/x-decoder/release/xdecoder_focalt_last_novg.pt"))
45
+
46
+
47
+ '''
48
+ build model
49
+ '''
50
+ model_last = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth_last).eval().cuda()
51
+
52
+ with torch.no_grad():
53
+ model_last.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(["background", "background"], is_eval=True)
54
+
55
+ '''
56
+ inference model
57
+ '''
58
+
59
+ @torch.no_grad()
60
+ def inference(image, instruction, *args, **kwargs):
61
+ image = image.convert("RGB")
62
+ with torch.autocast(device_type='cuda', dtype=torch.float16):
63
+ return referring_inpainting_gpt3(model_last, image, instruction, *args, **kwargs)
64
+
65
+ '''
66
+ launch app
67
+ '''
68
+
69
+ title = "Instructional Image Editing"
70
+ description = """<p style='text-align: center'> <a href='https://x-decoder-vl.github.io/' target='_blank'>Project Page</a> | <a href='https://arxiv.org/pdf/2212.11270.pdf' target='_blank'>Paper</a> | <a href='https://github.com/microsoft/X-Decoder' target='_blank'>Github Repo</a> | <a href='https://youtu.be/wYp6vmyolqE' target='_blank'>Video</a></p>
71
+ <p style='text-align: center; color: red;'> NOTE: This demo is mainly for object-centric instructional image editing! For style transfer please refer to the hero demo <a href='https://huggingface.co/spaces/timbrooks/instruct-pix2pix' target='_blank'>Instruct-Pix2Pix</a></p>
72
+ <p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
73
+ <br/>
74
+ <a href="https://huggingface.co/spaces/xdecoder/Instruct-X-Decoder?duplicate=true">
75
+ <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
76
+ </p>
77
+ """
78
+
79
+ help_text = """
80
+ This demo is leveraging X-Decoder's fine-grained understanding for instruction-based image editing. You can use it to:
81
+ 1. Remove object, e.g., remove the dog in the image
82
+ 2. Replace object, e.g., change the sky with a mountain
83
+ """
84
+
85
+ gr.Markdown(help_text)
86
+
87
+ inputs = [gr.inputs.Image(type='pil'), gr.Textbox(label="instruction")]
88
+ gr.Interface(
89
+ fn=inference,
90
+ inputs=inputs,
91
+ outputs=[
92
+ gr.outputs.Image(
93
+ type="pil",
94
+ label="edit result"),
95
+ ],
96
+ examples=[
97
+ ["./images/blue_white_bird.jpg", "change the color of bird's feathers from blue to red."],
98
+ ["./images/house.jpg", "change the house to a modern one."],
99
+ ["./images/apples.jpg", "change green apple to a red apple"],
100
+ ["./images/Furniture_Gateway_02.jpg", "make the sofa to one with leather"],
101
+ ["./images/cat.jfif", "remove the green chair"],
102
+ ["./images/horse.png", "change the sky to mountain"],
103
+ ["./images/zebras.jpg", "change sky to Seattle skyline"]
104
+ ],
105
+ title=title,
106
+ description=description,
107
+ allow_flagging='never',
108
+ cache_examples=True,
109
+ ).launch()
configs/xdecoder/svlp_focalt_lang.yaml ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3
+ # Copyright (c) 2022 Microsoft
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # Written by Xueyan Zou (xueyan@cs.wisc.edu)
6
+ # --------------------------------------------------------
7
+
8
+ ##################
9
+ # Task settings
10
+ ##################
11
+ VERBOSE: true
12
+ MODEL:
13
+ NAME: xdecoder_model
14
+ HEAD: xdecoder_head
15
+ DIM_PROJ: 512
16
+ BACKBONE_DIM: 768
17
+ TEXT:
18
+ ARCH: vlpencoder
19
+ NAME: transformer
20
+ TOKENIZER: clip
21
+ CONTEXT_LENGTH: 77 # 77
22
+ WIDTH: 512
23
+ HEADS: 8
24
+ LAYERS: 12 # 6
25
+ AUTOGRESSIVE: True
26
+ BACKBONE:
27
+ NAME: focal_dw
28
+ PRETRAINED: ''
29
+ LOAD_PRETRAINED: false
30
+ FOCAL:
31
+ PRETRAIN_IMG_SIZE: 224
32
+ PATCH_SIZE: 4
33
+ EMBED_DIM: 96
34
+ DEPTHS: [2, 2, 6, 2]
35
+ FOCAL_LEVELS: [3, 3, 3, 3]
36
+ FOCAL_WINDOWS: [3, 3, 3, 3]
37
+ DROP_PATH_RATE: 0.3
38
+ MLP_RATIO: 4.0
39
+ DROP_RATE: 0.0
40
+ PATCH_NORM: True
41
+ USE_CONV_EMBED: True
42
+ SCALING_MODULATOR: True
43
+ USE_CHECKPOINT: False
44
+ USE_POSTLN: true
45
+ USE_POSTLN_IN_MODULATION: false
46
+ USE_LAYERSCALE: True
47
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
48
+ OUT_INDICES: [0, 1, 2, 3]
49
+ ENCODER:
50
+ NAME: transformer_encoder_fpn
51
+ IGNORE_VALUE: 255
52
+ NUM_CLASSES: 133
53
+ LOSS_WEIGHT: 1.0
54
+ CONVS_DIM: 512
55
+ MASK_DIM: 512
56
+ NORM: "GN"
57
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
58
+ DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
59
+ COMMON_STRIDE: 4
60
+ TRANSFORMER_ENC_LAYERS: 6
61
+ DECODER:
62
+ NAME: xdecoder
63
+ TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
64
+ MASK: True
65
+ GROUNDING:
66
+ ENABLED: True
67
+ MAX_LEN: 5
68
+ TEXT_WEIGHT: 2.0
69
+ CLASS_WEIGHT: 0.5
70
+ DETECTION: False
71
+ CAPTION:
72
+ ENABLED: True
73
+ PHRASE_PROB: 0.0
74
+ SIM_THRES: 0.95
75
+ CAPTIONING:
76
+ ENABLED: True
77
+ STEP: 50
78
+ RETRIEVAL:
79
+ ENABLED: True
80
+ DIM_IMG: 768
81
+ ENSEMBLE: True
82
+ HIDDEN_DIM: 512
83
+ NUM_OBJECT_QUERIES: 101
84
+ NHEADS: 8
85
+ DROPOUT: 0.0
86
+ DIM_FEEDFORWARD: 2048
87
+ PRE_NORM: False
88
+ ENFORCE_INPUT_PROJ: False
89
+ SIZE_DIVISIBILITY: 32
90
+ TRAIN_NUM_POINTS: 12544
91
+ OVERSAMPLE_RATIO: 3.0
92
+ IMPORTANCE_SAMPLE_RATIO: 0.75
93
+ DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
94
+ TOP_GROUNDING_LAYERS: 3
95
+ TOP_CAPTION_LAYERS: 3
96
+ TOP_CAPTIONING_LAYERS: 3
97
+ TOP_RETRIEVAL_LAYERS: 3
98
+ TOP_OPENIMAGE_LAYERS: 10
99
+ TEST:
100
+ SEMANTIC_ON: True
101
+ INSTANCE_ON: True
102
+ PANOPTIC_ON: True
103
+ OVERLAP_THRESHOLD: 0.8
104
+ OBJECT_MASK_THRESHOLD: 0.4
105
+ SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
106
+ DETECTIONS_PER_IMAGE: 100
107
+
108
+ INPUT:
109
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
110
+ PIXEL_STD: [58.395, 57.120, 57.375]
images/Furniture_Gateway_02.jpg ADDED
images/Magritte_TheSonOfMan.jpg ADDED
images/animals.png ADDED

Git LFS Details

  • SHA256: a75f448104357b0199ec1bea17710cc95bb48d3bf649f2e405c31bc9aab5cf00
  • Pointer size: 132 Bytes
  • Size of remote file: 2.03 MB
images/apples.jpg ADDED
images/blue_white_bird.jpg ADDED
images/cat.jfif ADDED
Binary file (36.5 kB). View file
 
images/coco/000.jpg ADDED
images/coco/001.jpg ADDED
images/coco/002.jpg ADDED
images/coco/003.jpg ADDED
images/coco/004.jpg ADDED
images/coco/005.jpg ADDED
images/coco/006.jpg ADDED
images/coco/007.jpg ADDED
images/coco/008.jpg ADDED
images/coco/009.jpg ADDED
images/coco/010.jpg ADDED
images/coco/011.jpg ADDED
images/coco/012.jpg ADDED
images/coco/013.jpg ADDED
images/coco/014.jpg ADDED
images/coco/015.jpg ADDED
images/coco/016.jpg ADDED
images/coco/017.jpg ADDED
images/coco/018.jpg ADDED
images/coco/019.jpg ADDED
images/coco/020.jpg ADDED
images/coco/021.jpg ADDED
images/coco/022.jpg ADDED
images/coco/023.jpg ADDED
images/coco/024.jpg ADDED
images/coco/025.jpg ADDED
images/coco/026.jpg ADDED
images/coco/027.jpg ADDED
images/coco/028.jpg ADDED
images/coco/029.jpg ADDED
images/coco/030.jpg ADDED
images/coco/031.jpg ADDED
images/coco/032.jpg ADDED
images/coco/033.jpg ADDED
images/coco/034.jpg ADDED
images/coco/035.jpg ADDED
images/coco/036.jpg ADDED
images/coco/037.jpg ADDED
images/coco/038.jpg ADDED