| import gc |
| import multiprocessing as mp |
| import os |
| import shutil |
| import sys |
| import time |
| from os import path |
|
|
| import cv2 |
| import torch |
| from huggingface_hub import hf_hub_download |
| from PIL import Image |
|
|
| import ape |
| import detectron2.data.transforms as T |
| import gradio as gr |
| from ape.model_zoo import get_config_file |
| from demo_lazy import get_parser, setup_cfg |
| from detectron2.config import CfgNode |
| from detectron2.data.detection_utils import read_image |
| from detectron2.evaluation.coco_evaluation import instances_to_coco_json |
| from detectron2.utils.logger import setup_logger |
| from predictor_lazy import VisualizationDemo |
|
|
| this_dir = path.dirname(path.abspath(__file__)) |
|
|
| |
| |
|
|
| example_list = [ |
| [ |
| this_dir + "/examples/Totoro01.png", |
| |
| "Girl with hat", |
| |
| 0.25, |
| ["object detection", "instance segmentation"], |
| ], |
| [ |
| this_dir + "/examples/Totoro01.png", |
| "Sky, Water, Tree, Chinchilla, Grass, Girl", |
| 0.15, |
| ["semantic segmentation"], |
| ], |
| [ |
| this_dir + "/examples/199_3946193540.jpg", |
| "chess piece of horse head", |
| 0.30, |
| ["object detection", "instance segmentation"], |
| ], |
| [ |
| this_dir + "/examples/TheGreatWall.jpg", |
| "The Great Wall", |
| 0.1, |
| ["semantic segmentation"], |
| ], |
| [ |
| this_dir + "/examples/Pisa.jpg", |
| "Pisa", |
| 0.01, |
| ["object detection", "instance segmentation"], |
| ], |
| [ |
| this_dir + "/examples/SolvayConference1927.jpg", |
| |
| "Madame Curie", |
| |
| 0.03, |
| ["object detection", "instance segmentation"], |
| ], |
| [ |
| this_dir + "/examples/Transformers.webp", |
| "Optimus Prime", |
| 0.11, |
| ["object detection", "instance segmentation"], |
| ], |
| [ |
| this_dir + "/examples/Terminator3.jpg", |
| "Humanoid Robot", |
| 0.10, |
| ["object detection", "instance segmentation"], |
| ], |
| [ |
| this_dir + "/examples/MatrixRevolutionForZion.jpg", |
| """machine killer with gun in fighting, |
| donut with colored granules on the surface, |
| railings being crossed by horses, |
| a horse running or jumping, |
| equestrian rider's helmet, |
| outdoor dog led by rope, |
| a dog being touched, |
| clothed dog, |
| basketball in hand, |
| a basketball player with both feet off the ground, |
| player with basketball in the hand, |
| spoon on the plate, |
| coffee cup with coffee, |
| the nearest dessert to the coffee cup, |
| the bartender who is mixing wine, |
| a bartender in a suit, |
| wine glass with wine, |
| a person in aprons, |
| pot with food, |
| a knife being used to cut vegetables, |
| striped sofa in the room, |
| a sofa with pillows on it in the room, |
| lights on in the room, |
| an indoor lying pet, |
| a cat on the sofa, |
| one pet looking directly at the camera indoors, |
| a bed with patterns in the room, |
| the lamp on the table beside the bed, |
| pillow placed at the head of the bed, |
| a blackboard full of words in the classroom, |
| child sitting at desks in the classroom, |
| a person standing in front of bookshelves in the library, |
| the table someone is using in the library, |
| a person who touches books in the library, |
| a person standing in front of the cake counter, |
| a square plate full of cakes, |
| a cake decorated with cream, |
| hot dog with vegetables, |
| hot dog with sauce on the surface, |
| red sausage, |
| flowerpot with flowers potted inside, |
| monochrome flowerpot, |
| a flowerpot filled with black soil, |
| apple growing on trees, |
| red complete apple, |
| apple with a stalk, |
| a woman brushing her teeth, |
| toothbrush held by someone, |
| toilet brush with colored bristles, |
| a customer whose hair is being cut by barber, |
| a barber at work, |
| cloth covering the barber, |
| shopping cart pushed by people in the supermarket, |
| shopping cart with people in the supermarket, |
| shopping cart full of goods, |
| a child wearing a mask, |
| refrigerator with fruit, |
| a drink bottle in the refrigerator, |
| refrigerator with more than two doors, |
| a watch placed on a table or cloth, |
| a watch with three or more watch hands can be seen, |
| a watch with one or more small dials, |
| clothes hanger, |
| a piece of clothing hanging on the hanger, |
| a piece of clothing worn on plastic models, |
| leather bag with glossy surface, |
| backpack, |
| open package, |
| a fish held by people, |
| a person who is fishing with a fishing rod, |
| a fisherman standing on the shore with his body soaked in water, camera hold on someone's shoulder, |
| a person being interviewed, |
| a person with microphone hold in hand, |
| """, |
| 0.20, |
| ["object detection", "instance segmentation"], |
| ], |
| [ |
| this_dir + "/examples/094_56726435.jpg", |
| |
| """donut with colored granules on the surface, |
| railings being crossed by horses, |
| a horse running or jumping, |
| equestrian rider's helmet, |
| outdoor dog led by rope, |
| a dog being touched, |
| clothed dog, |
| basketball in hand, |
| a basketball player with both feet off the ground, |
| player with basketball in the hand, |
| spoon on the plate, |
| coffee cup with coffee, |
| the nearest dessert to the coffee cup, |
| the bartender who is mixing wine, |
| a bartender in a suit, |
| wine glass with wine, |
| a person in aprons, |
| pot with food, |
| a knife being used to cut vegetables, |
| striped sofa in the room, |
| a sofa with pillows on it in the room, |
| lights on in the room, |
| an indoor lying pet, |
| a cat on the sofa, |
| one pet looking directly at the camera indoors, |
| a bed with patterns in the room, |
| the lamp on the table beside the bed, |
| pillow placed at the head of the bed, |
| a blackboard full of words in the classroom, |
| a blackboard or whiteboard with something pasted, |
| child sitting at desks in the classroom, |
| a person standing in front of bookshelves in the library, |
| the table someone is using in the library, |
| a person who touches books in the library, |
| a person standing in front of the cake counter, |
| a square plate full of cakes, |
| a cake decorated with cream, |
| hot dog with vegetables, |
| hot dog with sauce on the surface, |
| red sausage, |
| flowerpot with flowers potted inside, |
| monochrome flowerpot, |
| a flowerpot filled with black soil, |
| apple growing on trees, |
| red complete apple, |
| apple with a stalk, |
| a woman brushing her teeth, |
| toothbrush held by someone, |
| toilet brush with colored bristles, |
| a customer whose hair is being cut by barber, |
| a barber at work, |
| cloth covering the barber, |
| a plastic toy, |
| a plush toy, |
| a humanoid toy, |
| shopping cart pushed by people in the supermarket, |
| shopping cart with people in the supermarket, |
| shopping cart full of goods, |
| a child wearing a mask, |
| a mask on face with half a face exposed, |
| a mask on face with only eyes exposed, |
| refrigerator with fruit, |
| a drink bottle in the refrigerator, |
| refrigerator with more than two doors, |
| a watch placed on a table or cloth, |
| a watch with three or more watch hands can be seen, |
| a watch with one or more small dials, |
| clothes hanger, |
| a piece of clothing hanging on the hanger, |
| a piece of clothing worn on plastic models, |
| leather bag with glossy surface, |
| backpack, |
| open package, |
| a fish held by people, |
| a person who is fishing with a fishing rod, |
| a fisherman standing on the shore with his body soaked in water, camera hold on someone's shoulder, |
| a person being interviewed, |
| a person with microphone hold in hand, |
| """, |
| 0.50, |
| ["object detection", "instance segmentation"], |
| ], |
| [ |
| this_dir + "/examples/013_438973263.jpg", |
| |
| """a male lion with a mane, |
| railings being crossed by horses, |
| a horse running or jumping, |
| equestrian rider's helmet, |
| outdoor dog led by rope, |
| a dog being touched, |
| clothed dog, |
| basketball in hand, |
| a basketball player with both feet off the ground, |
| player with basketball in the hand, |
| spoon on the plate, |
| coffee cup with coffee, |
| the nearest dessert to the coffee cup, |
| the bartender who is mixing wine, |
| a bartender in a suit, |
| wine glass with wine, |
| a person in aprons, |
| pot with food, |
| a knife being used to cut vegetables, |
| striped sofa in the room, |
| a sofa with pillows on it in the room, |
| lights on in the room, |
| an indoor lying pet, |
| a cat on the sofa, |
| one pet looking directly at the camera indoors, |
| a bed with patterns in the room, |
| the lamp on the table beside the bed, |
| pillow placed at the head of the bed, |
| a blackboard full of words in the classroom, |
| a blackboard or whiteboard with something pasted, |
| child sitting at desks in the classroom, |
| a person standing in front of bookshelves in the library, |
| the table someone is using in the library, |
| a person who touches books in the library, |
| a person standing in front of the cake counter, |
| a square plate full of cakes, |
| a cake decorated with cream, |
| hot dog with vegetables, |
| hot dog with sauce on the surface, |
| red sausage, |
| flowerpot with flowers potted inside, |
| monochrome flowerpot, |
| a flowerpot filled with black soil, |
| apple growing on trees, |
| red complete apple, |
| apple with a stalk, |
| a woman brushing her teeth, |
| toothbrush held by someone, |
| toilet brush with colored bristles, |
| a customer whose hair is being cut by barber, |
| a barber at work, |
| cloth covering the barber, |
| a plastic toy, |
| a plush toy, |
| a humanoid toy, |
| shopping cart pushed by people in the supermarket, |
| shopping cart with people in the supermarket, |
| shopping cart full of goods, |
| a child wearing a mask, |
| a mask on face with half a face exposed, |
| a mask on face with only eyes exposed, |
| refrigerator with fruit, |
| a drink bottle in the refrigerator, |
| refrigerator with more than two doors, |
| a watch placed on a table or cloth, |
| a watch with three or more watch hands can be seen, |
| a watch with one or more small dials, |
| clothes hanger, |
| a piece of clothing hanging on the hanger, |
| a piece of clothing worn on plastic models, |
| leather bag with glossy surface, |
| backpack, |
| open package, |
| a fish held by people, |
| a person who is fishing with a fishing rod, |
| a fisherman standing on the shore with his body soaked in water, camera hold on someone's shoulder, |
| a person being interviewed, |
| a person with microphone hold in hand, |
| """, |
| |
| 0.50, |
| ["object detection", "instance segmentation"], |
| ], |
| ] |
|
|
| ckpt_repo_id = "shenyunhang/APE" |
|
|
|
|
| def setup_model(name): |
| gc.collect() |
| torch.cuda.empty_cache() |
|
|
| if save_memory: |
| pass |
| else: |
| return |
|
|
| for key, demo in all_demo.items(): |
| if key == name: |
| demo.predictor.model.to(running_device) |
| else: |
| demo.predictor.model.to("cpu") |
|
|
| gc.collect() |
| torch.cuda.empty_cache() |
|
|
|
|
| def run_on_image_A(input_image_path, input_text, score_threshold, output_type): |
| logger.info("run_on_image") |
|
|
| setup_model("APE_A") |
| demo = all_demo["APE_A"] |
| cfg = all_cfg["APE_A"] |
| demo.predictor.model.model_vision.test_score_thresh = score_threshold |
|
|
| return run_on_image( |
| input_image_path, |
| input_text, |
| output_type, |
| demo, |
| cfg, |
| ) |
|
|
|
|
| def run_on_image_C(input_image_path, input_text, score_threshold, output_type): |
| logger.info("run_on_image_C") |
|
|
| setup_model("APE_C") |
| demo = all_demo["APE_C"] |
| cfg = all_cfg["APE_C"] |
| demo.predictor.model.model_vision.test_score_thresh = score_threshold |
|
|
| return run_on_image( |
| input_image_path, |
| input_text, |
| output_type, |
| demo, |
| cfg, |
| ) |
|
|
|
|
| def run_on_image_D(input_image_path, input_text, score_threshold, output_type): |
| logger.info("run_on_image_D") |
|
|
| setup_model("APE_D") |
| demo = all_demo["APE_D"] |
| cfg = all_cfg["APE_D"] |
| demo.predictor.model.model_vision.test_score_thresh = score_threshold |
|
|
| return run_on_image( |
| input_image_path, |
| input_text, |
| output_type, |
| demo, |
| cfg, |
| ) |
|
|
|
|
| def run_on_image_comparison(input_image_path, input_text, score_threshold, output_type): |
| logger.info("run_on_image_comparison") |
|
|
| r = [] |
| for key in all_demo.keys(): |
| logger.info("run_on_image_comparison {}".format(key)) |
| setup_model(key) |
| demo = all_demo[key] |
| cfg = all_cfg[key] |
| demo.predictor.model.model_vision.test_score_thresh = score_threshold |
|
|
| img, _ = run_on_image( |
| input_image_path, |
| input_text, |
| output_type, |
| demo, |
| cfg, |
| ) |
| r.append(img) |
|
|
| return r |
|
|
|
|
| def run_on_image( |
| input_image_path, |
| input_text, |
| output_type, |
| demo, |
| cfg, |
| ): |
| with_box = False |
| with_mask = False |
| with_sseg = False |
| if "object detection" in output_type: |
| with_box = True |
| if "instance segmentation" in output_type: |
| with_mask = True |
| if "semantic segmentation" in output_type: |
| with_sseg = True |
|
|
| if isinstance(input_image_path, dict): |
| input_mask_path = input_image_path["mask"] |
| input_image_path = input_image_path["image"] |
| print("input_image_path", input_image_path) |
| print("input_mask_path", input_mask_path) |
| else: |
| input_mask_path = None |
|
|
| print("input_text", input_text) |
|
|
| if isinstance(cfg, CfgNode): |
| input_format = cfg.INPUT.FORMAT |
| else: |
| if "model_vision" in cfg.model: |
| input_format = cfg.model.model_vision.input_format |
| else: |
| input_format = cfg.model.input_format |
|
|
| input_image = read_image(input_image_path, format="BGR") |
| |
| |
| |
| |
|
|
| if input_mask_path is not None: |
| input_mask = read_image(input_mask_path, "L").squeeze(2) |
| print("input_mask", input_mask) |
| print("input_mask", input_mask.shape) |
| else: |
| input_mask = None |
|
|
| if not with_box and not with_mask and not with_sseg: |
| return input_image[:, :, ::-1] |
|
|
| if input_image.shape[0] > 1024 or input_image.shape[1] > 1024: |
| transform = aug.get_transform(input_image) |
| input_image = transform.apply_image(input_image) |
| else: |
| transform = None |
|
|
| start_time = time.time() |
| predictions, visualized_output, _, metadata = demo.run_on_image( |
| input_image, |
| text_prompt=input_text, |
| mask_prompt=input_mask, |
| with_box=with_box, |
| with_mask=with_mask, |
| with_sseg=with_sseg, |
| ) |
|
|
| logger.info( |
| "{} in {:.2f}s".format( |
| "detected {} instances".format(len(predictions["instances"])) |
| if "instances" in predictions |
| else "finished", |
| time.time() - start_time, |
| ) |
| ) |
|
|
| output_image = visualized_output.get_image() |
| print("output_image", output_image.shape) |
| |
| |
| if transform: |
| output_image = transform.inverse().apply_image(output_image) |
| print("output_image", output_image.shape) |
|
|
| output_image = Image.fromarray(output_image) |
|
|
| gc.collect() |
| torch.cuda.empty_cache() |
|
|
| json_results = instances_to_coco_json(predictions["instances"].to(demo.cpu_device), 0) |
| for json_result in json_results: |
| json_result["category_name"] = metadata.thing_classes[json_result["category_id"]] |
| del json_result["image_id"] |
|
|
| return output_image, json_results |
|
|
|
|
| def load_APE_A(): |
| |
| init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VG/ape_deta/ape_deta_vitl_eva02_lsj_cp_720k_20230504_002019/model_final.pth" |
| init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint) |
|
|
| args = get_parser().parse_args() |
| args.config_file = get_config_file( |
| "LVISCOCOCOCOSTUFF_O365_OID_VG/ape_deta/ape_deta_vitl_eva02_lsj1024_cp_720k.py" |
| ) |
| args.confidence_threshold = 0.01 |
| args.opts = [ |
| "train.init_checkpoint='{}'".format(init_checkpoint), |
| "model.model_language.cache_dir=''", |
| "model.model_vision.select_box_nums_for_evaluation=500", |
| "model.model_vision.backbone.net.xattn=False", |
| "model.model_vision.transformer.encoder.pytorch_attn=True", |
| "model.model_vision.transformer.decoder.pytorch_attn=True", |
| ] |
| if running_device == "cpu": |
| args.opts += [ |
| "model.model_language.dtype='float32'", |
| ] |
| logger.info("Arguments: " + str(args)) |
| cfg = setup_cfg(args) |
|
|
| cfg.model.model_vision.criterion[0].use_fed_loss = False |
| cfg.model.model_vision.criterion[2].use_fed_loss = False |
| cfg.train.device = running_device |
|
|
| ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ |
| "vision_cfg" |
| ]["layers"] = 1 |
| ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ |
| "vision_cfg" |
| ]["fusedLN"] = False |
|
|
| demo = VisualizationDemo(cfg, args=args) |
| if save_memory: |
| demo.predictor.model.to("cpu") |
| |
| else: |
| demo.predictor.model.to(running_device) |
|
|
| all_demo["APE_A"] = demo |
| all_cfg["APE_A"] = cfg |
|
|
|
|
| def load_APE_B(): |
| |
| init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_225418/model_final.pth" |
| init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint) |
|
|
| args = get_parser().parse_args() |
| args.config_file = get_config_file( |
| "LVISCOCOCOCOSTUFF_O365_OID_VGR_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj1024_cp_1080k.py" |
| ) |
| args.confidence_threshold = 0.01 |
| args.opts = [ |
| "train.init_checkpoint='{}'".format(init_checkpoint), |
| "model.model_language.cache_dir=''", |
| "model.model_vision.select_box_nums_for_evaluation=500", |
| "model.model_vision.text_feature_bank_reset=True", |
| "model.model_vision.backbone.net.xattn=False", |
| "model.model_vision.transformer.encoder.pytorch_attn=True", |
| "model.model_vision.transformer.decoder.pytorch_attn=True", |
| ] |
| if running_device == "cpu": |
| args.opts += [ |
| "model.model_language.dtype='float32'", |
| ] |
| logger.info("Arguments: " + str(args)) |
| cfg = setup_cfg(args) |
|
|
| cfg.model.model_vision.criterion[0].use_fed_loss = False |
| cfg.model.model_vision.criterion[2].use_fed_loss = False |
| cfg.train.device = running_device |
|
|
| ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ |
| "vision_cfg" |
| ]["layers"] = 1 |
| ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ |
| "vision_cfg" |
| ]["fusedLN"] = False |
|
|
| demo = VisualizationDemo(cfg, args=args) |
| if save_memory: |
| demo.predictor.model.to("cpu") |
| |
| else: |
| demo.predictor.model.to(running_device) |
|
|
| all_demo["APE_B"] = demo |
| all_cfg["APE_B"] = cfg |
|
|
|
|
| def load_APE_C(): |
| |
| init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_210950/model_final.pth" |
| init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint) |
|
|
| args = get_parser().parse_args() |
| args.config_file = get_config_file( |
| "LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj1024_cp_1080k.py" |
| ) |
| args.confidence_threshold = 0.01 |
| args.opts = [ |
| "train.init_checkpoint='{}'".format(init_checkpoint), |
| "model.model_language.cache_dir=''", |
| "model.model_vision.select_box_nums_for_evaluation=500", |
| "model.model_vision.text_feature_bank_reset=True", |
| "model.model_vision.backbone.net.xattn=False", |
| "model.model_vision.transformer.encoder.pytorch_attn=True", |
| "model.model_vision.transformer.decoder.pytorch_attn=True", |
| ] |
| if running_device == "cpu": |
| args.opts += [ |
| "model.model_language.dtype='float32'", |
| ] |
| logger.info("Arguments: " + str(args)) |
| cfg = setup_cfg(args) |
|
|
| cfg.model.model_vision.criterion[0].use_fed_loss = False |
| cfg.model.model_vision.criterion[2].use_fed_loss = False |
| cfg.train.device = running_device |
|
|
| ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ |
| "vision_cfg" |
| ]["layers"] = 1 |
| ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ |
| "vision_cfg" |
| ]["fusedLN"] = False |
|
|
| demo = VisualizationDemo(cfg, args=args) |
| if save_memory: |
| demo.predictor.model.to("cpu") |
| |
| else: |
| demo.predictor.model.to(running_device) |
|
|
| all_demo["APE_C"] = demo |
| all_cfg["APE_C"] = cfg |
|
|
|
|
| def load_APE_D(): |
| |
| init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_16x4_1080k_mdl_20230829_162438/model_final.pth" |
| init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint) |
|
|
| args = get_parser().parse_args() |
| args.config_file = get_config_file( |
| "LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_16x4_1080k.py" |
| ) |
| args.confidence_threshold = 0.01 |
| args.opts = [ |
| "train.init_checkpoint='{}'".format(init_checkpoint), |
| "model.model_language.cache_dir=''", |
| "model.model_vision.select_box_nums_for_evaluation=500", |
| "model.model_vision.text_feature_bank_reset=True", |
| "model.model_vision.backbone.net.xattn=False", |
| "model.model_vision.transformer.encoder.pytorch_attn=True", |
| "model.model_vision.transformer.decoder.pytorch_attn=True", |
| ] |
| if running_device == "cpu": |
| args.opts += [ |
| "model.model_language.dtype='float32'", |
| ] |
| logger.info("Arguments: " + str(args)) |
| cfg = setup_cfg(args) |
|
|
| cfg.model.model_vision.criterion[0].use_fed_loss = False |
| cfg.model.model_vision.criterion[2].use_fed_loss = False |
| cfg.train.device = running_device |
|
|
| ape.modeling.text.eva02_clip.factory._MODEL_CONFIGS[cfg.model.model_language.clip_model][ |
| "vision_cfg" |
| ]["layers"] = 1 |
|
|
| demo = VisualizationDemo(cfg, args=args) |
| if save_memory: |
| demo.predictor.model.to("cpu") |
| |
| else: |
| demo.predictor.model.to(running_device) |
|
|
| all_demo["APE_D"] = demo |
| all_cfg["APE_D"] = cfg |
|
|
|
|
| def APE_A_tab(): |
| with gr.Tab("APE A"): |
| with gr.Row(equal_height=False): |
| with gr.Column(scale=1): |
| input_image = gr.Image( |
| sources=["upload"], |
| type="filepath", |
| |
| |
| ) |
| input_text = gr.Textbox( |
| label="Object Prompt (optional, if not provided, will only find COCO object.)", |
| info="格式: word1,word2,word3,...", |
| ) |
|
|
| score_threshold = gr.Slider( |
| label="Score Threshold", minimum=0.01, maximum=1.0, value=0.3, step=0.01 |
| ) |
|
|
| output_type = gr.CheckboxGroup( |
| ["object detection", "instance segmentation"], |
| value=["object detection", "instance segmentation"], |
| label="Output Type", |
| info="Which kind of output is displayed?", |
| ).style(item_container=True, container=True) |
|
|
| run_button = gr.Button("Run") |
|
|
| with gr.Column(scale=2): |
| gallery = gr.Image( |
| type="pil", |
| ) |
|
|
| example_data = gr.Dataset( |
| components=[input_image, input_text, score_threshold], |
| samples=examples, |
| samples_per_page=5, |
| ) |
| example_data.click(fn=set_example, inputs=example_data, outputs=example_data.components) |
|
|
| |
| output_json = gr.JSON(label="json results") |
|
|
| run_button.click( |
| fn=run_on_image, |
| inputs=[input_image, input_text, score_threshold, output_type], |
| outputs=[gallery, output_json], |
| ) |
|
|
|
|
| def APE_C_tab(): |
| with gr.Tab("APE C"): |
| with gr.Row(equal_height=False): |
| with gr.Column(scale=1): |
| input_image = gr.Image( |
| sources=["upload"], |
| type="filepath", |
| |
| |
| ) |
| input_text = gr.Textbox( |
| label="Object Prompt (optional, if not provided, will only find COCO object.)", |
| info="格式: word1,word2,sentence1,sentence2,...", |
| ) |
|
|
| score_threshold = gr.Slider( |
| label="Score Threshold", minimum=0.01, maximum=1.0, value=0.3, step=0.01 |
| ) |
|
|
| output_type = gr.CheckboxGroup( |
| ["object detection", "instance segmentation", "semantic segmentation"], |
| value=["object detection", "instance segmentation"], |
| label="Output Type", |
| info="Which kind of output is displayed?", |
| ).style(item_container=True, container=True) |
|
|
| run_button = gr.Button("Run") |
|
|
| with gr.Column(scale=2): |
| gallery = gr.Image( |
| type="pil", |
| ) |
|
|
| example_data = gr.Dataset( |
| components=[input_image, input_text, score_threshold], |
| samples=example_list, |
| samples_per_page=5, |
| ) |
| example_data.click(fn=set_example, inputs=example_data, outputs=example_data.components) |
|
|
| |
| output_json = gr.JSON(label="json results") |
|
|
| run_button.click( |
| fn=run_on_image_C, |
| inputs=[input_image, input_text, score_threshold, output_type], |
| outputs=[gallery, output_json], |
| ) |
|
|
|
|
| def APE_D_tab(): |
| with gr.Tab("APE D"): |
| with gr.Row(equal_height=False): |
| with gr.Column(scale=1): |
| input_image = gr.Image( |
| sources=["upload"], |
| type="filepath", |
| |
| |
| ) |
| input_text = gr.Textbox( |
| label="Object Prompt (optional, if not provided, will only find COCO object.)", |
| info="格式: word1,word2,sentence1,sentence2,...", |
| ) |
|
|
| score_threshold = gr.Slider( |
| label="Score Threshold", minimum=0.01, maximum=1.0, value=0.1, step=0.01 |
| ) |
|
|
| output_type = gr.CheckboxGroup( |
| ["object detection", "instance segmentation", "semantic segmentation"], |
| value=["object detection", "instance segmentation"], |
| label="Output Type", |
| info="Which kind of output is displayed?", |
| ) |
|
|
| run_button = gr.Button("Run") |
|
|
| with gr.Column(scale=2): |
| gallery = gr.Image( |
| type="pil", |
| ) |
|
|
| gr.Examples( |
| examples=example_list, |
| inputs=[input_image, input_text, score_threshold, output_type], |
| examples_per_page=20, |
| ) |
|
|
| |
| output_json = gr.JSON(label="json results") |
|
|
| run_button.click( |
| fn=run_on_image_D, |
| inputs=[input_image, input_text, score_threshold, output_type], |
| outputs=[gallery, output_json], |
| ) |
|
|
|
|
| def comparison_tab(): |
| with gr.Tab("APE all"): |
| with gr.Row(equal_height=False): |
| with gr.Column(scale=1): |
| input_image = gr.Image( |
| sources=["upload"], |
| type="filepath", |
| |
| |
| ) |
| input_text = gr.Textbox( |
| label="Object Prompt (optional, if not provided, will only find COCO object.)", |
| info="格式: word1,word2,sentence1,sentence2,...", |
| ) |
|
|
| score_threshold = gr.Slider( |
| label="Score Threshold", minimum=0.01, maximum=1.0, value=0.1, step=0.01 |
| ) |
|
|
| output_type = gr.CheckboxGroup( |
| ["object detection", "instance segmentation", "semantic segmentation"], |
| value=["object detection", "instance segmentation"], |
| label="Output Type", |
| info="Which kind of output is displayed?", |
| ) |
|
|
| run_button = gr.Button("Run") |
|
|
| gallery_all = [] |
| with gr.Column(scale=2): |
| for key in all_demo.keys(): |
| gallery = gr.Image( |
| label=key, |
| type="pil", |
| ) |
| gallery_all.append(gallery) |
|
|
| gr.Examples( |
| examples=example_list, |
| inputs=[input_image, input_text, score_threshold, output_type], |
| examples_per_page=20, |
| ) |
|
|
| |
|
|
| run_button.click( |
| fn=run_on_image_comparison, |
| inputs=[input_image, input_text, score_threshold, output_type], |
| outputs=gallery_all, |
| ) |
|
|
|
|
| def is_port_in_use(port: int) -> bool: |
| import socket |
|
|
| with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: |
| return s.connect_ex(("localhost", port)) == 0 |
|
|
|
|
| def add_head_info(max_available_memory): |
| gr.Markdown( |
| "# APE: Aligning and Prompting Everything All at Once for Universal Visual Perception" |
| ) |
| if max_available_memory: |
| gr.Markdown( |
| "Note multiple models are deployed on single GPU, so it may take several minutes to run the models and visualize the results." |
| ) |
| else: |
| gr.Markdown( |
| "Note multiple models are deployed on CPU, so it may take a while to run the models and visualize the results." |
| ) |
| gr.Markdown( |
| "Noted results computed by CPU are slightly different to results computed by GPU, and some libraries are disabled on CPU." |
| ) |
| gr.Markdown( |
| "If the demo is out of memory, try to ***decrease*** the number of object prompt and ***increase*** score threshold." |
| ) |
|
|
| gr.Markdown("---") |
|
|
|
|
| def add_tail_info(): |
| gr.Markdown("---") |
| gr.Markdown("### We also support Prompt") |
| gr.Markdown( |
| """ |
| | Location prompt | result | Location prompt | result | |
| | ---- | ---- | ---- | ---- | |
| |  |  |  |  | |
| |  |  |  |  | |
| """ |
| ) |
| gr.Markdown("---") |
|
|
|
|
| if __name__ == "__main__": |
| available_port = [80, 8080] |
| for port in available_port: |
| if is_port_in_use(port): |
| continue |
| else: |
| server_port = port |
| break |
| print("server_port", server_port) |
|
|
| available_memory = [ |
| torch.cuda.mem_get_info(i)[0] / 1024**3 for i in range(torch.cuda.device_count()) |
| ] |
|
|
| global running_device |
| if len(available_memory) > 0: |
| max_available_memory = max(available_memory) |
| device_id = available_memory.index(max_available_memory) |
|
|
| running_device = "cuda:" + str(device_id) |
| else: |
| max_available_memory = 0 |
| running_device = "cpu" |
|
|
| global save_memory |
| save_memory = False |
| if max_available_memory > 0 and max_available_memory < 40: |
| save_memory = True |
|
|
| print("available_memory", available_memory) |
| print("max_available_memory", max_available_memory) |
| print("running_device", running_device) |
| print("save_memory", save_memory) |
|
|
| |
|
|
| mp.set_start_method("spawn", force=True) |
| setup_logger(name="fvcore") |
| setup_logger(name="ape") |
| global logger |
| logger = setup_logger() |
|
|
| global aug |
| aug = T.ResizeShortestEdge([1024, 1024], 1024) |
|
|
| global all_demo |
| all_demo = {} |
| all_cfg = {} |
|
|
| |
| |
| |
| save_memory = False |
| load_APE_D() |
|
|
| title = "APE: Aligning and Prompting Everything All at Once for Universal Visual Perception" |
| block = gr.Blocks(title=title).queue() |
| with block: |
| add_head_info(max_available_memory) |
|
|
| |
| |
| APE_D_tab() |
|
|
| comparison_tab() |
|
|
| |
|
|
| block.launch( |
| share=False, |
| |
| |
| show_api=False, |
| show_error=True, |
| ) |
|
|