import os
import sys


import numpy as np
from tqdm import tqdm
from einops import rearrange, reduce

from utils.io_utils import *
from utils.cv import img_alpha_blending

import sam3
from PIL import Image
from sam3 import build_sam3_image_model
from sam3.model.box_ops import box_xywh_to_cxcywh
from sam3.model.sam3_image_processor import Sam3Processor
from sam3.visualization_utils import draw_box_on_image, normalize_bbox, plot_results

from live2d.scrap_model import VALID_BODY_PARTS_V2

import torch

# turn on tfloat32 for Ampere GPUs
# https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# use bfloat16 for the entire notebook
torch.autocast("cuda", dtype=torch.bfloat16).__enter__()

sam3_root = os.path.join(os.path.dirname(sam3.__file__), "..")
bpe_path = f"{sam3_root}/sam3/assets/bpe_simple_vocab_16e6.txt.gz"
model = build_sam3_image_model(bpe_path=bpe_path, checkpoint_path='/home/jlin/repos/live2d_parsing/local_gitclones/sam3/sam3.pt')


save_dir = 'workspace/datasets/l2deval_sam3_ouput'

src = ''
exec_list = load_exec_list(src)


for image_path in tqdm(exec_list):
    # image_path = f"/home/jlin/repos/live2d_parsing/workspace/datasets/leo/final.png"
    image = Image.open(image_path).convert('RGB')
    width, height = image.size
    processor = Sam3Processor(model, confidence_threshold=0.5)
    inference_state = processor.set_image(image)

    saved = osp.join(save_dir, osp.splitext(osp.basename(image_path))[0])
    os.makedirs(saved, exist_ok=True)

    img_list = []

    for tag in VALID_BODY_PARTS_V2:
        if tag == 'handwear':
            prompt = 'arms,hands'
        else:
            prompt = tag
        processor.reset_all_prompts(inference_state)
        inference_state = processor.set_text_prompt(state=inference_state, prompt=prompt)
        masks = inference_state['masks']

        mask = reduce(inference_state['masks'], 'b c h w -> h w', 'any').to(device='cpu', dtype=torch.float32).numpy()

        alpha = mask.astype(np.uint8) * 255
        tag_img = np.concatenate([np.array(image), alpha[..., None]], axis=2)
        savep = osp.join(saved, tag + '.png')
        img_list.append(tag_img)

    final_size = (image.height, image.width)
    recon = img_alpha_blending(img_list, final_size=final_size)
    Image.fromarray(recon).save(osp.join(saved, 'reconstruction.png'))