#!/usr/bin/env python3
"""使用 AXModel 推理链路 (transformer + VAE decoder)。"""
from __future__ import annotations

import argparse
import json
import os
import random
import sys
from contextlib import contextmanager
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple, Union

current_file_path = os.path.abspath(__file__)
project_roots = [os.path.dirname(current_file_path), os.path.dirname(os.path.dirname(current_file_path)), os.path.dirname(os.path.dirname(os.path.dirname(current_file_path)))]
for project_root in project_roots:
    sys.path.insert(0, project_root) if project_root not in sys.path else None

SCRIPT_DIR = Path(__file__).resolve().parent
REPO_ROOT = SCRIPT_DIR.parents[2]
if REPO_ROOT.as_posix() not in sys.path:
    sys.path.insert(0, REPO_ROOT.as_posix())

import numpy as np
import torch
from axengine import InferenceSession as AxInferenceSession
from diffusers import FlowMatchEulerDiscreteScheduler
from diffusers.image_processor import VaeImageProcessor
from diffusers.utils.torch_utils import randn_tensor
from omegaconf import OmegaConf
from PIL import Image
from loguru import logger
from tqdm import tqdm

from videox_fun.models import AutoTokenizer, Qwen3ForCausalLM
from videox_fun.utils.fm_solvers import FlowDPMSolverMultistepScheduler
from videox_fun.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
from videox_fun.utils.utils import get_image_latent


# -----------------------------------------------------------------------------
# 模型与资源路径
# -----------------------------------------------------------------------------
MODEL_NAME = "models/Diffusion_Transformer/Z-Image-Turbo/"
CONFIG_PATH = REPO_ROOT / "VideoX-Fun" / "config" / "z_image" / "z_image.yaml"
TRANSFORMER_CONFIG_PATH = REPO_ROOT / "VideoX-Fun" / "pulsar2_configs" / "transformers_subgraph.json"
TRANSFORMER_ONNX_PATH = REPO_ROOT / "VideoX-Fun" / "compiled_subgraph_from_onnx" / "frontend" / "optimized_quant_axmodel.onnx"
TRANSFORMER_AXMODEL_DIR = REPO_ROOT / "VideoX-Fun" / "comliled_subgraph_from_all_onnx"  # compiled_slice_quant_onnx
VAE_DECODER_AXMODEL = REPO_ROOT / "VideoX-Fun" / "vae_decoder.axmodel"
SAVE_DIR = REPO_ROOT / "VideoX-Fun" / "samples" / "z-image-t2i-axmodel"

# -----------------------------------------------------------------------------
# 运行配置
# -----------------------------------------------------------------------------
DEFAULT_PROMPTS = [
    "(masterpiece, best quality) solo female on a tropical beach, golden hour rim light, cinematic grading",
    "nighttime cyberpunk boulevard, neon reflections on wet asphalt, volumetric fog, wide shot",
    "sunrise over alpine mountains, low clouds in valleys, god rays, ultra-detailed landscape",
    "modern minimal living room, soft natural light, Scandinavian design, high-resolution interior render",
    "classical oil painting of a renaissance noblewoman, chiaroscuro lighting, rich textures",
    "macro photography of a dewdrop on a leaf, extreme detail, shallow depth of field",
    "futuristic sports car parked under neon lights, glossy paint, cinematic 35mm look",
    "ancient library with towering bookshelves, warm candlelight, dust motes in air",
    "portrait of an astronaut in full suit, visor reflection showing earth, studio lighting",
    "stormy sea with a lone lighthouse, crashing waves, dramatic clouds, long exposure feel",
    "cybernetic samurai standing in rain, backlit silhouette, moody blue-orange palette",
    "lush rainforest waterfall, soft mist, saturated greens, wide-angle composition",
    "product shot of a smartwatch on marble, softbox lighting, crisp shadows, advertisement style",
    "architectural exterior of a glass skyscraper at dusk, warm interior lights, reflections",
    "vintage film photograph of a 1950s diner at night, grain and halation, neon signage",
    "hyperrealistic bowl of ramen, steam rising, glossy broth, detailed toppings",
    "fantasy castle on a floating island, waterfalls falling into clouds, sunset lighting",
    "high-fashion editorial portrait, dramatic chiaroscuro, sharp focus on eyes",
    "aerial view of winding river through autumn forest, golden and crimson leaves",
    "studio shot of running shoes mid-air, motion blur trails, vibrant background gradient",
    "noir city alley in the 1940s, hard shadows, rain-slick pavement, moody atmosphere",
    "desert caravan at twilight, silhouettes of camels, soft purple sky, cinematic scope",
    "close-up of a mechanical watch movement, intricate gears, metallic reflections",
    "bioluminescent underwater reef, glowing corals, schools of fish, deep blue tones",
    "portrait of an elderly man with weathered face, soft window light, fine skin detail",
    "snowy village at night, warm cabin lights, smoke from chimneys, peaceful mood",
    "futuristic data center aisle, cool cyan lighting, depth and symmetry",
    "oil painting of a bowl of fruit in Dutch masters style, rich textures, dramatic lighting",
    "sunlit meadow with wildflowers, shallow depth of field, pastel color palette",
    "sci-fi corridor with volumetric light shafts, pristine white surfaces, wide lens",
    "luxury wristwatch on black velvet, high contrast, advertisement macro shot",
    "medieval marketplace at dawn, merchants setting up, soft warm light, lively details",
    "((masterpiece,best quality))1 young beautiful girl,ultra detailed,official art,unity 8k wallpaper,masterpiece, best quality, official art, extremely detailed CG unity 8k wallpaper, highly detailed, 1 girl, aqua eyes, light smile, ((grey hair)), hair flower, bracelet, choker, ribbon, JK, look at viewer, on the beach, in summer,"
]
prompt_idx = random.randint(0, len(DEFAULT_PROMPTS) - 1)
PROMPT = DEFAULT_PROMPTS[prompt_idx]
NEG_PROMPT = " "
GUIDANCE_SCALE = 0.0
SEED = 42
HEIGHT, WIDTH = 512, 512
NUM_INFERENCE_STEPS = 9
NUM_CHANNELS_LATENTS = 16
VAE_SCALE_FACTOR = 8
PATCH_SIZE = 2
FPATCH_SIZE = 1
MAX_SEQ_LEN = 128
VAE_SCALING_FACTOR = 0.3611
VAE_SHIFT_FACTOR = 0.1159

SAMPLER_MAP = {
    "Flow": FlowMatchEulerDiscreteScheduler,
    "Flow_Unipc": FlowUniPCMultistepScheduler,
    "Flow_DPM++": FlowDPMSolverMultistepScheduler,
}
SAMPLER_NAME = "Flow"

# 默认最终输出，如果不存在 auto 子图则回退到最后一个 cfg 输出
DEFAULT_FINAL_OUTPUT = None

# -----------------------------------------------------------------------------
# 工具函数 (复制自原 launcher 并微调)
# -----------------------------------------------------------------------------

def _infer_module_device(module: torch.nn.Module) -> torch.device:
    param = next(module.parameters(), None)
    if param is not None:
        return param.device
    buffer = next(module.buffers(), None)
    if buffer is not None:
        return buffer.device
    return torch.device("cpu")


@contextmanager
def module_to_device(module: torch.nn.Module, target_device: torch.device):
    if module is None:
        yield module
        return
    original_device = _infer_module_device(module)
    target_device = target_device or original_device
    needs_move = original_device != target_device
    moved_to_cuda = needs_move and target_device.type == "cuda"
    if needs_move:
        module.to(target_device)
    try:
        yield module
    finally:
        if needs_move:
            module.to(original_device)
            if moved_to_cuda and torch.cuda.is_available():
                cache_device = target_device.index or torch.cuda.current_device()
                with torch.cuda.device(cache_device):
                    torch.cuda.empty_cache()


def _encode_prompt(
    tokenizer: AutoTokenizer,
    text_encoder: Qwen3ForCausalLM,
    prompt: Union[str, List[str]],
    device: torch.device,
    prompt_embeds: Optional[List[torch.FloatTensor]] = None,
    max_sequence_length: int = 512,
) -> List[torch.FloatTensor]:
    if prompt_embeds is not None:
        return prompt_embeds
    prompts = [prompt] if isinstance(prompt, str) else list(prompt)
    for idx, item in enumerate(prompts):
        messages = [{"role": "user", "content": item}]
        prompts[idx] = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True, enable_thinking=True
        )
    text_inputs = tokenizer(
        prompts,
        padding="max_length",
        max_length=max_sequence_length,
        truncation=True,
        return_tensors="pt",
    )
    text_input_ids = text_inputs.input_ids.to(device)
    prompt_masks = text_inputs.attention_mask.to(device).bool()
    with module_to_device(text_encoder, device):
        prompt_embeds = text_encoder(
            input_ids=text_input_ids,
            attention_mask=prompt_masks,
            output_hidden_states=True,
        ).hidden_states[-2]
    return [prompt_embeds[i] for i in range(len(prompt_embeds))]


def encode_prompt(
    tokenizer: AutoTokenizer,
    text_encoder: Qwen3ForCausalLM,
    prompt: Union[str, List[str]],
    device: torch.device,
    do_classifier_free_guidance: bool,
    negative_prompt: Optional[Union[str, List[str]]],
    max_sequence_length: int,
) -> Tuple[List[torch.FloatTensor], List[torch.FloatTensor]]:
    prompt_embeds = _encode_prompt(
        tokenizer, text_encoder, prompt, device, None, max_sequence_length
    )
    negative_embeds: List[torch.FloatTensor] = []
    if do_classifier_free_guidance:
        neg = negative_prompt or ""
        negative_list = [neg] if isinstance(neg, str) else list(neg)
        negative_embeds = _encode_prompt(
            tokenizer, text_encoder, negative_list, device, None, max_sequence_length
        )
    return prompt_embeds, negative_embeds


def _stack_prompt_embeddings(prompt_embeds_input):
    if isinstance(prompt_embeds_input, list):
        return torch.stack(prompt_embeds_input, dim=0)
    return prompt_embeds_input


def prepare_latents(
    batch_size: int,
    num_channels_latents: int,
    height: int,
    width: int,
    dtype: torch.dtype,
    device: torch.device,
    generator: torch.Generator,
) -> torch.FloatTensor:
    height = 2 * (int(height) // (VAE_SCALE_FACTOR * 2))
    width = 2 * (int(width) // (VAE_SCALE_FACTOR * 2))
    shape = (batch_size, num_channels_latents, height, width)
    latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
    return latents


def calculate_shift(
    image_seq_len: int,
    base_seq_len: int = 256,
    max_seq_len: int = 4096,
    base_shift: float = 0.5,
    max_shift: float = 1.15,
) -> float:
    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
    b = base_shift - m * base_seq_len
    return image_seq_len * m + b


def retrieve_timesteps(
    scheduler,
    num_inference_steps: int,
    device: torch.device,
    **kwargs,
):
    scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
    return scheduler.timesteps


# -----------------------------------------------------------------------------
# 参数
# -----------------------------------------------------------------------------


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="AXModel 推理 (transformer + VAE)")
    parser.add_argument("--prompt", type=str, default=None, help="正向提示词，不填则使用预置随机样本")
    parser.add_argument("--negative-prompt", type=str, default=NEG_PROMPT, help="反向提示词")
    parser.add_argument("--steps", type=int, default=NUM_INFERENCE_STEPS, help="迭代步数")
    parser.add_argument("--height", type=int, default=HEIGHT, help="生成高度，需被 16 整除")
    parser.add_argument("--width", type=int, default=WIDTH, help="生成宽度，需被 16 整除")
    parser.add_argument("--seed", type=int, default=SEED, help="随机种子")
    parser.add_argument("--sampler", type=str, choices=list(SAMPLER_MAP.keys()), default=SAMPLER_NAME, help="采样器")
    parser.add_argument("--max-seq-len", type=int, default=MAX_SEQ_LEN, help="最大文本长度")
    parser.add_argument("--save-dir", type=str, default=str(SAVE_DIR), help="结果输出目录")
    parser.add_argument("--transformer-config", type=str, required=True, help="子图配置 json")
    parser.add_argument("--transformer-onnx", type=str, default=None, help="原始 transformer onnx（可选，sub_configs 已覆盖可不填）")
    parser.add_argument("--transformer-subgraph-dir", type=str, required=True, help="子图 axmodel 目录")
    parser.add_argument("--vae-axmodel", type=str, required=True, help="VAE decoder axmodel 路径")
    parser.add_argument("--final-output-name", type=str, default=None, help="指定最终输出 tensor 名称，默认自动推断")
    parser.add_argument("--save-decoder-input", action="store_true", help="是否保存 decoder 输入 npy")
    parser.add_argument("--no-progress", action="store_true", help="关闭进度条输出")
    return parser.parse_args()


# -----------------------------------------------------------------------------
# AX transformer 子图执行器
# -----------------------------------------------------------------------------

from scripts.split_quant_onnx_by_subconfigs import SubGraphSpec, sanitize


class AxSplitTransformer:
    def __init__(self, config_path: Path, onnx_path: Optional[Path], model_dir: Path):
        self.config_path = config_path
        self.onnx_path = onnx_path
        self.model_dir = model_dir
        self._session_cache: Dict[str, AxInferenceSession] = {}
        config_specs = self._load_specs()
        auto_specs = self._load_auto_specs()
        self.specs = config_specs + auto_specs
        last_group = auto_specs if auto_specs else config_specs
        self.final_outputs = list(last_group[-1].end)
        self.default_final_output = DEFAULT_FINAL_OUTPUT or self.final_outputs[0]

    def _get_session(self, spec: SubGraphSpec) -> AxInferenceSession:
        if spec.label not in self._session_cache:
            path = self._expected_path(spec)
            self._session_cache[spec.label] = AxInferenceSession(path.as_posix())
            logger.info(f"加载子图 session: {spec.label} from {path.name}")
        return self._session_cache[spec.label]

    def close(self) -> None:
        # 显式释放缓存的 session
        for key, sess in list(self._session_cache.items()):
            try:
                del sess
            finally:
                self._session_cache.pop(key, None)

    def _load_specs(self) -> List[SubGraphSpec]:
        with self.config_path.open("r", encoding="utf-8") as f:
            config = json.load(f)
        sub_configs = config.get("compiler", {}).get("sub_configs", [])
        if not sub_configs:
            raise ValueError("配置文件缺少 compiler.sub_configs")
        specs: List[SubGraphSpec] = []
        for idx, entry in enumerate(sub_configs):
            start = [name for name in entry.get("start_tensor_names", []) if name]
            end = [name for name in entry.get("end_tensor_names", []) if name]
            if not start or not end:
                raise ValueError(f"sub_config[{idx}] 定义不完整")
            spec = SubGraphSpec(
                label=f"cfg_{idx:02d}",
                start=start,
                end=end,
                node_names=set(),
                source="config",
            )
            specs.append(spec)
        return specs

    def _load_auto_specs(self) -> List[SubGraphSpec]:
        specs: List[SubGraphSpec] = []
        for path in sorted(self.model_dir.glob("auto_*.axmodel")):
            try:
                session = AxInferenceSession(path.as_posix())
                inputs = [info.name for info in session.get_inputs() if getattr(info, "name", None)]
                outputs = [info.name for info in session.get_outputs() if getattr(info, "name", None)]
                # 缓存 session，避免重复打开
                self._session_cache[path.stem] = session
            except Exception as exc:  # pragma: no cover - defensive
                logger.warning(f"跳过 {path.name}，加载/解析 IO 失败: {exc}")
                continue

            if not inputs or not outputs:
                logger.warning(f"跳过 {path.name}，未找到有效的输入/输出定义")
                continue

            specs.append(
                SubGraphSpec(
                    label=path.stem,
                    start=inputs,
                    end=outputs,
                    node_names=set(),
                    source="auto",
                    output_path=path,
                )
            )
        return specs

    def _expected_path(self, spec: SubGraphSpec) -> Path:
        if spec.output_path is not None:
            path = spec.output_path
        else:
            head = sanitize(spec.start[0]) if spec.start else "const"
            tail = sanitize(spec.end[0]) if spec.end else "out"
            filename = f"{spec.label}_{head}_to_{tail}_{spec.source}.axmodel"
            path = self.model_dir / filename
        if not path.exists():
            raise FileNotFoundError(f"缺少 AXModel: {path}")
        return path

    def __call__(
        self,
        latent_np: np.ndarray,
        prompt_np: np.ndarray,
        timestep_np: np.ndarray,
        final_output_name: Optional[str] = None,
    ) -> np.ndarray:
        tensor_store: Dict[str, np.ndarray] = {
            "latent_model_input": latent_np,
            "prompt_embeds": prompt_np,
            "timestep": timestep_np,
        }
        executed = set()
        target = final_output_name or self.default_final_output

        # 就绪驱动执行，单个子图跑完立刻释放 session
        while target not in tensor_store:
            progressed = False
            for spec in self.specs:
                if spec.label in executed:
                    continue
                if not all(name in tensor_store for name in spec.start):
                    continue
                session = self._get_session(spec)
                inputs = {name: tensor_store[name] for name in spec.start}
                outputs = session.run(spec.end, inputs)
                for out_name, value in zip(spec.end, outputs):
                    tensor_store[out_name] = value
                executed.add(spec.label)
                progressed = True
            if not progressed:
                missing = [
                    (spec.label, [name for name in spec.start if name not in tensor_store])
                    for spec in self.specs
                    if spec.label not in executed
                ]
                raise RuntimeError(
                    f"子图调度中断，缺少输入: {missing}; 当前可用: {list(tensor_store.keys())}"
                )

        return tensor_store[target]


# -----------------------------------------------------------------------------
# 主流程
# -----------------------------------------------------------------------------

def main() -> None:
    args = parse_args()
    prompt_text = args.prompt if args.prompt is not None else PROMPT
    logger.info(f"使用的 prompt: {prompt_text}")

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    torch.set_grad_enabled(False)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, subfolder="tokenizer")
    text_encoder = Qwen3ForCausalLM.from_pretrained(
        MODEL_NAME,
        subfolder="text_encoder",
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
    )
    text_encoder.eval()

    scheduler_cls = SAMPLER_MAP[args.sampler]
    scheduler = scheduler_cls.from_pretrained(MODEL_NAME, subfolder="scheduler")

    image_processor = VaeImageProcessor(vae_scale_factor=VAE_SCALE_FACTOR * 2)

    prompt_embeds, _ = encode_prompt(
        tokenizer,
        text_encoder,
        prompt_text,
        device,
        do_classifier_free_guidance=False,
        negative_prompt=args.negative_prompt,
        max_sequence_length=args.max_seq_len,
    )

    latents = prepare_latents(
        batch_size=1,
        num_channels_latents=NUM_CHANNELS_LATENTS,
        height=args.height,
        width=args.width,
        dtype=torch.float32,
        device=device,
        generator=torch.Generator(device=device).manual_seed(args.seed),
    )

    image_seq_len = (latents.shape[2] // 2) * (latents.shape[3] // 2)
    mu = calculate_shift(
        image_seq_len,
        scheduler.config.get("base_image_seq_len", 256),
        scheduler.config.get("max_image_seq_len", 4096),
        scheduler.config.get("base_shift", 0.5),
        scheduler.config.get("max_shift", 1.15),
    )
    timesteps = retrieve_timesteps(scheduler, args.steps, device=device, mu=mu)

    onnx_path = Path(args.transformer_onnx) if args.transformer_onnx else None
    transformer_runner = AxSplitTransformer(
        Path(args.transformer_config),
        onnx_path,
        Path(args.transformer_subgraph_dir),
    )
    # 优先使用 auto_* 子图里的 sample 输出，避免误用中间特征导致 shape 对不上
    available_outputs = [name for spec in transformer_runner.specs for name in getattr(spec, "end", [])]
    preferred_output = "sample" if "sample" in available_outputs else transformer_runner.default_final_output
    final_output_name = args.final_output_name or preferred_output
    if final_output_name not in available_outputs:
        raise ValueError(f"指定的输出 {final_output_name} 不存在，可选: {available_outputs}")

    prompt_embeds_tensor = _stack_prompt_embeddings(prompt_embeds)

    iterator = timesteps if args.no_progress else tqdm(timesteps, desc="AX Denoising", dynamic_ncols=True)
    for t in iterator:
        timestep = t.expand(latents.shape[0])
        timestep_model_input = (1000 - timestep) / 1000

        latent_model_input = latents.to(torch.float32)
        latent_np = latent_model_input.unsqueeze(2).to(dtype=torch.float32).cpu().numpy()
        prompt_np = prompt_embeds_tensor.to(dtype=torch.float32).cpu().numpy()
        timestep_np = timestep_model_input.to(dtype=torch.float32).cpu().numpy()

        model_out = transformer_runner(latent_np, prompt_np, timestep_np, final_output_name)
        if model_out.ndim == 5 and model_out.shape[2] == 1:
            model_out = np.squeeze(model_out, axis=2)
        model_out_tensor = torch.from_numpy(model_out).to(device=device, dtype=torch.float32)
        if model_out_tensor.dim() == 5 and model_out_tensor.size(2) == 1:
            model_out_tensor = model_out_tensor.squeeze(2)
        noise_pred = -model_out_tensor

        latents = scheduler.step(noise_pred, t, latents, return_dict=False)[0]

    # 释放 transformer 缓存的 session
    transformer_runner.close()

    latents = (latents / VAE_SCALING_FACTOR) + VAE_SHIFT_FACTOR
    decoder_input = latents.to(dtype=torch.float32).cpu().numpy()

    if args.save_decoder_input:
        save_dir_path = Path(args.save_dir)
        save_dir_path.mkdir(parents=True, exist_ok=True)
        np.save(save_dir_path / "decoder_input.npy", decoder_input)
        logger.info("已保存 decoder 输入为 npy")

    del transformer_runner

    vae_decoder_session = AxInferenceSession(Path(args.vae_axmodel).as_posix())

    if decoder_input.ndim == 5 and decoder_input.shape[2] == 1:
        decoder_input = np.squeeze(decoder_input, axis=2)
    image = vae_decoder_session.run(None, {"latents": decoder_input})[0]

    del vae_decoder_session
    image = torch.from_numpy(image).to(device=device, dtype=torch.float32)
    image = image_processor.postprocess(image, output_type="pil")

    save_dir = Path(args.save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)
    target_path = save_dir / f"z_image_axmodel_{prompt_idx}.png"
    image[0].save(target_path)
    logger.info(f"AXModel 推理完成，结果保存到 {target_path}")


if __name__ == "__main__":
    """
    # 512x512 生成示例命令：
    python3 examples/z_image_fun/launcher_axmodel.py \
        --transformer-config pulsar2_configs/transformers_subgraph.json \
        --transformer-subgraph-dir comliled_subgraph_from_all_onnx \
        --vae-axmodel vae_decoder.axmodel

    # 1728x992 生成示例命令:
    python3 examples/z_image_fun/launcher_axmodel.py \
        --transformer-config pulsar2_configs/transformers_subgraph_1728x992.json \
        --transformer-subgraph-dir transformers_body_only_1728_992_split_onnx \
        --vae-axmodel onnx-models-1728x992/vae_decoder_simp_slim.axmodel \
        --max-seq-len 256 \
        --height 1728 --width 992
    """

    main()