# -*- coding: utf-8 -*-
"""
Chinese Calligraphy Generation with Flux Model
Author and font style controllable generation
"""

import os
import json
import torch
from safetensors.torch import load_file as load_safetensors
from optimum.quanto import quantize, freeze, qint4
from PIL import Image, ImageDraw, ImageFont
from typing import Optional, List, Union, Dict, Any
from einops import rearrange
from pypinyin import lazy_pinyin
from huggingface_hub import hf_hub_download, snapshot_download

from src.flux.util import configs, load_ae, load_clip, load_t5
from src.flux.model import Flux
from src.flux.xflux_pipeline import XFluxSampler


# HuggingFace Hub model IDs
HF_MODEL_ID = "TSXu/Unicalli_Pro"
HF_CHECKPOINT_INDEX = "model.safetensors.index.json"  # Sharded safetensors index
HF_INTERNVL_ID = "OpenGVLab/InternVL3-1B"


def download_sharded_safetensors(
    model_id: str = HF_MODEL_ID,
    local_dir: str = None,
    force_download: bool = False
) -> str:
    """
    Download sharded safetensors model from HuggingFace Hub
    
    Args:
        model_id: HuggingFace model repository ID
        local_dir: Local directory to save files (optional)
        force_download: Whether to force re-download
        
    Returns:
        Path to the index.json file
    """
    print(f"Downloading sharded safetensors from HuggingFace Hub ({model_id})...")
    
    # Get HF token from environment for private repos
    hf_token = os.environ.get("HF_TOKEN", None)
    
    try:
        # First download the index file
        index_path = hf_hub_download(
            repo_id=model_id,
            filename=HF_CHECKPOINT_INDEX,
            local_dir=local_dir,
            force_download=force_download,
            token=hf_token
        )
        print(f"Index downloaded to: {index_path}")
        
        # Read index to get shard filenames
        with open(index_path, 'r') as f:
            index = json.load(f)
        
        # Get unique shard files
        shard_files = set(index['weight_map'].values())
        print(f"Downloading {len(shard_files)} shard files...")
        
        # Download all shards
        for shard_file in sorted(shard_files):
            print(f"  Downloading {shard_file}...")
            hf_hub_download(
                repo_id=model_id,
                filename=shard_file,
                local_dir=local_dir,
                force_download=force_download,
                token=hf_token
            )
        
        print(f"All shards downloaded!")
        return index_path
    except Exception as e:
        print(f"Error downloading model: {e}")
        raise


def is_huggingface_repo_id(path: str) -> bool:
    """
    Check if a string looks like a HuggingFace repo ID (e.g., 'namespace/repo_name')
    NOT a local file path
    """
    # HF repo IDs have format: namespace/repo_name (exactly one /)
    # Local paths typically have multiple / or start with / or .
    if path.startswith('/') or path.startswith('.') or path.startswith('~'):
        return False
    parts = path.split('/')
    # HF repo ID should have exactly 2 parts: namespace and repo_name
    if len(parts) == 2 and all(part and not part.startswith('.') for part in parts):
        return True
    return False


def ensure_checkpoint_exists(checkpoint_path: str) -> str:
    """
    Ensure checkpoint exists locally, download from HF Hub if not
    
    Args:
        checkpoint_path: Local path or HF model ID
        
    Returns:
        Path to the local checkpoint/index file
    """
    # If it's a local path and exists, return it
    if os.path.exists(checkpoint_path):
        print(f"Using local checkpoint: {checkpoint_path}")
        return checkpoint_path
    
    # If it looks like a HuggingFace repo ID (e.g., "TSXu/Unicalli_Pro")
    if is_huggingface_repo_id(checkpoint_path):
        print(f"Downloading from HuggingFace Hub: {checkpoint_path}")
        return download_sharded_safetensors(model_id=checkpoint_path)
    
    raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")


def convert_to_pinyin(text):
    return ' '.join([item[0] if isinstance(item, list) else item for item in lazy_pinyin(text)])


class CalligraphyGenerator:
    """
    Chinese Calligraphy Generator using Flux model

    Attributes:
        device: torch device for computation
        model_name: name of the flux model (flux-dev or flux-schnell)
        font_styles: available font styles for generation
        authors: available calligrapher authors
    """

    def __init__(
        self,
        model_name: str = "flux-dev",
        device: str = "cuda",
        offload: bool = True,
        checkpoint_path: Optional[str] = None,
        intern_vlm_path: Optional[str] = None,
        ref_latent_path: Optional[str] = None,
        font_descriptions_path: str = "chirography.json",
        author_descriptions_path: str = "calligraphy_styles_en.json",
        use_deepspeed: bool = False,
        use_4bit_quantization: bool = False,
        use_float8_quantization: bool = False,
        use_torch_compile: bool = False,
        compile_mode: str = "reduce-overhead",
        deepspeed_config: Optional[str] = None,
        dtype: Optional[str] = None,
        preloaded_embedding: Optional[torch.nn.Module] = None,
        preloaded_tokenizer: Optional[Any] = None,
    ):
        """
        Initialize the calligraphy generator

        Args:
            model_name: flux model name (flux-dev or flux-schnell)
            device: device for computation
            offload: whether to offload model to CPU when not in use
            checkpoint_path: path to model checkpoint if using fine-tuned model
            intern_vlm_path: path to InternVLM model for text embedding
            ref_latent_path: path to reference latents for recognition mode
            font_descriptions_path: path to font style descriptions JSON
            author_descriptions_path: path to author style descriptions JSON
            use_deepspeed: whether to use DeepSpeed ZeRO for memory optimization
            use_4bit_quantization: whether to use 4-bit quantization (quanto/bitsandbytes)
            use_float8_quantization: whether to use Float8 quantization (torchao) for faster inference
            use_torch_compile: whether to use torch.compile for optimized inference
            compile_mode: torch.compile mode - "reduce-overhead", "max-autotune", or "default"
            deepspeed_config: path to DeepSpeed config JSON file
            dtype: force specific dtype for inference: "fp16", "bf16", "fp32", or None for auto
        """
        self.device = torch.device(device)
        self.model_name = model_name
        self.offload = offload
        self.is_schnell = model_name == "flux-schnell"
        self.use_deepspeed = use_deepspeed
        self.deepspeed_config = deepspeed_config
        self.use_4bit_quantization = use_4bit_quantization
        self.use_float8_quantization = use_float8_quantization
        self.use_torch_compile = use_torch_compile
        self.compile_mode = compile_mode
        self.forced_dtype = dtype  # "fp16", "bf16", "fp32", or None for auto

        # Load font and author style descriptions
        if os.path.exists(font_descriptions_path):
            with open(font_descriptions_path, 'r', encoding='utf-8') as f:
                self.font_style_des = json.load(f)
        else:
            raise FileNotFoundError(f"Font descriptions file not found: {font_descriptions_path}")

        if os.path.exists(author_descriptions_path):
            with open(author_descriptions_path, 'r', encoding='utf-8') as f:
                self.author_style = json.load(f)
        else:
            raise FileNotFoundError(f"Author descriptions file not found: {author_descriptions_path}")

        # Load models
        print("Loading models...")
        # When using DeepSpeed, load text encoders on CPU first to save memory during initialization
        # They will be moved to GPU after DeepSpeed initializes the main model
        if self.use_deepspeed:
            text_encoder_device = "cpu"
        elif offload:
            text_encoder_device = "cpu"  # Will be moved to GPU during inference
        else:
            text_encoder_device = self.device

        self.t5 = load_t5(text_encoder_device, max_length=256 if self.is_schnell else 512)
        self.clip = load_clip(text_encoder_device)
        self.clip.requires_grad_(False)

        # Ensure checkpoint exists (download from HF Hub if needed)
        if checkpoint_path:
            checkpoint_path = ensure_checkpoint_exists(checkpoint_path)
            print(f"Loading model from checkpoint: {checkpoint_path}")
            # When using DeepSpeed, don't move to GPU yet - let DeepSpeed handle it
            self.model = self._load_model_from_checkpoint(
                checkpoint_path, model_name,
                offload=offload,
                use_deepspeed=self.use_deepspeed
            )

            # Initialize DeepSpeed if requested
            if self.use_deepspeed:
                self.model = self._init_deepspeed(self.model)
        else:
            # If no checkpoint path provided, download default from HF Hub
            print("No checkpoint path provided, downloading from HuggingFace Hub...")
            checkpoint_path = download_model_from_hf()
            print(f"Loading model from checkpoint: {checkpoint_path}")
            self.model = self._load_model_from_checkpoint(
                checkpoint_path, model_name,
                offload=offload,
                use_deepspeed=self.use_deepspeed
            )
            if self.use_deepspeed:
                self.model = self._init_deepspeed(self.model)
        
        # Note: Float8 quantization and torch.compile optimizations
        # are applied externally (e.g., in app.py) for better control
        # over the optimization process with ZeroGPU AOT compilation.

        # Load VAE
        if self.use_deepspeed or offload:
            vae_device = "cpu"
        else:
            vae_device = self.device

        self.vae = load_ae(model_name, device=vae_device)

        # Move VAE to GPU only if offload (not DeepSpeed)
        if offload and not self.use_deepspeed:
            self.vae = self.vae.to(self.device)

        # After DeepSpeed init, move text encoders to GPU
        if self.use_deepspeed:
            print("Moving text encoders to GPU...")
            self.t5 = self.t5.to(self.device)
            self.clip = self.clip.to(self.device)
            self.vae = self.vae.to(self.device)

        # Load reference latents if provided
        self.ref_latent = None
        if ref_latent_path and os.path.exists(ref_latent_path):
            print(f"Loading reference latents from {ref_latent_path}")
            self.ref_latent = torch.load(ref_latent_path, map_location='cpu')

        # Create sampler (use preloaded embedding if available)
        self.sampler = XFluxSampler(
            clip=self.clip,
            t5=self.t5,
            ae=self.vae,
            ref_latent=self.ref_latent,
            model=self.model,
            device=self.device,
            intern_vlm_path=intern_vlm_path,
            preloaded_embedding=preloaded_embedding,
            preloaded_tokenizer=preloaded_tokenizer,
        )

        # Font for generating condition images
        project_root = os.path.dirname(os.path.abspath(__file__))
        local_font_path = os.path.join(project_root, "FangZhengKaiTiFanTi-1.ttf")
        self.font_path = self._ensure_font_exists(local_font_path)
        self.default_font_size = 102  # 128 * 0.8

    def _ensure_font_exists(self, font_path: str) -> str:
        """
        Ensure font file exists locally, download from HF Hub if not
        
        Args:
            font_path: Local path to font file
            
        Returns:
            Path to the local font file
        """
        cached_font_path = os.environ.get("UNICALLI_FONT_PATH")
        if cached_font_path and os.path.exists(cached_font_path):
            return cached_font_path

        if os.path.exists(font_path):
            return font_path
        
        # Try to download from HF Hub
        print(f"Font file not found locally, downloading from HuggingFace Hub...")
        hf_token = os.environ.get("HF_TOKEN", None)
        try:
            font_path = hf_hub_download(
                repo_id=HF_MODEL_ID,
                filename="FangZhengKaiTiFanTi-1.ttf",
                token=hf_token
            )
            print(f"Font downloaded to: {font_path}")
            return font_path
        except Exception as e:
            print(f"Warning: Could not download font: {e}")
            return font_path  # Return original path, may fail later

    def _load_model_from_checkpoint(self, checkpoint_path: str, model_name: str, offload: bool, use_deepspeed: bool = False):
        """
        Load model from checkpoint without loading flux pretrained weights.
        Supports both regular checkpoints and NF4 quantized checkpoints.

        Args:
            checkpoint_path: Path to your checkpoint file or NF4 model directory
            model_name: flux model name (for config)
            offload: whether to offload to CPU
            use_deepspeed: whether using DeepSpeed (keeps model on CPU)

        Returns:
            model with loaded checkpoint
        """
        print(f"Creating empty flux model structure...")
        load_device = "cpu"

        # Create model structure without loading pretrained weights (using "meta" device)
        with torch.device("meta"):
            model = Flux(configs[model_name].params)

        # Initialize module embeddings (must be done before loading checkpoint)
        print("Initializing module embeddings...")
        model.init_module_embeddings(tokens_num=320, cond_txt_channel=896)

        # Move model to loading device
        print(f"Moving model to {load_device} for loading...")
        model = model.to_empty(device=load_device)

        # Check if this is an NF4 quantized model
        is_nf4 = self._is_nf4_checkpoint(checkpoint_path)
        
        # Load checkpoint
        print(f"Loading checkpoint from {checkpoint_path}")
        if is_nf4:
            print("Detected NF4 quantized model, dequantizing...")
            checkpoint = self._load_nf4_checkpoint(checkpoint_path)
        else:
            checkpoint = self._load_checkpoint_file(checkpoint_path)
        
        # Determine dtype from checkpoint - keep original dtype for efficiency
        first_tensor = next(iter(checkpoint.values()))
        checkpoint_dtype = first_tensor.dtype
        print(f"Checkpoint dtype: {checkpoint_dtype}")
        
        # Check if user forced a specific dtype
        forced_dtype = getattr(self, 'forced_dtype', None)
        if forced_dtype:
            dtype_map = {
                "fp16": torch.float16,
                "bf16": torch.bfloat16,
                "fp32": torch.float32,
                "fp8": torch.float8_e4m3fn,
            }
            if forced_dtype not in dtype_map:
                print(f"Warning: Unknown dtype '{forced_dtype}', using auto selection")
                forced_dtype = None
            else:
                target_dtype = dtype_map[forced_dtype]
                print(f"Using forced dtype: {target_dtype}")
                if checkpoint_dtype != target_dtype:
                    print(f"Converting checkpoint from {checkpoint_dtype} to {target_dtype}...")
                    checkpoint = {k: v.to(target_dtype) for k, v in checkpoint.items()}
        
        if not forced_dtype:
            # Note: We trust the original precision (like FP8) if it is provided that way
            target_dtype = checkpoint_dtype
            print(f"Using auto-detected checkpoint dtype: {target_dtype} for inference loading")

        # Load weights into model
        model.load_state_dict(checkpoint, strict=False, assign=True)
        print(f"Model dtype after loading: {next(model.parameters()).dtype}")
        
        # Store target dtype for inference
        self._model_dtype = target_dtype
        
        # Free checkpoint memory
        del checkpoint

        # Apply bitsandbytes 4-bit quantization if requested
        if hasattr(self, 'use_4bit_quantization') and self.use_4bit_quantization:
            try:
                import bitsandbytes as bnb
                print("Applying bitsandbytes NF4 quantization for 4-bit inference...")
                model = self._quantize_model_bnb(model)
                model._is_quantized = True
                print("bitsandbytes NF4 quantization complete!")
            except ImportError:
                print("bitsandbytes not available, using quanto quantization...")
                model = model.float()
                quantize(model, weights=qint4)
                freeze(model)
                model._is_quantized = True
                print("quanto 4-bit quantization complete!")

        # Move to GPU only if NOT using DeepSpeed
        if not use_deepspeed:
            if self.device.type != "cpu":
                print(f"Moving model to {self.device}...")
                model = model.to(self.device)
                
                # Enable optimized attention backends
                try:
                    torch.backends.cuda.enable_flash_sdp(True)
                    torch.backends.cuda.enable_mem_efficient_sdp(True)
                    torch.backends.cuda.enable_math_sdp(False)
                    print("Enabled FlashAttention / Memory-Efficient SDPA backends")
                except Exception as e:
                    print(f"Could not configure SDPA backends: {e}")

        return model

    def _is_nf4_checkpoint(self, path: str) -> bool:
        """Check if path contains an NF4 quantized checkpoint"""
        if os.path.isdir(path):
            return os.path.exists(os.path.join(path, "quantization_config.json"))
        return False

    def _load_nf4_checkpoint(self, checkpoint_dir: str) -> dict:
        """
        Load NF4 quantized checkpoint and dequantize to float tensors.
        
        Args:
            checkpoint_dir: Directory containing NF4 model files
            
        Returns:
            Dequantized state dict
        """
        from safetensors.torch import load_file as load_safetensors
        
        # Load quantization config
        config_path = os.path.join(checkpoint_dir, "quantization_config.json")
        with open(config_path, 'r') as f:
            quant_config = json.load(f)
        
        block_size = quant_config.get("block_size", 64)
        quantized_keys = set(quant_config.get("quantized_keys", []))
        
        # Load index
        index_path = os.path.join(checkpoint_dir, "model_nf4.safetensors.index.json")
        with open(index_path, 'r') as f:
            index = json.load(f)
        
        # Load all shards
        shard_files = sorted(set(index['weight_map'].values()))
        print(f"Loading {len(shard_files)} NF4 shards...")
        
        raw_state = {}
        for shard_file in shard_files:
            shard_path = os.path.join(checkpoint_dir, shard_file)
            print(f"  Loading {shard_file}...")
            shard_data = load_safetensors(shard_path)
            raw_state.update(shard_data)
        
        # NF4 lookup table for dequantization
        nf4_values = torch.tensor([
            -1.0, -0.6961928009986877, -0.5250730514526367, -0.39491748809814453,
            -0.28444138169288635, -0.18477343022823334, -0.09105003625154495, 0.0,
            0.07958029955625534, 0.16093020141124725, 0.24611230850220, 0.33791524171829224,
            0.44070982933044434, 0.5626170039176941, 0.7229568362236023, 1.0
        ], dtype=torch.float32)
        
        # Dequantize
        state_dict = {}
        dequant_count = 0
        
        for key in list(raw_state.keys()):
            if key.endswith('.quant_data'):
                base_key = key.replace('.quant_data', '')
                if base_key in quantized_keys:
                    # Dequantize this tensor
                    quant_data = raw_state[f"{base_key}.quant_data"]
                    scales = raw_state[f"{base_key}.scales"]
                    shape = raw_state[f"{base_key}.shape"].tolist()
                    pad_len = raw_state[f"{base_key}.pad_len"].item()
                    
                    # Unpack 4-bit values
                    high = (quant_data >> 4) & 0x0F
                    low = quant_data & 0x0F
                    indices = torch.stack([high, low], dim=-1).flatten().long()
                    
                    # Lookup and reshape
                    values = nf4_values[indices]
                    
                    # Apply scales
                    num_blocks = len(scales)
                    values = values[:num_blocks * block_size].reshape(num_blocks, block_size)
                    values = values * scales.float().unsqueeze(1)
                    values = values.flatten()
                    
                    # Remove padding and reshape
                    if pad_len > 0:
                        values = values[:-pad_len]
                    
                    state_dict[base_key] = values.reshape(shape)
                    dequant_count += 1
            elif not any(key.endswith(s) for s in ['.scales', '.shape', '.block_size', '.pad_len']):
                # Non-quantized tensor, keep as-is
                state_dict[key] = raw_state[key]
        
        print(f"Dequantized {dequant_count} tensors")
        return state_dict

    def _quantize_model_bnb(self, model):
        """
        Quantize model using bitsandbytes NF4.
        Replaces Linear layers with Linear4bit for true 4-bit inference.
        """
        import bitsandbytes as bnb
        import torch.nn as nn
        
        def replace_linear_with_4bit(module, name=''):
            for child_name, child in list(module.named_children()):
                full_name = f"{name}.{child_name}" if name else child_name
                
                if isinstance(child, nn.Linear):
                    # Create 4-bit linear layer
                    new_layer = bnb.nn.Linear4bit(
                        child.in_features,
                        child.out_features,
                        bias=child.bias is not None,
                        compute_dtype=torch.bfloat16,
                        compress_statistics=True,
                        quant_type='nf4'
                    )
                    # Copy weights (will be quantized when moved to GPU)
                    new_layer.weight = bnb.nn.Params4bit(
                        child.weight.data,
                        requires_grad=False,
                        quant_type='nf4'
                    )
                    if child.bias is not None:
                        new_layer.bias = nn.Parameter(child.bias.data)
                    
                    setattr(module, child_name, new_layer)
                else:
                    replace_linear_with_4bit(child, full_name)
        
        print("Replacing Linear layers with Linear4bit...")
        replace_linear_with_4bit(model)
        return model

    def _init_deepspeed(self, model):
        """
        Initialize DeepSpeed for the model with ZeRO-3 inference optimization.

        Args:
            model: PyTorch model to wrap with DeepSpeed

        Returns:
            DeepSpeed inference engine
        """
        try:
            import deepspeed
        except ImportError:
            raise ImportError("DeepSpeed is not installed. Install it with: pip install deepspeed")

        # Load DeepSpeed config
        if self.deepspeed_config is None:
            self.deepspeed_config = "ds_config_zero2.json"

        if not os.path.exists(self.deepspeed_config):
            raise FileNotFoundError(f"DeepSpeed config not found: {self.deepspeed_config}")

        print(f"Initializing DeepSpeed Inference with config: {self.deepspeed_config}")

        # Initialize distributed environment for single GPU if not already initialized
        if not torch.distributed.is_initialized():
            import random
            # Set environment variables for single-process mode
            # Use a random port to avoid conflicts
            port = random.randint(29500, 29600)
            os.environ['MASTER_ADDR'] = 'localhost'
            os.environ['MASTER_PORT'] = str(port)
            os.environ['RANK'] = '0'
            os.environ['LOCAL_RANK'] = '0'
            os.environ['WORLD_SIZE'] = '1'

            # Initialize process group
            try:
                torch.distributed.init_process_group(
                    backend='nccl',
                    init_method='env://',
                    world_size=1,
                    rank=0
                )
                print(f"Initialized single-GPU distributed environment for DeepSpeed on port {port}")
            except RuntimeError as e:
                if "address already in use" in str(e):
                    print(f"Port {port} in use, trying again...")
                    # Try a different port
                    port = random.randint(29600, 29700)
                    os.environ['MASTER_PORT'] = str(port)
                    torch.distributed.init_process_group(
                        backend='nccl',
                        init_method='env://',
                        world_size=1,
                        rank=0
                    )
                    print(f"Initialized single-GPU distributed environment for DeepSpeed on port {port}")
                else:
                    raise

        # Use DeepSpeed inference API instead of initialize
        # This doesn't require an optimizer
        with open(self.deepspeed_config) as f:
            ds_config = json.load(f)

        model_engine = deepspeed.init_inference(
            model=model,
            mp_size=1,  # model parallel size
            dtype=torch.float32,  # Use float32 for compatibility
            replace_with_kernel_inject=False,  # Don't replace with DeepSpeed kernels for custom models
        )

        print("DeepSpeed Inference initialized successfully")
        return model_engine

    def _load_checkpoint_file(self, checkpoint_path: str) -> dict:
        """
        Load checkpoint file and extract state dict.
        Supports: sharded safetensors, single safetensors, .bin/.pt files

        Args:
            checkpoint_path: Path to checkpoint file or index.json

        Returns:
            state_dict: model state dictionary
        """
        # Check if it's a sharded safetensors (index.json file)
        if checkpoint_path.endswith('.index.json'):
            print(f"Loading sharded safetensors from index: {checkpoint_path}")
            with open(checkpoint_path, 'r') as f:
                index = json.load(f)
            
            # Get the directory containing the shards
            shard_dir = os.path.dirname(checkpoint_path)
            
            # Get unique shard files
            shard_files = sorted(set(index['weight_map'].values()))
            print(f"Loading {len(shard_files)} shard files in parallel...")
            
            # Load shards in parallel using ThreadPoolExecutor
            from concurrent.futures import ThreadPoolExecutor, as_completed
            
            def load_shard(shard_file):
                shard_path = os.path.join(shard_dir, shard_file)
                return shard_file, load_safetensors(shard_path)
            
            state_dict = {}
            with ThreadPoolExecutor(max_workers=len(shard_files)) as executor:
                futures = {executor.submit(load_shard, sf): sf for sf in shard_files}
                for future in as_completed(futures):
                    shard_file, shard_dict = future.result()
                    print(f"  Loaded {shard_file}")
                    state_dict.update(shard_dict)
            
            print(f"Loaded {len(state_dict)} tensors from sharded safetensors")
            return state_dict
        
        # Check if it's a single safetensors file
        if checkpoint_path.endswith('.safetensors'):
            print(f"Loading safetensors: {checkpoint_path}")
            state_dict = load_safetensors(checkpoint_path)
            return state_dict

        # Check if it's a directory containing checkpoint files
        if os.path.isdir(checkpoint_path):
            # Look for index.json first (sharded safetensors)
            index_path = os.path.join(checkpoint_path, 'model.safetensors.index.json')
            if os.path.exists(index_path):
                return self._load_checkpoint_file(index_path)
            
            # Look for common checkpoint filenames
            possible_files = [
                'model.safetensors',
                'model.pt', 'model.pth', 'model.bin',
                'checkpoint.pt', 'checkpoint.pth',
                'pytorch_model.bin', 'model_state_dict.pt'
            ]

            checkpoint_file = None
            for filename in possible_files:
                full_path = os.path.join(checkpoint_path, filename)
                if os.path.exists(full_path):
                    checkpoint_file = full_path
                    print(f"Found checkpoint file: {filename}")
                    break

            if checkpoint_file is None:
                import glob
                # Try safetensors first
                st_files = glob.glob(os.path.join(checkpoint_path, "*.safetensors"))
                if st_files:
                    checkpoint_file = st_files[0]
                else:
                    pt_files = glob.glob(os.path.join(checkpoint_path, "*.pt")) + \
                              glob.glob(os.path.join(checkpoint_path, "*.pth")) + \
                              glob.glob(os.path.join(checkpoint_path, "*.bin"))
                    if pt_files:
                        checkpoint_file = pt_files[0]
                    else:
                        raise ValueError(f"No checkpoint files found in directory: {checkpoint_path}")
                print(f"Found checkpoint file: {os.path.basename(checkpoint_file)}")

            checkpoint_path = checkpoint_file
            # Recursively call to handle the found file
            return self._load_checkpoint_file(checkpoint_path)

        # Load .bin or .pt checkpoint
        print(f"Loading checkpoint file: {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path, map_location='cpu')

        # Handle different checkpoint formats
        if isinstance(checkpoint, dict):
            if 'model' in checkpoint:
                state_dict = checkpoint['model']
            elif 'model_state_dict' in checkpoint:
                state_dict = checkpoint['model_state_dict']
            elif 'state_dict' in checkpoint:
                state_dict = checkpoint['state_dict']
            else:
                state_dict = checkpoint

            if 'epoch' in checkpoint:
                print(f"Checkpoint from epoch: {checkpoint['epoch']}")
            if 'global_step' in checkpoint:
                print(f"Checkpoint from step: {checkpoint['global_step']}")
            if 'loss' in checkpoint:
                print(f"Checkpoint loss: {checkpoint['loss']:.4f}")
        else:
            state_dict = checkpoint

        # Remove 'module.' prefix if present
        if any(key.startswith('module.') for key in state_dict.keys()):
            state_dict = {key.replace('module.', ''): value
                         for key, value in state_dict.items()}
            print("Removed 'module.' prefix from state dict keys")

        return state_dict

    def text_to_cond_image(
        self,
        text: str,
        img_size: int = 128,
        font_scale: float = 0.8,
        font_path: Optional[str] = None,
        fixed_chars: int = 7
    ) -> Image.Image:
        """
        Convert text to condition image - always creates image for fixed_chars characters
        Text is arranged from top to bottom.

        Args:
            text: Chinese text to convert (must be <= fixed_chars characters)
            img_size: size of each character block (default 128)
            font_scale: scale of font relative to image size (default 0.8)
            font_path: path to font file
            fixed_chars: fixed number of character slots (default 7)

        Returns:
            PIL Image with text rendered (always fixed_chars * img_size height)
        """
        if len(text) > fixed_chars:
            raise ValueError(f"Text must be at most {fixed_chars} characters, got {len(text)}")

        if font_path is None:
            font_path = self.font_path

        # Create font - font size is scaled down from img_size
        font_size_scaled = int(font_scale * img_size)
        font = ImageFont.truetype(font_path, font_size_scaled)

        # Calculate image dimensions - always fixed_chars height
        img_width = img_size
        img_height = img_size * fixed_chars  # Fixed height for 7 characters

        # Create white background image
        cond_img = Image.new("RGB", (img_width, img_height), (255, 255, 255))
        cond_draw = ImageDraw.Draw(cond_img)

        # Draw each character from top to bottom
        # Note: font_size for positioning should be img_size, not the scaled font size
        for i, char in enumerate(text):
            font_space = font_size_scaled * (1 - font_scale) // 2
            # Position based on img_size blocks, not scaled font size
            font_position = (font_space, img_size * i + font_space)
            cond_draw.text(font_position, char, font=font, fill=(0, 0, 0))

        return cond_img

    def build_prompt(
        self,
        font_style: str = "楷",
        author: str = None,
        is_traditional: bool = True,
    ) -> str:
        """
        Build prompt for generation following dataset.py logic

        Args:
            font_style: font style (楷/草/行)
            author: author name (Chinese or None for synthetic)
            is_traditional: whether generating traditional calligraphy

        Returns:
            formatted prompt string
        """
        # Validate font style
        if font_style not in self.font_style_des:
            raise ValueError(f"Font style must be one of: {list(self.font_style_des.keys())}")

        # Convert font style to pinyin
        font_style_pinyin = convert_to_pinyin(font_style)

        # Build prompt based on traditional or synthetic
        if is_traditional and author and author in self.author_style:
            # Traditional calligraphy with specific author
            prompt = f"Traditional Chinese calligraphy works, background: black, font: {font_style_pinyin}, "
            prompt += self.font_style_des[font_style]
            author_info = self.author_style[author]
            prompt += f" author: {author_info}"
        else:
            # Synthetic calligraphy
            prompt = f"Synthetic calligraphy data, background: black, font: {font_style_pinyin}, "
            prompt += self.font_style_des[font_style]

        return prompt

    @torch.no_grad()
    def generate(
        self,
        text: str,
        font_style: str = "楷",
        author: str = None,
        width: int = 128,
        height: int = None,  # Fixed to 7 characters height
        num_steps: int = 50,
        guidance: float = 3.5,
        seed: int = None,
        is_traditional: bool = None,
        save_path: Optional[str] = None
    ) -> tuple[Image.Image, Image.Image]:
        """
        Generate calligraphy image from text

        Args:
            text: Chinese text to generate (1-7 characters)
            font_style: font style (楷/草/行)
            author: author/calligrapher name from the style list
            width: image width (default 128)
            height: image height (fixed to 7 * width)
            num_steps: number of denoising steps
            guidance: guidance scale
            seed: random seed for generation
            is_traditional: whether generating traditional calligraphy (auto-determined if None)
            save_path: optional path to save the generated image

        Returns:
            tuple of (generated_image, condition_image)
        """
        # Fixed number of characters
        FIXED_CHARS = 7
        
        # Validate text - must have 1-7 characters
        if len(text) < 1:
            raise ValueError(f"Text must have at least 1 character, got empty string")
        if len(text) > FIXED_CHARS:
            raise ValueError(f"Text must have at most {FIXED_CHARS} characters, got {len(text)}")

        if seed is None:
            seed = torch.randint(0, 2**32, (1,)).item()

        # Fixed height for 7 characters
        num_chars = len(text)
        height = width * FIXED_CHARS  # Always 7 characters height

        # Auto-determine traditional vs synthetic
        if is_traditional is None:
            is_traditional = author is not None and author in self.author_style

        # Generate condition image (fixed size for 7 characters)
        cond_img = self.text_to_cond_image(text, img_size=width, fixed_chars=FIXED_CHARS)

        # Build prompt
        prompt = self.build_prompt(
            font_style=font_style,
            author=author,
            is_traditional=is_traditional,
        )

        print(f"Generating with prompt: {prompt}")
        print(f"Text: {text} ({num_chars} chars), Seed: {seed}")
        # Generate image
        result_img, recognized_text = self.sampler(
            prompt=prompt,
            width=width,
            height=height,
            num_steps=num_steps,
            controlnet_image=cond_img,
            is_generation=True,
            cond_text=text,
            required_chars=FIXED_CHARS,  # Always 7 characters
            seed=seed
        )

        # Crop to actual text length if less than FIXED_CHARS
        if num_chars < FIXED_CHARS:
            actual_height = width * num_chars
            # Crop result image (top portion only)
            result_img = result_img.crop((0, 0, width, actual_height))
            # Crop condition image as well
            cond_img = cond_img.crop((0, 0, width, actual_height))

        # Save if path provided
        if save_path:
            os.makedirs(os.path.dirname(save_path) or ".", exist_ok=True)
            result_img.save(save_path)
            print(f"Image saved to {save_path}")

        return result_img, cond_img

    def batch_generate(
        self,
        texts: List[str],
        font_styles: Optional[List[str]] = None,
        authors: Optional[List[str]] = None,
        output_dir: str = "./outputs",
        **kwargs
    ) -> List[tuple[Image.Image, Image.Image]]:
        """
        Batch generate calligraphy images

        Args:
            texts: list of texts to generate (1-7 characters each)
            font_styles: list of font styles (if None, use default)
            authors: list of authors (if None, use synthetic)
            output_dir: directory to save outputs
            **kwargs: additional arguments for generate()

        Returns:
            list of (generated_image, condition_image) tuples
        """
        os.makedirs(output_dir, exist_ok=True)
        results = []

        # Default styles and authors if not provided
        if font_styles is None:
            font_styles = ["楷"] * len(texts)
        if authors is None:
            authors = [None] * len(texts)

        for i, (text, font, author) in enumerate(zip(texts, font_styles, authors)):
            # Clean author name for filename
            author_name = author if author else "synthetic"
            if author and author in self.author_style:
                author_name = convert_to_pinyin(author)

            save_path = os.path.join(
                output_dir,
                f"{text}_{font}_{author_name}_{i}.png"
            )

            result_img, cond_img = self.generate(
                text=text,
                font_style=font,
                author=author,
                save_path=save_path,
                **kwargs
            )
            results.append((result_img, cond_img))

        return results

    def get_available_authors(self) -> List[str]:
        """Get list of available author styles"""
        return list(self.author_style.keys())

    def get_available_fonts(self) -> List[str]:
        """Get list of available font styles"""
        return list(self.font_style_des.keys())


# Hugging Face Pipeline wrapper
class FluxCalligraphyPipeline:
    """Hugging Face compatible pipeline for calligraphy generation"""

    def __init__(
        self,
        model_name: str = "flux-dev",
        device: str = "cuda",
        checkpoint_path: Optional[str] = None,
        **kwargs
    ):
        """Initialize the pipeline"""
        self.generator = CalligraphyGenerator(
            model_name=model_name,
            device=device,
            checkpoint_path=checkpoint_path,
            **kwargs
        )

    def __call__(
        self,
        text: Union[str, List[str]],
        font_style: Union[str, List[str]] = "楷",
        author: Union[str, List[str]] = None,
        num_inference_steps: int = 50,
        guidance_scale: float = 3.5,
        generator: Optional[torch.Generator] = None,
        **kwargs
    ) -> Union[Image.Image, List[Image.Image]]:
        """
        Generate calligraphy images

        Args:
            text: text or list of texts to generate (1-7 characters each)
            font_style: font style(s) (楷/草/行)
            author: author name(s) from the style list
            num_inference_steps: number of denoising steps
            guidance_scale: guidance scale for generation
            generator: torch generator for reproducibility

        Returns:
            generated image(s)
        """
        # Handle single text
        if isinstance(text, str):
            seed = None
            if generator is not None:
                seed = generator.initial_seed()

            result, _ = self.generator.generate(
                text=text,
                font_style=font_style,
                author=author,
                num_steps=num_inference_steps,
                guidance=guidance_scale,
                seed=seed,
                **kwargs
            )
            return result

        # Handle batch
        else:
            if isinstance(font_style, str):
                font_style = [font_style] * len(text)
            if isinstance(author, str) or author is None:
                author = [author] * len(text)

            results = []
            for t, f, a in zip(text, font_style, author):
                seed = None
                if generator is not None:
                    seed = generator.initial_seed()

                result, _ = self.generator.generate(
                    text=t,
                    font_style=f,
                    author=a,
                    num_steps=num_inference_steps,
                    guidance=guidance_scale,
                    seed=seed,
                    **kwargs
                )
                results.append(result)

            return results


if __name__ == "__main__":
    # Example usage
    import argparse

    parser = argparse.ArgumentParser(description="Generate Chinese calligraphy")
    parser.add_argument("--text", type=str, default="暴富且平安", help="Text to generate (1-7 characters)")
    parser.add_argument("--font", type=str, default="楷", help="Font style (楷/草/行)")
    parser.add_argument("--author", type=str, default=None, help="Author/calligrapher name")
    parser.add_argument("--steps", type=int, default=50, help="Number of inference steps")
    parser.add_argument("--seed", type=int, default=None, help="Random seed")
    parser.add_argument("--output", type=str, default="output.png", help="Output path")
    parser.add_argument("--device", type=str, default="cuda", help="Device to use")
    parser.add_argument("--checkpoint", type=str, default=None, help="Checkpoint path")
    parser.add_argument("--list-authors", action="store_true", help="List available authors")
    parser.add_argument("--list-fonts", action="store_true", help="List available font styles")
    parser.add_argument("--float8", action="store_true", help="Use Float8 quantization (torchao) for faster inference")
    parser.add_argument("--compile", action="store_true", help="Use torch.compile for optimized inference")
    parser.add_argument("--compile-mode", type=str, default="max-autotune", 
                       choices=["reduce-overhead", "max-autotune", "default"],
                       help="torch.compile mode")

    args = parser.parse_args()

    # Initialize generator
    generator = CalligraphyGenerator(
        model_name="flux-dev",
        device=args.device,
        checkpoint_path=args.checkpoint,
    )
    
    # Apply optimizations if requested (CLI mode)
    if args.float8 or args.compile:
        from torchao.quantization import quantize_, Float8DynamicActivationFloat8WeightConfig
        import torch._inductor.config as inductor_config
        
        # Inductor configs from FLUX-Kontext-fp8
        inductor_config.conv_1x1_as_mm = True
        inductor_config.coordinate_descent_tuning = True
        inductor_config.coordinate_descent_check_all_directions = True
        inductor_config.max_autotune = True
        
        if args.float8:
            print("Applying Float8 quantization...")
            quantize_(generator.model, Float8DynamicActivationFloat8WeightConfig())
            print("✓ Float8 quantization complete!")
        
        if args.compile:
            print(f"Applying torch.compile (mode={args.compile_mode})...")
            generator.model = torch.compile(
                generator.model,
                mode=args.compile_mode,
                backend="inductor",
                dynamic=True,
            )
            print("✓ torch.compile applied!")

    # List available options
    if args.list_authors:
        print("Available authors:")
        for author in generator.get_available_authors()[:20]:  # Show first 20
            print(f"  - {author}")
        print(f"  ... and {len(generator.get_available_authors()) - 20} more")
        exit(0)

    if args.list_fonts:
        print("Available font styles:")
        for font in generator.get_available_fonts():
            print(f"  - {font}: {generator.font_style_des[font]}")
        exit(0)

    # Validate text - must have 1-7 characters
    if len(args.text) < 1:
        print(f"Error: Text must have at least 1 character")
        exit(1)
    if len(args.text) > 7:
        print(f"Error: Text must have at most 7 characters, got {len(args.text)}")
        exit(1)

    # Generate
    result_img, cond_img = generator.generate(
        text=args.text,
        font_style=args.font,
        author=args.author,
        num_steps=args.steps,
        seed=args.seed,
        save_path=args.output
    )

    print(f"Generation complete! Saved to {args.output}")