Spaces:

TSXu
/

UniCalli_Dev

Running on Zero

App Files Files Community

Txu647 commited on Jan 27

Commit

414150e

1 Parent(s): 11050b2

Add NF4 4-bit inference with bitsandbytes

Browse files

Files changed (5) hide show

README.md +2 -5
app.py +21 -3
inference.py +163 -19
requirements.txt +1 -0
src/flux/xflux_pipeline.py +91 -9

README.md CHANGED Viewed

@@ -11,14 +11,10 @@ license: cc-by-nc-nd-4.0
 short_description: Chinese Calligraphy Generator
 ---
-# 🖌️ UniCalli - Chinese Calligraphy Generator
 **A Unified Diffusion Framework for Column-Level Generation and Recognition of Chinese Calligraphy**
-Generate beautiful Chinese calligraphy in various styles and by different historical masters.
-用不同历史书法大师的风格生成精美的中国书法。
 ## Links
 - 🌐 **Project Page**: [https://envision-research.github.io/UniCalli/](https://envision-research.github.io/UniCalli/)
@@ -33,6 +29,7 @@ Generate beautiful Chinese calligraphy in various styles and by different histor
 - **Historical Masters**: 90+ calligraphers including 王羲之, 颜真卿, 赵佶/宋徽宗, etc.
 - **Multiple Font Styles**: 楷 (Regular), 行 (Running), 草 (Cursive)
 - **Interactive Session**: Generate multiple images in one GPU session
 ## Usage

 short_description: Chinese Calligraphy Generator
 ---
+# 🖌️ UniCalli-Dev - Chinese Calligraphy Generator
 **A Unified Diffusion Framework for Column-Level Generation and Recognition of Chinese Calligraphy**
 ## Links
 - 🌐 **Project Page**: [https://envision-research.github.io/UniCalli/](https://envision-research.github.io/UniCalli/)
 - **Historical Masters**: 90+ calligraphers including 王羲之, 颜真卿, 赵佶/宋徽宗, etc.
 - **Multiple Font Styles**: 楷 (Regular), 行 (Running), 草 (Cursive)
 - **Interactive Session**: Generate multiple images in one GPU session
+- **4-bit Quantization**: Runtime quantization for efficient inference on limited GPU memory
 ## Usage

app.py CHANGED Viewed

@@ -68,16 +68,34 @@ def init_generator():
     if generator is None:
         # Lazy import to avoid CUDA initialization at module load time
         from inference import CalligraphyGenerator
         generator = CalligraphyGenerator(
             model_name="flux-dev",
             device="cuda",
             offload=True,  # Enable offload to save GPU memory
-            intern_vlm_path="OpenGVLab/InternVL3-1B",
-            checkpoint_path="TSXu/Unicalli_Pro",
             font_descriptions_path='dataset/chirography.json',
             author_descriptions_path='dataset/calligraphy_styles_en.json',
             use_deepspeed=False,
-            use_4bit_quantization=False,  # Disabled - quantization overhead not worth it
         )
     return generator

     if generator is None:
         # Lazy import to avoid CUDA initialization at module load time
         from inference import CalligraphyGenerator
+        import os
+        from huggingface_hub import snapshot_download
+        # Download NF4 quantized model from HuggingFace (~6GB instead of 23GB)
+        hf_token = os.environ.get("HF_TOKEN", None)
+        print("Downloading NF4 quantized model from TSXu/Unicalli_Pro...")
+        local_dir = snapshot_download(
+            repo_id="TSXu/Unicalli_Pro",
+            allow_patterns=["unicalli_pro_chars7_nf4/*"],
+            token=hf_token
+        )
+        checkpoint_path = os.path.join(local_dir, "unicalli_pro_chars7_nf4")
+        intern_vlm_path = os.path.join(checkpoint_path, "internvl_embedding")
+        # Fallback to full InternVL3 if embedding not in NF4 folder
+        if not os.path.exists(intern_vlm_path):
+            intern_vlm_path = "OpenGVLab/InternVL3-1B"
         generator = CalligraphyGenerator(
             model_name="flux-dev",
             device="cuda",
             offload=True,  # Enable offload to save GPU memory
+            intern_vlm_path=intern_vlm_path,
+            checkpoint_path=checkpoint_path,  # NF4 quantized model
             font_descriptions_path='dataset/chirography.json',
             author_descriptions_path='dataset/calligraphy_styles_en.json',
             use_deepspeed=False,
+            use_4bit_quantization=True,  # Use bitsandbytes for true 4-bit inference
         )
     return generator

inference.py CHANGED Viewed

@@ -284,10 +284,10 @@ class CalligraphyGenerator:
     def _load_model_from_checkpoint(self, checkpoint_path: str, model_name: str, offload: bool, use_deepspeed: bool = False):
         """
         Load model from checkpoint without loading flux pretrained weights.
-        This creates an empty model, initializes module embeddings, then loads your checkpoint.
         Args:
-            checkpoint_path: Path to your checkpoint file
             model_name: flux model name (for config)
             offload: whether to offload to CPU
             use_deepspeed: whether using DeepSpeed (keeps model on CPU)
@@ -296,8 +296,6 @@ class CalligraphyGenerator:
             model with loaded checkpoint
         """
         print(f"Creating empty flux model structure...")
-        # Load checkpoint on CPU first to save memory
-        # If using DeepSpeed, keep on CPU; otherwise move to GPU after loading
         load_device = "cpu"
         # Create model structure without loading pretrained weights (using "meta" device)
@@ -312,9 +310,16 @@ class CalligraphyGenerator:
         print(f"Moving model to {load_device} for loading...")
         model = model.to_empty(device=load_device)
         # Load checkpoint
         print(f"Loading checkpoint from {checkpoint_path}")
-        checkpoint = self._load_checkpoint_file(checkpoint_path)
         # Determine dtype from checkpoint and convert to float32
         first_tensor = next(iter(checkpoint.values()))
@@ -326,36 +331,175 @@ class CalligraphyGenerator:
             print(f"Converting checkpoint from {checkpoint_dtype} to float32...")
             checkpoint = {k: v.float() for k, v in checkpoint.items()}
-        # Load weights into model (assign=True to use checkpoint tensors directly, preserving dtype)
         model.load_state_dict(checkpoint, strict=False, assign=True)
         print(f"Model dtype after loading: {next(model.parameters()).dtype}")
-        # Apply 4-bit quantization if requested
         if hasattr(self, 'use_4bit_quantization') and self.use_4bit_quantization:
-            print("Applying 4-bit quantization...")
-            model = model.float()  # 先转为 float32
-            quantize(model, weights=qint4)
-            freeze(model)
-            model._is_quantized = True  # 添加标记供 xflux_pipeline 检查
-            print("4-bit quantization complete!")
-        # Move to GPU only if NOT using DeepSpeed (DeepSpeed will handle device placement)
         if not use_deepspeed:
-            print(f"Moving model to {self.device} and converting to float32...")
-            model = model.to(device=self.device, dtype=torch.float32)
             # Enable optimized attention backends
             try:
-                # Prefer FlashAttention if available (fastest)
                 torch.backends.cuda.enable_flash_sdp(True)
                 torch.backends.cuda.enable_mem_efficient_sdp(True)
-                torch.backends.cuda.enable_math_sdp(False)  # Disable slowest fallback
                 print("Enabled FlashAttention / Memory-Efficient SDPA backends")
             except Exception as e:
                 print(f"Could not configure SDPA backends: {e}")
         return model
     def _init_deepspeed(self, model):
         """
         Initialize DeepSpeed for the model with ZeRO-3 inference optimization.

     def _load_model_from_checkpoint(self, checkpoint_path: str, model_name: str, offload: bool, use_deepspeed: bool = False):
         """
         Load model from checkpoint without loading flux pretrained weights.
+        Supports both regular checkpoints and NF4 quantized checkpoints.
         Args:
+            checkpoint_path: Path to your checkpoint file or NF4 model directory
             model_name: flux model name (for config)
             offload: whether to offload to CPU
             use_deepspeed: whether using DeepSpeed (keeps model on CPU)
             model with loaded checkpoint
         """
         print(f"Creating empty flux model structure...")
         load_device = "cpu"
         # Create model structure without loading pretrained weights (using "meta" device)
         print(f"Moving model to {load_device} for loading...")
         model = model.to_empty(device=load_device)
+        # Check if this is an NF4 quantized model
+        is_nf4 = self._is_nf4_checkpoint(checkpoint_path)
         # Load checkpoint
         print(f"Loading checkpoint from {checkpoint_path}")
+        if is_nf4:
+            print("Detected NF4 quantized model, dequantizing...")
+            checkpoint = self._load_nf4_checkpoint(checkpoint_path)
+        else:
+            checkpoint = self._load_checkpoint_file(checkpoint_path)
         # Determine dtype from checkpoint and convert to float32
         first_tensor = next(iter(checkpoint.values()))
             print(f"Converting checkpoint from {checkpoint_dtype} to float32...")
             checkpoint = {k: v.float() for k, v in checkpoint.items()}
+        # Load weights into model
         model.load_state_dict(checkpoint, strict=False, assign=True)
         print(f"Model dtype after loading: {next(model.parameters()).dtype}")
+        # Free checkpoint memory
+        del checkpoint
+        # Apply bitsandbytes 4-bit quantization if requested
         if hasattr(self, 'use_4bit_quantization') and self.use_4bit_quantization:
+            try:
+                import bitsandbytes as bnb
+                print("Applying bitsandbytes NF4 quantization for 4-bit inference...")
+                model = self._quantize_model_bnb(model)
+                model._is_quantized = True
+                print("bitsandbytes NF4 quantization complete!")
+            except ImportError:
+                print("bitsandbytes not available, using quanto quantization...")
+                model = model.float()
+                quantize(model, weights=qint4)
+                freeze(model)
+                model._is_quantized = True
+                print("quanto 4-bit quantization complete!")
+        # Move to GPU only if NOT using DeepSpeed
         if not use_deepspeed:
+            print(f"Moving model to {self.device}...")
+            model = model.to(self.device)
             # Enable optimized attention backends
             try:
                 torch.backends.cuda.enable_flash_sdp(True)
                 torch.backends.cuda.enable_mem_efficient_sdp(True)
+                torch.backends.cuda.enable_math_sdp(False)
                 print("Enabled FlashAttention / Memory-Efficient SDPA backends")
             except Exception as e:
                 print(f"Could not configure SDPA backends: {e}")
         return model
+    def _is_nf4_checkpoint(self, path: str) -> bool:
+        """Check if path contains an NF4 quantized checkpoint"""
+        if os.path.isdir(path):
+            return os.path.exists(os.path.join(path, "quantization_config.json"))
+        return False
+    def _load_nf4_checkpoint(self, checkpoint_dir: str) -> dict:
+        """
+        Load NF4 quantized checkpoint and dequantize to float tensors.
+        Args:
+            checkpoint_dir: Directory containing NF4 model files
+        Returns:
+            Dequantized state dict
+        """
+        from safetensors.torch import load_file as load_safetensors
+        # Load quantization config
+        config_path = os.path.join(checkpoint_dir, "quantization_config.json")
+        with open(config_path, 'r') as f:
+            quant_config = json.load(f)
+        block_size = quant_config.get("block_size", 64)
+        quantized_keys = set(quant_config.get("quantized_keys", []))
+        # Load index
+        index_path = os.path.join(checkpoint_dir, "model_nf4.safetensors.index.json")
+        with open(index_path, 'r') as f:
+            index = json.load(f)
+        # Load all shards
+        shard_files = sorted(set(index['weight_map'].values()))
+        print(f"Loading {len(shard_files)} NF4 shards...")
+        raw_state = {}
+        for shard_file in shard_files:
+            shard_path = os.path.join(checkpoint_dir, shard_file)
+            print(f"  Loading {shard_file}...")
+            shard_data = load_safetensors(shard_path)
+            raw_state.update(shard_data)
+        # NF4 lookup table for dequantization
+        nf4_values = torch.tensor([
+            -1.0, -0.6961928009986877, -0.5250730514526367, -0.39491748809814453,
+            -0.28444138169288635, -0.18477343022823334, -0.09105003625154495, 0.0,
+            0.07958029955625534, 0.16093020141124725, 0.24611230850220, 0.33791524171829224,
+            0.44070982933044434, 0.5626170039176941, 0.7229568362236023, 1.0
+        ], dtype=torch.float32)
+        # Dequantize
+        state_dict = {}
+        dequant_count = 0
+        for key in list(raw_state.keys()):
+            if key.endswith('.quant_data'):
+                base_key = key.replace('.quant_data', '')
+                if base_key in quantized_keys:
+                    # Dequantize this tensor
+                    quant_data = raw_state[f"{base_key}.quant_data"]
+                    scales = raw_state[f"{base_key}.scales"]
+                    shape = raw_state[f"{base_key}.shape"].tolist()
+                    pad_len = raw_state[f"{base_key}.pad_len"].item()
+                    # Unpack 4-bit values
+                    high = (quant_data >> 4) & 0x0F
+                    low = quant_data & 0x0F
+                    indices = torch.stack([high, low], dim=-1).flatten().long()
+                    # Lookup and reshape
+                    values = nf4_values[indices]
+                    # Apply scales
+                    num_blocks = len(scales)
+                    values = values[:num_blocks * block_size].reshape(num_blocks, block_size)
+                    values = values * scales.float().unsqueeze(1)
+                    values = values.flatten()
+                    # Remove padding and reshape
+                    if pad_len > 0:
+                        values = values[:-pad_len]
+                    state_dict[base_key] = values.reshape(shape)
+                    dequant_count += 1
+            elif not any(key.endswith(s) for s in ['.scales', '.shape', '.block_size', '.pad_len']):
+                # Non-quantized tensor, keep as-is
+                state_dict[key] = raw_state[key]
+        print(f"Dequantized {dequant_count} tensors")
+        return state_dict
+    def _quantize_model_bnb(self, model):
+        """
+        Quantize model using bitsandbytes NF4.
+        Replaces Linear layers with Linear4bit for true 4-bit inference.
+        """
+        import bitsandbytes as bnb
+        import torch.nn as nn
+        def replace_linear_with_4bit(module, name=''):
+            for child_name, child in list(module.named_children()):
+                full_name = f"{name}.{child_name}" if name else child_name
+                if isinstance(child, nn.Linear):
+                    # Create 4-bit linear layer
+                    new_layer = bnb.nn.Linear4bit(
+                        child.in_features,
+                        child.out_features,
+                        bias=child.bias is not None,
+                        compute_dtype=torch.bfloat16,
+                        compress_statistics=True,
+                        quant_type='nf4'
+                    )
+                    # Copy weights (will be quantized when moved to GPU)
+                    new_layer.weight = bnb.nn.Params4bit(
+                        child.weight.data,
+                        requires_grad=False,
+                        quant_type='nf4'
+                    )
+                    if child.bias is not None:
+                        new_layer.bias = nn.Parameter(child.bias.data)
+                    setattr(module, child_name, new_layer)
+                else:
+                    replace_linear_with_4bit(child, full_name)
+        print("Replacing Linear layers with Linear4bit...")
+        replace_linear_with_4bit(model)
+        return model
     def _init_deepspeed(self, model):
         """
         Initialize DeepSpeed for the model with ZeRO-3 inference optimization.

requirements.txt CHANGED Viewed

@@ -7,6 +7,7 @@ safetensors>=0.4.0
 # Model and inference
 optimum-quanto
 torch
 torchvision
 timm

 # Model and inference
 optimum-quanto
+bitsandbytes>=0.41.0
 torch
 torchvision
 timm

src/flux/xflux_pipeline.py CHANGED Viewed

@@ -460,12 +460,94 @@ class XFluxSampler(XFluxPipeline):
         self.offload = False
         self.ref_latent = ref_latent
-        self.embed_tokens = AutoModel.from_pretrained(
-            intern_vlm_path,
-            torch_dtype=torch.float32,
-            device_map="cpu",
-            trust_remote_code=True
-        ).language_model.model.embed_tokens.eval()
-        self.embed_tokens.requires_grad_(False)
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            intern_vlm_path, trust_remote_code=True, use_fast=False)

         self.offload = False
         self.ref_latent = ref_latent
+        # Load embedding - try lightweight extracted version first, fallback to full model
+        self.embed_tokens, self.tokenizer = self._load_embedding(intern_vlm_path)
+    def _load_embedding(self, intern_vlm_path):
+        """
+        Load embedding layer and tokenizer.
+        Supports three modes:
+        1. HuggingFace repo with internvl_embedding subfolder (e.g., TSXu/Unicalli_Pro)
+        2. Lightweight: Load from extracted embedding files (embedding.safetensors + tokenizer)
+        3. Full: Load from complete InternVL3 model (fallback)
+        """
+        import os
+        from safetensors.torch import load_file as load_safetensors
+        # Check if this is a HuggingFace model ID (contains '/' but not a local path)
+        if '/' in intern_vlm_path and not os.path.exists(intern_vlm_path):
+            print(f"Downloading internvl_embedding from HuggingFace: {intern_vlm_path}")
+            from huggingface_hub import snapshot_download
+            hf_token = os.environ.get("HF_TOKEN", None)
+            # Download only the internvl_embedding subfolder
+            local_dir = snapshot_download(
+                repo_id=intern_vlm_path,
+                allow_patterns=["internvl_embedding/*", "unicalli_pro_chars7_nf4/internvl_embedding/*"],
+                token=hf_token
+            )
+            # Check for internvl_embedding in different locations
+            possible_paths = [
+                os.path.join(local_dir, "internvl_embedding"),
+                os.path.join(local_dir, "unicalli_pro_chars7_nf4", "internvl_embedding"),
+            ]
+            for path in possible_paths:
+                if os.path.exists(path):
+                    intern_vlm_path = path
+                    print(f"Found internvl_embedding at: {intern_vlm_path}")
+                    break
+            else:
+                print(f"Warning: internvl_embedding not found, falling back to full model")
+        # Check if this is an extracted embedding directory
+        embedding_file = os.path.join(intern_vlm_path, "embedding.safetensors")
+        config_file = os.path.join(intern_vlm_path, "embedding_config.json")
+        if os.path.exists(embedding_file) and os.path.exists(config_file):
+            # Lightweight mode: Load extracted embedding
+            print(f"Loading lightweight embedding from: {intern_vlm_path}")
+            import json
+            with open(config_file, 'r') as f:
+                config = json.load(f)
+            # Create embedding layer
+            embed_tokens = torch.nn.Embedding(
+                num_embeddings=config["num_embeddings"],
+                embedding_dim=config["embedding_dim"],
+                padding_idx=config.get("padding_idx", None)
+            )
+            # Load weights
+            state_dict = load_safetensors(embedding_file)
+            embed_tokens.load_state_dict(state_dict)
+            embed_tokens.eval()
+            embed_tokens.requires_grad_(False)
+            # Load tokenizer
+            tokenizer = AutoTokenizer.from_pretrained(
+                intern_vlm_path, trust_remote_code=True, use_fast=False
+            )
+            print(f"Loaded lightweight embedding: {config['num_embeddings']} x {config['embedding_dim']}")
+            return embed_tokens, tokenizer
+        else:
+            # Full mode: Load from complete InternVL3 model
+            print(f"Loading full InternVL3 model from: {intern_vlm_path}")
+            embed_tokens = AutoModel.from_pretrained(
+                intern_vlm_path,
+                torch_dtype=torch.float32,
+                device_map="cpu",
+                trust_remote_code=True
+            ).language_model.model.embed_tokens.eval()
+            embed_tokens.requires_grad_(False)
+            tokenizer = AutoTokenizer.from_pretrained(
+                intern_vlm_path, trust_remote_code=True, use_fast=False
+            )
+            return embed_tokens, tokenizer