Spaces:

Aatricks
/

LightDiffusion-Next

Running on Zero

File size: 7,900 Bytes

b701455

"""
Prompt Attention Caching
Caches CLIP embeddings for repeated prompts to avoid re-encoding.
Training-free, lossless optimization providing 5-15% speedup.
"""

import hashlib
from functools import lru_cache
import torch
import logging

# Global cache enabled flag
_cache_enabled = True

def enable_prompt_cache(enabled: bool = True):
    """Enable or disable prompt caching globally.
    
    Args:
        enabled (bool): Whether to enable caching. Defaults to True.
    """
    global _cache_enabled
    _cache_enabled = enabled
    if not enabled:
        clear_prompt_cache()
    logging.info(f"Prompt caching {'enabled' if enabled else 'disabled'}")


def is_prompt_cache_enabled() -> bool:
    """Check if prompt caching is enabled.
    
    Returns:
        bool: True if caching is enabled.
    """
    return _cache_enabled


def get_prompt_hash(prompt: str) -> int:
    """Generate a fast hash for a prompt.
    
    Uses Python's built-in hash() which is much faster than MD5
    and sufficient for cache keying (not cryptographic).
    
    Args:
        prompt (str): The text prompt.
    
    Returns:
        int: Hash of the prompt.
    """
    return hash(prompt)


def _get_clip_identity(clip) -> str:
    """Get a stable identity string for a CLIP model instance.
    
    Uses the model's checkpoint path or class name instead of id(clip)
    which changes when a model is reloaded at the same logical identity.
    
    Args:
        clip: CLIP model instance.
        
    Returns:
        str: Stable identity string.
    """
    # Try to get a stable path-based identifier
    if hasattr(clip, 'model_path') and clip.model_path:
        return f"clip:{clip.model_path}"
    if hasattr(clip, 'patcher') and hasattr(clip.patcher, 'model_path'):
        return f"clip:{clip.patcher.model_path}"
    # Fall back to class name + parameter count for stability
    try:
        param_count = sum(p.numel() for p in clip.parameters() if hasattr(clip, 'parameters'))
        return f"clip:{clip.__class__.__name__}:{param_count}"
    except Exception:
        # Last resort: use id() (not ideal but better than crashing)
        return f"clip:id:{id(clip)}"


# LRU cache with 128 slots (enough for typical session)
# Each cached entry is ~100-500KB depending on model
@lru_cache(maxsize=128)
def _cached_encode_impl(prompt_hash: str, prompt: str, clip_id: int):
    """Internal cached encoding function.
    
    Note: This is called by get_cached_encoding and should not be called directly.
    The actual encoding happens in the calling code, this just provides the cache wrapper.
    
    Args:
        prompt_hash (str): Hash of the prompt.
        prompt (str): The actual prompt text.
        clip_id (int): Unique ID of the CLIP model instance.
    
    Returns:
        None (actual encoding happens in caller)
    """
    pass


class PromptCacheEntry:
    """Container for cached prompt encoding results."""
    
    def __init__(self, cond: torch.Tensor, pooled: torch.Tensor):
        """Initialize cache entry.
        
        Args:
            cond (torch.Tensor): Conditional embedding tensor.
            pooled (torch.Tensor): Pooled output tensor.
        """
        # We don't clone here because these tensors are treated as read-only
        # by consumers, and the producer (CLIP) creates fresh tensors
        # for each encoding. This reduces memory pressure and latency.
        self.cond = cond if cond is not None else None
        self.pooled = pooled if pooled is not None else None
        self.hits = 0
    
    def get(self) -> tuple:
        """Get cached tensors (returns references for performance).
        
        Returns:
            tuple: (cond, pooled) tensors.
        """
        self.hits += 1
        # Returns direct references. Tensors are assumed to be read-only.
        return (self.cond, self.pooled)


# Secondary cache using dict for more control
_prompt_cache_dict = {}
_cache_stats = {"hits": 0, "misses": 0, "size_mb": 0.0}


def get_cached_encoding(clip, prompt: str) -> tuple:
    """Get cached encoding or encode and cache if not present.
    
    Args:
        clip: CLIP model instance.
        prompt (str): Text prompt.
    
    Returns:
        tuple: (cond, pooled) or None if caching disabled.
    """
    if not _cache_enabled:
        return None
    
    prompt_hash = get_prompt_hash(prompt)
    clip_key = _get_clip_identity(clip)
    cache_key = f"{clip_key}_{prompt_hash}"
    
    # Check if we have it cached
    if cache_key in _prompt_cache_dict:
        _cache_stats["hits"] += 1
        entry = _prompt_cache_dict[cache_key]
        cond, pooled = entry.get()
        
        if _cache_stats["hits"] % 10 == 0:  # Log every 10 hits
            hit_rate = _cache_stats["hits"] / max(1, _cache_stats["hits"] + _cache_stats["misses"])
            logging.debug(f"Prompt cache hit rate: {hit_rate:.1%} (size: {len(_prompt_cache_dict)} entries)")
        
        return (cond, pooled)
    
    # Cache miss
    _cache_stats["misses"] += 1
    return None


def cache_encoding(clip, prompt: str, cond: torch.Tensor, pooled: torch.Tensor):
    """Cache an encoding result.
    
    Args:
        clip: CLIP model instance.
        prompt (str): Text prompt.
        cond (torch.Tensor): Conditional embedding.
        pooled (torch.Tensor): Pooled output.
    """
    if not _cache_enabled:
        return
    
    prompt_hash = get_prompt_hash(prompt)
    clip_key = _get_clip_identity(clip)
    cache_key = f"{clip_key}_{prompt_hash}"
    
    # Don't cache if already present
    if cache_key in _prompt_cache_dict:
        return
    
    # Store in cache
    entry = PromptCacheEntry(cond, pooled)
    _prompt_cache_dict[cache_key] = entry
    
    # Update size estimate (rough)
    if cond is not None:
        _cache_stats["size_mb"] = len(_prompt_cache_dict) * (cond.numel() * cond.element_size() / 1024 / 1024)
    
    # Limit cache size to prevent memory issues
    max_entries = 256
    if len(_prompt_cache_dict) > max_entries:
        # Remove oldest 25% of entries (simple FIFO)
        remove_count = max_entries // 4
        keys_to_remove = list(_prompt_cache_dict.keys())[:remove_count]
        for key in keys_to_remove:
            del _prompt_cache_dict[key]
        logging.debug(f"Prompt cache pruned: removed {remove_count} old entries")


def clear_prompt_cache():
    """Clear the entire prompt cache."""
    global _prompt_cache_dict, _cache_stats
    
    old_size = len(_prompt_cache_dict)
    _prompt_cache_dict.clear()
    _cached_encode_impl.cache_clear()  # Clear LRU cache too
    _cache_stats = {"hits": 0, "misses": 0, "size_mb": 0.0}
    
    if old_size > 0:
        logging.info(f"Prompt cache cleared ({old_size} entries removed)")


def get_cache_stats() -> dict:
    """Get cache statistics.
    
    Returns:
        dict: Stats including hits, misses, hit rate, size.
    """
    total_requests = _cache_stats["hits"] + _cache_stats["misses"]
    hit_rate = _cache_stats["hits"] / max(1, total_requests)
    
    return {
        "enabled": _cache_enabled,
        "hits": _cache_stats["hits"],
        "misses": _cache_stats["misses"],
        "total_requests": total_requests,
        "hit_rate": hit_rate,
        "cache_entries": len(_prompt_cache_dict),
        "estimated_size_mb": _cache_stats["size_mb"],
    }


def print_cache_stats():
    """Print cache statistics to console."""
    stats = get_cache_stats()
    print("\n" + "="*60)
    print("Prompt Cache Statistics")
    print("="*60)
    print(f"  Status: {'Enabled' if stats['enabled'] else 'Disabled'}")
    print(f"  Entries: {stats['cache_entries']}")
    print(f"  Size: ~{stats['estimated_size_mb']:.1f} MB")
    print(f"  Requests: {stats['total_requests']} (hits: {stats['hits']}, misses: {stats['misses']})")
    print(f"  Hit Rate: {stats['hit_rate']:.1%}")
    print("="*60 + "\n")