krystv
/

LatentRecurrentFlow

+"""
+LatentRecurrentFlow (LRF) - HuggingFace-Compatible Pipeline
+Provides:
+- LRFPipeline: Full text-to-image and image-editing pipeline
+- Model save/load compatible with HF Hub
+- Diffusers-style API for easy integration
+"""
+import torch
+import torch.nn as nn
+import json
+import os
+from typing import Optional, List, Union
+from pathlib import Path
+from lrf.model import LatentRecurrentFlow
+from lrf.training import RectifiedFlowScheduler
+class LRFPipeline:
+    """
+    LatentRecurrentFlow Pipeline for inference.
+    Usage:
+        pipe = LRFPipeline.from_pretrained("path/to/model")
+        images = pipe("a photo of a cat", num_steps=20)
+        # Or for editing:
+        images = pipe("make the cat blue", image=source_image, num_steps=20)
+    """
+    def __init__(
+        self,
+        model: LatentRecurrentFlow,
+        tokenizer=None,
+        device: torch.device = torch.device('cpu'),
+    ):
+        self.model = model.to(device)
+        self.model.eval()
+        self.device = device
+        self.scheduler = RectifiedFlowScheduler(shift=1.0)
+        self.tokenizer = tokenizer
+    @classmethod
+    def from_pretrained(cls, path: str, device: str = 'cpu'):
+        """Load model from directory or HF Hub."""
+        path = Path(path)
+        device = torch.device(device)
+        # Load config
+        config_path = path / 'config.json'
+        if config_path.exists():
+            with open(config_path) as f:
+                config = json.load(f)
+        else:
+            config = LatentRecurrentFlow.default_config()
+        # Create model
+        model = LatentRecurrentFlow(config)
+        # Load weights if available
+        weights_path = path / 'model.safetensors'
+        pt_path = path / 'model.pt'
+        if weights_path.exists():
+            from safetensors.torch import load_file
+            state_dict = load_file(str(weights_path))
+            model.load_state_dict(state_dict)
+        elif pt_path.exists():
+            state_dict = torch.load(str(pt_path), map_location='cpu', weights_only=True)
+            if 'model_state' in state_dict:
+                model.load_state_dict(state_dict['model_state'])
+            else:
+                model.load_state_dict(state_dict)
+        return cls(model=model, device=device)
+    def save_pretrained(self, path: str):
+        """Save model to directory."""
+        path = Path(path)
+        path.mkdir(parents=True, exist_ok=True)
+        # Save config
+        with open(path / 'config.json', 'w') as f:
+            json.dump(self.model.config, f, indent=2)
+        # Save weights
+        try:
+            from safetensors.torch import save_file
+            save_file(self.model.state_dict(), str(path / 'model.safetensors'))
+        except ImportError:
+            torch.save(self.model.state_dict(), str(path / 'model.pt'))
+        # Save README
+        readme = self._generate_readme()
+        with open(path / 'README.md', 'w') as f:
+            f.write(readme)
+    def _generate_readme(self):
+        counts = self.model.count_parameters()
+        return f"""---
+tags:
+  - image-generation
+  - latent-recurrent-flow
+  - lrf
+  - mobile-first
+  - flow-matching
+  - recursive-reasoning
+library_name: lrf
+pipeline_tag: text-to-image
+---
+# LatentRecurrentFlow (LRF)
+A novel mobile-first image generation architecture combining:
+- **Recursive Latent Refinement (RLR)** core — HRM-inspired iterative reasoning
+- **Gated Linear Diffusion (GLD)** blocks — O(N) subquadratic spatial mixing
+- **Compact f=16 VAE** with tiny decoder
+- **Rectified flow** training objective
+## Model Details
+| Component | Parameters |
+|-----------|-----------|
+| VAE Encoder | {counts['vae_encoder']:,} |
+| VAE Decoder | {counts['vae_decoder']:,} |
+| Text Encoder | {counts['text_encoder']:,} |
+| Denoising Core | {counts['core']:,} |
+| **Total** | **{counts['total']:,}** |
+## Architecture Innovations
+1. **Recursive Latent Refinement**: Same parameter blocks applied T_outer × T_inner times,
+   giving effective depth of {self.model.config.get('T_outer', 2) * self.model.config.get('T_inner', 4) * self.model.config.get('num_blocks', 4)} layers
+   from only {self.model.config.get('num_blocks', 4)} unique parameter sets.
+2. **Gated Linear Attention**: O(N) bidirectional scan with token-differential operators
+   and 2D locality injection — replaces quadratic self-attention.
+3. **IFT Training**: O(1) memory backpropagation through arbitrary recursion depth.
+## Usage
+```python
+from lrf.pipeline import LRFPipeline
+pipe = LRFPipeline.from_pretrained("path/to/model")
+images = pipe("a beautiful sunset over the ocean", num_steps=20)
+```
+"""
+    def _simple_tokenize(self, text: str, max_length: int = 77) -> tuple:
+        """Simple character-level tokenization for prototype."""
+        if self.tokenizer is not None:
+            tokens = self.tokenizer(text, max_length=max_length, padding='max_length',
+                                     truncation=True, return_tensors='pt')
+            return tokens['input_ids'], tokens['attention_mask']
+        # Fallback: simple hash-based tokenization
+        words = text.lower().split()
+        token_ids = []
+        for word in words:
+            # Simple hash to token id
+            token_id = hash(word) % 31998 + 1
+            token_ids.append(token_id)
+        # Pad/truncate
+        if len(token_ids) > max_length:
+            token_ids = token_ids[:max_length]
+        attention_mask = [1.0] * len(token_ids) + [0.0] * (max_length - len(token_ids))
+        token_ids = token_ids + [0] * (max_length - len(token_ids))
+        return (
+            torch.tensor([token_ids], dtype=torch.long),
+            torch.tensor([attention_mask], dtype=torch.float),
+        )
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Optional[torch.Tensor] = None,
+        num_steps: int = 20,
+        cfg_scale: float = 7.5,
+        height: int = 256,
+        width: int = 256,
+        seed: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Generate images from text prompts.
+        Args:
+            prompt: Text prompt or list of prompts
+            image: Optional source image for editing [B, 3, H, W] in [-1, 1]
+            num_steps: Number of sampling steps (4-50, default 20)
+            cfg_scale: Classifier-free guidance scale
+            height, width: Output image size
+            seed: Random seed for reproducibility
+        Returns:
+            images: Tensor [B, 3, H, W] in [-1, 1]
+        """
+        if seed is not None:
+            torch.manual_seed(seed)
+        # Handle string input
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        B = len(prompt)
+        # Tokenize
+        all_ids = []
+        all_masks = []
+        for p in prompt:
+            ids, mask = self._simple_tokenize(p)
+            all_ids.append(ids)
+            all_masks.append(mask)
+        token_ids = torch.cat(all_ids, dim=0).to(self.device)
+        attention_mask = torch.cat(all_masks, dim=0).to(self.device)
+        # Encode text
+        text_emb, text_global = self.model.encode_text(token_ids, attention_mask)
+        # Compute latent size
+        latent_h = height // 16
+        latent_w = width // 16
+        C = self.model.config['latent_channels']
+        # Handle editing: encode source image
+        image_cond = None
+        if image is not None:
+            image = image.to(self.device)
+            with torch.no_grad():
+                image_cond, _, _ = self.model.encode_image(image)
+        # Sample
+        shape = (B, C, latent_h, latent_w)
+        z = self.scheduler.sample(
+            self.model, shape, text_emb, text_global,
+            num_steps=num_steps, cfg_scale=cfg_scale, device=self.device,
+        )
+        # Decode
+        images = self.model.decode_latent(z)
+        return images.clamp(-1, 1)
+    def to(self, device):
+        """Move pipeline to device."""
+        self.device = torch.device(device)
+        self.model = self.model.to(self.device)
+        return self
+class LRFTrainingPipeline:
+    """
+    Complete training pipeline with staged curriculum.
+    Stages:
+    1. VAE pre-training (or use pre-trained DC-AE)
+    2. Flow matching denoiser training
+    3. Consistency distillation for few-step
+    4. Editing fine-tuning
+    """
+    STAGE_CONFIGS = {
+        'vae': {
+            'description': 'Train VAE for image compression',
+            'freeze': [],
+            'train': ['vae'],
+            'lr': 1e-4,
+            'min_steps': 50000,
+        },
+        'flow_lowres': {
+            'description': 'Flow matching at 64x64 (composition learning)',
+            'freeze': ['vae'],
+            'train': ['core', 'text_encoder'],
+            'lr': 1e-4,
+            'resolution': 64,
+            'min_steps': 100000,
+        },
+        'flow_midres': {
+            'description': 'Flow matching at 256x256 (texture learning)',
+            'freeze': ['vae'],
+            'train': ['core', 'text_encoder'],
+            'lr': 5e-5,
+            'resolution': 256,
+            'min_steps': 200000,
+        },
+        'flow_highres': {
+            'description': 'Flow matching at 512x512 (detail learning)',
+            'freeze': ['vae'],
+            'train': ['core', 'text_encoder'],
+            'lr': 2e-5,
+            'resolution': 512,
+            'min_steps': 100000,
+        },
+        'consistency': {
+            'description': 'Consistency distillation for 4-step generation',
+            'freeze': ['vae', 'text_encoder'],
+            'train': ['core'],
+            'lr': 1e-5,
+            'min_steps': 50000,
+        },
+        'editing': {
+            'description': 'Fine-tune for editing tasks',
+            'freeze': ['vae'],
+            'train': ['core', 'text_encoder'],
+            'lr': 1e-5,
+            'min_steps': 50000,
+        },
+    }
+    @classmethod
+    def get_stage_config(cls, stage_name: str) -> dict:
+        return cls.STAGE_CONFIGS.get(stage_name, {})
+    @classmethod
+    def get_curriculum(cls) -> list:
+        """Return the full training curriculum."""
+        return [
+            'vae',
+            'flow_lowres',
+            'flow_midres',
+            'flow_highres',
+            'consistency',
+            'editing',
+        ]